# Setup (on collab)

In [None]:
!git clone https://github.com/alicebarbosam/fraud-detection-kaggle.git
%cd fraud-detection-kaggle

In [28]:
import os
import joblib
import pandas as pd
import numpy as np

model = joblib.load("../models/xgb_optuna_final.joblib")

test_df = pd.read_csv("../data/test.csv")

def add_features(df: "pd.DataFrame") -> "pd.DataFrame":
    df = df.copy()

    # criando nova feature com o log do amount para lidar com outliers
    df["Amount_log"] = np.log1p(df["Amount"])

    # criando features da magnitude das features mais dominantes
    df["V14_abs"] = df["V14"].abs()
    df["V10_abs"] = df["V10"].abs()
    df["V12_abs"] = df["V12"].abs()
    df["V4_abs"]  = df["V4"].abs()

    # interacoes da v14 com outras features dominantes
    df["V14_x_V10"] = df["V14"] * df["V10"]
    df["V14_x_V12"] = df["V14"] * df["V12"]

    return df

test_df = add_features(test_df)

X_test = test_df.drop(columns=["id"])
test_ids = test_df["id"]

test_probs = model.predict_proba(X_test)[:, 1]

submission = pd.DataFrame({
    "id": test_ids,
    "target": test_probs
})

os.makedirs("../outputs", exist_ok=True)
submission.to_csv("../outputs/submission.csv", index=False)

print("Saved: ../outputs/submission.csv")
print(submission.head())

Saved: ../outputs/submission.csv
       id    target
0  263021  0.000007
1   11379  0.000011
2  147284  0.000090
3  219440  0.000003
4   36940  0.000529


In [29]:
print(submission["target"].describe())
print("max:", submission["target"].max())
print("min:", submission["target"].min())
print("count > 0.5:", (submission["target"] > 0.5).sum())
print("count > 0.1:", (submission["target"] > 0.1).sum())
print("count > 0.01:", (submission["target"] > 0.01).sum())

count    5.696200e+04
mean     2.163549e-03
std      4.197936e-02
min      1.517142e-07
25%      7.839470e-06
50%      1.997999e-05
75%      5.891907e-05
max      9.999971e-01
Name: target, dtype: float64
max: 0.99999714
min: 1.5171419e-07
count > 0.5: 104
count > 0.1: 150
count > 0.01: 345
