In [2]:
import os
import numpy as np, pandas as pd, xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, f1_score
import joblib

In [3]:
train_data = pd.read_csv(
    os.path.join(
        os.path.dirname(os.getcwd()),
        'data',
        'training',
        'final_anomaly_training.csv'
    )
)
train_data.describe()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
count,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0
mean,0.7620937,1.32714,0.5115073,-0.118658,0.350732,0.10073,-0.349274,0.087404
std,2.952638,9.519952,1.749233,0.323386,0.4772,0.300971,0.476741,0.282426
min,-0.4522426,-0.326715,-0.6123311,-1.0,0.0,0.0,-1.0,0.0
25%,-0.2770901,-0.2293651,-0.3214841,0.0,0.0,0.0,-1.0,0.0
50%,4.03482e-17,-1.8200200000000003e-17,3.426058e-17,0.0,0.0,0.0,0.0,0.0
75%,0.7229099,0.7706349,0.6785159,0.0,1.0,0.0,0.0,0.0
max,262.9377,3885.243,164.6674,0.0,1.0,1.0,0.0,1.0


In [4]:
X_train = train_data.drop(columns=['fraud'])
y_train = train_data['fraud']

spw = (y_train == 0).sum() / max(1, (y_train == 1).sum())

In [5]:
model = xgb.XGBClassifier(
    objective="binary:logistic",
    eval_metric="aucpr",            # better for rare positives
    scale_pos_weight=spw,
    n_estimators=2000,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=1,
    tree_method="hist",
    n_jobs=-1,
    random_state=42,
)

In [6]:
eval_data = pd.read_csv(
    os.path.join(
        os.path.dirname(os.getcwd()),
        'data',
        'testing',
        'final_heldout_testing.csv'
    )
)
eval_data.describe()


Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
count,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0
mean,0.751245,1.322413,0.508681,-0.118036,0.34984,0.100684,-0.349688,0.087404
std,3.044706,7.214491,1.702321,0.322651,0.47692,0.30091,0.476872,0.282427
min,-0.453118,-0.326649,-0.612094,-1.0,0.0,0.0,-1.0,0.0
25%,-0.277074,-0.22956,-0.322691,0.0,0.0,0.0,-1.0,0.0
50%,-0.000638,0.002164,2.4e-05,0.0,0.0,0.0,0.0,0.0
75%,0.708908,0.780644,0.676347,0.0,1.0,0.0,0.0,0.0
max,482.571443,872.947532,103.155886,0.0,1.0,1.0,0.0,1.0


In [7]:
X_val = eval_data.drop(columns=['fraud'])
y_val = eval_data['fraud']

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=False
)

In [8]:
p_val = model.predict_proba(X_val)[:, 1]
prec, rec, thr = precision_recall_curve(y_val, p_val)
f1s = 2*prec*rec/(prec+rec+1e-12)
best_thr = float(thr[np.argmax(f1s[:-1])])
print("Best F1:", f1_score(y_val, p_val >= best_thr), " @ thr=", best_thr)

Best F1: 0.9887953264416603  @ thr= 0.7745236158370972


In [9]:
test_data = pd.read_csv(
    os.path.join(
        os.path.dirname(os.getcwd()),
        'data',
        'testing',
        'final_split_testing.csv'
    )
)
test_data.describe()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
count,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0
mean,0.749996,1.321497,0.508218,-0.118504,0.350292,0.100288,-0.349556,0.0874
std,2.931037,7.339187,1.7101,0.323205,0.477062,0.300384,0.476831,0.282421
min,-0.45208,-0.326656,-0.613159,-1.0,0.0,0.0,-1.0,0.0
25%,-0.277367,-0.229685,-0.323477,0.0,0.0,0.0,-1.0,0.0
50%,-0.001399,0.000538,-0.000515,0.0,0.0,0.0,0.0,0.0
75%,0.710764,0.772658,0.678479,0.0,1.0,0.0,0.0,0.0
max,398.275579,892.86877,89.624772,0.0,1.0,1.0,0.0,1.0


In [13]:
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report

X_test = test_data.drop(columns=['fraud'])
y_test = test_data['fraud']

p_test = model.predict_proba(X_test)[:, 1]
y_pred = (p_test >= best_thr).astype(int)

roc_auc = roc_auc_score(y_test, p_test)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division=0, digits=4)

metrics_table = pd.DataFrame({
    'Metric': ['ROC AUC'],
    'Value': [roc_auc]
})

print("Confusion Matrix:\n", cm)
print("\nClassification Report:\n", report)
print("\nMetrics Table:\n", metrics_table)

Confusion Matrix:
 [[227875    275]
 [   182  21668]]

Classification Report:
               precision    recall  f1-score   support

         0.0     0.9992    0.9988    0.9990    228150
         1.0     0.9875    0.9917    0.9896     21850

    accuracy                         0.9982    250000
   macro avg     0.9933    0.9952    0.9943    250000
weighted avg     0.9982    0.9982    0.9982    250000


Metrics Table:
     Metric     Value
0  ROC AUC  0.999971


In [12]:
dirpath = os.path.join(os.path.dirname(os.getcwd()), "results")
filepath = os.path.join(dirpath, "xgboost_model.joblib")
if not os.path.exists(dirpath):
    os.makedirs(dirpath)
if os.path.exists(filepath):
    os.remove(filepath)
joblib.dump(model, filepath)

['c:\\Users\\abudi\\OneDrive\\Documents\\Uni Work\\Capstone Project\\fraud-detection-capstone\\results\\xgboost_model.joblib']