In [16]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import numpy as np

# ------------------------
# STEP 1: Load preprocessed data + artifacts
# ------------------------
X_train, X_test, y_train, y_test = joblib.load(
    "/Users/akashbhat/credit-card-fraud-detection/data/processed_data.pkl"
)

# Load fitted scaler + feature order from preprocessing
scaler = joblib.load("/Users/akashbhat/credit-card-fraud-detection/data/scaler.pkl")
feature_columns = joblib.load("/Users/akashbhat/credit-card-fraud-detection/data/feature_columns.pkl")

# Apply the scaler ONLY on the Amount column (it’s already created as Amount_scaled)
# No need to refit scaler! Just ensure feature order matches
X_train = X_train[feature_columns]
X_test = X_test[feature_columns]

# ------------------------
# STEP 2: Train baseline models
# ------------------------
ratio = len(y_train[y_train==0]) / len(y_train[y_train==1])  # imbalance ratio

models = {
    "Logistic Regression": LogisticRegression(class_weight="balanced", max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=200, class_weight="balanced", random_state=42),
    "XGBoost": XGBClassifier(
        scale_pos_weight=ratio,
        n_estimators=500,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=42
    )
}

results = {}

for name, model in models.items():
    print(f"\n🔹 Training {name}...")
    model.fit(X_train, y_train)   # <── train on aligned DataFrame, not re-scaled numpy array
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    results[name] = report["1"]  # fraud metrics only
    print(classification_report(y_test, y_pred, digits=4))

# ------------------------
# STEP 3: Hyperparameter Tuning for XGBoost
# ------------------------
param_grid = {
    "n_estimators": [200, 500, 800],
    "max_depth": [4, 6, 8],
    "learning_rate": [0.01, 0.05, 0.1],
    "subsample": [0.7, 0.8, 1.0],
    "colsample_bytree": [0.7, 0.8, 1.0]
}

xgb = XGBClassifier(
    scale_pos_weight=ratio,
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42
)

print("\n🔎 Running RandomizedSearchCV for XGBoost... (this may take a while)")
search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_grid,
    scoring="f1",
    cv=3,
    verbose=2,
    n_iter=10,
    n_jobs=-1
)

search.fit(X_train, y_train)
print("✅ Best parameters from tuning:", search.best_params_)

best_xgb = search.best_estimator_
y_pred_best = best_xgb.predict(X_test)
report_best = classification_report(y_test, y_pred_best, output_dict=True)
results["XGBoost Tuned"] = report_best["1"]

print("\n📊 Classification Report (XGBoost Tuned):")
print(classification_report(y_test, y_pred_best, digits=4))

# ------------------------
# STEP 4: Save the best model
# ------------------------
joblib.dump(best_xgb, "/Users/akashbhat/credit-card-fraud-detection/data/fraud_model.pkl")
print("✅ Saved fraud_model.pkl")



🔹 Training Logistic Regression...
              precision    recall  f1-score   support

           0     0.9999    0.9746    0.9870     56864
           1     0.0586    0.9184    0.1101        98

    accuracy                         0.9745     56962
   macro avg     0.5292    0.9465    0.5486     56962
weighted avg     0.9982    0.9745    0.9855     56962


🔹 Training Random Forest...
              precision    recall  f1-score   support

           0     0.9996    0.9999    0.9998     56864
           1     0.9610    0.7551    0.8457        98

    accuracy                         0.9995     56962
   macro avg     0.9803    0.8775    0.9227     56962
weighted avg     0.9995    0.9995    0.9995     56962


🔹 Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

           0     0.9997    0.9998    0.9997     56864
           1     0.8632    0.8367    0.8497        98

    accuracy                         0.9995     56962
   macro avg     0.9314    0.9183    0.9247     56962
weighted avg     0.9995    0.9995    0.9995     56962


🔎 Running RandomizedSearchCV for XGBoost... (this may take a while)
Fitting 3 folds for each of 10 candidates, totalling 30 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


✅ Best parameters from tuning: {'subsample': 0.7, 'n_estimators': 800, 'max_depth': 4, 'learning_rate': 0.1, 'colsample_bytree': 0.8}

📊 Classification Report (XGBoost Tuned):
              precision    recall  f1-score   support

           0     0.9997    0.9998    0.9997     56864
           1     0.8632    0.8367    0.8497        98

    accuracy                         0.9995     56962
   macro avg     0.9314    0.9183    0.9247     56962
weighted avg     0.9995    0.9995    0.9995     56962

✅ Saved fraud_model.pkl and scaler.pkl
