Imports

In [1]:
import pandas as pd
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score, confusion_matrix

Load Processed Data

In [2]:
print("Loading processed data...")
X_train = pd.read_csv('../data/processed/train_features.csv')
y_train = pd.read_csv('../data/processed/train_target.csv').values.ravel()
X_test = pd.read_csv('../data/processed/test_features.csv')
y_test = pd.read_csv('../data/processed/test_target.csv').values.ravel()

Loading processed data...


Baseline Model (Logistic Regression)

In [3]:
print("Training Baseline: Logistic Regression...")
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)

# Predict
y_pred_lr = lr_model.predict(X_test)
y_prob_lr = lr_model.predict_proba(X_test)[:, 1]

print("\n--- Baseline Results ---")
print(classification_report(y_test, y_pred_lr))
print(f"AUC-ROC: {roc_auc_score(y_test, y_prob_lr):.4f}")
print(f"AUC-PR: {average_precision_score(y_test, y_prob_lr):.4f}") 

Training Baseline: Logistic Regression...

--- Baseline Results ---
              precision    recall  f1-score   support

           0       0.95      0.65      0.77     27393
           1       0.17      0.70      0.27      2830

    accuracy                           0.65     30223
   macro avg       0.56      0.67      0.52     30223
weighted avg       0.88      0.65      0.73     30223

AUC-ROC: 0.7557
AUC-PR: 0.4351


Ensemble Model (Random Forest)

In [4]:
print("\nTraining Ensemble: Random Forest...")
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Predict
y_pred_rf = rf_model.predict(X_test)
y_prob_rf = rf_model.predict_proba(X_test)[:, 1]

print("\n--- Random Forest Results ---")
print(classification_report(y_test, y_pred_rf))
print(f"AUC-ROC: {roc_auc_score(y_test, y_prob_rf):.4f}")
print(f"AUC-PR: {average_precision_score(y_test, y_prob_rf):.4f}")


Training Ensemble: Random Forest...

--- Random Forest Results ---
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     27393
           1       0.99      0.55      0.71      2830

    accuracy                           0.96     30223
   macro avg       0.97      0.77      0.84     30223
weighted avg       0.96      0.96      0.95     30223

AUC-ROC: 0.7695
AUC-PR: 0.6369


Save Best Model

In [5]:
joblib.dump(rf_model, '../models/fraud_detection_model.pkl')
print("Best model saved to models/fraud_detection_model.pkl")

Best model saved to models/fraud_detection_model.pkl
