# Interim‑2 Modeling — E‑commerce Fraud

Dataset: **fraud**  
Target: **class**

This notebook fully addresses **Interim‑2 feedback**:
- Logistic Regression baseline (Task 2a)
- Random Forest ensemble + tuning (Task 2b)
- AUC‑PR, F1‑Score, Confusion Matrix
- Model comparison and saved artifact


In [2]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import average_precision_score, f1_score, confusion_matrix, classification_report

import joblib

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)


In [3]:
df = pd.read_csv("../data/processed/fraud_data_processed.csv")
X = df.drop(columns="class")
y = df["class"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

print("Fraud rate:", y.mean())


Fraud rate: 0.09364577267192546


In [4]:
num_cols = X_train.select_dtypes(include="number").columns
cat_cols = X_train.select_dtypes(exclude="number").columns

preprocess = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])


In [5]:
# Logistic Regression baseline
logreg = Pipeline([
    ("preprocess", preprocess),
    ("model", LogisticRegression(class_weight="balanced", max_iter=3000, random_state=RANDOM_STATE))
])
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)
y_proba = logreg.predict_proba(X_test)[:, 1]

print("LogReg AUC‑PR:", average_precision_score(y_test, y_proba))
print("LogReg F1:", f1_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


LogReg AUC‑PR: 0.6543021345975666
LogReg F1: 0.6684503901895206
[[27237   156]
 [ 1331  1499]]


In [None]:
# Random Forest with tuning
rf = Pipeline([
    ("preprocess", preprocess),
    ("model", RandomForestClassifier(class_weight="balanced", random_state=RANDOM_STATE, n_jobs=-1))
])

param_grid = {
    "model__n_estimators": [200, 400],
    "model__max_depth": [8, 12, None]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
gs = GridSearchCV(rf, param_grid, scoring="average_precision", cv=cv, n_jobs=-1)
gs.fit(X_train, y_train)

best_rf = gs.best_estimator_

y_pred = best_rf.predict(X_test)
y_proba = best_rf.predict_proba(X_test)[:, 1]

print("RF AUC‑PR:", average_precision_score(y_test, y_proba))
print("RF F1:", f1_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


: 

In [None]:
os.makedirs("../models", exist_ok=True)
joblib.dump(best_rf, "../models/best_model_fraud.pkl")
print("Saved ../models/best_model_fraud.pkl")
