In [4]:
import numpy as np
import pandas as pd

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Imbalance tools
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

RANDOM_STATE = 42

# ---- Load your processed dataset ----
DATA_PATH = "../data/processed/cleaned_transactions.csv"
TIMESTAMP_COL = "timestamp"
TARGET_COL = "is_fraud"

df = pd.read_csv(DATA_PATH)
df[TIMESTAMP_COL] = pd.to_datetime(df[TIMESTAMP_COL], errors="coerce")
df = df.dropna(subset=[TIMESTAMP_COL]).sort_values(TIMESTAMP_COL).reset_index(drop=True)

# ---- Feature groups ----
numeric_features = (
    df.select_dtypes(include=["number"])
      .columns
      .drop([TARGET_COL], errors="ignore")
      .tolist()
)
categorical_features = (
    df.select_dtypes(include=["object", "category", "bool"])
      .columns
      .drop([TARGET_COL, TIMESTAMP_COL], errors="ignore")
      .tolist()
)

X = df[numeric_features + categorical_features]
y = df[TARGET_COL].astype(int)

# ---- Time split (same approach as Day 1) ----
split_index = int(len(df) * 0.8)
X_train_raw, X_test_raw = X.iloc[:split_index], X.iloc[split_index:]
y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

print("Train shape:", X_train_raw.shape, "Test shape:", X_test_raw.shape)
print("Train fraud rate:", y_train.mean(), "Test fraud rate:", y_test.mean())

# ---- Preprocessing ----
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocess = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
], remainder="drop")


# ---- Experiment logger ----
def evaluate_probs(y_true, y_proba, threshold=0.5):
    y_pred = (y_proba >= threshold).astype(int)
    return {
        "threshold": threshold,
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1": f1_score(y_true, y_pred, zero_division=0),
        "roc_auc": roc_auc_score(y_true, y_proba)
    }

experiment_log = []

Train shape: (8832, 26) Test shape: (2208, 26)
Train fraud rate: 0.07676630434782608 Test fraud rate: 0.1408514492753623


In [6]:
import imblearn
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
print("imblearn version:", imblearn.__version__)

imblearn version: 0.11.0


In [12]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve

def eval_at_threshold(y_true, y_proba, threshold=0.5):
    y_pred = (y_proba >= threshold).astype(int)
    return {
        "threshold": threshold,
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1": f1_score(y_true, y_pred, zero_division=0),
        "roc_auc": roc_auc_score(y_true, y_proba),
    }

experiment_log = []

def log_result(model_name, imbalance_method, y_true, y_proba, threshold=0.5, notes=""):
    row = {
        "model": model_name,
        "imbalance_method": imbalance_method,
        **eval_at_threshold(y_true, y_proba, threshold),
        "notes": notes
    }

    # remove any existing row with same model + method + threshold
    global experiment_log
    experiment_log = [
        r for r in experiment_log
        if not (r["model"] == model_name and r["imbalance_method"] == imbalance_method and r["threshold"] == threshold)
    ]

    experiment_log.append(row)
    return row

## Random Forest + Random UnderSampling (Imbalance Handling)

### Why are we doing this?
Fraud datasets are imbalanced (few fraud cases, many legitimate cases). Many models become biased toward predicting “legit” because that’s the majority class.

**RandomUnderSampler** reduces the number of legitimate transactions in the training set so the model sees a more balanced dataset during learning.  
This often improves **recall** (catching more fraud), but it can sometimes reduce **precision** (more false alarms).

### What this experiment does
1. Applies the same preprocessing pipeline (scaling numeric + encoding categorical).
2. Undersamples the majority class (legit) **only on the training data**.
3. Trains a Random Forest model.
4. Predicts fraud probabilities on the test set.
5. Evaluates precision, recall, F1, and ROC-AUC at a chosen threshold (0.5).
6. Logs the results into our experiment tracker.

In [13]:
# =========================
# Random Forest + UnderSampling (Imbalance Handling)
# =========================

# 1) Import the model and imbalance tool
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

# 2) Build an imbalanced-learn pipeline
#    Why ImbPipeline?
#    - It allows sampling steps (like RandomUnderSampler) inside a pipeline
#    - It ensures the sampler runs ONLY during training (fit), not during predict
rf_under = ImbPipeline(steps=[
    ("preprocess", preprocess),  # (a) apply preprocessing: impute + scale + one-hot encode
    ("under", RandomUnderSampler(random_state=42)),  # (b) undersample majority class in TRAIN only
    ("model", RandomForestClassifier(
        n_estimators=300,      # number of trees (more trees = more stable, but slower)
        random_state=42,       # for reproducibility
        n_jobs=-1              # use all CPU cores
    ))
])

# 3) Train the pipeline on the training split
rf_under.fit(X_train_raw, y_train)

# 4) Predict probabilities on the test split
#    We take [:, 1] because column 1 is the probability of class 1 (fraud)
y_proba = rf_under.predict_proba(X_test_raw)[:, 1]

# 5) Log results (evaluated at threshold=0.5)
#    This stores metrics in your experiment_log list so we can compare many runs later
log_result(
    model_name="RandomForest",
    imbalance_method="RandomUnderSampler",
    y_true=y_test,
    y_proba=y_proba,
    threshold=0.5,
    notes="Baseline RF + undersampling"
)

# 6) Display experiment results sorted by best F1 score
pd.DataFrame(experiment_log).sort_values("f1", ascending=False)

Unnamed: 0,model,imbalance_method,threshold,precision,recall,f1,roc_auc,notes
0,RandomForest,RandomUnderSampler,0.5,0.903427,0.932476,0.917722,0.978038,Baseline RF + undersampling


## Random Forest + SMOTE (Oversampling the Minority Class)

### Why SMOTE?
Undersampling throws away many legitimate transactions, which can remove useful patterns.
**SMOTE** (Synthetic Minority Oversampling Technique) instead *adds* synthetic fraud samples to the training set, making the classes more balanced **without deleting majority-class data**.

### What this experiment does
1. Preprocess the data (impute + scale + one-hot encode).
2. Apply SMOTE **on the training data only** to generate additional fraud-like samples.
3. Train a Random Forest model on the SMOTE-balanced training data.
4. Predict fraud probabilities on the untouched test set.
5. Evaluate and log performance (precision, recall, F1, ROC-AUC).

In [14]:
# =========================
# Random Forest + SMOTE (Imbalance Handling)
# =========================

from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# NOTE:
# SMOTE creates synthetic minority samples by looking at nearest neighbors.
# It works best on numeric feature spaces. Since we one-hot encode categoricals,
# SMOTE can still run, but it's not always ideal for one-hot encoded data.
# For today (Day 3), it's still a valid experiment. Later, we can consider SMOTENC.

rf_smote = ImbPipeline(steps=[
    ("preprocess", preprocess),                 # (a) preprocessing
    ("smote", SMOTE(random_state=42)),          # (b) oversample fraud class in TRAIN only
    ("model", RandomForestClassifier(
        n_estimators=300,
        random_state=42,
        n_jobs=-1
    ))
])

# Train
rf_smote.fit(X_train_raw, y_train)

# Predict probabilities on TEST
y_proba = rf_smote.predict_proba(X_test_raw)[:, 1]

# Log results
log_result(
    model_name="RandomForest",
    imbalance_method="SMOTE",
    y_true=y_test,
    y_proba=y_proba,
    threshold=0.5,
    notes="RF + SMOTE oversampling"
)

# View results
pd.DataFrame(experiment_log).sort_values("f1", ascending=False)

Unnamed: 0,model,imbalance_method,threshold,precision,recall,f1,roc_auc,notes
1,RandomForest,SMOTE,0.5,1.0,0.916399,0.956376,0.971303,RF + SMOTE oversampling
0,RandomForest,RandomUnderSampler,0.5,0.903427,0.932476,0.917722,0.978038,Baseline RF + undersampling


## Random Forest + Class Weights (No Sampling)

### Why this approach?
Instead of changing the dataset size (undersampling) or generating synthetic samples (SMOTE),
we tell the model that fraud errors are more costly by using **class_weight**.

This often improves recall while keeping the dataset intact.

### What this experiment does
1. Preprocess data (impute + scale + one-hot encode)
2. Train a Random Forest with class weighting
3. Predict probabilities on the test set
4. Evaluate precision/recall/F1/ROC-AUC and log results

In [15]:
# =========================
# Random Forest + Class Weights (No Sampling)
# =========================

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

rf_weighted = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", RandomForestClassifier(
        n_estimators=300,
        random_state=42,
        n_jobs=-1,
        class_weight="balanced_subsample"  # balances classes per tree sample
    ))
])

# Train
rf_weighted.fit(X_train_raw, y_train)

# Predict probabilities
y_proba = rf_weighted.predict_proba(X_test_raw)[:, 1]

# Log results
log_result(
    model_name="RandomForest",
    imbalance_method="class_weight=balanced_subsample",
    y_true=y_test,
    y_proba=y_proba,
    threshold=0.5,
    notes="RF without sampling; uses class weights"
)

# View log (latest runs only)
pd.DataFrame(experiment_log).sort_values("f1", ascending=False)

Unnamed: 0,model,imbalance_method,threshold,precision,recall,f1,roc_auc,notes
1,RandomForest,SMOTE,0.5,1.0,0.916399,0.956376,0.971303,RF + SMOTE oversampling
2,RandomForest,class_weight=balanced_subsample,0.5,1.0,0.916399,0.956376,0.974102,RF without sampling; uses class weights
0,RandomForest,RandomUnderSampler,0.5,0.903427,0.932476,0.917722,0.978038,Baseline RF + undersampling


In [16]:
import importlib

for pkg in ["xgboost", "lightgbm"]:
    try:
        importlib.import_module(pkg)
        print(f"✅ {pkg} is installed")
    except Exception as e:
        print(f"❌ {pkg} not installed -> {e}")

✅ xgboost is installed
❌ lightgbm not installed -> No module named 'lightgbm'


## XGBoost (Advanced Model) + Imbalance Handling

### Why XGBoost?
XGBoost is a strong gradient-boosting model that often outperforms bagging models (like Random Forest)
on structured/tabular datasets.

### How we handle imbalance in XGBoost
Instead of SMOTE/undersampling, XGBoost can directly weight the minority class using:

**scale_pos_weight = (#negative / #positive)**

This tells the model that fraud errors are more costly and often improves recall.

In [17]:
from xgboost import XGBClassifier

# 1) Compute scale_pos_weight from training set
neg = (y_train == 0).sum()
pos = (y_train == 1).sum()
scale_pos_weight = neg / pos
print("scale_pos_weight:", scale_pos_weight)

# 2) XGBoost pipeline (uses same preprocessing)
xgb_weighted = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", XGBClassifier(
        n_estimators=500,
        max_depth=4,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1,
        eval_metric="logloss",
        scale_pos_weight=scale_pos_weight
    ))
])

# 3) Train
xgb_weighted.fit(X_train_raw, y_train)

# 4) Predict probabilities (fraud class)
y_proba = xgb_weighted.predict_proba(X_test_raw)[:, 1]

# 5) Log results
log_result(
    model_name="XGBoost",
    imbalance_method=f"scale_pos_weight={scale_pos_weight:.2f}",
    y_true=y_test,
    y_proba=y_proba,
    threshold=0.5,
    notes="XGB with class weighting (no sampling)"
)

pd.DataFrame(experiment_log).sort_values("f1", ascending=False)

scale_pos_weight: 12.026548672566372


Unnamed: 0,model,imbalance_method,threshold,precision,recall,f1,roc_auc,notes
1,RandomForest,SMOTE,0.5,1.0,0.916399,0.956376,0.971303,RF + SMOTE oversampling
2,RandomForest,class_weight=balanced_subsample,0.5,1.0,0.916399,0.956376,0.974102,RF without sampling; uses class weights
0,RandomForest,RandomUnderSampler,0.5,0.903427,0.932476,0.917722,0.978038,Baseline RF + undersampling
3,XGBoost,scale_pos_weight=12.03,0.5,0.899371,0.919614,0.90938,0.975314,XGB with class weighting (no sampling)


In [20]:
xgb_t07 = eval_at_threshold(y_test, y_proba, threshold=0.7)
xgb_t07

{'threshold': 0.7,
 'precision': 0.9827586206896551,
 'recall': 0.9163987138263665,
 'f1': 0.9484193011647254,
 'roc_auc': 0.9753138734878392}

In [21]:
thresholds = np.round(np.arange(0.1, 0.8, 0.1), 1)  # 0.1 to 0.7

rows = []
for t in thresholds:
    m = eval_at_threshold(y_test, y_proba, threshold=float(t))
    rows.append({
        "model": "XGBoost",
        "imbalance_method": f"scale_pos_weight={scale_pos_weight:.2f}",
        "threshold": float(t),
        "precision": m["precision"],
        "recall": m["recall"],
        "f1": m["f1"],
        "roc_auc": m["roc_auc"],
    })

xgb_threshold_df = pd.DataFrame(rows).sort_values("f1", ascending=False)
xgb_threshold_df

Unnamed: 0,model,imbalance_method,threshold,precision,recall,f1,roc_auc
6,XGBoost,scale_pos_weight=12.03,0.7,0.982759,0.916399,0.948419,0.975314
5,XGBoost,scale_pos_weight=12.03,0.6,0.937705,0.919614,0.928571,0.975314
4,XGBoost,scale_pos_weight=12.03,0.5,0.899371,0.919614,0.90938,0.975314
3,XGBoost,scale_pos_weight=12.03,0.4,0.857567,0.92926,0.891975,0.975314
2,XGBoost,scale_pos_weight=12.03,0.3,0.776596,0.938907,0.850073,0.975314
1,XGBoost,scale_pos_weight=12.03,0.2,0.653422,0.951768,0.774869,0.975314
0,XGBoost,scale_pos_weight=12.03,0.1,0.473016,0.958199,0.633369,0.975314


## LightGBM (Advanced Model) + Imbalance Handling

### Why LightGBM?
LightGBM is a fast gradient-boosting model (like XGBoost) that often performs very well on tabular data.
It can handle imbalance using built-in weighting instead of resampling.

### Imbalance strategy used
We use **class_weight="balanced"** so the model penalizes mistakes on fraud (minority) more heavily.


In [22]:
from lightgbm import LGBMClassifier

lgbm_weighted = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", LGBMClassifier(
        n_estimators=800,
        learning_rate=0.05,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        class_weight="balanced"   # imbalance handling
    ))
])

# Train
lgbm_weighted.fit(X_train_raw, y_train)

# Predict probabilities
y_proba = lgbm_weighted.predict_proba(X_test_raw)[:, 1]

# Log
log_result(
    model_name="LightGBM",
    imbalance_method="class_weight=balanced",
    y_true=y_test,
    y_proba=y_proba,
    threshold=0.5,
    notes="LGBM with class weighting (no sampling)"
)

pd.DataFrame(experiment_log).sort_values("f1", ascending=False)

[LightGBM] [Info] Number of positive: 678, number of negative: 8154
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004278 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2396
[LightGBM] [Info] Number of data points in the train set: 8832, number of used features: 184
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


Unnamed: 0,model,imbalance_method,threshold,precision,recall,f1,roc_auc,notes
1,RandomForest,SMOTE,0.5,1.0,0.916399,0.956376,0.971303,RF + SMOTE oversampling
2,RandomForest,class_weight=balanced_subsample,0.5,1.0,0.916399,0.956376,0.974102,RF without sampling; uses class weights
4,LightGBM,class_weight=balanced,0.5,0.972696,0.916399,0.943709,0.966474,LGBM with class weighting (no sampling)
0,RandomForest,RandomUnderSampler,0.5,0.903427,0.932476,0.917722,0.978038,Baseline RF + undersampling
3,XGBoost,scale_pos_weight=12.03,0.5,0.899371,0.919614,0.90938,0.975314,XGB with class weighting (no sampling)


In [23]:
thresholds = np.round(np.arange(0.1, 0.8, 0.1), 1)

rows = []
for t in thresholds:
    m = eval_at_threshold(y_test, y_proba, threshold=float(t))  # y_proba from LightGBM
    rows.append({
        "model": "LightGBM",
        "imbalance_method": "class_weight=balanced",
        "threshold": float(t),
        "precision": m["precision"],
        "recall": m["recall"],
        "f1": m["f1"],
        "roc_auc": m["roc_auc"],
    })

lgbm_threshold_df = pd.DataFrame(rows).sort_values("f1", ascending=False)
lgbm_threshold_df

Unnamed: 0,model,imbalance_method,threshold,precision,recall,f1,roc_auc
6,LightGBM,class_weight=balanced,0.7,0.989583,0.916399,0.951586,0.966474
5,LightGBM,class_weight=balanced,0.6,0.982759,0.916399,0.948419,0.966474
4,LightGBM,class_weight=balanced,0.5,0.972696,0.916399,0.943709,0.966474
3,LightGBM,class_weight=balanced,0.4,0.969388,0.916399,0.942149,0.966474
2,LightGBM,class_weight=balanced,0.3,0.962838,0.916399,0.939044,0.966474
1,LightGBM,class_weight=balanced,0.2,0.946844,0.916399,0.931373,0.966474
0,LightGBM,class_weight=balanced,0.1,0.92233,0.916399,0.919355,0.966474


## Reflection (fraud prevalence over time & stability):
Fraud prevalence increased from the training period (7.7%) to the test period (14.1%), showing the distribution of the target changes over time. 
When prevalence shifts, a fixed threshold can produce different precision/recall behaviour because the base rate of fraud affects how many alerts the model generates and how many are true positives. This can make model performance less stable in production, especially precision, because more (or fewer) cases fall above the decision threshold depending on the time window. 

As a result, models should be validated on time-based splits and monitored continuously, with periodic threshold recalibration. Using imbalance-aware approaches (class weights, SMOTE, tuned thresholds) helps maintain recall while controlling false positives under changing fraud rates.