In [None]:
from typing import Dict, List
from project_utils.autosave_plots import enable_autosave

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score, precision_recall_curve
from sklearn.model_selection import ParameterGrid, train_test_split
from sklearn.tree import export_text
from tqdm.auto import tqdm
from xgboost import XGBClassifier

In [None]:
# save plots to results/
enable_autosave("ml_baselines", quiet=True)

In [None]:
# enable retina plots
%matplotlib inline
%config InlineBackend.figure_format = "retina"

In [None]:
SEED = 42

## Load the cleaned up data

In [None]:
df = pd.read_csv("../data/processed/creditcard_clean.csv")
df.head()

## Creating the training/validation/testing datasets

In [None]:
# separate out class from other features
X = df.drop(columns=["Class"])
y = df["Class"]

# split the data 80:20 into train and test_validation (we'll split the latter again)
X_train, X_test_validation, y_train, y_test_validation = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=SEED,
)

# split the test_validation data into test and validation
X_test, X_validation, y_test, y_validation = train_test_split(
    X_test_validation,
    y_test_validation,
    test_size=0.5,
    random_state=SEED,
)

In [None]:
# Check distribution
for name, labels in zip(
    ["Train", "Validation", "Test"], [y_train, y_validation, y_test]
):
    print(f"{name}: {len(labels)} samples, {labels.mean()*100:.3f}% fraud")

print(
    f"Shapes: X_train={X_train.shape}, X_validation={X_validation.shape}, X_test={X_test.shape}"
)

#### Stratification

In [None]:
# separate out class from other features
X = df.drop(columns=["Class"])
y = df["Class"]

# split the data 80:20 into train and test_validation (we'll split the latter again)
X_train, X_test_validation, y_train, y_test_validation = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)

# split the test_validation data into test and validation
X_test, X_validation, y_test, y_validation = train_test_split(
    X_test_validation,
    y_test_validation,
    test_size=0.5,
    random_state=SEED,
    stratify=y_test_validation,
)

In [None]:
# Check distribution
for name, labels in zip(
    ["Train", "Validation", "Test"], [y_train, y_validation, y_test]
):
    print(f"{name}: {len(labels)} samples, {labels.mean()*100:.3f}% fraud")

print(
    f"Shapes: X_train={X_train.shape}, X_validation={X_validation.shape}, X_test={X_test.shape}"
)

## Logistic Regression

In [None]:
# define logistic regression
log_reg = LogisticRegression(
    class_weight="balanced",  # dataset is heavily imbalanced so classes are weighted
    penalty="l2",
    C=1,
    solver="lbfgs",
    random_state=SEED,
    max_iter=2000,
)

In [None]:
log_reg.fit(X_train, y_train)

In [None]:
y_validation_probabilities = log_reg.predict_proba(X_validation)
y_validation_probabilities = y_validation_probabilities[
    :, 1
]  # probabilities of fraud class

In [None]:
aps_lr = average_precision_score(y_validation, y_validation_probabilities)
print(f"Average APS score: {aps_lr:.3f}")

In [None]:
precision_lr, recall_lr, _ = precision_recall_curve(
    y_validation, y_validation_probabilities
)

In [None]:
aps_v14 = average_precision_score(
    y_validation, -X_validation["V14"]
)  # flipped sign because V14 mean is greater than sample mean
precision_v14, recall_v14, _ = precision_recall_curve(
    y_validation, -X_validation["V14"]
)

plt.figure(figsize=(6, 5))

# plot LT PRC
plt.plot(precision_lr, recall_lr, lw=2, label=f"LT | APS = {aps_lr:.3f}")
plt.fill_between(precision_lr, recall_lr, alpha=0.2)

# plot V14 VRC
plt.plot(precision_v14, recall_v14, lw=2, label=f"V14 | APS = {aps_v14:.3f}")
plt.fill_between(precision_v14, recall_v14, alpha=0.2)

plt.xlabel("Recall (frauds caught)")
plt.ylabel("Precision (alerts correct)")
plt.title("Precision-Recall Curve | V14 vs Logistic Regression")
plt.legend()
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.grid()
plt.tight_layout()
plt.show()

## Random Forest

In [None]:
# initialize model
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=3,
    class_weight="balanced",
    n_jobs=-1,  # use all processors
    random_state=SEED,
    verbose=1,
)

# train
rf.fit(X_train, y_train)

### Visualize a sample tree

In [None]:
# visualize tree
print("Single Decision Tree")
print(export_text(rf.estimators_[0], feature_names=list(X_train.columns)))

In [None]:
# visualize tree probabilities
print("Single Decision Tree Weighted Class Count")
print(rf.estimators_[0].tree_.value)

### Implementation

In [None]:
# actual implementation
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    class_weight="balanced",
    n_jobs=-1,  # use all processors
    random_state=SEED,
    verbose=1,
)

# train
rf.fit(X_train, y_train)

In [None]:
# predict probabilities
y_validation_probabilities = rf.predict_proba(X_validation)
y_validation_probabilities = y_validation_probabilities[
    :, 1
]  # probabilities of fraud class

In [None]:
# calculate aps
aps_rf = average_precision_score(y_validation, y_validation_probabilities)
print(f"Average APS score: {aps_rf:.3f}")

In [None]:
precision_rf, recall_rf, _ = precision_recall_curve(
    y_validation, y_validation_probabilities
)

In [None]:
plt.figure(figsize=(6, 5))

# plot V14 VRC
plt.plot(precision_v14, recall_v14, lw=2, label=f"V14 | APS = {aps_v14:.3f}")
plt.fill_between(precision_v14, recall_v14, alpha=0.2)

# plot LT PRC
plt.plot(precision_lr, recall_lr, lw=2, label=f"LT | APS = {aps_lr:.3f}")
plt.fill_between(precision_lr, recall_lr, alpha=0.2)

# plot RF PRC
plt.plot(precision_rf, recall_rf, lw=2, label=f"RF | APS = {aps_rf:.3f}")
plt.fill_between(precision_rf, recall_rf, alpha=0.2)

plt.xlabel("Recall (frauds caught)")
plt.ylabel("Precision (alerts correct)")
plt.title("Precision-Recall Curve | V14 vs Logistic Regression vs Random Forest")
plt.legend()
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.grid()
plt.tight_layout()
plt.show()

## XGBoost

### Initial model

In [None]:
## initial model training
xgb = XGBClassifier(
    n_estimators=2000,  # number of trees
    max_depth=6,
    learning_rate=0.3,  # eta
    verbosity=0,
    gamma=0,
    min_child_weight=1,
    max_delta_step=0,
    subsample=0.8,  # subsample 80% of instances to reduce overfitting
    sampling_method="uniform",
    colsample_bytree=0.8,  # subsample 80% of features to reduce dependence on few features
    reg_alpha=0,  # L1 regularization
    reg_lambda=1,  # L2 regularization
    scale_pos_weight=(y_train == 0).sum()
    / (y_train == 1).sum(),  # balance imbalance in dataset
    random_state=SEED,
    eval_metric="aucpr",
)

# train
xgb.fit(
    X_train,
    y_train,
    eval_set=[(X_validation, y_validation)],
    verbose=False,
)

In [None]:
# predict probabilities
y_validation_probabilities = xgb.predict_proba(X_validation)
y_validation_probabilities = y_validation_probabilities[
    :, 1
]  # probabilities of fraud class

In [None]:
# calculate aps
aps_xgb = average_precision_score(y_validation, y_validation_probabilities)
print(f"Average APS score: {aps_xgb:.3f}")

### Hyperparameter tuning

#### Mini training sweep

In [None]:
ParamSpace = Dict[str, List[float]]

hyperparameter_space: ParamSpace = {
    "learning_rate": [0.03, 0.05, 0.1],
    "max_depth": [1, 3, 4, 5, 10],
    "min_child_weight": [1, 2, 4, 10],
}

results = []
for lr in tqdm(
    hyperparameter_space["learning_rate"], desc="lr", position=0, leave=True
):
    for md in tqdm(
        hyperparameter_space["max_depth"], desc="md", position=1, leave=False
    ):
        for mcw in tqdm(
            hyperparameter_space["min_child_weight"],
            desc="mcw",
            position=2,
            leave=False,
        ):
            model = XGBClassifier(
                n_estimators=2000,  # number of trees
                max_depth=md,
                learning_rate=lr,  # eta
                verbosity=0,
                n_jobs=-1,
                gamma=0,
                min_child_weight=mcw,
                max_delta_step=0,
                subsample=0.8,  # subsample 80% of rows to reduce overfitting
                sampling_method="uniform",
                colsample_bytree=0.8,  # subsample 80% of features to reduce dependence on few features
                reg_alpha=0,  # L1 regularization
                reg_lambda=1,  # L2 regularization
                scale_pos_weight=(y_train == 0).sum()
                / (y_train == 1).sum(),  # balance imbalance in dataset
                random_state=SEED,
                eval_metric="aucpr",
            )
            model.fit(
                X_train, y_train, eval_set=[(X_validation, y_validation)], verbose=False
            )

            # predict APS
            y_validation_probabilities = model.predict_proba(X_validation)
            y_validation_probabilities = y_validation_probabilities[
                :, 1
            ]  # probabilities of fraud class
            aps = average_precision_score(y_validation, y_validation_probabilities)
            results.append(
                {
                    "learning_rate": lr,
                    "max_depth": md,
                    "min_child_weight": mcw,
                    "val_aps": aps,
                }
            )

In [None]:
tune_df = (
    pd.DataFrame(results).sort_values("val_aps", ascending=False).reset_index(drop=True)
)
print(tune_df.head(10))

In [None]:
# train the best model with early stopping
## initial model training
early_stopping_rounds_results = []
for early_stopping_rounds in tqdm(range(10, 200, 10)):
    model = XGBClassifier(
        n_estimators=2000,  # number of trees
        max_depth=tune_df["max_depth"][0],
        learning_rate=tune_df["learning_rate"][0],  # eta
        verbosity=0,
        n_jobs=-1,
        gamma=0,
        min_child_weight=tune_df["min_child_weight"][0],
        max_delta_step=0,
        subsample=0.8,  # subsample 80% of rows to reduce overfitting
        sampling_method="uniform",
        colsample_bytree=0.8,  # subsample 80% of features to reduce dependence on few features
        reg_alpha=0,  # L1 regularization
        reg_lambda=1,  # L2 regularization
        scale_pos_weight=(y_train == 0).sum()
        / (y_train == 1).sum(),  # balance imbalance in dataset
        random_state=SEED,
        eval_metric="aucpr",
        early_stopping_rounds=early_stopping_rounds,
    )

    # train
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_validation, y_validation)],
        verbose=False,
    )

    # predict APS
    y_validation_probabilities = model.predict_proba(X_validation)
    y_validation_probabilities = y_validation_probabilities[
        :, 1
    ]  # probabilities of fraud class
    aps = average_precision_score(y_validation, y_validation_probabilities)
    early_stopping_rounds_results.append(
        {
            "early_stopping_rounds": early_stopping_rounds,
            "best_iteration": model.best_iteration,
            "num_boosted_rounds": model.get_booster().num_boosted_rounds(),
            "val_aps": aps,
        }
    )

In [None]:
print("Early stopping rounds by APS")
print(pd.DataFrame(early_stopping_rounds_results))

In [None]:
optimal_early_stopping_rounds = (
    pd.DataFrame(early_stopping_rounds_results)
    .sort_values(by=["val_aps", "early_stopping_rounds"], ascending=[False, True])
    .reset_index(drop=True)["early_stopping_rounds"][0]
)

print(f"Optimal early stopping rounds: {optimal_early_stopping_rounds}")

#### Broader grid search

In [None]:
hyperparameter_space_regularization: ParamSpace = {
    "reg_alpha": [0.0, 0.1, 0.5],
    "reg_lambda": [0.5, 1.0, 2.0, 5.0],
    "gamma": [0.0, 0.1, 1.0],
}
hyperparameters_list: List[Dict[str, float]] = list(
    ParameterGrid(hyperparameter_space_regularization)
)

broad_grid_search_results: List[Dict[str, float]] = []
for hyperparameters in tqdm(hyperparameters_list):
    model = XGBClassifier(
        n_estimators=2000,  # number of trees
        max_depth=tune_df["max_depth"][0],
        learning_rate=tune_df["learning_rate"][0],  # eta
        verbosity=0,
        n_jobs=-1,
        min_child_weight=tune_df["min_child_weight"][0],
        max_delta_step=0,
        subsample=0.8,  # subsample 80% of instances to reduce overfitting
        sampling_method="uniform",
        colsample_bytree=0.8,  # subsample 80% of features to reduce dependence on few features
        scale_pos_weight=(y_train == 0).sum()
        / (y_train == 1).sum(),  # balance imbalance in dataset
        random_state=SEED,
        eval_metric="aucpr",
        early_stopping_rounds=optimal_early_stopping_rounds,
        **hyperparameters,
    )
    model.fit(X_train, y_train, eval_set=[(X_validation, y_validation)], verbose=False)

    # predict APS
    y_validation_probabilities = model.predict_proba(X_validation)
    y_validation_probabilities = y_validation_probabilities[
        :, 1
    ]  # probabilities of fraud class
    aps = average_precision_score(y_validation, y_validation_probabilities)
    broad_grid_search_results.append(
        {
            "reg_alpha": hyperparameters["reg_alpha"],
            "reg_lambda": hyperparameters["reg_lambda"],
            "gamma": hyperparameters["gamma"],
            "val_aps": aps,
        }
    )

In [None]:
regularization_parameters = (
    pd.DataFrame(broad_grid_search_results)
    .sort_values(by="val_aps", ascending=False)
    .reset_index(drop=True)
)
print(regularization_parameters.head(10))

### Train best model

In [None]:
model = XGBClassifier(
    n_estimators=2000,  # number of trees
    max_depth=tune_df["max_depth"][0],
    learning_rate=tune_df["learning_rate"][0],  # eta
    verbosity=0,
    n_jobs=-1,
    gamma=regularization_parameters["gamma"][0],
    min_child_weight=tune_df["min_child_weight"][0],
    max_delta_step=0,
    subsample=0.8,  # subsample 80% of instances to reduce overfitting
    sampling_method="uniform",
    colsample_bytree=0.8,  # subsample 80% of features to reduce dependence on few features
    reg_alpha=regularization_parameters["reg_alpha"][0],  # L1 regularization
    reg_lambda=regularization_parameters["reg_lambda"][0],  # L2 regularization
    scale_pos_weight=(y_train == 0).sum()
    / (y_train == 1).sum(),  # balance imbalance in dataset
    random_state=SEED,
    eval_metric="aucpr",
    early_stopping_rounds=optimal_early_stopping_rounds,
)
model.fit(X_train, y_train, eval_set=[(X_validation, y_validation)], verbose=False)

# predict APS
y_validation_probabilities = model.predict_proba(X_validation)
y_validation_probabilities = y_validation_probabilities[
    :, 1
]  # probabilities of fraud class

In [None]:
model.best_iteration

In [None]:
# calculate aps
aps_xgb = average_precision_score(y_validation, y_validation_probabilities)
print(f"Average APS score: {aps_xgb:.3f}")

In [None]:
# calculate precision and recall values
precision_xgb, recall_xgb, _ = precision_recall_curve(
    y_validation, y_validation_probabilities
)

In [None]:
plt.figure(figsize=(6, 5))

# plot V14 VRC
plt.plot(precision_v14, recall_v14, lw=2, label=f"V14 | APS = {aps_v14:.3f}")
plt.fill_between(precision_v14, recall_v14, alpha=0.2)

# plot LT PRC
plt.plot(precision_lr, recall_lr, lw=2, label=f"LT | APS = {aps_lr:.3f}")
plt.fill_between(precision_lr, recall_lr, alpha=0.2)

# plot RF PRC
plt.plot(precision_rf, recall_rf, lw=2, label=f"RF | APS = {aps_rf:.3f}")
plt.fill_between(precision_rf, recall_rf, alpha=0.2)

# plot XGB PRC
plt.plot(precision_xgb, recall_xgb, lw=2, label=f"XGB | APS = {aps_xgb:.3f}")
plt.fill_between(precision_xgb, recall_xgb, alpha=0.2)

plt.xlabel("Recall (frauds caught)")
plt.ylabel("Precision (alerts correct)")
plt.title(
    "Precision-Recall Curve | V14 vs Logistic Regression vs Random Forest vs XGBoost"
)
plt.legend()
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.grid()
plt.tight_layout()
plt.show()

In [None]:
# barplot aps by model
model_names = ["V14", "Logistic Regression", "Random Forest", "XGBoost"]
aps_scores = [aps_v14, aps_lr, aps_rf, aps_xgb]
plt.figure(figsize=(6, 4))
plt.bar(model_names, aps_scores, color=["gray", "blue", "green", "orange"])
plt.ylabel("Average Precision Score (APS)")
plt.title("Model Comparison on Validation Set")
plt.ylim(0, 1)
plt.grid(axis="y")
plt.tight_layout()
plt.show()

## Evaluate on Test Set

In [None]:
# scores for V14
aps_v14 = average_precision_score(
    y_test, -X_test["V14"]
)  # flipped sign because V14 mean is greater than sample mean
precision_v14, recall_v14, _ = precision_recall_curve(y_test, -X_test["V14"])

# scores for LR
y_test_probabilities = log_reg.predict_proba(X_test)
y_test_probabilities = y_test_probabilities[:, 1]  # probabilities of fraud class
precision_lr, recall_lr, _ = precision_recall_curve(y_test, y_test_probabilities)

# scores for RF
y_test_probabilities = rf.predict_proba(X_test)
y_test_probabilities = y_test_probabilities[:, 1]  # probabilities of fraud class
precision_rf, recall_rf, _ = precision_recall_curve(y_test, y_test_probabilities)

# score for XGB
y_test_probabilities = model.predict_proba(X_test)
y_test_probabilities = y_test_probabilities[:, 1]  # probabilities of fraud class
precision_xgb, recall_xgb, _ = precision_recall_curve(y_test, y_test_probabilities)

In [None]:
plt.figure(figsize=(6, 5))

# plot V14 VRC
plt.plot(precision_v14, recall_v14, lw=2, label=f"V14 | APS = {aps_v14:.3f}")
plt.fill_between(precision_v14, recall_v14, alpha=0.2)

# plot LT PRC
plt.plot(precision_lr, recall_lr, lw=2, label=f"LT | APS = {aps_lr:.3f}")
plt.fill_between(precision_lr, recall_lr, alpha=0.2)

# plot RF PRC
plt.plot(precision_rf, recall_rf, lw=2, label=f"RF | APS = {aps_rf:.3f}")
plt.fill_between(precision_rf, recall_rf, alpha=0.2)

# plot XGB PRC
plt.plot(precision_xgb, recall_xgb, lw=2, label=f"XGB | APS = {aps_xgb:.3f}")
plt.fill_between(precision_xgb, recall_xgb, alpha=0.2)

plt.xlabel("Recall (frauds caught)")
plt.ylabel("Precision (alerts correct)")
plt.title("Precision-Recall Curve | Test Set | V14 vs Classic ML Baselines")
plt.legend()
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.grid()
plt.tight_layout()
plt.show()