<a href="https://colab.research.google.com/github/ad-batrisyia/money-laundering-detection-ml/blob/main/notebooks/05_Stratified_Sampling_With_RUS_Hyperparameter_Tuning_Adjust_Threshold_Retrain_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import math
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from PIL import Image
from scipy.stats import skew
from matplotlib.transforms import Bbox
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Define the path where you saved it
load_path = '/content/drive/MyDrive/Money Laundering Detection/PreprocessedData.csv'

# Load the CSV
df = pd.read_csv(load_path)

df.head()

Unnamed: 0,Amount,Payment_type,Is_laundering,TimeOfDay,Is_Weekend,Tx_per_Day,Tx_Interval,Avg_Amount_Sent,Pair_bank_location,Laundering_Structure
0,1459.15,1,0,0,0,1,0.0,1857.866667,304,2
1,6019.64,5,0,0,0,102,0.0,5889.320323,303,6
2,14328.44,3,0,0,0,3,0.0,10167.677647,304,6
3,11895.0,0,0,0,0,9,0.0,11937.773333,304,5
4,115.25,1,0,0,0,1,0.0,1068.905,304,2


In [None]:
df.shape

(9504852, 10)

In [None]:
X = df.drop(columns=['Is_laundering'], axis=1)
y = df['Is_laundering']

## Split Train-Validation-Test 70:15:15 (Stratify)


In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV

# ====================
# Train-Validate-Test Split
# ====================
# First split off test set (15%)
x_trainval, x_test, y_trainval, y_test = train_test_split(
    X, y, test_size=0.15, stratify=y, random_state=42
)

# Then split trainval into train (70%) + validation (15%)
x_train, x_val, y_train, y_val = train_test_split(
    x_trainval, y_trainval, test_size=0.176, stratify=y_trainval, random_state=42
)
# (0.176 of 0.85 = 0.15 total)

print("x_train:", x_train.shape, "y_train:", y_train.shape)
print("x_val:", x_val.shape, "y_val:", y_val.shape)
print("x_test:", x_test.shape, "y_test:", y_test.shape)

x_train: (6657198, 9) y_train: (6657198,)
x_val: (1421926, 9) y_val: (1421926,)
x_test: (1425728, 9) y_test: (1425728,)


In [None]:
# Class distribution in training set
print("\nClass distribution in y_train:")
print(y_train.value_counts())

# Class distribution in validation set
print("\nClass distribution in y_train:")
print(y_val.value_counts())

# Class distribution in testing set
print("\nClass distribution in y_test:")
print(y_test.value_counts())


Class distribution in y_train:
Is_laundering
0    6650283
1       6915
Name: count, dtype: int64

Class distribution in y_train:
Is_laundering
0    1420449
1       1477
Name: count, dtype: int64

Class distribution in y_test:
Is_laundering
0    1424247
1       1481
Name: count, dtype: int64


## Train Model

### Baseline Model Result

In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix)

def evaluate_model(name, y_true, y_pred):
    print(f"\n=== {name} ===")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print("Classification Report:")
    print(classification_report(y_true, y_pred, digits=4))
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))

# === Random Forest Baseline ===
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
rf.fit(x_train, y_train)
y_val_pred_rf = rf.predict(x_val)

# === XGBoost Baseline ===
xgb = XGBClassifier(
    random_state=42,
    eval_metric="logloss",
    n_jobs=-1)

xgb.fit(x_train, y_train)
y_val_pred_xgb = xgb.predict(x_val)

evaluate_model("Random Forest", y_val, y_val_pred_rf)
evaluate_model("XGBoost", y_val, y_val_pred_xgb)


=== Random Forest ===
Accuracy: 0.9997
Classification Report:
              precision    recall  f1-score   support

           0     0.9997    1.0000    0.9998   1420449
           1     0.9894    0.6919    0.8143      1477

    accuracy                         0.9997   1421926
   macro avg     0.9945    0.8460    0.9071   1421926
weighted avg     0.9997    0.9997    0.9996   1421926

Confusion Matrix:
[[1420438      11]
 [    455    1022]]

=== XGBoost ===
Accuracy: 0.9996
Classification Report:
              precision    recall  f1-score   support

           0     0.9997    0.9999    0.9998   1420449
           1     0.8923    0.6845    0.7747      1477

    accuracy                         0.9996   1421926
   macro avg     0.9460    0.8422    0.8873   1421926
weighted avg     0.9996    0.9996    0.9996   1421926

Confusion Matrix:
[[1420327     122]
 [    466    1011]]


### Stratified Sampling with RUS

In [None]:
from imblearn.under_sampling import RandomUnderSampler

# ====================
# Apply Random Under-Sampling (RUS) on Training Set
# ====================
rus = RandomUnderSampler(random_state=42)
x_train_rus, y_train_rus = rus.fit_resample(x_train, y_train)

print("\nAfter RUS - class distribution:", y_train_rus.value_counts())

# Train with RUS data
rf_rus = RandomForestClassifier(random_state=42, n_jobs=-1)
rf_rus.fit(x_train_rus, y_train_rus)
y_val_pred_rf_rus = rf_rus.predict(x_val)

xgb_rus = XGBClassifier(
    random_state=42,
    eval_metric="logloss",
    n_jobs=-1
)
xgb_rus.fit(x_train_rus, y_train_rus)
y_val_pred_xgb_rus = xgb_rus.predict(x_val)

print("\n=== After RUS (Default Params) ===")
evaluate_model("Random Forest (RUS)", y_val, y_val_pred_rf_rus)
evaluate_model("XGBoost (RUS)", y_val, y_val_pred_xgb_rus)


After RUS - class distribution: Is_laundering
0    6915
1    6915
Name: count, dtype: int64

=== After RUS (Default Params) ===

=== Random Forest (RUS) ===
Accuracy: 0.9550
Classification Report:
              precision    recall  f1-score   support

           0     0.9999    0.9550    0.9770   1420449
           1     0.0214    0.9479    0.0419      1477

    accuracy                         0.9550   1421926
   macro avg     0.5107    0.9514    0.5095   1421926
weighted avg     0.9989    0.9550    0.9760   1421926

Confusion Matrix:
[[1356570   63879]
 [     77    1400]]

=== XGBoost (RUS) ===
Accuracy: 0.9585
Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.9585    0.9788   1420449
           1     0.0234    0.9553    0.0456      1477

    accuracy                         0.9585   1421926
   macro avg     0.5117    0.9569    0.5122   1421926
weighted avg     0.9989    0.9585    0.9778   1421926

Confusion Matrix:
[[1361470 

### Hyperparameter Tuning

In [None]:
# ====================
# Random Forest Hyperparameter Tuning
# ====================
rf_param_dist = {
    "n_estimators": [100, 200, 300],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5, 10]
}
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
rf_search = RandomizedSearchCV(rf, rf_param_dist, n_iter=5, cv=3, scoring="f1", n_jobs=-1, random_state=42)
rf_search.fit(x_train_rus, y_train_rus)
best_rf = rf_search.best_estimator_
y_val_pred_rf = best_rf.predict(x_val)
print("\nBest RF Params:", rf_search.best_params_)
evaluate_model("Random Forest (RUS + tuned)", y_val, y_val_pred_rf)

# ====================
# XGBoost Hyperparameter Tuning
# ====================
xgb_param_dist = {
    "n_estimators": [100, 200],
    "max_depth": [3, 6, 10],
    "learning_rate": [0.01, 0.1, 0.2],
    "subsample": [0.8, 1.0],
}
xgb = XGBClassifier(random_state=42, eval_metric="logloss", n_jobs=-1)
xgb_search = RandomizedSearchCV(xgb, xgb_param_dist, n_iter=5, cv=3, scoring="f1", n_jobs=-1, random_state=42)
xgb_search.fit(x_train_rus, y_train_rus)
best_xgb = xgb_search.best_estimator_
y_val_pred_xgb = best_xgb.predict(x_val)
print("\nBest XGB Params:", xgb_search.best_params_)
evaluate_model("XGBoost (RUS + tuned)", y_val, y_val_pred_xgb)


Best RF Params: {'n_estimators': 200, 'min_samples_split': 5, 'max_depth': 20}

=== Random Forest (RUS + tuned) ===
Accuracy: 0.9549
Classification Report:
              precision    recall  f1-score   support

           0     0.9999    0.9549    0.9769   1420449
           1     0.0214    0.9506    0.0419      1477

    accuracy                         0.9549   1421926
   macro avg     0.5107    0.9527    0.5094   1421926
weighted avg     0.9989    0.9549    0.9759   1421926

Confusion Matrix:
[[1356347   64102]
 [     73    1404]]

Best XGB Params: {'subsample': 0.8, 'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.1}

=== XGBoost (RUS + tuned) ===
Accuracy: 0.9572
Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.9572    0.9781   1420449
           1     0.0227    0.9546    0.0443      1477

    accuracy                         0.9572   1421926
   macro avg     0.5113    0.9559    0.5112   1421926
weighted avg     0.9

### Adjust Threshold

In [None]:
from sklearn.metrics import f1_score

def tune_threshold(model, X_val, y_val, name):
    """Find best threshold on validation set for max F1"""
    probs = model.predict_proba(X_val)[:, 1]
    best_thresh, best_f1 = 0.5, 0
    for thresh in np.linspace(0.1, 0.9, 81):
        preds = (probs >= thresh).astype(int)
        f1 = f1_score(y_val, preds)
        if f1 > best_f1:
            best_f1, best_thresh = f1, thresh
    print(f"{name} best threshold: {best_thresh:.2f}, best F1: {best_f1:.4f}")
    return best_thresh

# ====================
# 4) Random Forest
# ====================
rf_thresh = tune_threshold(best_rf, x_val, y_val, "Random Forest")
y_val_pred_rf = (best_rf.predict_proba(x_val)[:, 1] >= rf_thresh).astype(int)
evaluate_model("Random Forest (Validation)", y_val, y_val_pred_rf)

# ====================
# 5) XGBoost
# ====================
xgb_thresh = tune_threshold(best_xgb, x_val, y_val, "XGBoost")
y_val_pred_xgb = (best_xgb.predict_proba(x_val)[:, 1] >= xgb_thresh).astype(int)
evaluate_model("XGBoost (Validation)", y_val, y_val_pred_xgb)


Random Forest best threshold: 0.90, best F1: 0.2991

=== Random Forest (Validation) ===
Accuracy: 0.9959
Classification Report:
              precision    recall  f1-score   support

           0     0.9998    0.9961    0.9980   1420449
           1     0.1822    0.8362    0.2991      1477

    accuracy                         0.9959   1421926
   macro avg     0.5910    0.9161    0.6485   1421926
weighted avg     0.9990    0.9959    0.9972   1421926

Confusion Matrix:
[[1414904    5545]
 [    242    1235]]
XGBoost best threshold: 0.90, best F1: 0.2556

=== XGBoost (Validation) ===
Accuracy: 0.9948
Classification Report:
              precision    recall  f1-score   support

           0     0.9999    0.9949    0.9974   1420449
           1     0.1501    0.8605    0.2556      1477

    accuracy                         0.9948   1421926
   macro avg     0.5750    0.9277    0.6265   1421926
weighted avg     0.9990    0.9948    0.9966   1421926

Confusion Matrix:
[[1413251    7198]
 [    20

### Retrain on Train+Validation Set with Best Threshold

In [None]:
# Retrain RF on full Train+Val with RUS
x_trainval_rus, y_trainval_rus = rus.fit_resample(x_trainval, y_trainval)
best_rf.fit(x_trainval_rus, y_trainval_rus)
y_test_pred_rf = (best_rf.predict_proba(x_test)[:, 1] >= rf_thresh).astype(int)
evaluate_model("Random Forest (Final Test)", y_test, y_test_pred_rf)

# Retrain XGB on full Train+Val with RUS
best_xgb.fit(x_trainval_rus, y_trainval_rus)
y_test_pred_xgb = (best_xgb.predict_proba(x_test)[:, 1] >= xgb_thresh).astype(int)
evaluate_model("XGBoost (Final Test)", y_test, y_test_pred_xgb)


=== Random Forest (Final Test) ===
Accuracy: 0.9958
Classification Report:
              precision    recall  f1-score   support

           0     0.9998    0.9960    0.9979   1424247
           1     0.1769    0.8298    0.2916      1481

    accuracy                         0.9958   1425728
   macro avg     0.5884    0.9129    0.6448   1425728
weighted avg     0.9990    0.9958    0.9972   1425728

Confusion Matrix:
[[1418529    5718]
 [    252    1229]]

=== XGBoost (Final Test) ===
Accuracy: 0.9935
Classification Report:
              precision    recall  f1-score   support

           0     0.9999    0.9936    0.9967   1424247
           1     0.1226    0.8596    0.2146      1481

    accuracy                         0.9935   1425728
   macro avg     0.5612    0.9266    0.6057   1425728
weighted avg     0.9989    0.9935    0.9959   1425728

Confusion Matrix:
[[1415139    9108]
 [    208    1273]]
