In [1]:
import sys
import joblib
import numpy as np
import pandas as pd
import xgboost as xgb
import joblib
import scipy.sparse as sp
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report
)

import xgboost as xgb


Load Preprocessed Data

In [3]:
X_train = joblib.load("../models/X_train_processed.pkl")
X_val = joblib.load("../models/X_val_processed.pkl")
y_train = joblib.load("../models/y_train.pkl")
y_val = joblib.load("../models/y_val.pkl")

print(X_train.shape, X_val.shape)


(2120440, 4798810) (530110, 4798810)


RANDOM FOREST

In [3]:
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=12,
    min_samples_split=5,
    min_samples_leaf=2,
    class_weight="balanced",
    n_jobs=-1,
    random_state=42
)

rf.fit(X_train, y_train)

rf_preds = rf.predict(X_val)
rf_probs = rf.predict_proba(X_val)[:, 1]

print("===== RANDOM FOREST RESULTS =====")
print(f"Accuracy: {accuracy_score(y_val, rf_preds):.4f}")
print(f"F1-score: {f1_score(y_val, rf_preds):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_val, rf_probs):.4f}")
print("\nConfusion Matrix:\n", confusion_matrix(y_val, rf_preds))
print("\nClassification Report:\n", classification_report(y_val, rf_preds))


===== RANDOM FOREST RESULTS =====
Accuracy: 0.7776
F1-score: 0.4596
ROC-AUC: 0.9247

Confusion Matrix:
 [[362088 113066]
 [  4818  50138]]

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.76      0.86    475154
           1       0.31      0.91      0.46     54956

    accuracy                           0.78    530110
   macro avg       0.65      0.84      0.66    530110
weighted avg       0.92      0.78      0.82    530110



Save Random Forest Model

In [4]:
joblib.dump(rf, "../models/rf_baseline.pkl")
print("Saved: ../models/rf_baseline.pkl")


Saved: ../models/rf_baseline.pkl


LIGHTGBM

In [3]:
from lightgbm import LGBMClassifier
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report
)

X_train = joblib.load("../models/X_train_processed.pkl")
X_val = joblib.load("../models/X_val_processed.pkl")
y_train = joblib.load("../models/y_train.pkl")
y_val = joblib.load("../models/y_val.pkl")

print("Original shapes:", X_train.shape, X_val.shape)


Original shapes: (2120440, 4798810) (530110, 4798810)


In [5]:
#Convert to SPARSE
X_train_sparse = sp.csr_matrix(X_train)
X_val_sparse = sp.csr_matrix(X_val)

print("Converted to sparse format")


Converted to sparse format


In [6]:
#TRAIN LIGHTGBM
lgb_model = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=7,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(),
    n_jobs=-1,
    random_state=42
)

lgb_model.fit(
    X_train_sparse,
    y_train,
    eval_set=[(X_val_sparse, y_val)]
)


[LightGBM] [Info] Number of positive: 219823, number of negative: 1900617
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 532.540317 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 982543
[LightGBM] [Info] Number of data points in the train set: 2120440, number of used features: 485090
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.103669 -> initscore=-2.157111
[LightGBM] [Info] Start training from score -2.157111


0,1,2
,boosting_type,'gbdt'
,num_leaves,64
,max_depth,7
,learning_rate,0.05
,n_estimators,500
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


Evaluate LightGBM

In [7]:
lgb_preds = lgb_model.predict(X_val_sparse)
lgb_probs = lgb_model.predict_proba(X_val_sparse)[:, 1]

print("===== LIGHTGBM RESULTS =====")
print(f"Accuracy: {accuracy_score(y_val, lgb_preds):.4f}")
print(f"F1-score: {f1_score(y_val, lgb_preds):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_val, lgb_probs):.4f}")
print("\nConfusion Matrix:\n", confusion_matrix(y_val, lgb_preds))
print("\nClassification Report:\n", classification_report(y_val, lgb_preds))




===== LIGHTGBM RESULTS =====
Accuracy: 0.9504
F1-score: 0.8002
ROC-AUC: 0.9907

Confusion Matrix:
 [[451217  23937]
 [  2344  52612]]

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.95      0.97    475154
           1       0.69      0.96      0.80     54956

    accuracy                           0.95    530110
   macro avg       0.84      0.95      0.89    530110
weighted avg       0.96      0.95      0.95    530110



Threshold Tuning

In [8]:
thresholds = np.linspace(0.2, 0.8, 7)

results = []
for t in thresholds:
    preds_t = (lgb_probs >= t).astype(int)
    results.append((t,
                    accuracy_score(y_val, preds_t),
                    f1_score(y_val, preds_t)))

results_df = pd.DataFrame(results, columns=["threshold", "accuracy", "f1"])
print(results_df)


   threshold  accuracy        f1
0        0.2  0.905884  0.684004
1        0.3  0.926515  0.733468
2        0.4  0.940069  0.769983
3        0.5  0.950423  0.800152
4        0.6  0.958633  0.825521
5        0.7  0.965734  0.848609
6        0.8  0.971370  0.866760


In [9]:
#Pick the best threshold:
best_row = results_df.loc[results_df["f1"].idxmax()]
best_threshold = best_row["threshold"]

print(f"Best threshold: {best_threshold}")

final_preds = (lgb_probs >= best_threshold).astype(int)

print("Tuned Metrics:")
print(f"Accuracy: {accuracy_score(y_val, final_preds):.4f}")
print(f"F1-score: {f1_score(y_val, final_preds):.4f}")


Best threshold: 0.8
Tuned Metrics:
Accuracy: 0.9714
F1-score: 0.8668


SAVE THE MODEL

In [10]:
joblib.dump(lgb_model, "../models/lightgbm_baseline.pkl")
print("Saved: ../models/lightgbm_baseline.pkl")


Saved: ../models/lightgbm_baseline.pkl


Compare Models

In [11]:
results = pd.DataFrame({
    "Model": ["LogReg", "RandomForest", "LightGBM"],
    "Accuracy": [
        accuracy_score(y_val, joblib.load("../models/logreg_baseline.pkl").predict(X_val)),
        accuracy_score(y_val, joblib.load("../models/rf_baseline.pkl").predict(X_val)),
        accuracy_score(y_val, lgb_preds)
    ],
    "F1": [
        f1_score(y_val, joblib.load("../models/logreg_baseline.pkl").predict(X_val)),
        f1_score(y_val, joblib.load("../models/rf_baseline.pkl").predict(X_val)),
        f1_score(y_val, lgb_preds)
    ],
    "ROC-AUC": [
        roc_auc_score(y_val, joblib.load("../models/logreg_baseline.pkl").predict_proba(X_val)[:,1]),
        roc_auc_score(y_val, joblib.load("../models/rf_baseline.pkl").predict_proba(X_val)[:,1]),
        roc_auc_score(y_val, lgb_probs)
    ]
})

results


Unnamed: 0,Model,Accuracy,F1,ROC-AUC
0,LogReg,0.958939,0.822117,0.986183
1,RandomForest,0.777624,0.459644,0.924662
2,LightGBM,0.950423,0.800152,0.990662


“Because of very high-dimensional sparse features, I switched from XGBoost to LightGBM, which is optimized for large sparse datasets. I used class weighting, threshold tuning, and proper validation.”