In [6]:
import os
import joblib
import numpy as np
import pandas as pd
import scipy.sparse as sp

from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report
)

import shap


In [7]:
X_train = joblib.load("../models/X_train_processed.pkl")
X_val = joblib.load("../models/X_val_processed.pkl")
y_train = joblib.load("../models/y_train.pkl")
y_val = joblib.load("../models/y_val.pkl")

# Convert to sparse (critical)
X_train_sparse = sp.csr_matrix(X_train)
X_val_sparse = sp.csr_matrix(X_val)

print("Shapes:", X_train_sparse.shape, X_val_sparse.shape)


Shapes: (2120440, 4798810) (530110, 4798810)


LIGHTGBM HYPERPARAMETER TUNING

In [None]:
#Define Search Space
param_dist = {
    "n_estimators": [300, 500, 800],
    "learning_rate": [0.01, 0.03, 0.05],
    "max_depth": [5, 7, 9, -1],
    "num_leaves": [31, 63, 127],
    "subsample": [0.7, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "min_child_samples": [20, 50, 100]
}


Randomized Search

In [None]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV

lgb = LGBMClassifier(
    scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(),
    n_jobs=-1,
    random_state=42,
    n_estimators=500,
    early_stopping_round=50
)

param_dist = {
    "num_leaves": [31, 63],
    "max_depth": [5, 7],
    "learning_rate": [0.05, 0.1],
    "subsample": [0.7, 0.9],
    "colsample_bytree": [0.6, 0.8]
}

search = RandomizedSearchCV(
    estimator=lgb,
    param_distributions=param_dist,
    n_iter=3,
    scoring="roc_auc",
    cv=None,
    verbose=1,
    random_state=42
)

search.fit(
    X_train_sparse, y_train,
    eval_set=[(X_val_sparse, y_val)],
    eval_metric="auc"
)

best_lgb = search.best_estimator_
print("Best Params:", search.best_params_)

Evaluate Tuned LightGBM

In [8]:
lgb_probs = best_lgb.predict_proba(X_val_sparse)[:, 1]
lgb_preds = (lgb_probs >= 0.5).astype(int)

print("===== TUNED LIGHTGBM =====")
print(f"Accuracy: {accuracy_score(y_val, lgb_preds):.4f}")
print(f"F1-score: {f1_score(y_val, lgb_preds):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_val, lgb_probs):.4f}")
print("\nClassification Report:\n", classification_report(y_val, lgb_preds))




===== TUNED LIGHTGBM =====
Accuracy: 0.9535
F1-score: 0.8102
ROC-AUC: 0.9914

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.95      0.97    475154
           1       0.70      0.96      0.81     54956

    accuracy                           0.95    530110
   macro avg       0.85      0.96      0.89    530110
weighted avg       0.96      0.95      0.96    530110



Save

In [None]:
joblib.dump(best_lgb, "../models/lightgbm_tuned.pkl")
print("Saved: ../models/lightgbm_tuned.pkl")


ðŸ§ SHAP EXPLAINABILITY

In [17]:
import joblib

best_lgb = joblib.load("../models/lightgbm_tuned.pkl")
print("Loaded LightGBM model")

Loaded LightGBM model


In [None]:
best_lgb = joblib.load("lightgbm_tuned_model.joblib")
# LightGBM feature importance
importances = best_lgb.feature_importances_

# Create a DataFrame
feat_imp = pd.DataFrame({
    "feature": np.arange(len(importances)),
    "importance": importances
})

# Sort
feat_imp = feat_imp.sort_values("importance", ascending=False)

# Show top 20
feat_imp.head(20)

In [None]:
import numpy as np
import shap

# Small sample to avoid memory issues
sample_idx = np.random.choice(X_val_sparse.shape[0], size=200, replace=False)
X_sample = X_val_sparse[sample_idx, :]

# Native LightGBM SHAP (works with sparse matrices)
shap_values = best_lgb.predict(X_sample, pred_contrib=True)

# Bar plot (top features only, no feature_names needed)
shap.summary_plot(shap_values, plot_type="bar")

In [None]:
#Load Logistic Regression
logreg = joblib.load("../models/logreg_baseline.pkl")


In [None]:
#Get Probabilities from Both Models
logreg_probs = logreg.predict_proba(X_val)[:, 1]
lgb_probs = best_lgb.predict_proba(X_val_sparse)[:, 1]


In [None]:
#SIMPLE AVERAGE ENSEMBLE
ensemble_probs = (logreg_probs + lgb_probs) / 2
ensemble_preds = (ensemble_probs >= 0.5).astype(int)

print("===== ENSEMBLE RESULTS =====")
print(f"Accuracy: {accuracy_score(y_val, ensemble_preds):.4f}")
print(f"F1-score: {f1_score(y_val, ensemble_preds):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_val, ensemble_probs):.4f}")


In [None]:
#THRESHOLD TUNING
thresholds = np.linspace(0.2, 0.8, 7)

results = []
for t in thresholds:
    preds_t = (ensemble_probs >= t).astype(int)
    results.append((t,
                    accuracy_score(y_val, preds_t),
                    f1_score(y_val, preds_t)))

results_df = pd.DataFrame(results, columns=["threshold", "accuracy", "f1"])
print(results_df)

best_t = results_df.loc[results_df["f1"].idxmax(), "threshold"]
print("Best threshold:", best_t)


In [None]:
#Final predictions:
final_preds = (ensemble_probs >= best_t).astype(int)
print("Tuned Ensemble F1:", f1_score(y_val, final_preds))


In [None]:
#Save Ensemble
joblib.dump({
    "logreg": logreg,
    "lightgbm": best_lgb,
    "threshold": best_t
}, "../models/ensemble_model.pkl")

print("Saved: ../models/ensemble_model.pkl")
