# 05 – Model Training and Ensemble Learning

In this notebook we train multiple models on the preprocessed data and compare their performance.  We include logistic regression, random forest, gradient boosting and XGBoost.  We then build ensemble models such as stacking and voting using scikit‑learn.


In [6]:
# ============================================
# 05_model_training_temporal_val.ipynb
# - Temporal VAL split (mimics future TEST)
# - Robust XGBoost early stopping (old/new versions)
# - Base models + Voting + Stacking + Calibration
# - Honest TEST evaluation
# ============================================

import os
import numpy as np
import pandas as pd
from scipy import sparse
import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    RandomForestClassifier,
    ExtraTreesClassifier,
    BaggingClassifier,
    VotingClassifier,
    StackingClassifier,
)
from sklearn.ensemble import HistGradientBoostingClassifier

from xgboost import XGBClassifier

from sklearn.metrics import (
    roc_auc_score,
    log_loss,
    average_precision_score,
    precision_recall_curve,
    precision_score,
    recall_score,
)

from sklearn.calibration import CalibratedClassifierCV

# -----------------------------
# Helpers
# -----------------------------
def evaluate_probas(y_true, probas, name="Model"):
    return {
        "Model": name,
        "ROC_AUC": float(roc_auc_score(y_true, probas)),
        "LogLoss": float(log_loss(y_true, probas)),
        "PR_AUC": float(average_precision_score(y_true, probas)),
    }

def best_threshold_by_f1(y_true, probas):
    precision, recall, thresholds = precision_recall_curve(y_true, probas)
    precision = precision[:-1]
    recall = recall[:-1]
    f1 = (2 * precision * recall) / (precision + recall + 1e-12)
    best_idx = int(np.nanargmax(f1))
    return float(thresholds[best_idx]), float(f1[best_idx]), float(precision[best_idx]), float(recall[best_idx])

def summarize_threshold_metrics(y_true, probas, name="Model"):
    thr, best_f1, p, r = best_threshold_by_f1(y_true, probas)
    preds = (probas >= thr).astype(int)
    return {
        "Model": name,
        "BestThr_F1": thr,
        "F1_at_best": best_f1,
        "Precision_at_best": float(precision_score(y_true, preds, zero_division=0)),
        "Recall_at_best": float(recall_score(y_true, preds, zero_division=0)),
    }

def safe_predict_proba(model, X):
    if hasattr(model, "predict_proba"):
        p = model.predict_proba(X)
        if p.ndim == 2 and p.shape[1] >= 2:
            return p[:, 1]
        return p.reshape(-1)
    if hasattr(model, "decision_function"):
        scores = model.decision_function(X)
        return 1.0 / (1.0 + np.exp(-scores))
    raise AttributeError(f"{model.__class__.__name__} has no predict_proba/decision_function.")

def xgb_fit_with_early_stop_robust(model, Xtr, ytr, Xva, yva):
    """
    Compatibility layer for XGBoost:
    - Some versions don't accept early_stopping_rounds in sklearn API.
    - We'll try callbacks if available, else fit without early stopping.
    """
    try:
        import xgboost as xgb
        # callbacks exists in most versions
        cb = [xgb.callback.EarlyStopping(rounds=200, save_best=True)]
        model.fit(Xtr, ytr, eval_set=[(Xva, yva)], verbose=False, callbacks=cb)
        return
    except Exception:
        # fallback: plain fit
        model.fit(Xtr, ytr)

# -----------------------------
# 1) Load processed data
# -----------------------------
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
processed_dir = os.path.join(project_root, "data", "processed")
models_dir = os.path.join(project_root, "models")
os.makedirs(models_dir, exist_ok=True)

required = [
    "X_train_processed.npz",
    "X_test_processed.npz",
    "y_train.csv",
    "y_test.csv",
    "scale_pos_weight.txt",
]

for fname in required:
    p = os.path.join(processed_dir, fname)
    if not os.path.exists(p):
        raise FileNotFoundError(f"Required file not found: {p}")

X_train_full = sparse.load_npz(os.path.join(processed_dir, "X_train_processed.npz")).tocsr()
X_test = sparse.load_npz(os.path.join(processed_dir, "X_test_processed.npz")).tocsr()
y_train_full = pd.read_csv(os.path.join(processed_dir, "y_train.csv")).squeeze().astype(int)
y_test = pd.read_csv(os.path.join(processed_dir, "y_test.csv")).squeeze().astype(int)

with open(os.path.join(processed_dir, "scale_pos_weight.txt"), "r") as f:
    scale_pos_weight = float(f.read().strip())

print("X_train_full:", X_train_full.shape)
print("X_test      :", X_test.shape)
print("pos_rate train_full:", float(y_train_full.mean()), "test:", float(y_test.mean()))
print("scale_pos_weight:", scale_pos_weight)

# -----------------------------
# 2) Temporal VAL split (last 20% as validation)
# -----------------------------
n = X_train_full.shape[0]
split_idx = int(n * 0.8)

X_train = X_train_full[:split_idx]
y_train = y_train_full.iloc[:split_idx].reset_index(drop=True)

X_val = X_train_full[split_idx:]
y_val = y_train_full.iloc[split_idx:].reset_index(drop=True)

print("\nTemporal VAL split:")
print("X_train:", X_train.shape, "pos_rate:", float(y_train.mean()))
print("X_val  :", X_val.shape, "pos_rate:", float(y_val.mean()))

# -----------------------------
# 3) Dense conversion if safe
# -----------------------------
def can_dense(X):
    # avoid memory explosion
    rows, cols = X.shape
    approx_bytes = rows * cols * 4  # float32
    return approx_bytes < 2.5e9  # ~2.5GB

dense_allowed = can_dense(X_train_full)
print("\nDense conversion allowed:", dense_allowed)

if dense_allowed:
    X_train_dense = X_train.toarray().astype(np.float32)
    X_val_dense   = X_val.toarray().astype(np.float32)
    X_test_dense  = X_test.toarray().astype(np.float32)
else:
    X_train_dense = X_val_dense = X_test_dense = None

# -----------------------------
# 4) Define models
# -----------------------------
base_models = {
    "LogisticRegression": LogisticRegression(max_iter=3000, n_jobs=-1, class_weight="balanced"),
    "RandomForest": RandomForestClassifier(
        n_estimators=500,
        min_samples_split=10,
        min_samples_leaf=5,
        random_state=42,
        n_jobs=-1,
        class_weight="balanced_subsample",
    ),
    "ExtraTrees": ExtraTreesClassifier(
        n_estimators=800,
        min_samples_split=10,
        min_samples_leaf=5,
        random_state=42,
        n_jobs=-1,
        class_weight="balanced",
    ),
    "Bagging_LR": BaggingClassifier(
        estimator=LogisticRegression(max_iter=3000, n_jobs=-1, class_weight="balanced"),
        n_estimators=30,
        max_samples=0.75,
        bootstrap=True,
        n_jobs=-1,
        random_state=42,
    ),
    "HistGradientBoosting": HistGradientBoostingClassifier(
        max_depth=6,
        learning_rate=0.06,
        max_iter=400,
        random_state=42,
    ),
    "XGBoost_Tuned": XGBClassifier(
        n_estimators=3000,            # large + early stop
        learning_rate=0.02,
        max_depth=6,
        min_child_weight=3,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        reg_alpha=0.0,
        gamma=0.0,
        eval_metric="logloss",
        random_state=42,
        n_jobs=-1,
        scale_pos_weight=scale_pos_weight,
        tree_method="hist",
    ),
}

requires_dense = {"HistGradientBoosting": True}

# -----------------------------
# 5) Train base models (select by VAL PR_AUC)
# -----------------------------
results_val = []
thr_val = []
trained = {}
input_type = {}

for name, model in base_models.items():
    print(f"\nTraining (Temporal VAL): {name}")

    use_dense = bool(requires_dense.get(name, False))
    if use_dense:
        if not dense_allowed:
            print(f"[SKIP] {name} needs dense but dense conversion not allowed.")
            continue
        Xtr, Xva = X_train_dense, X_val_dense
    else:
        Xtr, Xva = X_train, X_val

    # XGB early stop robust
    if name == "XGBoost_Tuned":
        xgb_fit_with_early_stop_robust(model, Xtr, y_train, Xva, y_val)
    else:
        model.fit(Xtr, y_train)

    probas_val = safe_predict_proba(model, Xva)
    results_val.append(evaluate_probas(y_val, probas_val, name=name))
    thr_val.append(summarize_threshold_metrics(y_val, probas_val, name=name))

    trained[name] = model
    input_type[name] = "dense" if use_dense else "sparse"

val_df = pd.DataFrame(results_val).sort_values("PR_AUC", ascending=False)
val_thr_df = pd.DataFrame(thr_val).sort_values("F1_at_best", ascending=False)

print("\nVAL performance (temporal):")
print(val_df)

print("\nVAL best-threshold summary:")
print(val_thr_df)

best_name = val_df.iloc[0]["Model"]
print(f"\nSelected best by VAL PR_AUC: {best_name} | input: {input_type[best_name]}")

# -----------------------------
# 6) Ensembles
# -----------------------------
voting_candidates = ["LogisticRegression", "Bagging_LR", "HistGradientBoosting", "XGBoost_Tuned"]
estimators = [(m, trained[m]) for m in voting_candidates if m in trained]

ensemble_needs_dense = any(input_type.get(m, "sparse") == "dense" for m, _ in estimators)

def X_for(inp, split):
    if inp == "dense":
        if not dense_allowed:
            raise RuntimeError("Dense required but not allowed.")
        if split == "train": return X_train_dense
        if split == "val":   return X_val_dense
        if split == "test":  return X_test_dense
        raise ValueError(split)
    else:
        if split == "train": return X_train
        if split == "val":   return X_val
        if split == "test":  return X_test
        raise ValueError(split)

ens_input = "dense" if ensemble_needs_dense else "sparse"
Xtr_ens = X_for(ens_input, "train")
Xva_ens = X_for(ens_input, "val")

# Voting soft
print("\nTraining ensemble: Voting_Soft (Temporal VAL)")
voting_soft = VotingClassifier(estimators=estimators, voting="soft", n_jobs=-1)
voting_soft.fit(Xtr_ens, y_train)
probas_vote = safe_predict_proba(voting_soft, Xva_ens)

# Weighted soft (use VAL PR_AUC weights)
pr_map = {row["Model"]: float(row["PR_AUC"]) for _, row in val_df.iterrows()}
raw_weights = np.array([pr_map.get(m, 0.0) for m, _ in estimators], dtype=float)
weights = None
if np.any(raw_weights > 0):
    raw_weights = raw_weights + 1e-6
    weights = (raw_weights / raw_weights.sum()).tolist()

print("Voting models:", [m for m, _ in estimators])
print("Weights:", weights)

print("\nTraining ensemble: Voting_WeightedSoft (Temporal VAL)")
voting_weighted = VotingClassifier(estimators=estimators, voting="soft", weights=weights, n_jobs=-1)
voting_weighted.fit(Xtr_ens, y_train)
probas_vote_w = safe_predict_proba(voting_weighted, Xva_ens)

# Stacking
print("\nTraining ensemble: Stacking (Temporal VAL)")
stacking = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=3000, n_jobs=-1, class_weight="balanced"),
    stack_method="predict_proba",
    n_jobs=-1,
    cv=5,
)
stacking.fit(Xtr_ens, y_train)
probas_stack = safe_predict_proba(stacking, Xva_ens)

# Add ensemble results
results_val2 = results_val.copy()
thr_val2 = thr_val.copy()

for nm, pr in [
    ("Voting_Soft", probas_vote),
    ("Voting_WeightedSoft", probas_vote_w),
    ("Stacking", probas_stack),
]:
    results_val2.append(evaluate_probas(y_val, pr, name=nm))
    thr_val2.append(summarize_threshold_metrics(y_val, pr, name=nm))

trained["Voting_Soft"] = voting_soft
trained["Voting_WeightedSoft"] = voting_weighted
trained["Stacking"] = stacking

input_type["Voting_Soft"] = ens_input
input_type["Voting_WeightedSoft"] = ens_input
input_type["Stacking"] = ens_input

val_all_df = pd.DataFrame(results_val2).sort_values("PR_AUC", ascending=False)
print("\nAll models (base + ensembles) VAL performance:")
print(val_all_df)

final_best = val_all_df.iloc[0]["Model"]
final_input = input_type[final_best]
print(f"\nFINAL selected (by temporal VAL PR_AUC): {final_best} | input: {final_input}")

# -----------------------------
# 7) Refit best on TRAIN+VAL then TEST
# -----------------------------
print("\nRefitting best model on TRAIN+VAL then evaluating on TEST...")

X_trainval = X_train_full
y_trainval = y_train_full.reset_index(drop=True)

if final_input == "dense":
    if not dense_allowed:
        raise RuntimeError("Dense required but not allowed.")
    X_trainval_in = X_train_full.toarray().astype(np.float32)
    X_test_in = X_test_dense
else:
    X_trainval_in = X_trainval
    X_test_in = X_test

best_model = trained[final_best]

# Refit
if final_best == "XGBoost_Tuned":
    # for refit, train without early stop (or could early stop using VAL again)
    best_model.fit(X_trainval_in, y_trainval)
else:
    best_model.fit(X_trainval_in, y_trainval)

probas_test = safe_predict_proba(best_model, X_test_in)
test_df = pd.DataFrame([evaluate_probas(y_test, probas_test, name=f"{final_best}_TEST")])
test_thr = pd.DataFrame([summarize_threshold_metrics(y_test, probas_test, name=f"{final_best}_TEST")])

print("\nTEST metrics (honest):")
print(test_df)

print("\nTEST best-threshold summary:")
print(test_thr)

# -----------------------------
# 8) Calibration (optional)
# -----------------------------
print("\nCalibrating best model (sigmoid, CV=3) on TRAIN+VAL only...")
calibrated = CalibratedClassifierCV(best_model, method="sigmoid", cv=3)
calibrated.fit(X_trainval_in, y_trainval)
probas_cal = calibrated.predict_proba(X_test_in)[:, 1]

cal_df = pd.DataFrame([evaluate_probas(y_test, probas_cal, name=f"{final_best}_Calibrated_TEST")])
cal_thr = pd.DataFrame([summarize_threshold_metrics(y_test, probas_cal, name=f"{final_best}_Calibrated_TEST")])

print("\nCalibrated TEST metrics:")
print(cal_df)
print("\nCalibrated TEST best-threshold summary:")
print(cal_thr)

# -----------------------------
# 9) Save best + calibrated
# -----------------------------
best_path = os.path.join(models_dir, f"{final_best}_model.pkl")
cal_path  = os.path.join(models_dir, f"{final_best}_Calibrated_model.pkl")

joblib.dump(best_model, best_path)
joblib.dump(calibrated, cal_path)

print("\nSaved best model:", best_path)
print("Saved calibrated model:", cal_path)
print("\n[Training completed successfully - temporal VAL selection + honest TEST evaluation.]")


X_train_full: (40000, 29)
X_test      : (10000, 29)
pos_rate train_full: 0.051525 test: 0.0506
scale_pos_weight: 18.40805434255216

Temporal VAL split:
X_train: (32000, 29) pos_rate: 0.0515625
X_val  : (8000, 29) pos_rate: 0.051375

Dense conversion allowed: True

Training (Temporal VAL): LogisticRegression

Training (Temporal VAL): RandomForest

Training (Temporal VAL): ExtraTrees

Training (Temporal VAL): Bagging_LR

Training (Temporal VAL): HistGradientBoosting

Training (Temporal VAL): XGBoost_Tuned

VAL performance (temporal):
                  Model   ROC_AUC   LogLoss    PR_AUC
4  HistGradientBoosting  0.550186  0.201382  0.067160
1          RandomForest  0.552556  0.325918  0.060533
5         XGBoost_Tuned  0.523871  0.294011  0.059590
2            ExtraTrees  0.524061  0.525319  0.056851
0    LogisticRegression  0.490578  0.676817  0.049854
3            Bagging_LR  0.492193  0.671473  0.049130

VAL best-threshold summary:
                  Model  BestThr_F1  F1_at_best  Precis

### Modifications Summary
This training notebook has been updated to remove dependence on SMOTE-oversampled data.
Processed feature matrices (`X_train_processed.npz`, `X_test_processed.npz`) and labels are loaded instead.
Class imbalance is addressed via the `scale_pos_weight` parameter on the XGBoost model, computed from the training data.
Evaluation metrics now focus exclusively on ROC-AUC, LogLoss, and PR-AUC. Accuracy and F1-score have been removed to provide more informative assessment for imbalanced data.
