# 05 – Model Training and Ensemble Learning

In this notebook we train multiple models on the preprocessed data and compare their performance.  We include logistic regression, random forest, gradient boosting.  We then build ensemble models such as stacking and voting using scikit‑learn.


In [None]:
import os
import json
import numpy as np
import pandas as pd
from scipy import sparse
import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    RandomForestClassifier,
    ExtraTreesClassifier,
    BaggingClassifier,
    VotingClassifier,
    StackingClassifier,
    HistGradientBoostingClassifier,
)
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import f1_score


# Utils
def safe_predict_proba(model, X):
    if hasattr(model, "predict_proba"):
        p = model.predict_proba(X)
        return p[:, 1]
    scores = model.decision_function(X)
    return 1.0 / (1.0 + np.exp(-scores))


def can_dense(X):
    rows, cols = X.shape
    # float32 ~4 bytes
    return rows * cols * 4 < 2.5e9


def save_model(obj, path):
    joblib.dump(obj, path)
    print("[SAVED]", path)


def best_f1_threshold(y_true, p, steps=200):
    # grid thresholds
    thrs = np.linspace(0.01, 0.99, steps)
    best_thr, best_f1 = 0.5, -1.0
    for t in thrs:
        pred = (p >= t).astype(int)
        f1 = f1_score(y_true, pred, zero_division=0)
        if f1 > best_f1:
            best_f1 = f1
            best_thr = float(t)
    return best_thr, float(best_f1)


# Paths / Load processed data
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
processed_dir = os.path.join(project_root, "data", "processed")
models_dir = os.path.join(project_root, "models")
os.makedirs(models_dir, exist_ok=True)

X_train_full = sparse.load_npz(
    os.path.join(processed_dir, "X_train_processed.npz")
).tocsr()

X_test = sparse.load_npz(
    os.path.join(processed_dir, "X_test_processed.npz")
).tocsr()

y_train_full = (
    pd.read_csv(os.path.join(processed_dir, "y_train.csv"))
    .squeeze()
    .astype(int)
)

y_test = (
    pd.read_csv(os.path.join(processed_dir, "y_test.csv"))
    .squeeze()
    .astype(int)
)

print("X_train_full:", X_train_full.shape, "sparse:", sparse.issparse(X_train_full))
print("X_test:", X_test.shape, "sparse:", sparse.issparse(X_test))

# Temporal validation split
n = X_train_full.shape[0]
split_idx = int(n * 0.8)

X_train = X_train_full[:split_idx]
y_train = y_train_full.iloc[:split_idx].reset_index(drop=True)

X_val = X_train_full[split_idx:]
y_val = y_train_full.iloc[split_idx:].reset_index(drop=True)

dense_allowed = can_dense(X_train_full)
print("dense_allowed:", dense_allowed)

if dense_allowed:
    X_train_dense = X_train.toarray().astype(np.float32)
    X_val_dense = X_val.toarray().astype(np.float32)
    X_test_dense = X_test.toarray().astype(np.float32)


# Base Models
base_models = {
    "LogisticRegression": LogisticRegression(
        max_iter=3000, n_jobs=-1, class_weight="balanced"
    ),
    "RandomForest": RandomForestClassifier(
        n_estimators=500,
        min_samples_split=10,
        min_samples_leaf=5,
        random_state=42,
        n_jobs=-1,
        class_weight="balanced_subsample",
    ),
    "ExtraTrees": ExtraTreesClassifier(
        n_estimators=800,
        min_samples_split=10,
        min_samples_leaf=5,
        random_state=42,
        n_jobs=-1,
        class_weight="balanced",
    ),
    "Bagging_LR": BaggingClassifier(
        estimator=LogisticRegression(
            max_iter=3000, n_jobs=-1, class_weight="balanced"
        ),
        n_estimators=30,
        max_samples=0.75,
        bootstrap=True,
        n_jobs=-1,
        random_state=42,
    ),
    "HistGradientBoosting": HistGradientBoostingClassifier(
        max_depth=6,
        learning_rate=0.06,
        max_iter=400,
        random_state=42,
    ),
}

requires_dense = {"HistGradientBoosting": True}

trained = {}
input_type = {}

# Train base models 
for i, (name, model) in enumerate(base_models.items(), 1):
    print(f"\nTraining model {i}: {name}")

    use_dense = requires_dense.get(name, False)
    if use_dense and not dense_allowed:
        print(f"[SKIP] {name} requires dense, but dense is not allowed.")
        continue

    Xtr = X_train_dense if use_dense else X_train
    model.fit(Xtr, y_train)

    trained[name] = model
    input_type[name] = "dense" if use_dense else "sparse"
    print(f"Model {name} training finished")

    save_model(model, os.path.join(models_dir, f"{name}_model.pkl"))


# Ensembles
estimators = [(k, v) for k, v in trained.items()]
ensemble_input = (
    "dense" if any(input_type[k] == "dense" for k, _ in estimators) else "sparse"
)
Xtr_ens = X_train_dense if ensemble_input == "dense" else X_train
Xval_ens = X_val_dense if ensemble_input == "dense" else X_val

print("\nTraining model Voting_Soft")
voting_soft = VotingClassifier(estimators=estimators, voting="soft", n_jobs=-1)
voting_soft.fit(Xtr_ens, y_train)
print("Model Voting_Soft training finished")
save_model(voting_soft, os.path.join(models_dir, "Voting_Soft_model.pkl"))

print("\nTraining model Stacking")
stacking = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=3000, n_jobs=-1, class_weight="balanced"),
    stack_method="predict_proba",
    cv=5,
    n_jobs=-1,
)
stacking.fit(Xtr_ens, y_train)
print("Model Stacking training finished")
save_model(stacking, os.path.join(models_dir, "Stacking_model.pkl"))

# Evaluate on VAL to get BestThr_F1 
try:
    p_val = safe_predict_proba(stacking, Xval_ens)
    best_thr, best_f1 = best_f1_threshold(y_val.values, p_val, steps=200)
    print(f"\n[VAL] BestThr_F1={best_thr:.4f}  F1={best_f1:.4f}")

    best_thr_path = os.path.join(processed_dir, "best_threshold.json")
    with open(best_thr_path, "w", encoding="utf-8") as f:
        json.dump(
            {"model": "Stacking", "best_threshold_f1": best_thr, "best_f1": best_f1},
            f,
            ensure_ascii=False,
            indent=2
        )
    print("[SAVED]", best_thr_path)

except Exception as e:
    print("[WARN] Could not compute/save best_threshold.json:", str(e)[:200])


# Refit best model on TRAIN+VAL 
final_model = stacking
final_input = ensemble_input

X_trainval = X_train_full
y_trainval = y_train_full.reset_index(drop=True)

X_trainval_in = (
    X_trainval.toarray().astype(np.float32)
    if final_input == "dense"
    else X_trainval
)

print("\nRefit final model on full train...")
final_model.fit(X_trainval_in, y_trainval)
print("Final model refit finished")

# Calibration on full train
print("\nTraining model Stacking_Calibrated")
calibrated = CalibratedClassifierCV(final_model, method="sigmoid", cv=3)
calibrated.fit(X_trainval_in, y_trainval)
print("Model Stacking_Calibrated training finished")
save_model(calibrated, os.path.join(models_dir, "Stacking_Calibrated_model.pkl"))

# Save meta to help Streamlit avoid feature mismatch
try:
    feature_count = int(X_train_full.shape[1])  
    meta_path = os.path.join(processed_dir, "feature_count.json")
    with open(meta_path, "w", encoding="utf-8") as f:
        json.dump({"n_features": feature_count}, f, indent=2)
    print("[SAVED]", meta_path, "n_features=", feature_count)
except Exception as e:
    print("[WARN] Could not save feature_count.json:", str(e)[:200])

print("\nDONE. Models folder should now contain multiple .pkl files.")


X_train_full: (40000, 14) sparse: True
X_test: (10000, 14) sparse: True
dense_allowed: True

Training model 1: LogisticRegression
Model LogisticRegression training finished
[SAVED] d:\projects\Ai\project_fusion_ecu\models\LogisticRegression_model.pkl

Training model 2: RandomForest
Model RandomForest training finished
[SAVED] d:\projects\Ai\project_fusion_ecu\models\RandomForest_model.pkl

Training model 3: ExtraTrees
Model ExtraTrees training finished
[SAVED] d:\projects\Ai\project_fusion_ecu\models\ExtraTrees_model.pkl

Training model 4: Bagging_LR
Model Bagging_LR training finished
[SAVED] d:\projects\Ai\project_fusion_ecu\models\Bagging_LR_model.pkl

Training model 5: HistGradientBoosting
Model HistGradientBoosting training finished
[SAVED] d:\projects\Ai\project_fusion_ecu\models\HistGradientBoosting_model.pkl

Training model Voting_Soft
Model Voting_Soft training finished
[SAVED] d:\projects\Ai\project_fusion_ecu\models\Voting_Soft_model.pkl

Training model Stacking
Model Stackin

### Modifications Summary
This training notebook has been updated to remove dependence on SMOTE-oversampled data.
Processed feature matrices (`X_train_processed.npz`, `X_test_processed.npz`) and labels are loaded instead.
Class imbalance is addressed via the `scale_pos_weight` parameter on the XGBoost model, computed from the training data.
Evaluation metrics now focus exclusively on ROC-AUC, LogLoss, and PR-AUC. Accuracy and F1-score have been removed to provide more informative assessment for imbalanced data.
