# 05 – Model Training and Ensemble Learning

In this notebook we train multiple models on the preprocessed data and compare their performance.  We include logistic regression, random forest, gradient boosting and XGBoost.  We then build ensemble models such as stacking and voting using scikit‑learn.


In [6]:
# ============================================
# 05_model_training.ipynb  (Model Training + Ensembles)
# - Base models
# - Bagging (LR)  -> Bagging
# - Boosting (HistGB + XGB) -> Boosting
# - Soft Voting + Weighted Soft Voting (fusion)
# - Stacking (fusion)
# - Calibration for best PR_AUC
# ============================================

# ============================================
# 0. Imports and configuration
# ============================================

import os
import numpy as np
import pandas as pd
from scipy import sparse
import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    RandomForestClassifier,
    ExtraTreesClassifier,
    BaggingClassifier,
    VotingClassifier,
    StackingClassifier,
)
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.metrics import (
    roc_auc_score,
    log_loss,
    average_precision_score,
    precision_recall_curve,
    precision_score,
    recall_score,
)

from sklearn.calibration import CalibratedClassifierCV


# ============================================
# 1. Load processed data and labels
# ============================================

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
processed_dir = os.path.join(project_root, "data", "processed")
models_dir = os.path.join(project_root, "models")
os.makedirs(models_dir, exist_ok=True)

print("Project root:", project_root)
print("Processed dir:", processed_dir)
print("Models dir:", models_dir)

required = [
    "X_train_processed.npz",
    "X_test_processed.npz",
    "y_train.csv",
    "y_test.csv",
    "preprocessor.joblib",
    "scale_pos_weight.txt",
]

for fname in required:
    p = os.path.join(processed_dir, fname)
    if not os.path.exists(p):
        raise FileNotFoundError(f"Required file not found: {p}")
print("All required files exist.")

X_train = sparse.load_npz(os.path.join(processed_dir, "X_train_processed.npz")).tocsr()
X_test  = sparse.load_npz(os.path.join(processed_dir, "X_test_processed.npz")).tocsr()
y_train = pd.read_csv(os.path.join(processed_dir, "y_train.csv")).squeeze().astype(int)
y_test  = pd.read_csv(os.path.join(processed_dir, "y_test.csv")).squeeze().astype(int)

print("X_train:", X_train.shape, "type:", type(X_train))
print("X_test :", X_test.shape,  "type:", type(X_test))
print("y_train:", y_train.shape, "pos_rate:", float(y_train.mean()))
print("y_test :", y_test.shape,  "pos_rate:", float(y_test.mean()))

with open(os.path.join(processed_dir, "scale_pos_weight.txt"), "r") as f:
    scale_pos_weight = float(f.read().strip())
print(f"Loaded scale_pos_weight: {scale_pos_weight:.4f}")

# Prepare dense copies (needed for HistGradientBoosting and any ensemble including it)
# Use float32 to reduce memory
X_train_dense = X_train.toarray().astype(np.float32)
X_test_dense  = X_test.toarray().astype(np.float32)

print("X_train_dense:", X_train_dense.shape, "dtype:", X_train_dense.dtype)
print("X_test_dense :", X_test_dense.shape,  "dtype:", X_test_dense.dtype)


# ============================================
# 2. Utility: evaluation helpers
# ============================================

def evaluate_probas(y_true, probas, name="Model"):
    auc = roc_auc_score(y_true, probas)
    ll = log_loss(y_true, probas)
    pr_auc = average_precision_score(y_true, probas)
    return {"Model": name, "ROC_AUC": float(auc), "LogLoss": float(ll), "PR_AUC": float(pr_auc)}

def best_threshold_by_f1(y_true, probas):
    precision, recall, thresholds = precision_recall_curve(y_true, probas)
    precision = precision[:-1]
    recall = recall[:-1]
    f1 = (2 * precision * recall) / (precision + recall + 1e-12)
    best_idx = int(np.nanargmax(f1))
    return float(thresholds[best_idx]), float(f1[best_idx]), float(precision[best_idx]), float(recall[best_idx])

def summarize_threshold_metrics(y_true, probas, name="Model"):
    thr, best_f1, p, r = best_threshold_by_f1(y_true, probas)
    preds = (probas >= thr).astype(int)
    return {
        "Model": name,
        "BestThr_F1": thr,
        "F1_at_best": best_f1,
        "Precision_at_best": float(precision_score(y_true, preds, zero_division=0)),
        "Recall_at_best": float(recall_score(y_true, preds, zero_division=0)),
    }

def precision_at_k(y_true, probas, k):
    k = int(k)
    idx = np.argsort(-probas)[:k]
    if hasattr(y_true, "iloc"):
        return float(y_true.iloc[idx].mean())
    return float(np.mean(y_true[idx]))

def safe_predict_proba(model, X):
    if hasattr(model, "predict_proba"):
        return model.predict_proba(X)[:, 1]
    if hasattr(model, "decision_function"):
        scores = model.decision_function(X)
        return 1.0 / (1.0 + np.exp(-scores))
    raise AttributeError(f"Model {model.__class__.__name__} has no predict_proba/decision_function.")


# ============================================
# 3. Define base models (Bagging + Boosting included)
# ============================================

base_models = {
    "LogisticRegression": LogisticRegression(max_iter=3000, n_jobs=-1),
    "RandomForest": RandomForestClassifier(
        n_estimators=350,
        max_depth=None,
        min_samples_split=10,
        min_samples_leaf=5,
        random_state=42,
        n_jobs=-1,
    ),
    "ExtraTrees": ExtraTreesClassifier(
        n_estimators=600,
        max_depth=None,
        min_samples_split=10,
        min_samples_leaf=5,
        random_state=42,
        n_jobs=-1,
    ),
    "Bagging_LR": BaggingClassifier(
        estimator=LogisticRegression(max_iter=3000, n_jobs=-1),
        n_estimators=25,
        max_samples=0.75,
        max_features=1.0,
        bootstrap=True,
        n_jobs=-1,
        random_state=42,
    ),
    "HistGradientBoosting": HistGradientBoostingClassifier(
        max_depth=6,
        learning_rate=0.08,
        max_iter=300,
        random_state=42,
    ),
    "XGBoost_Tuned": XGBClassifier(
        n_estimators=800,
        learning_rate=0.03,
        max_depth=5,
        min_child_weight=3,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.0,
        reg_lambda=1.0,
        gamma=0.0,
        eval_metric="logloss",
        random_state=42,
        n_jobs=-1,
        scale_pos_weight=scale_pos_weight,
        tree_method="hist",
    ),
}

# Which models require dense input
requires_dense = {
    "HistGradientBoosting": True,
}
# default False for others


# ============================================
# 4. Train and evaluate base models
# ============================================

results = []
threshold_results = []
trained_models = {}
model_input_type = {}  # "sparse" or "dense"

for name, model in base_models.items():
    print(f"\nTraining: {name}")

    use_dense = bool(requires_dense.get(name, False))
    Xtr = X_train_dense if use_dense else X_train
    Xte = X_test_dense  if use_dense else X_test

    model.fit(Xtr, y_train)
    probas = safe_predict_proba(model, Xte)

    results.append(evaluate_probas(y_test, probas, name=name))
    threshold_results.append(summarize_threshold_metrics(y_test, probas, name=name))

    trained_models[name] = model
    model_input_type[name] = "dense" if use_dense else "sparse"

base_df = pd.DataFrame(results).sort_values("PR_AUC", ascending=False)
thr_df = pd.DataFrame(threshold_results).sort_values("F1_at_best", ascending=False)

print("\nBase models performance:")
print(base_df)

print("\nBest-threshold (by F1) summary:")
print(thr_df)


# ============================================
# 5. Fusion: Soft Voting + Weighted Soft Voting
# ============================================

voting_candidates = ["LogisticRegression", "Bagging_LR", "HistGradientBoosting", "XGBoost_Tuned"]

estimators = []
for mname in voting_candidates:
    if mname in trained_models:
        estimators.append((mname, trained_models[mname]))

# If any estimator needs dense, train the whole ensemble on dense
ensemble_needs_dense = any(model_input_type.get(mname, "sparse") == "dense" for mname, _ in estimators)
Xtr_ens = X_train_dense if ensemble_needs_dense else X_train
Xte_ens = X_test_dense  if ensemble_needs_dense else X_test

# 5.1 Soft Voting (equal weights)
print("\nTraining: Voting_Soft (fusion)")
voting_soft = VotingClassifier(
    estimators=estimators,
    voting="soft",
    weights=None,
    n_jobs=-1,
)
voting_soft.fit(Xtr_ens, y_train)
probas_vote = safe_predict_proba(voting_soft, Xte_ens)

results.append(evaluate_probas(y_test, probas_vote, name="Voting_Soft"))
threshold_results.append(summarize_threshold_metrics(y_test, probas_vote, name="Voting_Soft"))
trained_models["Voting_Soft"] = voting_soft
model_input_type["Voting_Soft"] = "dense" if ensemble_needs_dense else "sparse"

# 5.2 Weighted Soft Voting (weights derived from PR_AUC of base models)
print("\nTraining: Voting_WeightedSoft (fusion)")

pr_map = {row["Model"]: float(row["PR_AUC"]) for _, row in base_df.iterrows()}

raw_weights = []
used_names = []
for mname, _ in estimators:
    w = pr_map.get(mname, 0.0)
    raw_weights.append(w)
    used_names.append(mname)

raw_weights = np.array(raw_weights, dtype=float)

if np.all(raw_weights <= 0):
    weights = None
    print("WARNING: All computed weights were zero; fallback to equal weights.")
else:
    raw_weights = raw_weights + 1e-6
    weights = (raw_weights / raw_weights.sum()).tolist()

print("Weighted voting models:", used_names)
print("Weights (normalized):", weights)

voting_weighted = VotingClassifier(
    estimators=estimators,
    voting="soft",
    weights=weights,
    n_jobs=-1,
)
voting_weighted.fit(Xtr_ens, y_train)
probas_vote_w = safe_predict_proba(voting_weighted, Xte_ens)

results.append(evaluate_probas(y_test, probas_vote_w, name="Voting_WeightedSoft"))
threshold_results.append(summarize_threshold_metrics(y_test, probas_vote_w, name="Voting_WeightedSoft"))
trained_models["Voting_WeightedSoft"] = voting_weighted
model_input_type["Voting_WeightedSoft"] = "dense" if ensemble_needs_dense else "sparse"


# ============================================
# 6. Fusion: Stacking
# ============================================

print("\nTraining: Stacking (fusion)")
stacking = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=3000, n_jobs=-1),
    stack_method="predict_proba",
    passthrough=False,
    n_jobs=-1,
    cv=5,
)
stacking.fit(Xtr_ens, y_train)
probas_stack = safe_predict_proba(stacking, Xte_ens)

results.append(evaluate_probas(y_test, probas_stack, name="Stacking"))
threshold_results.append(summarize_threshold_metrics(y_test, probas_stack, name="Stacking"))
trained_models["Stacking"] = stacking
model_input_type["Stacking"] = "dense" if ensemble_needs_dense else "sparse"


# ============================================
# 7. Summaries
# ============================================

all_df = pd.DataFrame(results).sort_values("PR_AUC", ascending=False)
all_thr_df = pd.DataFrame(threshold_results).sort_values("F1_at_best", ascending=False)

print("\nAll models (base + ensembles) performance:")
print(all_df)

print("\nAll models best-threshold summary:")
print(all_thr_df)


# ============================================
# 8. Calibrate best model by PR_AUC
# ============================================

best_name = all_df.iloc[0]["Model"]
best_model = trained_models[best_name]
best_input = model_input_type.get(best_name, "sparse")

print(f"\nBest model by PR_AUC: {best_name}")
print(f"Best model input type: {best_input}")

Xtr_cal = X_train_dense if best_input == "dense" else X_train
Xte_cal = X_test_dense  if best_input == "dense" else X_test

print("Training calibrated best model (sigmoid, CV=3)...")
calibrated = CalibratedClassifierCV(best_model, method="sigmoid", cv=3)
calibrated.fit(Xtr_cal, y_train)
probas_cal = calibrated.predict_proba(Xte_cal)[:, 1]

cal_name = f"{best_name}_Calibrated"
results.append(evaluate_probas(y_test, probas_cal, name=cal_name))
threshold_results.append(summarize_threshold_metrics(y_test, probas_cal, name=cal_name))
trained_models[cal_name] = calibrated
model_input_type[cal_name] = best_input

all_df2 = pd.DataFrame(results).sort_values("PR_AUC", ascending=False)
all_thr_df2 = pd.DataFrame(threshold_results).sort_values("F1_at_best", ascending=False)

print("\nAll models (including calibrated) performance:")
print(all_df2)

print("\nAll models best-threshold summary (including calibrated):")
print(all_thr_df2)


# ============================================
# 9. Optional: Precision@K ranking proxy
# ============================================

Ks = [500, 1000, 2000, 5000]
rank_rows = []

for name in all_df2["Model"].tolist():
    m = trained_models.get(name)
    if m is None:
        continue

    inp = model_input_type.get(name, "sparse")
    Xte_rank = X_test_dense if inp == "dense" else X_test

    if name == cal_name:
        probas = probas_cal
    else:
        probas = safe_predict_proba(m, Xte_rank)

    for k in Ks:
        k = min(k, len(y_test))
        rank_rows.append({
            "Model": name,
            "K": k,
            "Precision@K": precision_at_k(y_test, probas, k),
        })

rank_df = pd.DataFrame(rank_rows)
print("\nPrecision@K (ranking proxy):")
print(rank_df.sort_values(["K", "Precision@K"], ascending=[True, False]))


# ============================================
# 10. Save models
# ============================================

save_list = [
    "LogisticRegression",
    "RandomForest",
    "ExtraTrees",
    "Bagging_LR",
    "HistGradientBoosting",
    "XGBoost_Tuned",
    "Voting_Soft",
    "Voting_WeightedSoft",
    "Stacking",
    cal_name,
]

for name in save_list:
    model = trained_models.get(name)
    if model is None:
        continue
    path = os.path.join(models_dir, f"{name}_model.pkl")
    joblib.dump(model, path)
    print(f"Saved {name} model to: {path}")

print("[Training + ensembles (including weighted voting) + calibration completed successfully.]")


Project root: d:\projects\Ai\project_fusion_ecu
Processed dir: d:\projects\Ai\project_fusion_ecu\data\processed
Models dir: d:\projects\Ai\project_fusion_ecu\models
All required files exist.
X_train: (160000, 19) type: <class 'scipy.sparse._csr.csr_matrix'>
X_test : (40000, 19) type: <class 'scipy.sparse._csr.csr_matrix'>
y_train: (160000,) pos_rate: 0.04765625
y_test : (40000,) pos_rate: 0.0457
Loaded scale_pos_weight: 19.9836
X_train_dense: (160000, 19) dtype: float32
X_test_dense : (40000, 19) dtype: float32

Training: LogisticRegression

Training: RandomForest

Training: ExtraTrees

Training: Bagging_LR

Training: HistGradientBoosting

Training: XGBoost_Tuned

Base models performance:
                  Model   ROC_AUC   LogLoss    PR_AUC
3            Bagging_LR  0.546442  0.184862  0.056800
0    LogisticRegression  0.546306  0.184884  0.056705
4  HistGradientBoosting  0.552093  0.184951  0.054692
5         XGBoost_Tuned  0.539574  0.650461  0.053649
2            ExtraTrees  0.53540

### Modifications Summary
This training notebook has been updated to remove dependence on SMOTE-oversampled data.
Processed feature matrices (`X_train_processed.npz`, `X_test_processed.npz`) and labels are loaded instead.
Class imbalance is addressed via the `scale_pos_weight` parameter on the XGBoost model, computed from the training data.
Evaluation metrics now focus exclusively on ROC-AUC, LogLoss, and PR-AUC. Accuracy and F1-score have been removed to provide more informative assessment for imbalanced data.
