# 05 – Model Training and Ensemble Learning

In this notebook we train multiple models on the preprocessed data and compare their performance.  We include logistic regression, random forest, gradient boosting and XGBoost.  We then build ensemble models such as stacking and voting using scikit‑learn.


In [4]:
# ============================================
# 0. Imports and configuration
# ============================================

import os
import pandas as pd
from scipy import sparse
import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    StackingClassifier,
    VotingClassifier,
)
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# ============================================
# 1. Load NPZ data and labels
# ============================================

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
processed_dir = os.path.join(project_root, "data", "processed")
models_dir = os.path.join(project_root, "models")
os.makedirs(models_dir, exist_ok=True)

print("Project root:", project_root)
print("Processed dir:", processed_dir)

required = [
    "X_train_resampled.npz",
    "X_test_transformed.npz",
    "y_train_resampled.csv",
    "y_test.csv",
    "preprocessor.joblib",
]

for fname in required:
    path = os.path.join(processed_dir, fname)
    if not os.path.exists(path):
        raise FileNotFoundError(f"Required file not found: {path}")

print("All required files exist.")

X_train = sparse.load_npz(os.path.join(processed_dir, "X_train_resampled.npz"))
X_test = sparse.load_npz(os.path.join(processed_dir, "X_test_transformed.npz"))
y_train = pd.read_csv(os.path.join(processed_dir, "y_train_resampled.csv")).squeeze()
y_test = pd.read_csv(os.path.join(processed_dir, "y_test.csv")).squeeze()

print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

preprocessor = joblib.load(os.path.join(processed_dir, "preprocessor.joblib"))
print("Preprocessor loaded (for deployment later).")

# ============================================
# 2. Define base models (input already preprocessed)
# ============================================

models = {
    "LogisticRegression": LogisticRegression(max_iter=2000, n_jobs=-1),
    "RandomForest": RandomForestClassifier(
        n_estimators=200, random_state=42, n_jobs=-1
    ),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "XGBoost": XGBClassifier(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss",
        random_state=42,
        n_jobs=-1,
    ),
}

# ============================================
# 3. Train and evaluate base models
# ============================================

results = []
trained_models = {}

for name, model in models.items():
    print(f"\nTraining base model: {name}")
    model.fit(X_train, y_train)

    preds = model.predict(X_test)
    probas = model.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    auc = roc_auc_score(y_test, probas)

    print(f"{name} – Accuracy: {acc:.4f}, F1: {f1:.4f}, AUC: {auc:.4f}")

    results.append((name, acc, f1, auc))
    trained_models[name] = model

results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "F1", "ROC_AUC"])
print("\nBase model performance:")
print(results_df.sort_values("ROC_AUC", ascending=False))

# ============================================
# 4. Voting ensemble (soft)
# ============================================

voting_clf = VotingClassifier(
    estimators=[
        ("lr", models["LogisticRegression"]),
        ("rf", models["RandomForest"]),
        ("gb", models["GradientBoosting"]),
        ("xgb", models["XGBoost"]),
    ],
    voting="soft",
    n_jobs=-1,
)

print("\nTraining VotingClassifier...")
voting_clf.fit(X_train, y_train)

preds_vot = voting_clf.predict(X_test)
probas_vot = voting_clf.predict_proba(X_test)[:, 1]

acc_vot = accuracy_score(y_test, preds_vot)
f1_vot = f1_score(y_test, preds_vot)
auc_vot = roc_auc_score(y_test, probas_vot)

print(f"Voting – Accuracy: {acc_vot:.4f}, F1: {f1_vot:.4f}, AUC: {auc_vot:.4f}")
results.append(("Voting", acc_vot, f1_vot, auc_vot))

# ============================================
# 5. Stacking ensemble
# ============================================

stack_clf = StackingClassifier(
    estimators=[
        ("rf", models["RandomForest"]),
        ("gb", models["GradientBoosting"]),
        ("xgb", models["XGBoost"]),
    ],
    final_estimator=LogisticRegression(max_iter=2000),
    n_jobs=-1,
)

print("\nTraining StackingClassifier...")
stack_clf.fit(X_train, y_train)

preds_stack = stack_clf.predict(X_test)
probas_stack = stack_clf.predict_proba(X_test)[:, 1]

acc_stack = accuracy_score(y_test, preds_stack)
f1_stack = f1_score(y_test, preds_stack)
auc_stack = roc_auc_score(y_test, probas_stack)

print(f"Stacking – Accuracy: {acc_stack:.4f}, F1: {f1_stack:.4f}, AUC: {auc_stack:.4f}")
results.append(("Stacking", acc_stack, f1_stack, auc_stack))

# ============================================
# 6. Summary and saving models
# ============================================

results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "F1", "ROC_AUC"])
print("\nAll models performance (base + ensembles):")
print(results_df.sort_values("ROC_AUC", ascending=False))

for name, model in trained_models.items():
    path = os.path.join(models_dir, f"{name}_model.pkl")
    joblib.dump(model, path)
    print(f"Saved {name} model to: {path}")

voting_path = os.path.join(models_dir, "Voting_model.pkl")
stacking_path = os.path.join(models_dir, "Stacking_model.pkl")
joblib.dump(voting_clf, voting_path)
joblib.dump(stack_clf, stacking_path)

print("\nSaved ensembles:")
print(" -", voting_path)
print(" -", stacking_path)

print("\n[Model training and saving completed successfully.]")


Project root: d:\projects\Ai\project_fusion_ecu
Processed dir: d:\projects\Ai\project_fusion_ecu\data\processed
All required files exist.
X_train: (304036, 14032)
X_test: (40036, 14032)
y_train: (304036,)
y_test: (40036,)
Preprocessor loaded (for deployment later).

Training base model: LogisticRegression
LogisticRegression – Accuracy: 0.5765, F1: 0.0988, AUC: 0.5378

Training base model: RandomForest
RandomForest – Accuracy: 0.9232, F1: 0.0339, AUC: 0.5147

Training base model: GradientBoosting
GradientBoosting – Accuracy: 0.8865, F1: 0.0646, AUC: 0.5019

Training base model: XGBoost
XGBoost – Accuracy: 0.9341, F1: 0.0287, AUC: 0.4978

Base model performance:
                Model  Accuracy        F1   ROC_AUC
0  LogisticRegression  0.576456  0.098847  0.537782
1        RandomForest  0.923219  0.033941  0.514692
2    GradientBoosting  0.886477  0.064622  0.501899
3             XGBoost  0.934134  0.028729  0.497844

Training VotingClassifier...
Voting – Accuracy: 0.9241, F1: 0.0368, AU