# 05 – Model Training and Ensemble Learning

This notebook trains several baseline models and ensemble classifiers on the preprocessed data.  We employ logistic regression, random forest, gradient boosting, XGBoost and CatBoost.  Ensemble approaches include Voting and Stacking classifiers.

In [None]:
import os
import joblib
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import numpy as np

# Load preprocessed data
processed_dir = os.path.join(os.path.pardir, 'data', 'processed')
X_train = pd.read_csv(os.path.join(processed_dir, 'X_train.csv'))
X_test = pd.read_csv(os.path.join(processed_dir, 'X_test.csv'))
y_train = pd.read_csv(os.path.join(processed_dir, 'y_train.csv')).squeeze()
y_test = pd.read_csv(os.path.join(processed_dir, 'y_test.csv')).squeeze()

# Load preprocessor
preprocessor = joblib.load(os.path.join(processed_dir, 'preprocessor.pkl'))

# Helper function to build pipeline
def build_model(model):
    return Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

# Define individual models
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000, n_jobs=-1),
    'RandomForest': RandomForestClassifier(n_estimators=200, max_depth=None, random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(n_estimators=300, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8, random_state=42, eval_metric='logloss'),
    'CatBoost': CatBoostClassifier(verbose=False, random_state=42)
}

trained_models = {}

# Train baseline models
for name, model in models.items():
    print(f"Training {name}…")
    clf = build_model(model)
    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    acc = accuracy_score(y_test, preds)
    print(f"{name} Accuracy: {acc:.4f}")
    trained_models[name] = clf

# Save baseline models
models_dir = os.path.join(os.path.pardir, 'models')
os.makedirs(models_dir, exist_ok=True)
for name, model in trained_models.items():
    joblib.dump(model, os.path.join(models_dir, f'{name}_model.pkl'))

# Build ensemble models
# Voting classifier (hard voting)
voting_clf = VotingClassifier(
    estimators=[
        ('lr', models['LogisticRegression']),
        ('rf', models['RandomForest']),
        ('gb', models['GradientBoosting'])
    ],
    voting='hard'
)
voting_pipeline = build_model(voting_clf)
voting_pipeline.fit(X_train, y_train)
preds_voting = voting_pipeline.predict(X_test)
acc_voting = accuracy_score(y_test, preds_voting)
print(f"Voting Classifier Accuracy: {acc_voting:.4f}")

# Stacking classifier
estimators = [
    ('rf', models['RandomForest']),
    ('gb', models['GradientBoosting']),
    ('xgb', models['XGBoost'])
]
final_estimator = LogisticRegression(max_iter=1000)
stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=final_estimator
)
stacking_pipeline = build_model(stacking_clf)
stacking_pipeline.fit(X_train, y_train)
preds_stack = stacking_pipeline.predict(X_test)
acc_stack = accuracy_score(y_test, preds_stack)
print(f"Stacking Classifier Accuracy: {acc_stack:.4f}")

# Save ensemble models
joblib.dump(voting_pipeline, os.path.join(models_dir, 'VotingClassifier_model.pkl'))
joblib.dump(stacking_pipeline, os.path.join(models_dir, 'StackingClassifier_model.pkl'))
