# 03 — Model Training & Evaluation
Train 9 classifiers and compare performance on train and test sets.

In [None]:
import sys
sys.path.insert(0, '..')

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    AdaBoostClassifier, ExtraTreesClassifier
)
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
)

from src.preprocessing import load_and_clean, build_preprocessor

## 1. Data Setup

In [None]:
df = load_and_clean()

X = df.drop('Risk', axis=1)
y = df['Risk'].map({'good': 1, 'bad': 0})

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f'Train: {X_train.shape}  |  Test: {X_test.shape}')

## 2. Define Models

In [None]:
models = {
    'Logistic'        : LogisticRegression(),
    'RandomForest'    : RandomForestClassifier(),
    'SVM'             : SVC(),
    'DecisionTree'    : DecisionTreeClassifier(),
    'AdaBoost'        : AdaBoostClassifier(),
    'GradientBoosting': GradientBoostingClassifier(),
    'XGBoost'         : XGBClassifier(eval_metric='logloss'),
    'LightGBM'        : LGBMClassifier(),
    'ExtraTrees'      : ExtraTreesClassifier(),
}

## 3. Train & Evaluate

In [None]:
def print_performance(name, tr_acc, tr_f1, tr_prec, tr_rec, tr_auc,
                      te_acc, te_f1, te_prec, te_rec, te_auc):
    print(name)
    print('  Train  — Acc: {:.4f}  F1: {:.4f}  Prec: {:.4f}  Rec: {:.4f}  AUC: {}'.format(
        tr_acc, tr_f1, tr_prec, tr_rec,
        f'{tr_auc:.4f}' if tr_auc is not None else 'N/A'))
    print('  Test   — Acc: {:.4f}  F1: {:.4f}  Prec: {:.4f}  Rec: {:.4f}  AUC: {}'.format(
        te_acc, te_f1, te_prec, te_rec,
        f'{te_auc:.4f}' if te_auc is not None else 'N/A'))
    print('=' * 60)

In [None]:
preprocessor = build_preprocessor()

train_results = {}
test_results  = {}

for name, model in models.items():
    pipe = Pipeline([('prep', preprocessor), ('model', model)])
    pipe.fit(X_train, y_train)

    y_tr_pred = pipe.predict(X_train)
    y_te_pred = pipe.predict(X_test)

    y_tr_prob = pipe.predict_proba(X_train)[:, 1] if hasattr(pipe, 'predict_proba') else None
    y_te_prob = pipe.predict_proba(X_test)[:, 1]  if hasattr(pipe, 'predict_proba') else None

    tr_acc  = accuracy_score(y_train, y_tr_pred)
    tr_prec = precision_score(y_train, y_tr_pred)
    tr_rec  = recall_score(y_train, y_tr_pred)
    tr_f1   = f1_score(y_train, y_tr_pred)
    tr_auc  = roc_auc_score(y_train, y_tr_prob) if y_tr_prob is not None else None

    te_acc  = accuracy_score(y_test, y_te_pred)
    te_prec = precision_score(y_test, y_te_pred)
    te_rec  = recall_score(y_test, y_te_pred)
    te_f1   = f1_score(y_test, y_te_pred)
    te_auc  = roc_auc_score(y_test, y_te_prob) if y_te_prob is not None else None

    print_performance(name, tr_acc, tr_f1, tr_prec, tr_rec, tr_auc,
                            te_acc, te_f1, te_prec, te_rec, te_auc)

    train_results[name] = dict(Train_Accuracy=tr_acc, Train_Precision=tr_prec,
                               Train_Recall=tr_rec, Train_F1=tr_f1, Train_ROC_AUC=tr_auc)
    test_results[name]  = dict(Test_Accuracy=te_acc,  Test_Precision=te_prec,
                               Test_Recall=te_rec,  Test_F1=te_f1,  Test_ROC_AUC=te_auc)

## 4. Results Summary

In [None]:
print('Train Summary (sorted by Recall):')
pd.DataFrame(train_results).T.sort_values('Train_Recall', ascending=False)

In [None]:
print('Test Summary (sorted by Recall):')
pd.DataFrame(test_results).T.sort_values('Test_Recall', ascending=False)