# Model Training: Gradient Boosting and Deep Learning

This notebook demonstrates training and evaluation of:
1. XGBoost
2. LightGBM
3. Transformer-based models

on tabular datasets.

In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from utils.data_loader import DataLoader
from models import XGBoostClassifier, LightGBMClassifier, MLPClassifier, TransformerClassifier

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

%matplotlib inline

## Load and Prepare Data

In [None]:
# Choose dataset
dataset_name = 'breast_cancer'  # Options: 'breast_cancer', 'adult_income', 'bank_marketing'

loader = DataLoader(dataset_name, random_state=42)
X, y = loader.load_data()
data = loader.prepare_data(X, y, test_size=0.2, scale_features=True)

X_train = data['X_train']
X_test = data['X_test']
y_train = data['y_train']
y_test = data['y_test']

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Number of classes: {len(np.unique(y))}")

## 1. XGBoost Classifier

In [None]:
print("Training XGBoost...")
xgb_model = XGBoostClassifier(n_estimators=100, max_depth=6, random_state=42)
xgb_model.train(X_train, y_train)

# Evaluate
train_metrics = xgb_model.evaluate(X_train, y_train)
test_metrics = xgb_model.evaluate(X_test, y_test)

print("\nXGBoost Training Metrics:")
for k, v in train_metrics.items():
    print(f"  {k}: {v:.4f}" if v is not None else f"  {k}: N/A")

print("\nXGBoost Test Metrics:")
for k, v in test_metrics.items():
    print(f"  {k}: {v:.4f}" if v is not None else f"  {k}: N/A")

In [None]:
# Feature importance
xgb_importance = xgb_model.get_feature_importance(X_train.columns.tolist())
print("\nTop 10 Important Features (XGBoost):")
print(xgb_importance.head(10))

# Plot
plt.figure(figsize=(10, 8))
plt.barh(xgb_importance['feature'][:20], xgb_importance['importance'][:20])
plt.xlabel('Importance')
plt.title('XGBoost Feature Importance')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 2. LightGBM Classifier

In [None]:
print("Training LightGBM...")
lgb_model = LightGBMClassifier(n_estimators=100, max_depth=6, random_state=42)
lgb_model.train(X_train, y_train)

# Evaluate
train_metrics = lgb_model.evaluate(X_train, y_train)
test_metrics = lgb_model.evaluate(X_test, y_test)

print("\nLightGBM Training Metrics:")
for k, v in train_metrics.items():
    print(f"  {k}: {v:.4f}" if v is not None else f"  {k}: N/A")

print("\nLightGBM Test Metrics:")
for k, v in test_metrics.items():
    print(f"  {k}: {v:.4f}" if v is not None else f"  {k}: N/A")

In [None]:
# Feature importance
lgb_importance = lgb_model.get_feature_importance(X_train.columns.tolist())
print("\nTop 10 Important Features (LightGBM):")
print(lgb_importance.head(10))

# Plot
plt.figure(figsize=(10, 8))
plt.barh(lgb_importance['feature'][:20], lgb_importance['importance'][:20])
plt.xlabel('Importance')
plt.title('LightGBM Feature Importance')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 3. Transformer Classifier

In [None]:
print("Training Transformer...")
transformer_model = TransformerClassifier(
    d_model=64,
    nhead=4,
    num_layers=2,
    epochs=30,
    batch_size=32,
    learning_rate=0.001
)
transformer_model.train(X_train, y_train)

# Evaluate
train_metrics = transformer_model.evaluate(X_train, y_train)
test_metrics = transformer_model.evaluate(X_test, y_test)

print("\nTransformer Training Metrics:")
for k, v in train_metrics.items():
    print(f"  {k}: {v:.4f}" if v is not None else f"  {k}: N/A")

print("\nTransformer Test Metrics:")
for k, v in test_metrics.items():
    print(f"  {k}: {v:.4f}" if v is not None else f"  {k}: N/A")

## Model Comparison

In [None]:
# Compare all models
models = {
    'XGBoost': xgb_model,
    'LightGBM': lgb_model,
    'Transformer': transformer_model
}

comparison_data = []
for name, model in models.items():
    metrics = model.evaluate(X_test, y_test)
    metrics['Model'] = name
    comparison_data.append(metrics)

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df[['Model', 'accuracy', 'f1_score', 'precision', 'recall', 'roc_auc']]

print("\nModel Comparison on Test Set:")
print(comparison_df.to_string(index=False))

In [None]:
# Visualize comparison
metrics_to_plot = ['accuracy', 'f1_score', 'precision', 'recall']

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.ravel()

for idx, metric in enumerate(metrics_to_plot):
    ax = axes[idx]
    data = comparison_df[['Model', metric]].dropna()
    ax.bar(data['Model'], data[metric])
    ax.set_title(metric.replace('_', ' ').title())
    ax.set_ylabel('Score')
    ax.set_ylim([0, 1])
    ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## Summary

This notebook demonstrated:
- Training three different model types on tabular data
- Evaluating model performance with multiple metrics
- Comparing models across different metrics
- Analyzing feature importance for tree-based models

Next steps:
- Apply SHAP and LIME explanations (see next notebook)
- Calculate interpretability metrics
- Compare explainability across models