# 05 - Modeling & Evaluation

Train multiple models (LightGBM, CatBoost, TabNet) to predict podium probability (top-3 finish).

**Models:**
1. **LightGBM**: Primary tree-based ensemble with categorical feature support
2. **CatBoost**: Alternative tree ensemble with built-in categorical handling
3. **TabNet**: Deep learning model with attention-based feature selection

**Train/Val/Test Split:**
- Train: 1994-2022
- Validation: 2023
- Test: 2024

**Input:** `data/processed/features.csv`  
**Output:** `models/best_podium_model.pkl`


In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    brier_score_loss,
    log_loss,
    precision_recall_fscore_support,
    classification_report
)
from sklearn.preprocessing import LabelEncoder
from sklearn.calibration import calibration_curve
import matplotlib.pyplot as plt

# Models
try:
    import lightgbm as lgb
    LIGHTGBM_AVAILABLE = True
except ImportError:
    LIGHTGBM_AVAILABLE = False
    print("Warning: LightGBM not available")

try:
    from catboost import CatBoostClassifier
    CATBOOST_AVAILABLE = True
except ImportError:
    CATBOOST_AVAILABLE = False
    print("Warning: CatBoost not available")

try:
    from pytorch_tabnet.tab_model import TabNetClassifier
    TABNET_AVAILABLE = True
except ImportError:
    TABNET_AVAILABLE = False
    print("Warning: TabNet not available")

# Set up paths
PROJECT_ROOT = Path("..").resolve()
PROCESSED_ROOT = PROJECT_ROOT / "data" / "processed"
MODELS_ROOT = PROJECT_ROOT / "models"
MODELS_ROOT.mkdir(parents=True, exist_ok=True)

# Load features
features = pd.read_csv(PROCESSED_ROOT / "features.csv")
features['date'] = pd.to_datetime(features['date'], errors='coerce')

print(f"Loaded features: {features.shape}")
print(f"Date range: {features['date'].min()} to {features['date'].max()}")



In [None]:
## 1. Data Preparation

Prepare train/val/test splits and handle missing values.


In [None]:
# Prepare features and target
X = features.drop(columns=['podium', 'raceId', 'driverId', 'date', 'year'], errors='ignore')
y = features['podium']

# Handle categorical features
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str).fillna('Unknown'))

# Fill missing values with median for numeric, mode for categorical
for col in X.columns:
    if X[col].dtype in [np.float64, np.int64]:
        X[col] = X[col].fillna(X[col].median())
    else:
        X[col] = X[col].fillna(X[col].mode()[0] if len(X[col].mode()) > 0 else 0)

print(f"Features shape: {X.shape}")
print(f"Target distribution: {y.value_counts().to_dict()}")
print(f"Podium rate: {y.mean():.2%}")

# Train/Val/Test split
train_mask = features['year'] < 2023
val_mask = features['year'] == 2023
test_mask = features['year'] == 2024

X_train, y_train = X[train_mask], y[train_mask]
X_val, y_val = X[val_mask], y[val_mask]
X_test, y_test = X[test_mask], y[test_mask]

print(f"\nTrain: {len(X_train):,} samples ({features[train_mask]['year'].min()}-{features[train_mask]['year'].max()})")
print(f"Val: {len(X_val):,} samples (2023)")
print(f"Test: {len(X_test):,} samples (2024)")


In [None]:
## 2. Model 1: LightGBM

Train LightGBM with categorical feature support.


In [None]:
if LIGHTGBM_AVAILABLE:
    # LightGBM model
    lgb_model = lgb.LGBMClassifier(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=7,
        num_leaves=31,
        objective='binary',
        metric='binary_logloss',
        random_state=42,
        verbose=-1
    )
    
    lgb_model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )
    
    # Predictions
    y_val_pred_lgb = lgb_model.predict_proba(X_val)[:, 1]
    y_test_pred_lgb = lgb_model.predict_proba(X_test)[:, 1]
    
    # Metrics
    lgb_val_metrics = {
        'roc_auc': roc_auc_score(y_val, y_val_pred_lgb),
        'pr_auc': average_precision_score(y_val, y_val_pred_lgb),
        'brier': brier_score_loss(y_val, y_val_pred_lgb),
        'log_loss': log_loss(y_val, y_val_pred_lgb)
    }
    
    lgb_test_metrics = {
        'roc_auc': roc_auc_score(y_test, y_test_pred_lgb),
        'pr_auc': average_precision_score(y_test, y_test_pred_lgb),
        'brier': brier_score_loss(y_test, y_test_pred_lgb),
        'log_loss': log_loss(y_test, y_test_pred_lgb)
    }
    
    print("LightGBM Results:")
    print(f"  Val ROC-AUC: {lgb_val_metrics['roc_auc']:.4f}")
    print(f"  Val PR-AUC: {lgb_val_metrics['pr_auc']:.4f}")
    print(f"  Test ROC-AUC: {lgb_test_metrics['roc_auc']:.4f}")
    print(f"  Test PR-AUC: {lgb_test_metrics['pr_auc']:.4f}")
    
    # Feature importance
    feature_importance_lgb = pd.DataFrame({
        'feature': X.columns,
        'importance': lgb_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(f"\nTop 10 Features (LightGBM):")
    print(feature_importance_lgb.head(10).to_string(index=False))
else:
    print("LightGBM not available - skipping")
    lgb_model = None



In [None]:
## 3. Model 2: CatBoost

Train CatBoost with built-in categorical handling.


In [None]:
if CATBOOST_AVAILABLE:
    # CatBoost model
    cat_model = CatBoostClassifier(
        iterations=500,
        learning_rate=0.05,
        depth=7,
        loss_function='Logloss',
        random_state=42,
        verbose=False
    )
    
    cat_model.fit(
        X_train, y_train,
        eval_set=(X_val, y_val),
        early_stopping_rounds=50
    )
    
    # Predictions
    y_val_pred_cat = cat_model.predict_proba(X_val)[:, 1]
    y_test_pred_cat = cat_model.predict_proba(X_test)[:, 1]
    
    # Metrics
    cat_val_metrics = {
        'roc_auc': roc_auc_score(y_val, y_val_pred_cat),
        'pr_auc': average_precision_score(y_val, y_val_pred_cat),
        'brier': brier_score_loss(y_val, y_val_pred_cat),
        'log_loss': log_loss(y_val, y_val_pred_cat)
    }
    
    cat_test_metrics = {
        'roc_auc': roc_auc_score(y_test, y_test_pred_cat),
        'pr_auc': average_precision_score(y_test, y_test_pred_cat),
        'brier': brier_score_loss(y_test, y_test_pred_cat),
        'log_loss': log_loss(y_test, y_test_pred_cat)
    }
    
    print("CatBoost Results:")
    print(f"  Val ROC-AUC: {cat_val_metrics['roc_auc']:.4f}")
    print(f"  Val PR-AUC: {cat_val_metrics['pr_auc']:.4f}")
    print(f"  Test ROC-AUC: {cat_test_metrics['roc_auc']:.4f}")
    print(f"  Test PR-AUC: {cat_test_metrics['pr_auc']:.4f}")
    
    # Feature importance
    feature_importance_cat = pd.DataFrame({
        'feature': X.columns,
        'importance': cat_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(f"\nTop 10 Features (CatBoost):")
    print(feature_importance_cat.head(10).to_string(index=False))
else:
    print("CatBoost not available - skipping")
    cat_model = None


In [None]:
## 4. Model 3: TabNet

Train TabNet deep learning model with attention-based feature selection.


In [None]:
if TABNET_AVAILABLE:
    import torch
    # TabNet model
    tabnet_model = TabNetClassifier(
        n_d=64,
        n_a=64,
        n_steps=5,
        gamma=1.5,
        n_independent=2,
        n_shared=2,
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=2e-2),
        scheduler_params={"step_size":50, "gamma":0.9},
        scheduler_fn=torch.optim.lr_scheduler.StepLR,
        mask_type='entmax',
        seed=42
    )
    
    tabnet_model.fit(
        X_train.values, y_train.values,
        eval_set=[(X_val.values, y_val.values)],
        eval_metric=['auc'],
        max_epochs=100,
        patience=20,
        batch_size=1024,
        virtual_batch_size=128
    )
    
    # Predictions
    y_val_pred_tab = tabnet_model.predict_proba(X_val.values)[:, 1]
    y_test_pred_tab = tabnet_model.predict_proba(X_test.values)[:, 1]
    
    # Metrics
    tabnet_val_metrics = {
        'roc_auc': roc_auc_score(y_val, y_val_pred_tab),
        'pr_auc': average_precision_score(y_val, y_val_pred_tab),
        'brier': brier_score_loss(y_val, y_val_pred_tab),
        'log_loss': log_loss(y_val, y_val_pred_tab)
    }
    
    tabnet_test_metrics = {
        'roc_auc': roc_auc_score(y_test, y_test_pred_tab),
        'pr_auc': average_precision_score(y_test, y_test_pred_tab),
        'brier': brier_score_loss(y_test, y_test_pred_tab),
        'log_loss': log_loss(y_test, y_test_pred_tab)
    }
    
    print("TabNet Results:")
    print(f"  Val ROC-AUC: {tabnet_val_metrics['roc_auc']:.4f}")
    print(f"  Val PR-AUC: {tabnet_val_metrics['pr_auc']:.4f}")
    print(f"  Test ROC-AUC: {tabnet_test_metrics['roc_auc']:.4f}")
    print(f"  Test PR-AUC: {tabnet_test_metrics['pr_auc']:.4f}")
else:
    print("TabNet not available - skipping")
    tabnet_model = None


In [None]:
## 5. Model Comparison

Compare all models and select the best one.


In [None]:
# Compare models on validation set
model_comparison = []

if LIGHTGBM_AVAILABLE:
    model_comparison.append({
        'model': 'LightGBM',
        'val_roc_auc': lgb_val_metrics['roc_auc'],
        'val_pr_auc': lgb_val_metrics['pr_auc'],
        'test_roc_auc': lgb_test_metrics['roc_auc'],
        'test_pr_auc': lgb_test_metrics['pr_auc']
    })

if CATBOOST_AVAILABLE:
    model_comparison.append({
        'model': 'CatBoost',
        'val_roc_auc': cat_val_metrics['roc_auc'],
        'val_pr_auc': cat_val_metrics['pr_auc'],
        'test_roc_auc': cat_test_metrics['roc_auc'],
        'test_pr_auc': cat_test_metrics['pr_auc']
    })

if TABNET_AVAILABLE:
    model_comparison.append({
        'model': 'TabNet',
        'val_roc_auc': tabnet_val_metrics['roc_auc'],
        'val_pr_auc': tabnet_val_metrics['pr_auc'],
        'test_roc_auc': tabnet_test_metrics['roc_auc'],
        'test_pr_auc': tabnet_test_metrics['pr_auc']
    })

if model_comparison:
    comparison_df = pd.DataFrame(model_comparison)
    print("Model Comparison (Validation Set):")
    print(comparison_df[['model', 'val_roc_auc', 'val_pr_auc']].to_string(index=False))
    
    # Select best model based on validation PR-AUC
    best_model_name = comparison_df.loc[comparison_df['val_pr_auc'].idxmax(), 'model']
    print(f"\nBest model (by Val PR-AUC): {best_model_name}")
    
    # Get best model
    if best_model_name == 'LightGBM' and LIGHTGBM_AVAILABLE:
        best_model = lgb_model
    elif best_model_name == 'CatBoost' and CATBOOST_AVAILABLE:
        best_model = cat_model
    elif best_model_name == 'TabNet' and TABNET_AVAILABLE:
        best_model = tabnet_model
    else:
        best_model = None
else:
    print("No models available for comparison")
    best_model = None
    best_model_name = None


In [None]:
## 6. Model Persistence

Save the best model for future use.


In [None]:
if best_model is not None:
    import joblib
    
    # Save best model
    model_output_path = MODELS_ROOT / "best_podium_model.pkl"
    joblib.dump(best_model, model_output_path)
    print(f"Best model saved to: {model_output_path}")
    
    # Save comparison results
    if model_comparison:
        comparison_df.to_csv(PROCESSED_ROOT / "model_comparison.csv", index=False)
        print(f"Model comparison saved to: {PROCESSED_ROOT / 'model_comparison.csv'}")
else:
    print("No model to save")


## Summary

- Trained LightGBM, CatBoost, and TabNet models on 1994-2022 data
- Validated on 2023 data, tested on 2024 data
- Selected best model based on validation PR-AUC
- Saved best model to `models/best_podium_model.pkl` for downstream inference
