# 04 - Modeling & Evaluation

Train multiple models (LightGBM, CatBoost, TabNet) to predict podium probability (top-3 finish).

**Models:**
1. **LightGBM**: Primary tree-based ensemble with categorical feature support
2. **CatBoost**: Alternative tree ensemble with built-in categorical handling
3. **TabNet**: Deep learning model with attention-based feature selection

**Train/Val/Test Split:**
- Train: 1994-2022
- Validation: 2023
- Test: 2024

**Input:** `data/processed/master_races_clean.csv` (from 03.8)  
**Output:** `models/best_podium_model.pkl`


In [17]:
from pathlib import Path
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    brier_score_loss,
    log_loss,
    precision_recall_fscore_support,
    classification_report
)
from sklearn.preprocessing import LabelEncoder
from sklearn.calibration import calibration_curve
import matplotlib.pyplot as plt

# Models
try:
    import lightgbm as lgb
    LIGHTGBM_AVAILABLE = True
except ImportError:
    LIGHTGBM_AVAILABLE = False
    print("Warning: LightGBM not available")

try:
    from catboost import CatBoostClassifier
    CATBOOST_AVAILABLE = True
except ImportError:
    CATBOOST_AVAILABLE = False
    print("Warning: CatBoost not available")

try:
    from pytorch_tabnet.tab_model import TabNetClassifier
    TABNET_AVAILABLE = True
except ImportError:
    TABNET_AVAILABLE = False
    print("Warning: TabNet not available")

# Set up paths
# Get project root (works whether running from notebooks/ or F1/ folder)
PROJECT_ROOT = Path().resolve()
if PROJECT_ROOT.name == 'notebooks':
    PROJECT_ROOT = PROJECT_ROOT.parent

PROCESSED_ROOT = PROJECT_ROOT / "data" / "processed"
MODELS_ROOT = PROJECT_ROOT / "models"
MODELS_ROOT.mkdir(parents=True, exist_ok=True)

# Load features from cleaned dataset (output from 03.8)
features = pd.read_csv(PROCESSED_ROOT / "master_races_clean.csv")
features['date'] = pd.to_datetime(features['date'], errors='coerce')

print(f"Loaded features: {features.shape}")
print(f"Date range: {features['date'].min()} to {features['date'].max()}")



Loaded features: (12358, 54)
Date range: 1994-03-27 00:00:00 to 2024-12-08 00:00:00


## 1. Data Preparation

Prepare train/val/test splits and handle missing values.

In [18]:
# Prepare features and target
# EXCLUDE race outcome features (known only after race - would cause data leakage)
outcome_features = [
    # Race results (outcomes)
    'points',                      # Race points (outcome)
    'position',                    # Final race position (outcome)
    'positionOrder',               # Final race position order (outcome)
    'milliseconds',                # Race finish time in milliseconds (outcome)
    'time',                        # Race finish time (outcome)
    'laps',                        # Number of laps completed (outcome)
    'fastestLap',                  # Fastest lap number (outcome)
    'fastestLapTime',              # Fastest lap time (outcome)
    'fastestLapSpeed',             # Fastest lap speed (outcome)
    'rank',                        # Fastest lap rank (outcome)
    'statusId',                    # Race finish status ID (outcome)
    'status_category',             # Race finish status category (outcome)
    'resultId',                    # Result ID (outcome identifier)
    'constructor_results_points',  # Constructor race points (outcome)
    # Sprint race results (outcomes)
    #'sprint_results_positionOrder',    # Sprint race position (outcome)
    #'sprint_results_points',          # Sprint race points (outcome)
    #'sprint_results_time',            # Sprint race time (outcome)
    #'sprint_results_milliseconds',    # Sprint race time in milliseconds (outcome)
    #'sprint_results_fastestLap',      # Sprint fastest lap (outcome)
    #'sprint_results_fastestLapTime',  # Sprint fastest lap time (outcome)
    #'sprint_results_laps',            # Sprint laps completed (outcome)
    #'sprint_results_statusId',         # Sprint finish status (outcome)
    # Non-PRE_RACE standings (include current race - data leakage)
    'driver_standings_points',        # Includes current race (use PRE_RACE version instead)
    'driver_standings_position',      # Includes current race (use PRE_RACE version instead)
    'constructor_standings_points',   # Includes current race (use PRE_RACE version instead)
    'constructor_standings_position', # Includes current race (use PRE_RACE version instead)
]

# Metadata and target to exclude
exclude_cols = ['podium', 'raceId', 'driverId', 'date', 'year'] + outcome_features

# Remove outcome features and metadata
X = features.drop(columns=exclude_cols, errors='ignore')
y = features['podium']

print(f"Excluded {len([c for c in exclude_cols if c in features.columns])} outcome/metadata features")
print(f"Remaining features: {X.shape[1]}")

# Handle categorical features
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str).fillna('Unknown'))

# Fill missing values with median for numeric, mode for categorical
for col in X.columns:
    if X[col].dtype in [np.float64, np.int64]:
        X[col] = X[col].fillna(X[col].median())
    else:
        X[col] = X[col].fillna(X[col].mode()[0] if len(X[col].mode()) > 0 else 0)

print(f"Features shape: {X.shape}")
print(f"Target distribution: {y.value_counts().to_dict()}")
print(f"Podium rate: {y.mean():.2%}")

# Train/Val/Test split
train_mask = features['year'] < 2023
val_mask = features['year'] == 2023
test_mask = features['year'] == 2024

X_train, y_train = X[train_mask], y[train_mask]
X_val, y_val = X[val_mask], y[val_mask]
X_test, y_test = X[test_mask], y[test_mask]

print(f"\nTrain: {len(X_train):,} samples ({features[train_mask]['year'].min()}-{features[train_mask]['year'].max()})")
print(f"Val: {len(X_val):,} samples (2023)")
print(f"Test: {len(X_test):,} samples (2024)")


Excluded 18 outcome/metadata features
Remaining features: 36
Features shape: (12358, 36)
Target distribution: {0: 10627, 1: 1731}
Podium rate: 14.01%

Train: 11,439 samples (1994-2022)
Val: 440 samples (2023)
Test: 479 samples (2024)


## 2. Model 1: LightGBM

Train LightGBM with categorical feature support.

In [19]:
if LIGHTGBM_AVAILABLE:
    # LightGBM model
    lgb_model = lgb.LGBMClassifier(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=7,
        num_leaves=31,
        objective='binary',
        metric='binary_logloss',
        random_state=42,
        verbose=-1
    )
    
    lgb_model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )
    
    # Predictions
    y_val_pred_lgb = lgb_model.predict_proba(X_val)[:, 1]
    y_test_pred_lgb = lgb_model.predict_proba(X_test)[:, 1]
    
    # Metrics
    lgb_val_metrics = {
        'roc_auc': roc_auc_score(y_val, y_val_pred_lgb),
        'pr_auc': average_precision_score(y_val, y_val_pred_lgb),
        'brier': brier_score_loss(y_val, y_val_pred_lgb),
        'log_loss': log_loss(y_val, y_val_pred_lgb)
    }
    
    lgb_test_metrics = {
        'roc_auc': roc_auc_score(y_test, y_test_pred_lgb),
        'pr_auc': average_precision_score(y_test, y_test_pred_lgb),
        'brier': brier_score_loss(y_test, y_test_pred_lgb),
        'log_loss': log_loss(y_test, y_test_pred_lgb)
    }
    
    print("LightGBM Results:")
    print(f"  Val ROC-AUC: {lgb_val_metrics['roc_auc']:.4f}")
    print(f"  Val PR-AUC: {lgb_val_metrics['pr_auc']:.4f}")
    print(f"  Test ROC-AUC: {lgb_test_metrics['roc_auc']:.4f}")
    print(f"  Test PR-AUC: {lgb_test_metrics['pr_auc']:.4f}")
    
    # Feature importance
    feature_importance_lgb = pd.DataFrame({
        'feature': X.columns,
        'importance': lgb_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(f"\nTop 10 Features (LightGBM):")
    print(feature_importance_lgb.head(10).to_string(index=False))
else:
    print("LightGBM not available - skipping")
    lgb_model = None



Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[149]	valid_0's binary_logloss: 0.175533
LightGBM Results:
  Val ROC-AUC: 0.9591
  Val PR-AUC: 0.8400
  Test ROC-AUC: 0.9641
  Test PR-AUC: 0.8306

Top 10 Features (LightGBM):
                           feature  importance
         driver_points_avg_last_10         404
            driver_avg_grid_last_5         382
            driver_races_completed         333
              driver_total_podiums         305
                              grid         298
        driver_avg_position_last_5         261
   constructor_podium_rate_last_15         259
constructor_podium_rate_at_circuit         252
                        driver_age         210
  driver_standings_points_PRE_RACE         203


## 3. Model 2: CatBoost

Train CatBoost with built-in categorical handling.


In [20]:
if CATBOOST_AVAILABLE:
    # CatBoost model
    cat_model = CatBoostClassifier(
        iterations=500,
        learning_rate=0.05,
        depth=7,
        loss_function='Logloss',
        random_state=42,
        verbose=False
    )
    
    cat_model.fit(
        X_train, y_train,
        eval_set=(X_val, y_val),
        early_stopping_rounds=50
    )
    
    # Predictions
    y_val_pred_cat = cat_model.predict_proba(X_val)[:, 1]
    y_test_pred_cat = cat_model.predict_proba(X_test)[:, 1]
    
    # Metrics
    cat_val_metrics = {
        'roc_auc': roc_auc_score(y_val, y_val_pred_cat),
        'pr_auc': average_precision_score(y_val, y_val_pred_cat),
        'brier': brier_score_loss(y_val, y_val_pred_cat),
        'log_loss': log_loss(y_val, y_val_pred_cat)
    }
    
    cat_test_metrics = {
        'roc_auc': roc_auc_score(y_test, y_test_pred_cat),
        'pr_auc': average_precision_score(y_test, y_test_pred_cat),
        'brier': brier_score_loss(y_test, y_test_pred_cat),
        'log_loss': log_loss(y_test, y_test_pred_cat)
    }
    
    print("CatBoost Results:")
    print(f"  Val ROC-AUC: {cat_val_metrics['roc_auc']:.4f}")
    print(f"  Val PR-AUC: {cat_val_metrics['pr_auc']:.4f}")
    print(f"  Test ROC-AUC: {cat_test_metrics['roc_auc']:.4f}")
    print(f"  Test PR-AUC: {cat_test_metrics['pr_auc']:.4f}")
    
    # Feature importance
    feature_importance_cat = pd.DataFrame({
        'feature': X.columns,
        'importance': cat_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(f"\nTop 10 Features (CatBoost):")
    print(feature_importance_cat.head(10).to_string(index=False))
else:
    print("CatBoost not available - skipping")
    cat_model = None


CatBoost Results:
  Val ROC-AUC: 0.9570
  Val PR-AUC: 0.8192
  Test ROC-AUC: 0.9622
  Test PR-AUC: 0.8318

Top 10 Features (CatBoost):
                                feature  importance
                                   grid   13.415982
        constructor_podium_rate_last_15   10.313532
              driver_points_avg_last_10    9.914227
                 driver_avg_grid_last_5    7.781556
                 driver_races_completed    7.450418
                   driver_total_podiums    7.289335
constructor_standings_position_PRE_RACE    6.810534
     driver_standings_position_PRE_RACE    5.281715
             driver_avg_position_last_5    4.453291
     constructor_podium_rate_at_circuit    4.222331


## 4. Model 3: TabNet

Train TabNet deep learning model with attention-based feature selection.


In [21]:
if TABNET_AVAILABLE:
    import torch
    # TabNet model
    tabnet_model = TabNetClassifier(
        n_d=64,
        n_a=64,
        n_steps=5,
        gamma=1.5,
        n_independent=2,
        n_shared=2,
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=2e-2),
        scheduler_params={"step_size":50, "gamma":0.9},
        scheduler_fn=torch.optim.lr_scheduler.StepLR,
        mask_type='entmax',
        seed=42
    )
    
    tabnet_model.fit(
        X_train.values, y_train.values,
        eval_set=[(X_val.values, y_val.values)],
        eval_metric=['auc'],
        max_epochs=100,
        patience=20,
        batch_size=1024,
        virtual_batch_size=128
    )
    
    # Predictions
    y_val_pred_tab = tabnet_model.predict_proba(X_val.values)[:, 1]
    y_test_pred_tab = tabnet_model.predict_proba(X_test.values)[:, 1]
    
    # Metrics
    tabnet_val_metrics = {
        'roc_auc': roc_auc_score(y_val, y_val_pred_tab),
        'pr_auc': average_precision_score(y_val, y_val_pred_tab),
        'brier': brier_score_loss(y_val, y_val_pred_tab),
        'log_loss': log_loss(y_val, y_val_pred_tab)
    }
    
    tabnet_test_metrics = {
        'roc_auc': roc_auc_score(y_test, y_test_pred_tab),
        'pr_auc': average_precision_score(y_test, y_test_pred_tab),
        'brier': brier_score_loss(y_test, y_test_pred_tab),
        'log_loss': log_loss(y_test, y_test_pred_tab)
    }
    
    print("TabNet Results:")
    print(f"  Val ROC-AUC: {tabnet_val_metrics['roc_auc']:.4f}")
    print(f"  Val PR-AUC: {tabnet_val_metrics['pr_auc']:.4f}")
    print(f"  Test ROC-AUC: {tabnet_test_metrics['roc_auc']:.4f}")
    print(f"  Test PR-AUC: {tabnet_test_metrics['pr_auc']:.4f}")
else:
    print("TabNet not available - skipping")
    tabnet_model = None


epoch 0  | loss: 0.58362 | val_0_auc: 0.6665  |  0:00:02s
epoch 1  | loss: 0.3486  | val_0_auc: 0.63227 |  0:00:04s
epoch 2  | loss: 0.30232 | val_0_auc: 0.65285 |  0:00:07s
epoch 3  | loss: 0.28028 | val_0_auc: 0.71488 |  0:00:09s
epoch 4  | loss: 0.27022 | val_0_auc: 0.77564 |  0:00:11s
epoch 5  | loss: 0.25297 | val_0_auc: 0.7931  |  0:00:13s
epoch 6  | loss: 0.24855 | val_0_auc: 0.84654 |  0:00:16s
epoch 7  | loss: 0.23939 | val_0_auc: 0.829   |  0:00:18s
epoch 8  | loss: 0.24126 | val_0_auc: 0.82527 |  0:00:20s
epoch 9  | loss: 0.23681 | val_0_auc: 0.85132 |  0:00:22s
epoch 10 | loss: 0.23241 | val_0_auc: 0.84152 |  0:00:24s
epoch 11 | loss: 0.23368 | val_0_auc: 0.85602 |  0:00:26s
epoch 12 | loss: 0.22602 | val_0_auc: 0.85841 |  0:00:29s
epoch 13 | loss: 0.22427 | val_0_auc: 0.86339 |  0:00:31s
epoch 14 | loss: 0.22454 | val_0_auc: 0.85549 |  0:00:33s
epoch 15 | loss: 0.21909 | val_0_auc: 0.86939 |  0:00:35s
epoch 16 | loss: 0.22173 | val_0_auc: 0.89827 |  0:00:37s
epoch 17 | los

## 5. Model Comparison

Compare all models and select the best one.


In [22]:
# Compare models on validation set
model_comparison = []

if LIGHTGBM_AVAILABLE:
    model_comparison.append({
        'model': 'LightGBM',
        'val_roc_auc': lgb_val_metrics['roc_auc'],
        'val_pr_auc': lgb_val_metrics['pr_auc'],
        'test_roc_auc': lgb_test_metrics['roc_auc'],
        'test_pr_auc': lgb_test_metrics['pr_auc']
    })

if CATBOOST_AVAILABLE:
    model_comparison.append({
        'model': 'CatBoost',
        'val_roc_auc': cat_val_metrics['roc_auc'],
        'val_pr_auc': cat_val_metrics['pr_auc'],
        'test_roc_auc': cat_test_metrics['roc_auc'],
        'test_pr_auc': cat_test_metrics['pr_auc']
    })

if TABNET_AVAILABLE:
    model_comparison.append({
        'model': 'TabNet',
        'val_roc_auc': tabnet_val_metrics['roc_auc'],
        'val_pr_auc': tabnet_val_metrics['pr_auc'],
        'test_roc_auc': tabnet_test_metrics['roc_auc'],
        'test_pr_auc': tabnet_test_metrics['pr_auc']
    })

if model_comparison:
    comparison_df = pd.DataFrame(model_comparison)
    print("Model Comparison (Validation Set):")
    print(comparison_df[['model', 'val_roc_auc', 'val_pr_auc']].to_string(index=False))
    
    # Select best model based on validation PR-AUC
    best_model_name = comparison_df.loc[comparison_df['val_pr_auc'].idxmax(), 'model']
    print(f"\nBest model (by Val PR-AUC): {best_model_name}")
    
    # Get best model
    if best_model_name == 'LightGBM' and LIGHTGBM_AVAILABLE:
        best_model = lgb_model
    elif best_model_name == 'CatBoost' and CATBOOST_AVAILABLE:
        best_model = cat_model
    elif best_model_name == 'TabNet' and TABNET_AVAILABLE:
        best_model = tabnet_model
    else:
        best_model = None
else:
    print("No models available for comparison")
    best_model = None
    best_model_name = None


Model Comparison (Validation Set):
   model  val_roc_auc  val_pr_auc
LightGBM     0.959123    0.840048
CatBoost     0.957017    0.819187
  TabNet     0.937814    0.778719

Best model (by Val PR-AUC): LightGBM


## 6. Model Persistence

Save the best model for future use.


In [23]:
if best_model is not None:
    import joblib
    
    # Save best model
    model_output_path = MODELS_ROOT / "best_podium_model.pkl"
    joblib.dump(best_model, model_output_path)
    print(f"Best model saved to: {model_output_path}")
    
    # Save comparison results
    if model_comparison:
        comparison_df.to_csv(PROCESSED_ROOT / "model_comparison.csv", index=False)
        print(f"Model comparison saved to: {PROCESSED_ROOT / 'model_comparison.csv'}")
else:
    print("No model to save")


Best model saved to: C:\Users\erikv\Downloads\models\best_podium_model.pkl
Model comparison saved to: C:\Users\erikv\Downloads\F1\data\processed\model_comparison.csv


## Summary

- Trained LightGBM, CatBoost, and TabNet models on 1994-2022 data
- Validated on 2023 data, tested on 2024 data
- Selected best model based on validation PR-AUC
- Saved best model to `models/best_podium_model.pkl` for downstream inference
