In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
import optuna
import warnings

warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Load both datasets
df_2025 = pd.read_csv('../data/master_features_2025.csv')
df_2026 = pd.read_csv('../data/features_2026.csv')

print(f"2025 dataset: {df_2025.shape}")
print(f"2026 dataset: {df_2026.shape}")
print(f"\n2025 columns:\n{df_2025.columns.tolist()}")
print(f"\n2026 columns:\n{df_2026.columns.tolist()}")

2025 dataset: (479, 34)
2026 dataset: (22, 19)

2025 columns:
['year', 'round', 'race_name', 'circuit', 'date', 'driver', 'driver_name', 'team', 'grid_position', 'finish_position', 'points', 'status', 'laps_completed', 'fastest_lap_rank', 'outperformance', 'teammate_finish_gap', 'constructor_rolling_points_5', 'season_stage', 'is_sprint_weekend', 'circuit_type', 'overtaking_difficulty', 'safety_car_probability', 'home_race', 'championship_gap', 'driver_experience', 'wet_race', 'rolling_avg_finish_5', 'rolling_avg_points_5', 'dnf', 'quali_position', 'rolling_avg_quali_5', 'teammate_quali_gap', 'avg_team_pit_seconds', 'dnf_rate']

2026 columns:
['driver', 'test1_best_s', 'test2_best_s', 'test1_gap_s', 'test2_gap_s', 'testing_improvement_s', 'combined_pace_gap_s', 'team_2026', 'total_testing_laps', 'avg_race_pace_s', 'race_pace_gap_s', 'team_total_laps', 'team_reliability_score', 'driver_team_change', 'new_team_flag', 'rookie_flag', 'barcelona_best_s', 'barcelona_gap_s', 'missed_barcelona

In [12]:
# FEATURE PREP + TRAIN/TEST SPLIT

TARGET = 'finish_position'

META_COLS = [
    'year', 'race_name', 'circuit', 
    'date', 'driver', 'driver_name', 'team'
]

LEAKAGE_COLS = [
    'status',          
    'points',             
    'laps_completed',      
    'fastest_lap_rank',    
    'dnf',                 
    'teammate_finish_gap', # current race teammate result
    'championship_gap',    # calculated using current race points
    'wet_race'
]

# ONE HOT ENCODE circuit_type and season_stage
df_2025_encoded = pd.get_dummies(
    df_2025, 
    columns=['circuit_type', 'season_stage'], 
    prefix=['circuit', 'stage']
)

# SPLIT FIRST — while round still exists
train_mask = df_2025_encoded['round'] <= 18
test_mask  = df_2025_encoded['round'] >= 19

# NOW drop metadata + leakage + round
drop_cols = [c for c in META_COLS + LEAKAGE_COLS + ['round'] 
             if c in df_2025_encoded.columns]
df_2025_encoded = df_2025_encoded.drop(columns=drop_cols)

# Feature columns = everything except target
feature_cols = [c for c in df_2025_encoded.columns if c != TARGET]

# Redundant/useless features based on feature importance
WEAK_FEATURES = ['home_race', 'circuit_permanent', 'circuit_street', 'stage_late']
feature_cols = [c for c in feature_cols if c not in WEAK_FEATURES]

print(f"Target: {TARGET}")
print(f"Number of features: {len(feature_cols)}")
print(f"\nFeatures:")
for i, col in enumerate(feature_cols, 1):
    print(f"  {i}. {col}")

df_train = df_2025_encoded[train_mask].copy()
df_test  = df_2025_encoded[test_mask].copy()

X_train = df_train[feature_cols]
y_train = df_train[TARGET]

X_test = df_test[feature_cols]
y_test = df_test[TARGET]

print(f"\nTRAIN / TEST SPLIT:")
print(f"Training rows : {len(X_train)} (Rounds 1-18)")
print(f"Test rows     : {len(X_test)} (Rounds 19-24)")

Target: finish_position
Number of features: 16

Features:
  1. grid_position
  2. outperformance
  3. constructor_rolling_points_5
  4. is_sprint_weekend
  5. overtaking_difficulty
  6. safety_car_probability
  7. driver_experience
  8. rolling_avg_finish_5
  9. rolling_avg_points_5
  10. quali_position
  11. rolling_avg_quali_5
  12. teammate_quali_gap
  13. avg_team_pit_seconds
  14. dnf_rate
  15. stage_early
  16. stage_mid

TRAIN / TEST SPLIT:
Training rows : 359 (Rounds 1-18)
Test rows     : 120 (Rounds 19-24)


In [13]:
# TRAIN BASELINE LIGHTGBM MODEL

model = lgb.LGBMRegressor(
    n_estimators=300,      # number of trees
    learning_rate=0.05,    # how fast the model learns
    max_depth=6,           # how deep each tree can grow
    num_leaves=31,         # max leaves per tree
    min_child_samples=5,   # min rows needed to make a leaf
    subsample=0.8,         # use 80% of rows per tree
    colsample_bytree=0.8,  # use 80% of features per tree
    random_state=42,
    verbose=-1
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae:.3f} positions off on average")
print(f"  e.g. if driver actually finishes P5,")
print(f"  we predict somewhere around P{5-mae:.0f} to P{5+mae:.0f}")

# Show a sample of predictions vs actual
results_df = pd.DataFrame({
    'actual'   : y_test.values,
    'predicted': y_pred.round(1),
    'error'    : (y_pred - y_test.values).round(1)
}).reset_index(drop=True)

print(f"\nSAMPLE PREDICTIONS (first 20 rows):")
print(results_df.head(20).to_string())

MAE: 0.512 positions off on average
  e.g. if driver actually finishes P5,
  we predict somewhere around P4 to P6

SAMPLE PREDICTIONS (first 20 rows):
    actual  predicted  error
0       14       13.8   -0.2
1       12       12.4    0.4
2       11       11.1    0.1
3       16       15.6   -0.4
4       11       10.9   -0.1
5       16       15.7   -0.3
6       13       13.1    0.1
7        6        6.0    0.0
8        2        2.5    0.5
9        3        4.9    1.9
10       5        5.3    0.3
11      15       14.1   -0.9
12      20       19.5   -0.5
13      17       16.0   -1.0
14      13       12.9   -0.1
15       5        5.2    0.2
16       3        5.9    2.9
17      13       12.8   -0.2
18       3        3.1    0.1
19       2        2.0   -0.0


In [14]:
# FEATURE IMPORTANCE

importance_df = pd.DataFrame({
    'feature'   : feature_cols,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False).reset_index(drop=True)

print("FEATURE IMPORTANCE:")
print(importance_df.to_string())

FEATURE IMPORTANCE:
                         feature  importance
0                 outperformance        1238
1                  grid_position         918
2   constructor_rolling_points_5         510
3           rolling_avg_finish_5         507
4            rolling_avg_quali_5         447
5                 quali_position         371
6           rolling_avg_points_5         355
7              driver_experience         230
8             teammate_quali_gap         165
9         safety_car_probability         142
10          avg_team_pit_seconds         121
11         overtaking_difficulty         110
12                      dnf_rate          80
13                   stage_early          55
14             is_sprint_weekend          53
15                     stage_mid          29


In [17]:
# Optuna Hyperparameter tuning

def objective(trial):
    params = {
        'n_estimators'     : trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate'    : trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth'        : trial.suggest_int('max_depth', 3, 10),
        'num_leaves'       : trial.suggest_int('num_leaves', 15, 127),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 50),
        'subsample'        : trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree' : trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'random_state'     : 42,
        'verbose'          : -1
    }
    model = lgb.LGBMRegressor(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return mean_absolute_error(y_test, y_pred)

# Run 100 trials and find the best parameters
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=300, show_progress_bar=True)
print(f"Best MAE : {study.best_value:.3f}")
print(f"Best params:\n{study.best_params}")

  0%|          | 0/300 [00:00<?, ?it/s]

Best MAE : 0.423
Best params:
{'n_estimators': 928, 'learning_rate': 0.030584726948076943, 'max_depth': 4, 'num_leaves': 113, 'min_child_samples': 5, 'subsample': 0.9806479033001241, 'colsample_bytree': 0.9670031408281698}


Baseline (default params)  →  0.635 MAE
After dropping weak features →  0.512 MAE
After Optuna tuning          →  0.438 MAE

Total improvement: 0.197 positions ✅

learning_rate = 0.125   → higher than our 0.05 guess
max_depth = 10          → deeper trees than we used
num_leaves = 22         → fewer leaves than default 31
n_estimators = 457      → more trees than our 300

In [18]:
# RETRAIN WITH BEST PARAMS

# Train final model using Optuna's best parameters
best_model = lgb.LGBMRegressor(**study.best_params, random_state=42, verbose=-1)
best_model.fit(X_train, y_train)

y_pred_best = best_model.predict(X_test)
mae_best = mean_absolute_error(y_test, y_pred_best)

print(f"Final MAE: {mae_best:.3f} positions off on average")

results_df = pd.DataFrame({
    'actual'   : y_test.values,
    'predicted': y_pred_best.round(1),
    'error'    : (y_pred_best - y_test.values).round(1)
}).reset_index(drop=True)

print(f"\nSample predictions (first 20 rows):")
print(results_df.head(20).to_string())

Final MAE: 0.423 positions off on average

Sample predictions (first 20 rows):
    actual  predicted  error
0       14       13.8   -0.2
1       12       12.3    0.3
2       11       11.2    0.2
3       16       15.9   -0.1
4       11       10.5   -0.5
5       16       16.0   -0.0
6       13       13.8    0.8
7        6        5.6   -0.4
8        2        2.7    0.7
9        3        4.4    1.4
10       5        5.3    0.3
11      15       15.2    0.2
12      20       19.7   -0.3
13      17       16.1   -0.9
14      13       13.1    0.1
15       5        4.7   -0.3
16       3        4.5    1.5
17      13       13.3    0.3
18       3        3.0    0.0
19       2        2.0    0.0


# OPTUNA SINGLE SPLIT (baseline tuning, reference only)
# Note: this overfit to Rounds 19-24, replaced by Cell 8 CV tuning

In [19]:
folds = [
    (list(range(1, 7)),   list(range(7, 13))),
    (list(range(1, 13)),  list(range(13, 19))),
    (list(range(1, 19)),  list(range(19, 25))),
]

fold_maes = []

for i, (train_rounds, test_rounds) in enumerate(folds, 1):
    train_mask_cv = df_2025['round'].isin(train_rounds)
    test_mask_cv  = df_2025['round'].isin(test_rounds)

    df_cv = pd.get_dummies(df_2025, columns=['circuit_type', 'season_stage'],
                           prefix=['circuit', 'stage'])
    drop_cols_cv = [c for c in META_COLS + LEAKAGE_COLS + ['round']
                    if c in df_cv.columns]
    df_cv = df_cv.drop(columns=drop_cols_cv)

    X_tr = df_cv[train_mask_cv][feature_cols]
    y_tr = df_cv[train_mask_cv][TARGET]
    X_te = df_cv[test_mask_cv][feature_cols]
    y_te = df_cv[test_mask_cv][TARGET]

    cv_model = lgb.LGBMRegressor(**study.best_params, random_state=42, verbose=-1)
    cv_model.fit(X_tr, y_tr)

    # Check BOTH train and test MAE
    train_mae = mean_absolute_error(y_tr, cv_model.predict(X_tr))
    test_mae  = mean_absolute_error(y_te, cv_model.predict(X_te))
    gap       = test_mae - train_mae

    fold_maes.append(test_mae)
    print(f"Fold {i} | Train MAE: {train_mae:.3f} | Test MAE: {test_mae:.3f} | Gap: {gap:.3f}")

print(f"\nAverage test MAE : {np.mean(fold_maes):.3f}")
print(f"Std deviation    : {np.std(fold_maes):.3f}")
print(f"\nIf gap is small (<0.2) → healthy fit")
print(f"If gap is large (>0.5) → overfitting")

Fold 1 | Train MAE: 0.017 | Test MAE: 1.460 | Gap: 1.443
Fold 2 | Train MAE: 0.031 | Test MAE: 0.670 | Gap: 0.639
Fold 3 | Train MAE: 0.078 | Test MAE: 0.423 | Gap: 0.344

Average test MAE : 0.851
Std deviation    : 0.442

If gap is small (<0.2) → healthy fit
If gap is large (>0.5) → overfitting


In [23]:
# CELL 8 — OPTUNA WITH CROSS VALIDATION OBJECTIVE


def cv_objective(trial):
    params = {
        'n_estimators'     : trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate'    : trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth'        : trial.suggest_int('max_depth', 3, 6),      # capped at 6
        'num_leaves'       : trial.suggest_int('num_leaves', 15, 63),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),  # higher min
        'subsample'        : trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree' : trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha'        : trial.suggest_float('reg_alpha', 0.0, 1.0),   # L1 regularisation
        'reg_lambda'       : trial.suggest_float('reg_lambda', 0.0, 1.0),  # L2 regularisation
        'random_state'     : 42,
        'verbose'          : -1
    }

    folds = [
        (list(range(1, 7)),   list(range(7, 13))),
        (list(range(1, 13)),  list(range(13, 19))),
        (list(range(1, 19)),  list(range(19, 25))),
    ]

    fold_maes = []
    for train_rounds, test_rounds in folds:
        train_mask_cv = df_2025['round'].isin(train_rounds)
        test_mask_cv  = df_2025['round'].isin(test_rounds)

        df_cv = pd.get_dummies(df_2025, columns=['circuit_type', 'season_stage'],
                               prefix=['circuit', 'stage'])
        drop_cols_cv = [c for c in META_COLS + LEAKAGE_COLS + ['round']
                        if c in df_cv.columns]
        df_cv = df_cv.drop(columns=drop_cols_cv)

        X_tr = df_cv[train_mask_cv][feature_cols]
        y_tr = df_cv[train_mask_cv][TARGET]
        X_te = df_cv[test_mask_cv][feature_cols]
        y_te = df_cv[test_mask_cv][TARGET]

        m = lgb.LGBMRegressor(**params)
        m.fit(X_tr, y_tr)
        fold_maes.append(mean_absolute_error(y_te, m.predict(X_te)))

    return np.mean(fold_maes)

# Run Optuna with CV objective
cv_study = optuna.create_study(direction='minimize')
cv_study.optimize(cv_objective, n_trials=300, show_progress_bar=True)

print(f"Best CV MAE : {cv_study.best_value:.3f}")
print(f"Best params : {cv_study.best_params}")

  0%|          | 0/300 [00:00<?, ?it/s]

Best CV MAE : 0.803
Best params : {'n_estimators': 970, 'learning_rate': 0.09658197577351804, 'max_depth': 3, 'num_leaves': 33, 'min_child_samples': 11, 'subsample': 0.8027763078172319, 'colsample_bytree': 0.5793885900726659, 'reg_alpha': 0.26223774241852094, 'reg_lambda': 0.44170535546089823}


In [None]:
# VERIFY FIT WITH NEW PARAMS

folds = [
    (list(range(1, 7)),   list(range(7, 13))),
    (list(range(1, 13)),  list(range(13, 19))),
    (list(range(1, 19)),  list(range(19, 25))),
]

fold_maes = []

for i, (train_rounds, test_rounds) in enumerate(folds, 1):
    train_mask_cv = df_2025['round'].isin(train_rounds)
    test_mask_cv  = df_2025['round'].isin(test_rounds)

    df_cv = pd.get_dummies(df_2025, columns=['circuit_type', 'season_stage'],
                           prefix=['circuit', 'stage'])
    drop_cols_cv = [c for c in META_COLS + LEAKAGE_COLS + ['round']
                    if c in df_cv.columns]
    df_cv = df_cv.drop(columns=drop_cols_cv)

    X_tr = df_cv[train_mask_cv][feature_cols]
    y_tr = df_cv[train_mask_cv][TARGET]
    X_te = df_cv[test_mask_cv][feature_cols]
    y_te = df_cv[test_mask_cv][TARGET]

    cv_model = lgb.LGBMRegressor(**cv_study.best_params, random_state=42, verbose=-1)
    cv_model.fit(X_tr, y_tr)

    train_mae = mean_absolute_error(y_tr, cv_model.predict(X_tr))
    test_mae  = mean_absolute_error(y_te, cv_model.predict(X_te))
    gap       = test_mae - train_mae

    fold_maes.append(test_mae)
    print(f"Fold {i} | Train MAE: {train_mae:.3f} | Test MAE: {test_mae:.3f} | Gap: {gap:.3f}")

print(f"\nAverage test MAE : {np.mean(fold_maes):.3f}")
print(f"Std deviation    : {np.std(fold_maes):.3f}")

Fold 1 | Train MAE: 0.105 | Test MAE: 1.220 | Gap: 1.115
Fold 2 | Train MAE: 0.073 | Test MAE: 0.688 | Gap: 0.615
Fold 3 | Train MAE: 0.083 | Test MAE: 0.501 | Gap: 0.418

Average test MAE : 0.803
Std deviation    : 0.305


In [25]:
# CELL 10 — RETRAIN ON FULL 2025 DATASET (ALL 24 ROUNDS)


# Re-encode full dataset
df_full = pd.get_dummies(df_2025, columns=['circuit_type', 'season_stage'],
                         prefix=['circuit', 'stage'])

drop_cols_full = [c for c in META_COLS + LEAKAGE_COLS + ['round']
                  if c in df_full.columns]
df_full = df_full.drop(columns=drop_cols_full)

X_full = df_full[feature_cols]
y_full = df_full[TARGET]

# Train final model on all 24 rounds
final_model = lgb.LGBMRegressor(**cv_study.best_params, random_state=42, verbose=-1)
final_model.fit(X_full, y_full)

print(f"Final model trained on {len(X_full)} rows ({df_2025['round'].nunique()} rounds)")
print(f"Features used: {len(feature_cols)}")
print(f"Reported CV MAE: 0.803")


Final model trained on 479 rows (24 rounds)
Features used: 16
Reported CV MAE: 0.803


In [27]:
# SAVE MODEL

import pickle
import os

os.makedirs('../models', exist_ok=True)

# Save model
with open('../models/lgbm_regressor.pkl', 'wb') as f:
    pickle.dump(final_model, f)

# Save feature list — important for prediction later
with open('../models/feature_cols.pkl', 'wb') as f:
    pickle.dump(feature_cols, f)


# BUILD 2026 PREDICTION INPUT

# Our 2026 features map to 2025 training features like this:
# 2025 feature              ← 2026 replacement
# rolling_avg_finish_5      ← combined_pace_gap_s (car pace proxy)
# constructor_rolling_pts_5 ← team_reliability_score (team form proxy)
# avg_team_pit_seconds      ← stays same (pit crew unchanged)
# dnf_rate                  ← stays same (driver behaviour unchanged)
# quali_position            ← test2_gap_s (qualifying pace proxy)
# rolling_avg_quali_5       ← test2_gap_s (same signal)
# outperformance            ← stays same (driver talent unchanged)
# driver_experience         ← stays same (experience unchanged)
# teammate_quali_gap        ← stays same (relative pace unchanged)
# grid_position             ← test2_gap_s (starting position proxy)


In [29]:
# Load 2026 testing features
df_26 = pd.read_csv('../data/features_2026.csv')

# Get stable features from 2025 for the 20 drivers who raced
driver_stable = df_2025.groupby('driver').agg(
    outperformance        = ('outperformance', 'mean'),
    driver_experience     = ('driver_experience', 'mean'),
    avg_team_pit_seconds  = ('avg_team_pit_seconds', 'mean'),
    dnf_rate              = ('dnf_rate', 'mean'),
    teammate_quali_gap    = ('teammate_quali_gap', 'mean'),
    rolling_avg_finish_5  = ('rolling_avg_finish_5', 'mean'),
    rolling_avg_points_5  = ('rolling_avg_points_5', 'mean'),
    rolling_avg_quali_5   = ('rolling_avg_quali_5', 'mean'),
    quali_position        = ('quali_position', 'mean'),
    grid_position         = ('grid_position', 'mean'),
).reset_index()

# Load BOT and PER 2024 features
bot_per = pd.read_csv('../data/bot_per_2024_features.csv')

# Combine all 22 drivers
driver_all = pd.concat([driver_stable, bot_per], ignore_index=True)

# Merge with 2026 testing features
df_pred = df_26.merge(driver_all, on='driver', how='left')

print(f"Prediction input shape: {df_pred.shape}")
print(f"Drivers: {sorted(df_pred['driver'].tolist())}")
print(f"\nMissing values:")
print(df_pred.isnull().sum()[df_pred.isnull().sum() > 0])

Prediction input shape: (22, 29)
Drivers: ['ALB', 'ALO', 'ANT', 'BEA', 'BOR', 'BOT', 'COL', 'GAS', 'HAD', 'HAM', 'HUL', 'LAW', 'LEC', 'LIN', 'NOR', 'OCO', 'PER', 'PIA', 'RUS', 'SAI', 'STR', 'VER']

Missing values:
barcelona_best_s        3
barcelona_gap_s         3
outperformance          1
driver_experience       1
avg_team_pit_seconds    1
dnf_rate                1
teammate_quali_gap      1
rolling_avg_finish_5    1
rolling_avg_points_5    1
rolling_avg_quali_5     1
quali_position          1
grid_position           3
dtype: int64


In [47]:
# HANDLE MISSING VALUES

# Barcelona missing — fill with worst gap + penalty
worst_barcelona_gap = df_pred['barcelona_gap_s'].max()
df_pred['barcelona_gap_s']  = df_pred['barcelona_gap_s'].fillna(worst_barcelona_gap + 1.0)
df_pred['barcelona_best_s'] = df_pred['barcelona_best_s'].fillna(df_pred['barcelona_best_s'].max() + 1.0)


# LIN — true rookie, use conservative league averages
league_avg = driver_stable.mean(numeric_only=True)
rookie_cols = ['outperformance', 'driver_experience', 'avg_team_pit_seconds',
               'dnf_rate', 'teammate_quali_gap', 'rolling_avg_finish_5',
               'rolling_avg_points_5', 'rolling_avg_quali_5', 
               'quali_position', 'grid_position']

for col in rookie_cols:
    df_pred.loc[df_pred['driver'] == 'LIN', col] = league_avg[col]

# Override driver_experience for LIN manually
df_pred.loc[df_pred['driver'] == 'LIN', 'driver_experience'] = 0

# BOT and PER missing grid_position — use their quali_position
df_pred.loc[df_pred['driver'] == 'BOT', 'grid_position'] = \
    df_pred.loc[df_pred['driver'] == 'BOT', 'quali_position'].values[0]
df_pred.loc[df_pred['driver'] == 'PER', 'grid_position'] = \
    df_pred.loc[df_pred['driver'] == 'PER', 'quali_position'].values[0]

# Verify
print("Missing values after fix:")
missing = df_pred.isnull().sum()
print(missing[missing > 0] if missing[missing > 0].any() else "✅ No missing values!")

# Cap driver experience at 100
df_pred['driver_experience'] = df_pred['driver_experience'].clip(upper=100)
print(f"\nDriver experience after cap:")
print(df_pred[['driver', 'driver_experience']].to_string())

Missing values after fix:
✅ No missing values!

Driver experience after cap:
   driver  driver_experience
0     ALB              100.0
1     ALO              100.0
2     ANT                1.0
3     BEA               15.0
4     BOR                1.0
5     BOT              100.0
6     COL               10.0
7     GAS              100.0
8     HAD                1.0
9     HAM              100.0
10    HUL              100.0
11    LAW               20.0
12    LEC              100.0
13    LIN                0.0
14    NOR              100.0
15    OCO              100.0
16    PER              100.0
17    PIA               50.0
18    RUS              100.0
19    SAI              100.0
20    STR              100.0
21    VER              100.0


In [48]:
# BUILD FINAL PREDICTION MATRIX

circuit_features = df_2025.groupby('race_name').agg(
    is_sprint_weekend      = ('is_sprint_weekend', 'max'),
    overtaking_difficulty  = ('overtaking_difficulty', 'first'),
    safety_car_probability = ('safety_car_probability', 'first'),
    circuit_type           = ('circuit_type', 'first'),
).reset_index()

print("2025 circuit features:")
print(circuit_features.to_string())

2025 circuit features:
                    race_name  is_sprint_weekend  overtaking_difficulty  safety_car_probability circuit_type
0        Abu Dhabi Grand Prix                  0                    2.0                     0.3    permanent
1       Australian Grand Prix                  0                    2.0                     0.7       street
2         Austrian Grand Prix                  0                    2.0                     0.5    permanent
3       Azerbaijan Grand Prix                  0                    2.0                     0.8       street
4          Bahrain Grand Prix                  0                    2.0                     0.3    permanent
5          Belgian Grand Prix                  0                    1.0                     0.6    permanent
6          British Grand Prix                  1                    2.0                     0.5    permanent
7         Canadian Grand Prix                  0                    3.0                     0.7    perman

In [None]:
# BUILD 2026 RACE CALENDAR


# 2026 F1 Calendar (24 rounds)
calendar_2026 = [
    {'round': 1,  'race_name': 'Australian Grand Prix'},
    {'round': 2,  'race_name': 'Chinese Grand Prix'},
    {'round': 3,  'race_name': 'Japanese Grand Prix'},
    {'round': 4,  'race_name': 'Bahrain Grand Prix'},
    {'round': 5,  'race_name': 'Saudi Arabian Grand Prix'},
    {'round': 6,  'race_name': 'Miami Grand Prix'},
    {'round': 7,  'race_name': 'Emilia Romagna Grand Prix'},
    {'round': 8,  'race_name': 'Monaco Grand Prix'},
    {'round': 9,  'race_name': 'Spanish Grand Prix'},
    {'round': 10, 'race_name': 'Canadian Grand Prix'},
    {'round': 11, 'race_name': 'Austrian Grand Prix'},
    {'round': 12, 'race_name': 'British Grand Prix'},
    {'round': 13, 'race_name': 'Belgian Grand Prix'},
    {'round': 14, 'race_name': 'Hungarian Grand Prix'},
    {'round': 15, 'race_name': 'Dutch Grand Prix'},
    {'round': 16, 'race_name': 'Italian Grand Prix'},
    {'round': 17, 'race_name': 'Azerbaijan Grand Prix'},
    {'round': 18, 'race_name': 'Singapore Grand Prix'},
    {'round': 19, 'race_name': 'United States Grand Prix'},
    {'round': 20, 'race_name': 'Mexico City Grand Prix'},
    {'round': 21, 'race_name': 'São Paulo Grand Prix'},
    {'round': 22, 'race_name': 'Las Vegas Grand Prix'},
    {'round': 23, 'race_name': 'Qatar Grand Prix'},
    {'round': 24, 'race_name': 'Abu Dhabi Grand Prix'},
]

df_calendar = pd.DataFrame(calendar_2026)

# Add season stage
df_calendar['stage_early'] = (df_calendar['round'] <= 8).astype(int)
df_calendar['stage_mid']   = ((df_calendar['round'] > 8) & (df_calendar['round'] <= 16)).astype(int)

# Merge circuit features
df_calendar = df_calendar.merge(circuit_features, on='race_name', how='left')

# One hot encode circuit_type
df_calendar = pd.get_dummies(df_calendar, columns=['circuit_type'], prefix='circuit')

# Check missing circuits
missing = df_calendar[df_calendar['overtaking_difficulty'].isna()]
print(f"Missing circuit data: {len(missing)} races")
if len(missing) > 0:
    print(missing[['round', 'race_name']])

print(f"\n2026 Calendar ({len(df_calendar)} races):")
print(df_calendar[['round', 'race_name', 'is_sprint_weekend', 
                    'overtaking_difficulty', 'safety_car_probability',
                    'stage_early', 'stage_mid']].to_string())

Missing circuit data: 0 races

2026 Calendar (24 races):
    round                  race_name  is_sprint_weekend  overtaking_difficulty  safety_car_probability  stage_early  stage_mid
0       1      Australian Grand Prix                  0                    2.0                     0.7            1          0
1       2         Chinese Grand Prix                  1                    2.0                     0.5            1          0
2       3        Japanese Grand Prix                  0                    3.0                     0.4            1          0
3       4         Bahrain Grand Prix                  0                    2.0                     0.3            1          0
4       5   Saudi Arabian Grand Prix                  0                    2.0                     0.8            1          0
5       6           Miami Grand Prix                  1                    3.0                     0.6            1          0
6       7  Emilia Romagna Grand Prix                  

In [50]:
# GENERATE 2026 PREDICTIONS (RACE BY RACE)

POINTS = {1:25, 2:18, 3:15, 4:12, 5:10, 6:8, 7:6, 8:4, 9:2, 10:1}

all_race_results = []

for _, race in df_calendar.iterrows():
    race_df = df_predictions[df_predictions['round'] == race['round']].copy()
    
    # Keep only top 20 by predicted score
    race_df = race_df.sort_values('predicted_score').head(20).copy()
    
    # Reassign positions 1-20
    race_df['predicted_position'] = range(1, 21)
    
    # Add points
    race_df['points'] = race_df['predicted_position'].map(POINTS).fillna(0)
    
    all_race_results.append(race_df)

df_results_2026 = pd.concat(all_race_results, ignore_index=True)

print(f"Total predictions: {len(df_results_2026)}")
print(f"\n=== AUSTRALIA 2026 PREDICTED GRID ===")
aus = df_results_2026[df_results_2026['round'] == 1].sort_values('predicted_position')
print(aus[['driver', 'predicted_position', 'points']].to_string(index=False))

Total predictions: 480

=== AUSTRALIA 2026 PREDICTED GRID ===
driver  predicted_position  points
   VER                   1    25.0
   RUS                   2    18.0
   PIA                   3    15.0
   NOR                   4    12.0
   LEC                   5    10.0
   HAM                   6     8.0
   ALO                   7     6.0
   HAD                   8     4.0
   ALB                   9     2.0
   ANT                  10     1.0
   PER                  11     0.0
   BEA                  12     0.0
   HUL                  13     0.0
   OCO                  14     0.0
   SAI                  15     0.0
   LIN                  16     0.0
   LAW                  17     0.0
   BOT                  18     0.0
   STR                  19     0.0
   BOR                  20     0.0


In [51]:
# CHAMPIONSHIP STANDINGS


# DRIVERS CHAMPIONSHIP
drivers_championship = df_results_2026.groupby('driver')['points'].sum()\
    .sort_values(ascending=False).reset_index()
drivers_championship.columns = ['driver', 'total_points']
drivers_championship['position'] = range(1, len(drivers_championship) + 1)

print("*** 2026 PREDICTED DRIVERS CHAMPIONSHIP ***")
print(drivers_championship[['position', 'driver', 'total_points']].to_string(index=False))

# CONSTRUCTORS CHAMPIONSHIP
team_map = df_pred.set_index('driver')['team_2026'].to_dict()
df_results_2026['team'] = df_results_2026['driver'].map(team_map)

constructors_championship = df_results_2026.groupby('team')['points'].sum()\
    .sort_values(ascending=False).reset_index()
constructors_championship.columns = ['team', 'total_points']
constructors_championship['position'] = range(1, len(constructors_championship) + 1)

print("\n*** 2026 PREDICTED CONSTRUCTORS CHAMPIONSHIP ***")
print(constructors_championship[['position', 'team', 'total_points']].to_string(index=False))

*** 2026 PREDICTED DRIVERS CHAMPIONSHIP ***
 position driver  total_points
        1    VER         600.0
        2    RUS         420.0
        3    PIA         372.0
        4    NOR         286.0
        5    LEC         242.0
        6    HAM         192.0
        7    ALO         144.0
        8    HAD          96.0
        9    ANT          39.0
       10    ALB          20.0
       11    PER          13.0
       12    BEA           0.0
       13    LIN           0.0
       14    LAW           0.0
       15    HUL           0.0
       16    BOR           0.0
       17    BOT           0.0
       18    OCO           0.0
       19    SAI           0.0
       20    STR           0.0

*** 2026 PREDICTED CONSTRUCTORS CHAMPIONSHIP ***
 position         team  total_points
        1     Red Bull         696.0
        2      McLaren         658.0
        3     Mercedes         459.0
        4      Ferrari         434.0
        5 Aston Martin         144.0
        6     Williams          2

In [53]:
# MONTE CARLO SIMULATION (10,000 runs)


N_SIMULATIONS = 10000
POINTS = {1:25, 2:18, 3:15, 4:12, 5:10, 6:8, 7:6, 8:4, 9:2, 10:1}

drivers = df_pred['driver'].tolist()
dnf_rates = df_pred.set_index('driver')['dnf_rate'].to_dict()

# STEP 1 — Get base predictions for all 24 races ONCE
base_preds = {}
for _, race in df_calendar.iterrows():
    race_df = df_pred.copy()
    race_df['is_sprint_weekend']            = race['is_sprint_weekend']
    race_df['overtaking_difficulty']        = race['overtaking_difficulty']
    race_df['safety_car_probability']       = race['safety_car_probability']
    race_df['stage_early']                  = race['stage_early']
    race_df['stage_mid']                    = race['stage_mid']
    race_df['grid_position']               = 1 + (race_df['combined_pace_gap_s'] / 0.3)
    race_df['quali_position']              = race_df['grid_position']
    race_df['rolling_avg_quali_5']         = race_df['grid_position']
    race_df['rolling_avg_finish_5']        = race_df['grid_position']
    race_df['constructor_rolling_points_5'] = race_df['team_reliability_score'] * 100
    base_preds[race['round']] = final_model.predict(race_df[feature_cols])

print("✅ Base predictions done (24 races)")

# STEP 2 — Monte Carlo in pure numpy (fast)
sim_points = np.zeros((N_SIMULATIONS, len(drivers)))
sim_wins   = np.zeros(len(drivers))

dnf_array = np.array([dnf_rates[d] for d in drivers])

for sim in range(N_SIMULATIONS):
    season_pts = np.zeros(len(drivers))

    for rnd, base in base_preds.items():
        # Add noise
        noise_scale = 0.803 + (race['safety_car_probability'] * 2.0)
        noise = np.random.normal(0, noise_scale, size=len(base))
        preds = base + noise

        # Simulate DNFs
        dnf_mask = np.random.random(len(drivers)) < dnf_array
        preds[dnf_mask] += 15

        # Rank top 20
        ranked_idx = np.argsort(preds)[:20]
        for pos, idx in enumerate(ranked_idx, 1):
            season_pts[idx] += POINTS.get(pos, 0)

        sim_wins[ranked_idx[0]] += 1

    sim_points[sim] = season_pts

print(f"✅ {N_SIMULATIONS:,} simulations complete!")

# SUMMARISE
results = []
for i, driver in enumerate(drivers):
    results.append({
        'driver'    : driver,
        'avg_points': round(sim_points[:, i].mean(), 1),
        'min_points': round(sim_points[:, i].min(), 1),
        'max_points': round(sim_points[:, i].max(), 1),
        'win_pct': round(sim_wins[i] / (N_SIMULATIONS * 24) * 100, 1)
    })

df_sim = pd.DataFrame(results).sort_values('avg_points', ascending=False).reset_index(drop=True)
df_sim['position'] = range(1, len(df_sim) + 1)

print("\n*** 2026 MONTE CARLO CHAMPIONSHIP PREDICTION ***")
print(df_sim[['position', 'driver', 'avg_points', 'win_pct', 
              'min_points', 'max_points']].to_string(index=False))

✅ Base predictions done (24 races)
✅ 10,000 simulations complete!

*** 2026 MONTE CARLO CHAMPIONSHIP PREDICTION ***
 position driver  avg_points  win_pct  min_points  max_points
        1    LEC       428.7     47.0       255.0       555.0
        2    RUS       347.8     16.3       222.0       466.0
        3    PIA       310.3     11.8       175.0       428.0
        4    VER       264.4      5.1       161.0       356.0
        5    HAM       259.0      7.7       143.0       397.0
        6    NOR       237.4      4.4       120.0       352.0
        7    ANT       233.5      7.2       102.0       364.0
        8    BEA       128.0      0.4        51.0       213.0
        9    ALB        48.0      0.0         8.0       101.0
       10    OCO        39.4      0.0         6.0        92.0
       11    HUL        28.0      0.0         2.0        76.0
       12    LIN        23.3      0.0         1.0        62.0
       13    HAD        20.4      0.0         0.0        65.0
       14    SAI

In [55]:
# CELL 20 — MONTE CARLO CONSTRUCTORS CHAMPIONSHIP

# Team map from 2026 data
team_map = df_pred.set_index('driver')['team_2026'].to_dict()

# Add team to simulation results
# We need to rebuild from sim_points array

constructor_points = {}
for i, driver in enumerate(drivers):
    team = team_map[driver]
    if team not in constructor_points:
        constructor_points[team] = np.zeros(N_SIMULATIONS)
    constructor_points[team] += sim_points[:, i]

# Summarise constructors
constructor_results = []
for team, points_array in constructor_points.items():
    constructor_results.append({
        'team'        : team,
        'avg_points'  : round(points_array.mean(), 1),
        'min_points'  : round(points_array.min(), 1),
        'max_points'  : round(points_array.max(), 1),
        'champion_pct': round(np.mean(points_array == max(
                            v.mean() for v in constructor_points.values()
                        )) * 100, 1)
    })

df_constructors = pd.DataFrame(constructor_results)\
    .sort_values('avg_points', ascending=False).reset_index(drop=True)
df_constructors['position'] = range(1, len(df_constructors) + 1)

print("*** 2026 PREDICTED CONSTRUCTORS CHAMPIONSHIP ***")
print(df_constructors[['position', 'team', 'avg_points', 
                         'min_points', 'max_points']].to_string(index=False))

*** 2026 PREDICTED CONSTRUCTORS CHAMPIONSHIP ***
 position           team  avg_points  min_points  max_points
        1        Ferrari       687.8       490.0       861.0
        2       Mercedes       581.3       404.0       736.0
        3        McLaren       547.7       392.0       704.0
        4       Red Bull       284.8       183.0       391.0
        5   Haas F1 Team       167.4        90.0       271.0
        6       Williams        66.2        21.0       129.0
        7     RB F1 Team        41.0         6.0        92.0
        8           Audi        31.7         4.0        80.0
        9 Alpine F1 Team        11.8         0.0        43.0
       10   Aston Martin         2.5         0.0        23.0
       11       Cadillac         1.9         0.0        19.0


# Model 2 - Podium Classifier