In [3]:
# notebooks/06_train_and_predict_per_driver.ipynb
# Title: Train & Predict Per-Driver Quali Lap (dynamic race number)
# Run this in the project venv kernel.

import os
from pathlib import Path
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings("ignore")
sns.set_style("whitegrid")

DATA_DIR = Path("../data")
DATA_DIR.mkdir(parents=True, exist_ok=True)

# ---------- Helper: load best available data ----------
def load_best_data():
    # Prefer enriched file if exists, else processed, else all_quali
    enriched = DATA_DIR / "enriched_quali_2024.csv"
    processed = DATA_DIR / "processed_quali_2024.csv"
    all_quali = DATA_DIR / "all_quali_2024.csv"
    
    if enriched.exists():
        print("Loading enriched data:", enriched)
        df = pd.read_csv(enriched, parse_dates=["date_start"])
    elif processed.exists():
        print("Enriched file not found. Loading processed file:", processed)
        df = pd.read_csv(processed, parse_dates=["date_start"])
        # best-effort: try to enrich with drivers/teams/tyres if endpoints are available
        # but we do not attempt heavy enrichment here; assume earlier step created mapping
    elif all_quali.exists():
        print("Only raw all_quali_2024.csv found. Loading it:", all_quali)
        df = pd.read_csv(all_quali, parse_dates=["date_start"])
    else:
        raise FileNotFoundError("No data files found in ../data. Please run earlier notebooks (fetch & process).")
    return df

df = load_best_data()
print("Rows:", len(df))
print("Columns:", df.columns.tolist())
df.head()

# ---------- Ask user which race number to predict ----------
# We use meeting_key ordering to define race number ordering
meetings = sorted(df["meeting_key"].unique())
print(f"Found {len(meetings)} meetings (ordered). Example keys: {meetings[:8]}")
print("Enter the race number you want to predict (1-based index). E.g., 5 to predict 5th race using 1-4 for training.")

# For notebook convenience use input(); in VS Code Jupyter you can set race_number variable manually
try:
    race_to_predict = int(input("Race number to predict (1-based): ").strip())
except:
    # fallback: default to 5 if running non-interactively
    race_to_predict = 5
    print("Using default race_to_predict =", race_to_predict)

if not (1 <= race_to_predict <= len(meetings)):
    raise ValueError(f"race_to_predict must be between 1 and {len(meetings)}")

train_meetings = meetings[:race_to_predict - 1]
test_meeting = meetings[race_to_predict - 1]
print("Training meetings (keys):", train_meetings)
print("Testing meeting (key):", test_meeting)

# ---------- Filter train/test ----------
train_df = df[df["meeting_key"].isin(train_meetings)].copy()
test_df = df[df["meeting_key"] == test_meeting].copy()

print("Train rows:", train_df.shape)
print("Test rows :", test_df.shape)

# ---------- Minimal cleaning & ensure required columns exist ----------
required_cols = ["meeting_key","session_key","driver_number","lap_number","lap_duration",
                 "duration_sector_1","duration_sector_2","duration_sector_3",
                 "i1_speed","i2_speed","st_speed","date_start"]
for c in required_cols:
    if c not in df.columns:
        raise KeyError(f"Required column missing: {c}. Please check processed/enriched dataset.")

# Convert numeric columns
num_cols = ["lap_number","lap_duration","duration_sector_1","duration_sector_2","duration_sector_3",
            "i1_speed","i2_speed","st_speed"]
for c in num_cols:
    train_df[c] = pd.to_numeric(train_df[c], errors="coerce")
    test_df[c] = pd.to_numeric(test_df[c], errors="coerce")

# If driver_name/team_name exist, good â€” else we'll aggregate by driver_number/team mapping must exist
has_driver_name = "driver_name" in df.columns
has_team_name = "team_name" in df.columns

if not has_driver_name:
    print("WARNING: driver_name not present. Predictions will use driver_number as identifier.")
else:
    print("driver_name column available.")

if not has_team_name:
    print("WARNING: team_name not present. Consider creating driver-team mapping if you want team features.")
else:
    print("team_name column available.")

# ---------- Build driver-level & team-level aggregated features from training set ----------
# Aggregations we want (per driver_number and per team_name if available)
# If team_name missing, create team column from driver mapping file if available
if not has_team_name:
    mapping_path = DATA_DIR / "driver_team_mapping.csv"
    if mapping_path.exists():
        mapping = pd.read_csv(mapping_path)
        # expect columns: driver_number, team
        if "team" in mapping.columns:
            train_df = train_df.merge(mapping[["driver_number","team"]], on="driver_number", how="left")
            test_df = test_df.merge(mapping[["driver_number","team"]], on="driver_number", how="left")
            has_team_name = True
            train_df = train_df.rename(columns={"team":"team_name"})
            test_df = test_df.rename(columns={"team":"team_name"})
            print("Loaded driver_team_mapping.csv and merged team info.")
        else:
            print("driver_team_mapping.csv exists but lacks 'team' column. Skipping team features.")
    else:
        print("No driver_team_mapping.csv found. Team-level features will be skipped.")

# Create driver aggregates
driver_aggs = train_df.groupby("driver_number").agg(
    driver_avg_lap = ("lap_duration","mean"),
    driver_best_lap = ("lap_duration","min"),
    driver_median_lap = ("lap_duration","median"),
    driver_std_lap = ("lap_duration","std"),
    driver_avg_s1 = ("duration_sector_1","mean"),
    driver_avg_s2 = ("duration_sector_2","mean"),
    driver_avg_s3 = ("duration_sector_3","mean"),
    driver_avg_i1 = ("i1_speed","mean"),
    driver_avg_i2 = ("i2_speed","mean"),
    driver_avg_st = ("st_speed","mean"),
    driver_total_laps = ("lap_duration","count")
).reset_index()

driver_aggs["driver_consistency"] = 1 / (driver_aggs["driver_std_lap"].replace(0, np.nan))
driver_aggs = driver_aggs.fillna(0)

# Create team aggregates (if available)
if has_team_name:
    team_aggs = train_df.groupby("team_name").agg(
        team_avg_lap = ("lap_duration","mean"),
        team_best_lap = ("lap_duration","min"),
        team_avg_s1 = ("duration_sector_1","mean"),
        team_avg_s2 = ("duration_sector_2","mean"),
        team_avg_s3 = ("duration_sector_3","mean"),
        team_avg_i1 = ("i1_speed","mean"),
        team_avg_i2 = ("i2_speed","mean"),
        team_avg_st = ("st_speed","mean"),
        team_total_laps = ("lap_duration","count")
    ).reset_index()
else:
    team_aggs = None

print("Driver aggregates:", driver_aggs.shape)
if team_aggs is not None:
    print("Team aggregates:", team_aggs.shape)

# Merge aggregates into train and test
train_df = train_df.merge(driver_aggs, on="driver_number", how="left")
test_df = test_df.merge(driver_aggs, on="driver_number", how="left")  # driver's historical stats (from earlier races)

if team_aggs is not None:
    train_df = train_df.merge(team_aggs, on="team_name", how="left")
    test_df = test_df.merge(team_aggs, on="team_name", how="left")

# ---------- Optionally add simple track-type features ----------
# If you have a mapping file of meeting_key -> track_type ('Street'/'Permanent'/'HighSpeed'), load it.
track_meta_path = DATA_DIR / "track_metadata.csv"
if track_meta_path.exists():
    track_meta = pd.read_csv(track_meta_path)
    # must contain columns: meeting_key, track_type
    if "meeting_key" in track_meta.columns and "track_type" in track_meta.columns:
        train_df = train_df.merge(track_meta[["meeting_key","track_type"]], on="meeting_key", how="left")
        test_df = test_df.merge(track_meta[["meeting_key","track_type"]], on="meeting_key", how="left")
        print("Merged track_type metadata.")
    else:
        print("track_metadata.csv present but missing required cols. Skipping track_type.")
else:
    print("No track_metadata.csv found. You can add: meeting_key, track_type to improve model (Street/Perm/HighSpeed).")

# ---------- Define features to use ----------
# Base lap-level features (per-row)
base_features = [
    "lap_number",
    "duration_sector_1","duration_sector_2","duration_sector_3",
    "i1_speed","i2_speed","st_speed"
]

# Add driver/team aggregates
driver_features = [
    "driver_avg_lap","driver_best_lap","driver_median_lap","driver_consistency","driver_total_laps",
    "driver_avg_s1","driver_avg_s2","driver_avg_s3","driver_avg_i1","driver_avg_i2","driver_avg_st"
]

team_features = []
if team_aggs is not None:
    team_features = [
        "team_avg_lap","team_best_lap","team_total_laps",
        "team_avg_s1","team_avg_s2","team_avg_s3","team_avg_i1","team_avg_i2","team_avg_st"
    ]

# track_type if present (categorical)
has_track_type = "track_type" in train_df.columns

feature_cols = base_features + driver_features + team_features
if has_track_type:
    feature_cols.append("track_type")

print("Final feature columns used:", feature_cols)

# Drop rows with NaN in critical features
train_df = train_df.dropna(subset=base_features + ["lap_duration"])
test_df = test_df.dropna(subset=base_features + ["lap_duration"])

# ---------- Prepare X/y ----------
X_train = train_df[feature_cols].copy()
y_train = train_df["lap_duration"].copy()
X_test = test_df[feature_cols].copy()
y_test = test_df["lap_duration"].copy()

# ---------- Build preprocessing + model pipeline ----------
# Categorical: track_type if present
categorical_cols = ["track_type"] if has_track_type else []
numeric_cols = [c for c in feature_cols if c not in categorical_cols]

# Simple pipeline: scale not strictly required for RandomForest, but keep one-hot encoding
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols)
    ],
    remainder="passthrough"
)

rf = RandomForestRegressor(random_state=42, n_jobs=-1)

pipeline = Pipeline([
    ("pre", preprocessor),
    ("model", rf)
])

# ---------- Hyperparameter tuning (randomized, fast) ----------
param_dist = {
    "model__n_estimators": [100,200,400],
    "model__max_depth": [6,10,15, None],
    "model__min_samples_split": [2,5,10],
    "model__min_samples_leaf": [1,2,4]
}

print("Starting RandomizedSearchCV (this may take a little while)...")
rs = RandomizedSearchCV(pipeline, param_distributions=param_dist,
                        n_iter=12, cv=3, scoring="neg_mean_absolute_error", verbose=1, n_jobs=-1, random_state=42)

rs.fit(X_train, y_train)

print("Best params:", rs.best_params_)
best_model = rs.best_estimator_

# ---------- Predict on test laps ----------
preds = best_model.predict(X_test)

# ---------- Evaluate per-lap ----------
mae = mean_absolute_error(y_test, preds)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print(f"Per-lap MAE: {mae:.4f} s, RMSE: {rmse:.4f} s")

# Save model
model_out = DATA_DIR / f"rf_model_up_to_race_{race_to_predict-1}_predict_race_{race_to_predict}.pkl"
joblib.dump(best_model, model_out)
print("Saved model to:", model_out)

# ---------- Reduce to per-driver predicted best lap (qualifying best) ----------
test_df = test_df.copy()
test_df["pred_lap"] = preds
# For actual best lap per driver:
actual_best = test_df.groupby("driver_number")["lap_duration"].min().reset_index().rename(columns={"lap_duration":"actual_best"})
pred_best = test_df.groupby("driver_number")["pred_lap"].min().reset_index().rename(columns={"pred_lap":"predicted_best"})
comparison = actual_best.merge(pred_best, on="driver_number", how="inner")

# If driver_name exists add it
if "driver_name" in test_df.columns:
    driver_names = test_df[["driver_number","driver_name"]].drop_duplicates("driver_number")
    comparison = comparison.merge(driver_names, on="driver_number", how="left")

# Add delta & rank orders
comparison["error"] = comparison["predicted_best"] - comparison["actual_best"]
comparison["abs_error"] = comparison["error"].abs()
comparison["actual_rank"] = comparison["actual_best"].rank(method="min").astype(int)
comparison["pred_rank"] = comparison["predicted_best"].rank(method="min").astype(int)
comparison["pos_error"] = (comparison["pred_rank"] - comparison["actual_rank"]).abs()

# Sort by actual rank
comparison = comparison.sort_values("actual_rank")
comparison.reset_index(drop=True, inplace=True)

print("\nPer-driver best-lap comparison (top rows):")
display_cols = ["driver_number","driver_name","actual_best","predicted_best","error","abs_error","actual_rank","pred_rank","pos_error"] \
               if "driver_name" in comparison.columns else ["driver_number","actual_best","predicted_best","error","abs_error","actual_rank","pred_rank","pos_error"]
display(comparison[display_cols].head(20))

# ---------- Metrics on per-driver predictions ----------
mae_driver = comparison["abs_error"].mean()
pos_mae = comparison["pos_error"].mean()
percent_correct_top1 = (comparison["pos_error"]==0).sum() / len(comparison)

print(f"\nPer-driver MAE (sec): {mae_driver:.3f}")
print(f"Mean absolute position error: {pos_mae:.3f} (positions)")
print(f"Percent exact position predicted: {percent_correct_top1*100:.1f}%")

# ---------- Save results and plots ----------
out_csv = DATA_DIR / f"predictions_race_{race_to_predict}.csv"
comparison.to_csv(out_csv, index=False)
print("Saved per-driver predictions to:", out_csv)

# Plot predicted vs actual best lap scatter
plt.figure(figsize=(7,6))
plt.scatter(comparison["actual_best"], comparison["predicted_best"])
plt.plot([comparison["actual_best"].min(), comparison["actual_best"].max()],
         [comparison["actual_best"].min(), comparison["actual_best"].max()], 'r--')
plt.xlabel("Actual best lap (s)")
plt.ylabel("Predicted best lap (s)")
plt.title(f"Predicted vs Actual best lap (Race {race_to_predict})")
plt.show()

# Plot error histogram
plt.figure(figsize=(7,4))
sns.histplot(comparison["error"], bins=20, kde=True)
plt.title("Distribution of per-driver prediction errors (s)")
plt.xlabel("Predicted - Actual (s)")
plt.show()

# ---------- Done ----------
print("Done. Inspect 'predictions_race_{race_to_predict}.csv' and the saved model to reuse for GUI or further analysis.")


Enriched file not found. Loading processed file: ..\data\processed_quali_2024.csv
Rows: 917
Columns: ['meeting_key', 'session_key', 'driver_number', 'lap_number', 'date_start', 'duration_sector_1', 'duration_sector_2', 'duration_sector_3', 'i1_speed', 'i2_speed', 'is_pit_out_lap', 'lap_duration', 'segments_sector_1', 'segments_sector_2', 'segments_sector_3', 'st_speed', 'sector_sum', 'avg_speed_est']
Found 5 meetings (ordered). Example keys: [np.int64(1254), np.int64(1255), np.int64(1256), np.int64(1257), np.int64(1258)]
Enter the race number you want to predict (1-based index). E.g., 5 to predict 5th race using 1-4 for training.


ValueError: race_to_predict must be between 1 and 5