In [4]:
from sklearn.metrics import accuracy_score, log_loss
import numpy as np

import fastf1
import pandas as pd
import datetime as dt
from pathlib import Path
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

# ── 0. cache ──────────────────────────────────────────────────
CACHE_DIR = Path("f1_cache")
CACHE_DIR.mkdir(exist_ok=True)
fastf1.Cache.enable_cache(CACHE_DIR)

# ── 1. helper: next race on the calendar ──────────────────────
def next_race_event(today: dt.date | None = None) -> dict:
    today = today or dt.date.today()
    for yr in (today.year, today.year + 1): # handle winter break
        sched = fastf1.get_event_schedule(yr, include_testing=False)
        future = sched[sched["EventDate"].dt.date > today]
        if not future.empty:
            row = future.iloc[0]
            return dict(
                year=yr,
                round=int(row["RoundNumber"]),
                name=row["EventName"],
                date=row["EventDate"].date(),
            )
    raise RuntimeError("No upcoming F1 race found")

def last_completed_race(today: dt.date | None = None) -> dict:
    """Return dict(year, round, name, date) for the most-recent GP that has run."""
    today = today or dt.date.today()
    # walk backwards through the calendar until we hit a race date < today
    for yr in (today.year, today.year - 1):
        sched = fastf1.get_event_schedule(yr, include_testing=False)
        past = sched[sched["EventDate"].dt.date < today]
        if not past.empty:
            row = past.iloc[-1]                      # last completed event
            return dict(
                year=yr,
                round=int(row["RoundNumber"]),
                name=row["EventName"],
                date=row["EventDate"].date(),
            )
    raise RuntimeError("Could not locate a completed race")

# Choose next, or previous race
# nr = next_race_event()
# print(f"→ Next race: {nr['name']}  on  {nr['date']}  (Round {nr['round']})")
nr = last_completed_race()
print(f"→ Predicting last race: {nr['name']}  ({nr['date']})")

# ── 2. collect race results up to (not incl.) next race ───────
rows: list[pd.DataFrame] = []
for yr in (2024, 2025):
    sched = fastf1.get_event_schedule(yr, include_testing=False)
    for rnd, ev_date in sched[["RoundNumber", "EventDate"]].itertuples(index=False):
        if ev_date.date() >= nr["date"]:
            break                                # stop at next GP
        sess = fastf1.get_session(yr, rnd, "R")
        sess.load()
        res = sess.results.assign(
            season=yr,
            round=rnd,
            circuit=sess.event["Location"],
            date=ev_date.date(),
        )
        for seg in ("Q1", "Q2", "Q3"):
            res[seg] = pd.to_timedelta(res[seg]).dt.total_seconds()
        res["bestQual"] = res[["Q1", "Q2", "Q3"]].min(axis=1)
        rows.append(res)

df = pd.concat(rows, ignore_index=True)
df["win"] = (df["Position"] == 1).astype(int)

feat_num = ["GridPosition", "bestQual"]
feat_cat = ["DriverId", "TeamId", "circuit"]

races = df.sort_values(["season", "round"]).groupby(["season", "round"])
model_hits, pole_hits = [], []

pre = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), feat_num),
        ("cat",
         Pipeline([
             ("imp", SimpleImputer(strategy="most_frequent")),
             ("enc", OneHotEncoder(handle_unknown="ignore")),
         ]),
         feat_cat),
    ]
)

model = Pipeline([("pre", pre), ("gb", GradientBoostingClassifier(random_state=0))])
model.fit(df[feat_num + feat_cat], df["win"])
print(f"Trained on {len(df)} driver-results (2024 → {nr['date'] - dt.timedelta(days=1)})")

for (yr, rnd), race_df in races:
    X_this = race_df[feat_num + feat_cat]
    y_this = race_df["win"]

    # train on everything *before* this race
    train_df = df[(df["season"] < yr) | ((df["season"] == yr) & (df["round"] < rnd))]
    if train_df.empty:                  # skip first race of 2024 (no train data yet)
        continue
    model.fit(train_df[feat_num + feat_cat], train_df["win"])

    proba = model.predict_proba(X_this)[:, 1]
    pred_winner = race_df.iloc[proba.argmax(), "Abbreviation"]
    real_winner = race_df.loc[y_this.idxmax(), "Abbreviation"]

    pole_winner = race_df.loc[race_df["GridPosition"] == 1, "Abbreviation"].iloc[0]

    model_hits.append(pred_winner == real_winner)
    pole_hits.append(pole_winner == real_winner)

print("Model hit-rate :", np.mean(model_hits))
print("P1  baseline  :", np.mean(pole_hits))


→ Predicting last race: Austrian Grand Prix  (2025-06-29)


core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '55', '16', '63', '4', '44', '81', '14', '18', '24', '20', '3', '22', '23', '27', '31', '10', '77', '2']
core           INFO 	Loading data for Saudi Arabian Grand Prix

Trained on 678 driver-results (2024 → 2025-06-28)


KeyError: np.int64(0)