In [1]:
# using an XGBoost model, the goal is to predict lap times
# TRAINING DATA: all data ingested from the 2024 season

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Data Gathering/Preprocessing

In [None]:
# pull the lap times data and race conditions data
lap_data = pd.read_csv("../data/lap_time_df.csv")
weather_data = pd.read_csv("../data/race_conditions_df.csv")

Unnamed: 0,race_id,year,round,track,driver,team,race_lap,stint,compound,tire_age,lap_time_sec,is_inlap,is_outlap,circuit_length(km)
0,2024_01_Bahrain_Grand_Prix,2024,1,Bahrain Grand Prix,VER,Red Bull Racing,1.0,1.0,SOFT,4.0,97.284,False,False,5.412
1,2024_01_Bahrain_Grand_Prix,2024,1,Bahrain Grand Prix,LEC,Ferrari,1.0,1.0,SOFT,4.0,98.271,False,False,5.412
2,2024_01_Bahrain_Grand_Prix,2024,1,Bahrain Grand Prix,RUS,Mercedes,1.0,1.0,SOFT,4.0,99.228,False,False,5.412
3,2024_01_Bahrain_Grand_Prix,2024,1,Bahrain Grand Prix,NOR,McLaren,1.0,1.0,SOFT,4.0,102.168,False,False,5.412
4,2024_01_Bahrain_Grand_Prix,2024,1,Bahrain Grand Prix,HAM,Mercedes,1.0,1.0,SOFT,4.0,103.122,False,False,5.412


In [None]:
# merge the two dataframes
data = lap_data.merge(
    weather_data,
    on="race_id",
    how="left"
)

# create a boolean column for rain
data["is_rain"] = data["rain"].astype(int)

# convert categorical columns (compound, track)
# compound (one hot encoding)
compound_ohe = pd.get_dummies(
    data["compound"],
    prefix="compound"
)

final_df = pd.concat([data, compound_ohe], axis=1)

# track (target encoding -> each track has roughly the same lap time per track)  MAKE SURE THERE IS NO DATA LEAKAGE (split train/test first then apply)
# WHAT IS TARGET ENCODING -> this is taking a categorical column and replacing it with some numerical value derived from that column
# NOTE: track x will always have lap times around 0-1 and then track y will always have lap times around 1-2 (for example)


In [17]:
final_df.columns

Index(['race_id', 'year', 'round', 'track', 'driver', 'team', 'race_lap',
       'stint', 'compound', 'tire_age', 'lap_time_sec', 'is_inlap',
       'is_outlap', 'circuit_length(km)', 'track_temp', 'air_temp', 'rain',
       'is_rain', 'compound_HARD', 'compound_INTERMEDIATE', 'compound_MEDIUM',
       'compound_SOFT', 'compound_WET'],
      dtype='object')

In [None]:
# target encode track column
# split the final_df before encoding to prevent data leakage
# train_df, test_df = train_test_split(final_df, test_size=0.2, random_state=44, shuffle=True)   # SPLITTING ISSUE (splits per row instead of per race -> context leakage (predicting later laps instead of future laps))
unique_races = final_df["race_id"].unique()

train_races, test_races = train_test_split(
    unique_races,
    test_size=0.2,
    random_state=44,
    shuffle=True
)

train_df = final_df[final_df["race_id"].isin(train_races)].copy()
test_df  = final_df[final_df["race_id"].isin(test_races)].copy()


# kfold target encoding helper function
def kfold_target_encode(df, cat_col, target_col, n_splits=5, random_state=44):
    df = df.copy()
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    # find a numerical feature to represent track
    global_mean = df[target_col].mean()
    encoded = pd.Series(index=df.index, dtype=float)

    for train_idx, val_idx in kf.split(df):
        train_fold = df.iloc[train_idx]
        val_fold = df.iloc[val_idx]

        means = train_fold.groupby(cat_col)[target_col].mean()
        encoded.iloc[val_idx] = val_fold[cat_col].map(means)

    # have a fallback for unseen categories
    encoded.fillna(global_mean, inplace=True)

    return encoded


# apply helper function to the training track column
train_df["track_te"] = kfold_target_encode(
    train_df, 
    "track",
    "lap_times_sec"
)

# apply encoding into the test data
track_means = train_df.groupby("track")["lap_times_sec"].mean()
global_mean = train_df["lap_times_sec"].mean()

test_df["track_te"] = (
    test_df["track"]
    .map(track_means)
    .fillna(global_mean)
)

# after all encodings drop all categorical columns
train_df = train_df.drop(columns=["rain", "track", "compound"])
test_df = test_df.drop(columns=["rain", "track", "compound"])

In [None]:
# NOTE: because rows are grouped together (i.e. row 1 = lap 3 of bahrain and row 2 = lap 4) so randomly splitting these would hurt our analysis
train_X = train_df[["race_lap", "tire_age", "stint", "track_temp", "air_temp", "is_rain", "track_te", "compound_HARD", "compound_INTERMEDIATE", "compound_MEDIUM", "compound_SOFT", "compound_WET", "circuit_length(km)"]]  # still need to target encode "track"
train_y = train_df["lap_times_sec"]

## Data Visualizations

In [None]:
# plot tire age vs lap time sec (colored by compound)
# GOAL: should see that degradation still exists (if there is no then I over encoded)

In [None]:
# pairplot of all X columns

## Model Building

## Model Optimization