# Model to predict laptime from car and telemetry data

### Relevant imports

Using pandas and numpy to build and operate the dataframe, sklearn to encode features, XGBoost to create gradient-boosted DT ensemble, and pickle to load/store model

In [102]:
import fastf1
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
import pickle
import os
from tabulate import tabulate
import seaborn as sns

### Configuration variables

To change year and track range, simply modify the years and tracks variables. Tracks can be found on the FastF1 API, or just use the full name of a Grand Prix from Wikipedia. Currently excludes 2025 data as it represents an incomplete season.

In [79]:
YEARS = list(range(2018, 2025))
TRACKS = [
    "Monza", "Silverstone", "Spa", "Baku", "Monaco", "Suzuka", "Interlagos", "Red Bull Ring", "Hungaroring", "Singapore", "Zandvoort"
]
SESSION_CODE = "R"  
MODEL_PATH = "models/laptime_xgb_model.pkl"
PIPELINE_PATH = "models/laptime_preprocessing_pipeline.pkl"

### Build an example of a modified DF for a single datapoint (Monaco 2023)

Load the session

In [80]:
fastf1.Cache.enable_cache('../cache')
session = fastf1.get_session(2023, "Monza", SESSION_CODE)
session.load()

core           INFO 	Loading data for Italian Grand Prix - Race [v3.6.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '55', '16', '63', '44', '23', '4', '14', '77', '40', '81', '2', '24', '10', '18', '27', '20', '31', '22']


Set up the OneHotEncoder

In [81]:
encoder = OneHotEncoder(handle_unknown = "ignore", sparse_output = False)

Extract laps and weather data and combine the tables:

In [82]:
drivers = session.drivers

laps = session.laps.reset_index(drop=True)
weather_data = session.laps.get_weather_data().reset_index(drop=True)

# Join w/ weather data, excluding time
laps = pd.concat([laps, weather_data.loc[:, ~(weather_data.columns == 'Time')]], axis = 1)

# Starting lap for a new stint after pit
laps["StintStartLap"] = laps.groupby("Stint")["LapNumber"].transform("min")
# Race age of current tire (excl. usage in previous sessions)
laps["TireAge"] = laps["LapNumber"] - laps["StintStartLap"]
# Lap time in seconds
laps["LapTimeSeconds"] = laps["LapTime"].dt.total_seconds()


Extracting relevant features/label

In [83]:
laps = laps[["LapNumber", "Stint", "TireAge", "Compound", "TrackStatus", "AirTemp", "TrackTemp", "Rainfall", "Driver", "Team", "LapTimeSeconds"]]
laps.head(5)

Unnamed: 0,LapNumber,Stint,TireAge,Compound,TrackStatus,AirTemp,TrackTemp,Rainfall,Driver,Team,LapTimeSeconds
0,1.0,1.0,0.0,MEDIUM,1,29.2,42.5,False,VER,Red Bull Racing,87.905
1,2.0,1.0,1.0,MEDIUM,1,29.2,43.1,False,VER,Red Bull Racing,86.163
2,3.0,1.0,2.0,MEDIUM,1,29.3,43.2,False,VER,Red Bull Racing,85.702
3,4.0,1.0,3.0,MEDIUM,1,29.5,43.1,False,VER,Red Bull Racing,86.05
4,5.0,1.0,4.0,MEDIUM,1,29.5,43.1,False,VER,Red Bull Racing,86.176


### Extracting lap data for the entire range of data, defined by config variables

Same logic as above, just for all years and drivers

In [None]:
all_laps = []
for year in YEARS:
    for event in TRACKS:
        try:
            session = fastf1.get_session(year, event, SESSION_CODE)
            session.load()
            laps = session.laps.reset_index(drop=True)
            weather_data = session.laps.get_weather_data().reset_index(drop=True)

            # Join w/ weather data, excluding time
            laps = pd.concat([laps, weather_data.loc[:, ~(weather_data.columns == 'Time')]], axis = 1)

            # Starting lap for a new stint after pit
            laps["StintStartLap"] = laps.groupby("Stint")["LapNumber"].transform("min")
            # Race age of current tire (excl. usage in previous sessions)
            laps["TireAge"] = laps["LapNumber"] - laps["StintStartLap"]
            # Lap time in seconds
            laps["LapTimeSeconds"] = laps["LapTime"].dt.total_seconds()
            laps = laps[["LapNumber", "Stint", "TireAge", "Compound", "TrackStatus", 
                                "AirTemp", "TrackTemp", "Rainfall", "Driver", "Team", "LapTimeSeconds"]].copy()
            laps["Year"] = year
            laps["Track"] = event
            all_laps.append(laps)
            print(f"Loaded {event} {year} ({len(laps)} laps)")
        except Exception as e:
            print(f"Skipping {event} {year}: {e}")
if not all_laps:
    raise RuntimeError("No lap data extracted. Check your years/tracks list or FastF1 setup.")
df = pd.concat(all_laps, ignore_index=True)


### Preprocessing the data

Drop any missing data (no reasonable estimate) or fill with mean value , define features/target, and perform test/train split

In [122]:
# drop any missing lap time values
df = df.dropna(subset = ["LapTimeSeconds"])
df = df.dropna()
df = df.reset_index(drop=True) 

# convert rainfall boolean to integer 1/0
df["Rainfall"] = df["Rainfall"].astype(int)
    

Define features/label

In [123]:
X = df[[
        "Compound", "TireAge", "LapNumber", "Stint", "TrackStatus",
        "AirTemp", "TrackTemp", "Driver", "Team"
    ]]
y = df["LapTimeSeconds"]

Categorical/numerical preprocessing

In [124]:
categorical = ["Compound", "TrackStatus", "Driver", "Team"]
numerical = ["TireAge", "LapNumber", "Stint", "AirTemp", "TrackTemp", "Rainfall"]

preprocessor = ColumnTransformer([
        ("cat", encoder, categorical),
        ("num", SimpleImputer(strategy="mean"), numerical)
    ])

# add "missing" to any missing categorical data
for col in categorical:
    X[col] = X[col].fillna("missing")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].fillna("missing")


Train/test/val split

In [125]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.111111, random_state=1) # 0.11 x 0.9 = 0.1