***
# MAKE PREDICTIONS USING THE FASTF1 API
***

#### _Disclaimer_: We'll make extensive use of custom made Python functions (residing in the fastf1_helper.py file). Those functions will do the tedious work of loading the data from each race, aggregating columns (average tap times etc), and they're not strictly necessary to understand the ML workflow.

- #### Import the necessary libraries (all imports will be done here)

In [1]:
import fastf1
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from fastf1_helper import get_race, get_season

- #### Create a path for cached data, so that it doesn't constantly re-download the same files

In [2]:
my_path = r'C:\Users\apost\miniconda3\envs\fastF1_cache'
fastf1.Cache.enable_cache(my_path)

- #### Load the seasons of 2023, 2024, 2024

In [3]:
stats_2023 = get_season(2023)

core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '14', '55', '44', '18', '63', '77', '10', '23', '22', '2', '20', '21', '27', '24', '4', '31', '16', '81']
core           INFO 	Loading data for Saudi Arabian Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data

In [4]:
stats_2024 = get_season(2024)

core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '55', '16', '63', '4', '44', '81', '14', '18', '24', '20', '3', '22', '23', '27', '31', '10', '77', '2']
core           INFO 	Loading data for Saudi Arabian Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data 

In [5]:
stats_2025 = get_season(2025)

core           INFO 	Loading data for Australian Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
core           INFO 	Finished loading data for 20 drivers: ['4', '1', '63', '12', '23', '18', '27', '16', '81', '44', '10', '22', '31', '87', '30', '5', '14', '55', '7', '6']
core           INFO 	Loading data for Chinese Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for 

In [6]:
stats_2023.isna().sum(), stats_2024.isna().sum(), stats_2025.isna().sum()

(Driver            0
 lapsCompleted     0
 avgLapTime_s      8
 stdLapTime_s     10
 GridPosition      0
 Position          0
 raceID            0
 Year              0
 dtype: int64,
 Driver            0
 lapsCompleted     0
 avgLapTime_s     13
 stdLapTime_s     15
 GridPosition      0
 Position          0
 raceID            0
 Year              0
 dtype: int64,
 Driver            0
 lapsCompleted     0
 avgLapTime_s     12
 stdLapTime_s     13
 GridPosition      0
 Position          0
 raceID            0
 Year              0
 dtype: int64)

In [7]:
for col in ['avgLapTime_s', 'stdLapTime_s']:
    stats_2023[col] = stats_2023[col].fillna(stats_2023[col].median())
    stats_2024[col] = stats_2024[col].fillna(stats_2024[col].median())
    stats_2025[col] = stats_2025[col].fillna(stats_2025[col].median())

- #### We now concat all the DataFrames:

In [8]:
# Use Pandas concat to make a big DataFrame with all stats:
all_stats = pd.concat([stats_2023, stats_2024, stats_2025], ignore_index = True)
# Sort the dataframe chronologically:
all_stats = all_stats.sort_values(by = ['Year', 'raceID']).reset_index(drop = True)

In [9]:
# Columns we want to calculate the historical mean for:
target_cols = ['avgLapTime_s', 'stdLapTime_s', 'GridPosition', 'Position']

for col in target_cols:

    # Group the data by driver:
    driver_stats = all_stats.groupby('Driver')[col]

    # Calculate the expanding mean up to the current column:
    all_stats[f'Prev_Avg_{col}'] = driver_stats.transform(
        # Calculate the expanding mean. Shift one position to the left so that we do not include the current race
        # to the predictors table (X) and avoid data leakage. We use Pandas' .shift(1) for that.
        lambda x: x.expanding(min_periods = 1).mean().shift(1)
    )

# Rename the columns for simplicity:
all_stats.rename(columns={
    'Prev_Avg_GridPosition': 'Prev_Avg_Grid',
    'Prev_Avg_avgLapTime_s': 'Prev_Avg_Lap_s',
    'Prev_Avg_stdLapTime_s': 'Prev_Std_Lap_s'
    }, inplace = True)

# Drop potential new null values:
all_stats.dropna(subset = ['Prev_Avg_Grid', 'Prev_Avg_Lap_s', 'Prev_Std_Lap_s'], inplace = True)

In [10]:
all_stats.columns

Index(['Driver', 'lapsCompleted', 'avgLapTime_s', 'stdLapTime_s',
       'GridPosition', 'Position', 'raceID', 'Year', 'Prev_Avg_Lap_s',
       'Prev_Std_Lap_s', 'Prev_Avg_Grid', 'Prev_Avg_Position'],
      dtype='object')

- #### We can now construct our X dataset, uring the columns of interest:
  #### ('GridPosition', 'Prev_Avg_Lap_s', 'Prev_Std_Lap_s', 'Prev_Avg_Grid', 'Prev_Avg_Position')

In [11]:
X_cols = ['GridPosition', 'Prev_Avg_Lap_s', 'Prev_Std_Lap_s', 'Prev_Avg_Grid', 'Prev_Avg_Position']
X = all_stats[X_cols].copy()
X.head()

Unnamed: 0,GridPosition,Prev_Avg_Lap_s,Prev_Std_Lap_s,Prev_Avg_Grid,Prev_Avg_Position
20,17.0,100.465088,5.821969,15.0,10.0
21,2.0,99.567947,5.448373,5.0,3.0
22,14.0,100.164614,4.719029,12.0,8.0
23,18.0,101.058196,5.017134,19.0,14.0
24,9.0,100.184018,6.135426,20.0,9.0


- #### Create some ID columns that will be used for identification purposes

In [12]:
ID_cols = ['Driver', 'Year', 'raceID']
ID = all_stats[ID_cols].copy() 

- #### And finally, our target variable

In [13]:
all_stats['Winner'] = (all_stats['Position'] == 1).astype(int) 
y = all_stats['Winner'].copy()

- #### We shouldn't shuffle our data, so we can choose as X_train the whole 2023 and 2023 seasons:

In [14]:
train_idx = np.where(ID['Year'] < 2025)[0]
test_idx = np.where(ID['Year'] == 2025)[0]
X_train, X_test = X.values[train_idx], X.values[test_idx]
y_train, y_test = y.values[train_idx], y.values[test_idx]

- #### Create a simple logistic regression model:

In [15]:
lr = LogisticRegression(class_weight = 'balanced', random_state = 42)
lr.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'lbfgs'
,max_iter,100


- #### We used class_weight as 'balanced' because our classes are not at all evenly distributed. As a matter of fact, our positive class is very rare.

In [16]:
y.value_counts()

Winner
0    1184
1      63
Name: count, dtype: int64

In [17]:
lr.score(X_test, y_test)

0.8174157303370787

In [18]:
lr.score(X_train, y_train)

0.877665544332211

- #### Our accuracy is a bit too high. This could be because the Negative ("0") class is too easy to predict. If you predict every single driver as a loser, you'll still have 1.184/1.247 = 0,901363 accuracy!!

In [19]:
y_pred = lr.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.81      0.89       338
           1       0.22      1.00      0.36        18

    accuracy                           0.82       356
   macro avg       0.61      0.90      0.63       356
weighted avg       0.96      0.82      0.87       356

