***
# MAKE PREDICTIONS USING THE FASTF1 API
***

#### _Disclaimer_: We'll make extensive use of custom made Python functions (residing in the fastf1_helper.py file). Those functions will do the tedious work of loading the data from each race, aggregating columns (average tap times etc), and they're not strictly necessary to understand the ML workflow.

- #### Import the necessary libraries (all imports will be done here)

In [1]:
import fastf1
import numpy as np
import pandas as pd
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import xgboost as xgb
from fastf1_helper import get_race, get_season

- #### Create a path for cached data, so that it doesn't constantly re-download the same files

In [2]:
my_path = r'C:\Users\apost\miniconda3\envs\fastF1_cache'
fastf1.Cache.enable_cache(my_path)

- #### Load the seasons of 2022, 2023, 2024, 2024

In [3]:
stats_2022 = get_season(2022)

core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
core           INFO 	Finished loading data for 20 drivers: ['16', '55', '44', '63', '20', '77', '31', '22', '14', '24', '47', '18', '23', '3', '4', '6', '27', '11', '1', '10']
core           INFO 	Loading data for Saudi Arabian Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data 

In [4]:
stats_2023 = get_season(2023)

core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '14', '55', '44', '18', '63', '77', '10', '23', '22', '2', '20', '21', '27', '24', '4', '31', '16', '81']
core           INFO 	Loading data for Saudi Arabian Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data

In [5]:
stats_2024 = get_season(2024)

core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '55', '16', '63', '4', '44', '81', '14', '18', '24', '20', '3', '22', '23', '27', '31', '10', '77', '2']
core           INFO 	Loading data for Saudi Arabian Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data 

In [6]:
stats_2025 = get_season(2025)

core           INFO 	Loading data for Australian Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
core           INFO 	Finished loading data for 20 drivers: ['4', '1', '63', '12', '23', '18', '27', '16', '81', '44', '10', '22', '31', '87', '30', '5', '14', '55', '7', '6']
core           INFO 	Loading data for Chinese Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for 

In [7]:
stats_2022.isna(), stats_2023.isna().sum(), stats_2024.isna().sum(), stats_2025.isna().sum()

(     Driver  lapsCompleted   Team  CircuitName  avgLapTime_s  stdLapTime_s  \
 0     False          False  False        False         False         False   
 1     False          False  False        False         False         False   
 2     False          False  False        False         False         False   
 3     False          False  False        False         False         False   
 4     False          False  False        False         False         False   
 ..      ...            ...    ...          ...           ...           ...   
 433   False          False  False        False         False         False   
 434   False          False  False        False         False         False   
 435   False          False  False        False         False         False   
 436   False          False  False        False         False         False   
 437   False          False  False        False         False         False   
 
      GridPosition  Position  isDNF  raceID   Year

In [8]:
for col in ['avgLapTime_s', 'stdLapTime_s']:
    stats_2022[col] = stats_2022[col].fillna(stats_2022[col].median())
    stats_2023[col] = stats_2023[col].fillna(stats_2023[col].median())
    stats_2024[col] = stats_2024[col].fillna(stats_2024[col].median())
    stats_2025[col] = stats_2025[col].fillna(stats_2025[col].median())

- #### We now concat all the DataFrames:

In [9]:
# Use Pandas concat to make a big DataFrame with all stats:
all_stats = pd.concat([stats_2022, stats_2023, stats_2024, stats_2025], ignore_index = True)
# Sort the dataframe chronologically:
all_stats = all_stats.sort_values(by = ['Year', 'raceID']).reset_index(drop = True)

- #### We'll find the minimum average time per race, so we can normalize our values per driver

In [10]:
# Get the minimum average lap time per race
all_stats['perRaceMinAvgLapTime'] = all_stats.groupby(['Year', 'raceID'])['avgLapTime_s'].transform('min')
# Calculate the percentage difference for each entry, 
all_stats['avgLapTime_s_norm'] = (
    (all_stats['avgLapTime_s'] - all_stats['perRaceMinAvgLapTime']) / all_stats['perRaceMinAvgLapTime']
)
# Calculate the normalized standard deviation
all_stats['stdLapTime_s_norm'] = (
    all_stats['stdLapTime_s'] / all_stats['perRaceMinAvgLapTime']
)

In [11]:
# !!!!!!!!! CURRENT WINDOW SIZE !!!!!!!!!!
# !!!!!!!!! MAY NEED ADJUSTMENT !!!!!!!!!!
WINDOW_SIZE = 5

In [12]:
# Columns we want to calculate the historical mean for:
target_cols = ['avgLapTime_s_norm', 'stdLapTime_s_norm', 'GridPosition', 'Position']

for col in target_cols:

    # Group the data by driver:
    driver_stats = all_stats.groupby('Driver')[col]

    # Calculate the expanding mean up to the previous race:
    all_stats[f'Prev_Avg_{col}'] = driver_stats.transform(
        # Calculate the expanding mean. Shift one position to the left so that we do not include the current race
        # to the predictors table (X) and avoid data leakage. We use Pandas' .shift(1) for that.
        lambda x: x.expanding(min_periods = 1).mean().shift(1)
    )

    # Calculate the rolling mean (for a given window of races) up to the previous race:
    all_stats[f'Rolling_Prev_Avg_{col}'] = driver_stats.transform(
        # Calculate the rolling mean. Shift one position to the left so that we do not include the current race
        # to the predictors table (X) and avoid data leakage. We use Pandas' .shift(1) for that.
        lambda x: x.rolling(window = WINDOW_SIZE, min_periods = 1).mean().shift(1)
    )

# Drop potential new null values:
all_stats.dropna(
    subset = ['Prev_Avg_avgLapTime_s_norm', 'Prev_Avg_stdLapTime_s_norm', 'Prev_Avg_GridPosition', 'Prev_Avg_Position',
             'Rolling_Prev_Avg_avgLapTime_s_norm', 'Rolling_Prev_Avg_stdLapTime_s_norm',
              'Rolling_Prev_Avg_GridPosition', 'Rolling_Prev_Avg_Position'],
    inplace = True
)

- #### We'll do the same for the average (expanding and rolling) Final Position that a driver historically has on a circuit
  ##### We don't do it in our loop because we need to group them differently.

In [13]:
all_stats['Prev_Avg_Finish_Track'] = all_stats.groupby(['Driver', 'CircuitName'])['Position'].transform(
    lambda x: x.expanding(min_periods = 1).mean().shift(1)
)

all_stats['Rolling_Prev_Avg_Finish_Track'] = all_stats.groupby(['Driver', 'CircuitName'])['Position'].transform(
    lambda x: x.rolling(window = WINDOW_SIZE, min_periods = 1).mean().shift(1)
)

all_stats.dropna(
    subset = ['Prev_Avg_Finish_Track', 'Rolling_Prev_Avg_Finish_Track'],
    inplace = True
)

- #### Aggregate stats per Team:

In [14]:
# Find out how the team as a whole is doing in recent races when it comes to average lap times
all_stats['Rolling_Prev_Avg_TeamPace'] = all_stats.groupby('Team')['avgLapTime_s_norm'].transform(
    lambda x: x.rolling(window = WINDOW_SIZE, min_periods = 1).mean().shift(1)
)

# Create a "per_team" df for summed "Position"s
per_team_stats = all_stats.groupby(['Year', 'raceID', 'Team'])['Position'].mean().reset_index()
# Merge the new summed Positions (will rename them) with the original df
per_team_stats.rename(columns = {'Position': 'perRace_Team_Avg_Pos'}, inplace=True)
all_stats = all_stats.merge(
    per_team_stats[['Year', 'raceID', 'Team', 'perRace_Team_Avg_Pos']],
    on=['Year', 'raceID', 'Team'],
    how='left'
)

all_stats['Rolling_Prev_Avg_TeamFinalPos'] = all_stats.groupby('Team')['perRace_Team_Avg_Pos'].transform(
    lambda x: x.rolling(window = WINDOW_SIZE, min_periods = 1).mean().shift(1)
)

all_stats.dropna(
    subset = ['Rolling_Prev_Avg_TeamPace', 'Rolling_Prev_Avg_TeamFinalPos'],
    inplace = True
)

- #### Add a "bad result" column, to penalize drivers that consistently finish below 10th

In [15]:
all_stats['BadResult'] = (all_stats['Position'] > 10).astype(int)

- #### Add a rolling DNF status to penalize drivers that DNF a lot

In [16]:
all_stats['Rolling_Prev_DNF_Status'] = all_stats.groupby('Driver')['isDNF'].transform(
    lambda x: x.rolling(window = WINDOW_SIZE, min_periods = 1).sum().shift(1)
)
# Fill NA with zeros, aka not DNF
all_stats['Rolling_Prev_DNF_Status'] = all_stats['Rolling_Prev_DNF_Status'].fillna(0)

In [17]:
all_stats.columns

Index(['Driver', 'lapsCompleted', 'Team', 'CircuitName', 'avgLapTime_s',
       'stdLapTime_s', 'GridPosition', 'Position', 'isDNF', 'raceID', 'Year',
       'perRaceMinAvgLapTime', 'avgLapTime_s_norm', 'stdLapTime_s_norm',
       'Prev_Avg_avgLapTime_s_norm', 'Rolling_Prev_Avg_avgLapTime_s_norm',
       'Prev_Avg_stdLapTime_s_norm', 'Rolling_Prev_Avg_stdLapTime_s_norm',
       'Prev_Avg_GridPosition', 'Rolling_Prev_Avg_GridPosition',
       'Prev_Avg_Position', 'Rolling_Prev_Avg_Position',
       'Prev_Avg_Finish_Track', 'Rolling_Prev_Avg_Finish_Track',
       'Rolling_Prev_Avg_TeamPace', 'perRace_Team_Avg_Pos',
       'Rolling_Prev_Avg_TeamFinalPos', 'BadResult',
       'Rolling_Prev_DNF_Status'],
      dtype='object')

- #### We can now construct our X dataset, uring the columns of interest:
  #### ('GridPosition', 'Prev_Avg_Lap_s', 'Prev_Std_Lap_s', 'Prev_Avg_Grid', 'Prev_Avg_Position')

In [18]:
X_cols = ['Prev_Avg_GridPosition', 'Prev_Avg_Position', 'Prev_Avg_avgLapTime_s_norm', 'Prev_Avg_stdLapTime_s_norm',
         'Rolling_Prev_Avg_avgLapTime_s_norm', 'Rolling_Prev_Avg_stdLapTime_s_norm',
         'Rolling_Prev_Avg_GridPosition', 'Rolling_Prev_Avg_Position', 'Prev_Avg_Finish_Track', 'Rolling_Prev_Avg_Finish_Track',
         'Rolling_Prev_Avg_TeamPace', 'Rolling_Prev_Avg_TeamFinalPos', 'BadResult']
X = all_stats[X_cols].copy()
X.head()

Unnamed: 0,Prev_Avg_GridPosition,Prev_Avg_Position,Prev_Avg_avgLapTime_s_norm,Prev_Avg_stdLapTime_s_norm,Rolling_Prev_Avg_avgLapTime_s_norm,Rolling_Prev_Avg_stdLapTime_s_norm,Rolling_Prev_Avg_GridPosition,Rolling_Prev_Avg_Position,Prev_Avg_Finish_Track,Rolling_Prev_Avg_Finish_Track,Rolling_Prev_Avg_TeamPace,Rolling_Prev_Avg_TeamFinalPos,BadResult
7,13.130435,14.086957,0.062654,0.106296,0.046935,0.070294,14.6,15.0,9.0,9.0,0.049495,11.0,0
9,10.652174,9.26087,0.049156,0.112053,0.025523,0.090305,8.6,10.4,6.0,6.0,0.034864,8.5,0
11,5.478261,5.086957,0.03349,0.104527,0.009092,0.082539,3.8,4.4,5.0,5.0,0.01484,4.5,0
12,4.695652,7.695652,0.033115,0.096373,0.029591,0.081996,4.2,7.2,3.0,3.0,0.03181,6.5,0
13,13.956522,11.782609,0.048723,0.102614,0.022956,0.082438,12.4,11.4,13.0,13.0,0.006269,11.5,1


- #### Create some ID columns that will be used for identification purposes

In [19]:
ID_cols = ['Driver', 'Year', 'raceID']
ID = all_stats[ID_cols].copy() 

- #### And finally, our target variable

In [20]:
all_stats['Winner'] = (all_stats['Position'] == 1).astype(int) 
y = all_stats['Winner'].copy()

- #### We shouldn't shuffle our data, so we can choose as X_train the whole 2023 and 2023 seasons:

In [21]:
train_idx = np.where(ID['Year'] < 2025)[0]
test_idx = np.where(ID['Year'] == 2025)[0]
X_train, X_test = X.values[train_idx], X.values[test_idx]
y_train, y_test = y.values[train_idx], y.values[test_idx]

- #### Scale our data using Sklearn's StandardScaler:

In [22]:
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

- #### Create a simple logistic regression model:

In [23]:
lr = LogisticRegression(class_weight = 'balanced', random_state = 42)
lr.fit(X_train_sc, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'lbfgs'
,max_iter,100


- #### We used class_weight as 'balanced' because our classes are not at all evenly distributed. As a matter of fact, our positive class is very rare.

In [24]:
y.value_counts()

Winner
0    926
1     59
Name: count, dtype: int64

In [25]:
lr.score(X_test_sc, y_test)

0.78125

In [26]:
lr.score(X_train_sc, y_train)

0.8641975308641975

- #### Our accuracy is a bit too high. This could be because the Negative ("0") class is too easy to predict. If you predict every single driver as a loser, you'll still have 1.184/1.247 = 0,901363 accuracy!!

In [27]:
y_pred = lr.predict(X_test_sc)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.77      0.87       238
           1       0.23      0.89      0.36        18

    accuracy                           0.78       256
   macro avg       0.61      0.83      0.62       256
weighted avg       0.94      0.78      0.83       256



- #### Move on to a __Random Forest__ model

Random Forests are sensitive to several parameters. That's why we'll do a Grid Search first.

In [28]:
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [5, 10, 20, 50, 80, 150, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 5],
    'class_weight': ['balanced']
}

grid = GridSearchCV(RandomForestClassifier(), param_grid = param_grid, scoring = 'roc_auc', cv = 5)
grid.fit(X_train, y_train)
grid.best_params_

{'class_weight': 'balanced',
 'max_depth': 5,
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 100}

In [29]:
rf = grid.best_estimator_
rf.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,5
,min_samples_split,5
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [30]:
rf.score(X_test, y_test)

0.89453125

In [31]:
rf.score(X_train, y_train)

0.9519890260631001

In [32]:
y_pred_rf = rf.predict(X_test)
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.95      0.93      0.94       238
           1       0.30      0.39      0.34        18

    accuracy                           0.89       256
   macro avg       0.63      0.66      0.64       256
weighted avg       0.91      0.89      0.90       256



In [33]:
neg_count = sum(y_train == 0)
pos_count = sum(y_train == 1)
scale_weight = neg_count / pos_count

xgbc = xgb.XGBClassifier(
    objective = 'binary:logistic',
    random_state = 42,
    eval_metric = 'logloss',
    scale_pos_weight = scale_weight
)

xgbc.fit(X_train_sc, y_train)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [34]:
y_pred_xgb = xgbc.predict(X_test_sc)
print(classification_report(y_test, y_pred_xgb))

              precision    recall  f1-score   support

           0       0.94      0.97      0.95       238
           1       0.33      0.22      0.27        18

    accuracy                           0.91       256
   macro avg       0.64      0.59      0.61       256
weighted avg       0.90      0.91      0.91       256



- #### There's a possible overfitting. Let's try and tune our XGB Model

In [35]:
param_dist = {
    'learning_rate': stats.uniform(loc = 0.01, scale = 0.1),
    'n_estimators': stats.randint(100, 1000),
    'scale_pos_weight': [scale_weight, scale_weight * 1.1, scale_weight * 0.9],
    'max_depth': stats.randint(1, 20),
    'min_child_weight': stats.randint(3, 7),
    'gamma': stats.uniform(loc = 0, scale = 0.5)
}

random_search = RandomizedSearchCV(
    estimator = xgb.XGBClassifier(objective='binary:logistic', eval_metric = 'logloss', seed = 42),
    n_iter = 100, param_distributions = param_dist, cv = 5, scoring = 'roc_auc', verbose = 1, random_state = 42)

random_search.fit(X_train, y_train)
random_search.best_params_

Fitting 5 folds for each of 100 candidates, totalling 500 fits


{'gamma': np.float64(0.2469468575917173),
 'learning_rate': np.float64(0.027882270922132885),
 'max_depth': 8,
 'min_child_weight': 5,
 'n_estimators': 294,
 'scale_pos_weight': np.float64(16.78048780487805)}

In [36]:
better_xgbc = random_search.best_estimator_
better_xgbc.gamma = 0.05
better_xgbc.fit(X_train, y_train)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [37]:
y_pred_better_xgb = better_xgbc.predict(X_test)
print(classification_report(y_test, y_pred_better_xgb))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96       238
           1       0.47      0.50      0.49        18

    accuracy                           0.93       256
   macro avg       0.72      0.73      0.72       256
weighted avg       0.93      0.93      0.93       256

