In [1]:
from matplotlib import pyplot as plt
import matplotlib.ticker as ticker
import pandas as pd
import numpy as np
import fastf1 as ff1
import fastf1.plotting

In [None]:
def gpData(year, country, session):
    """
    parameteres:
        year: int
        country: str
        session: str
    
    returns:
        race: DataFrame
    """

    session = ff1.get_session(year, country, session)
    session.load()
    laps = session.laps
    weather = laps.get_weather_data()
    race = pd.DataFrame({
        'LapNumber': laps['LapNumber'].values,
        'Position': laps['Position'].values,
        'Stint': laps['Stint'].values,
        'Compound': laps['Compound'].values,
        'TyreLife': laps['TyreLife'].values,
        'TrackTemp': weather['TrackTemp'].values,
        'Rainfall': weather['Rainfall'].values
    })

    race['RoundNumber'] = session.event.RoundNumber
    race['TotalLaps'] = laps['LapNumber'].max()

    pit_stop_condition = (
    (laps['PitInTime'].notna()) &
    (laps['TyreLife'] >= laps['TyreLife'].shift(-1)) &
    (laps['PitOutTime'].shift(-1).notna())
    )

    # series for when the car pits
    pitting = pd.Series(np.where(pit_stop_condition, 1, 0))


    return race, pitting

other features to add later for pit/stay out agent: distance_to_driver_ahead, vsc, sc, pit_time_cost, pit_position_cost


for pit + tyre suggestion: tyres available, 

Dataset will contain lap data from the Canadian GP, British GP, Italien GP, Bahrain GP,and Saudi Arabia GP from 2023 and 2024

In [251]:
canada23, canada23_target= gpData(2023, 'Canada', 'r')
canada24, canada24_target= gpData(2024, 'Canada', 'r')

british23, british23_target = gpData(2023, 'British', 'r')
british24, british24_target = gpData(2024, 'British', 'r')

italian23, italian23_target = gpData(2023, 'Italian', 'r')
italian24, italian24_target = gpData(2024, 'Italian', 'r')

bahrain23, bahrain23_target = gpData(2023, 'Bahrain', 'r')
bahrain24, bahrain24_target = gpData(2024, 'Bahrain', 'r')

saudi23, saudi23_target = gpData(2023, 'Saudi Arabia', 'r')
saudi24, saudi24_target = gpData(2024, 'Saudi Arabia', 'r')

core           INFO 	Loading data for Canadian Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '14', '44', '16', '55', '11', '23', '31', '18', '77', '81', '10', '4', '22', '27', '24', '20', '21', '63', '2']
core           INFO 	Loading data for Canadian Grand Prix - 

In [146]:
print(f'canada 2023 pitstorps: {canada23_target.value_counts()}')
print(f'canada 2024 pitstorps: {canada24_target.value_counts()}')
print(f'british 2023 pitstorps: {british23_target.value_counts()}')
print(f'british 2024 pitstorps: {british24_target.value_counts()}')
print(f'italian 2023 pitstorps: {italian23_target.value_counts()}')
print(f'italian 2024 pitstorps: {italian24_target.value_counts()}')
print(f'bahrain 2023 pitstorps: {bahrain23_target.value_counts()}')
print(f'bahrain 2024 pitstorps: {bahrain24_target.value_counts()}')
print(f'saudi 2023 pitstorps: {saudi23_target.value_counts()}')
print(f'saudi 2024 pitstorps: {saudi24_target.value_counts()}')


canada 2023 pitstorps: 0    1285
1      32
Name: count, dtype: int64
canada 2024 pitstorps: 0    1230
1      42
Name: count, dtype: int64
british 2023 pitstorps: 0    948
1     23
Name: count, dtype: int64
british 2024 pitstorps: 0    916
1     45
Name: count, dtype: int64
italian 2023 pitstorps: 0    933
1     25
Name: count, dtype: int64
italian 2024 pitstorps: 0    978
1     30
Name: count, dtype: int64
bahrain 2023 pitstorps: 0    1007
1      49
Name: count, dtype: int64
bahrain 2024 pitstorps: 0    1086
1      43
Name: count, dtype: int64
saudi 2023 pitstorps: 0    919
1     24
Name: count, dtype: int64
saudi 2024 pitstorps: 0    882
1     19
Name: count, dtype: int64


After double checking the values with the offcial pitstop summary i found discrepencies for 3 races, in Canada GP 2023, British GP 2023 and Bahrain GP 2023. 

Now I ll have a look to see where they are coming rom and make the necesary changes.

This is not surprising as it is not unusual to be mistakes in the data.

In [226]:
session = ff1.get_session(2023, 'Canada', 'r')
session.load()
laps = session.laps

core           INFO 	Loading data for Canadian Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '14', '44', '16', '55', '11', '23', '31', '18', '77', '81', '10', '4', '22', '27', '24', '20', '21', '63', '2']


In [None]:
pits = laps[['PitInTime', 'PitOutTime', 'LapNumber', 'Driver', 'TyreLife', 'Compound', 'Stint']]

pits.iloc[canada23_target[canada23_target == 1].index].sort_values(by=['LapNumber','PitInTime'])

Unnamed: 0,PitInTime,PitOutTime,LapNumber,Driver,TyreLife,Compound,Stint
565,0 days 01:03:32.868000,NaT,1.0,TSU,1.0,MEDIUM,1.0
79,0 days 01:15:36.573000,NaT,10.0,GAS,10.0,SOFT,1.0
784,0 days 01:16:51.275000,NaT,11.0,HUL,11.0,MEDIUM,1.0
360,0 days 01:16:54.275000,NaT,11.0,STR,11.0,MEDIUM,1.0
506,0 days 01:16:56.447000,NaT,11.0,DEV,11.0,MEDIUM,1.0
11,0 days 01:17:50.274000,NaT,12.0,VER,12.0,MEDIUM,1.0
994,0 days 01:18:00.165000,NaT,12.0,HAM,12.0,MEDIUM,1.0
221,0 days 01:18:00.977000,NaT,12.0,ALO,12.0,MEDIUM,1.0
854,0 days 01:18:13.164000,NaT,12.0,OCO,12.0,MEDIUM,1.0
1258,0 days 01:18:16.805000,NaT,12.0,PIA,12.0,MEDIUM,1.0


We are missing Tsunoda's pit stop on lap 34.

I ll look to see why and make modiffications accordingly.

In [None]:
col_to_drop =['Driver','Sector1Time','Sector2Time','Sector3Time', 'IsPersonalBest', 'DriverNumber', 'Time', 'Sector1SessionTime', 'Sector2SessionTime', 'Sector3SessionTime', 'SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST', 'LapStartTime', 'LapStartDate', 'Position', 'DeletedReason', 'FastF1Generated', 'IsAccurate']

In [None]:
driver = laps.pick_drivers("TSU").drop(columns= col_to_drop)

driver[driver['LapNumber']>=34].head()

Unnamed: 0,LapTime,LapNumber,Stint,PitOutTime,PitInTime,Compound,TyreLife,FreshTyre,Team,TrackStatus,Deleted
475,0 days 00:02:07.037000,34.0,3.0,NaT,NaT,SOFT,2.0,True,AlphaTauri,4,False
476,0 days 00:01:54.219000,35.0,3.0,NaT,NaT,SOFT,3.0,True,AlphaTauri,4,False
477,0 days 00:02:24.840000,36.0,3.0,NaT,NaT,SOFT,4.0,True,AlphaTauri,4,False
478,0 days 00:02:23.518000,37.0,3.0,NaT,NaT,SOFT,5.0,True,AlphaTauri,4,False
479,0 days 00:02:10.750000,38.0,3.0,NaT,NaT,SOFT,6.0,True,AlphaTauri,41,False


Ok seems that the problem is that the TyreLife was not reseted after he tyres were changed, so that will habe to be fixed.

After looking at the whole column, another mistake was found after the first pit stop so that was will also be adjusted.

The Stint column also have the wrong values in some parts so taht will be cahnged too.

The last 2 mistake happend in the same place after the first pitstop.

In [None]:
canada23.loc[567:598, 'Stint'] = 2
canada23.loc[567:598, 'TyreLife'] += 1
canada23.loc[599:634, 'TyreLife'] = range(1, 37)
canada23_target.iloc[598] = 1

In [238]:
canada23.iloc[598:605]

Unnamed: 0,LapNumber,Position,Stint,Compound,TyreLife,TrackTemp,Rainfall,RoundNumber,TotalLaps
598,34.0,17.0,2,HARD,33.0,31.3,0,8,70.0
599,35.0,16.0,3,HARD,1.0,32.3,0,8,70.0
600,36.0,16.0,3,HARD,2.0,32.3,0,8,70.0
601,37.0,16.0,3,HARD,3.0,32.3,0,8,70.0
602,38.0,16.0,3,HARD,4.0,32.3,0,8,70.0
603,39.0,16.0,3,HARD,5.0,31.6,0,8,70.0
604,40.0,16.0,3,HARD,6.0,31.6,0,8,70.0


In [None]:
session = ff1.get_session(2023, 'British', 'r')
session.load()
laps = session.laps
pits = laps[['PitInTime', 'PitOutTime', 'LapNumber', 'Driver', 'TyreLife', 'Compound', 'Stint']]


core           INFO 	Loading data for British Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '4', '44', '81', '63', '11', '14', '23', '16', '55', '2', '77', '27', '18', '24', '22', '21', '10', '20', '31']


In [235]:
pits.iloc[british23_target[british23_target==1].index].sort_values(by=['LapNumber','PitInTime'])


Unnamed: 0,PitInTime,PitOutTime,LapNumber,Driver,TyreLife,Compound,Stint
604,0 days 01:13:16.312000,NaT,7.0,HUL,7.0,HARD,1.0
455,0 days 01:24:14.879000,NaT,14.0,TSU,14.0,SOFT,1.0
219,0 days 01:30:08.076000,NaT,18.0,LEC,18.0,MEDIUM,1.0
569,0 days 01:39:56.003000,NaT,24.0,ZHO,24.0,MEDIUM,1.0
788,0 days 01:42:35.441000,NaT,26.0,SAI,26.0,MEDIUM,1.0
416,0 days 01:44:33.924000,NaT,27.0,DEV,27.0,SOFT,1.0
842,0 days 01:45:37.282000,NaT,28.0,RUS,31.0,SOFT,1.0
125,0 days 01:45:49.766000,NaT,28.0,PER,28.0,MEDIUM,1.0
947,0 days 01:47:04.827000,NaT,29.0,PIA,29.0,MEDIUM,1.0
334,0 days 01:47:37.014000,NaT,29.0,SAR,29.0,MEDIUM,1.0


In [240]:
driver = laps.pick_drivers("ZHO").drop(columns=col_to_drop)
driver[driver['LapNumber']>=36].head()


Unnamed: 0,LapTime,LapNumber,Stint,PitOutTime,PitInTime,Compound,TyreLife,FreshTyre,Team,TrackStatus,Deleted
581,0 days 00:02:20.527000,36.0,3.0,NaT,0 days 02:01:38.499000,SOFT,3.0,True,Alfa Romeo,4,False
582,0 days 00:02:28.721000,37.0,4.0,0 days 02:02:12.858000,NaT,SOFT,4.0,False,Alfa Romeo,4,False
583,0 days 00:02:07.914000,38.0,4.0,NaT,NaT,SOFT,5.0,False,Alfa Romeo,41,False
584,0 days 00:01:33.188000,39.0,4.0,NaT,NaT,SOFT,6.0,False,Alfa Romeo,1,False
585,0 days 00:01:33.310000,40.0,4.0,NaT,NaT,SOFT,7.0,False,Alfa Romeo,1,False


From this we are missing ZHO' s last pit stop which was not a tyre change was a brack check, which is not relevenat for the task at hand so no changes will be made

In [241]:
session = ff1.get_session(2023, 'bahrain', 'r')
session.load()
laps = session.laps
pits = laps[['PitInTime', 'PitOutTime', 'LapNumber', 'Driver', 'TyreLife', 'Compound', 'Stint']]
pits.iloc[bahrain23_target[bahrain23_target==1].index].sort_values(by=['LapNumber','PitInTime'])

core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '14', '55', '44', '18', '63', '77', '10', '23', '22', '2', '20', '21', '27', '24', '4', '31', '16', '81']


Unnamed: 0,PitInTime,PitOutTime,LapNumber,Driver,TyreLife,Compound,Stint
65,0 days 01:17:55.198000,NaT,9.0,GAS,9.0,SOFT,1.0
769,0 days 01:19:30.368000,NaT,10.0,NOR,10.0,SOFT,1.0
502,0 days 01:19:32.587000,NaT,10.0,TSU,10.0,SOFT,1.0
996,0 days 01:21:04.976000,NaT,11.0,BOT,11.0,SOFT,1.0
560,0 days 01:21:10.913000,NaT,11.0,ALB,11.0,SOFT,1.0
673,0 days 01:21:19.757000,NaT,11.0,HUL,14.0,SOFT,1.0
447,0 days 01:21:20.585000,NaT,11.0,DEV,11.0,SOFT,1.0
826,0 days 01:22:38.865000,NaT,12.0,HAM,15.0,SOFT,1.0
730,0 days 01:22:49.224000,NaT,12.0,OCO,15.0,SOFT,1.0
336,0 days 01:22:54.364000,NaT,12.0,SAR,12.0,SOFT,1.0


In [243]:
driver = laps.pick_drivers("OCO").drop(columns=col_to_drop)
driver[driver['LapNumber']>=10].head(10)

Unnamed: 0,LapTime,LapNumber,Stint,PitOutTime,PitInTime,Compound,TyreLife,FreshTyre,Team,TrackStatus,Deleted
728,0 days 00:01:40.858000,10.0,1.0,NaT,NaT,SOFT,13.0,False,Alpine,1,False
729,0 days 00:01:41.035000,11.0,1.0,NaT,NaT,SOFT,14.0,False,Alpine,1,False
730,0 days 00:01:43.787000,12.0,1.0,NaT,0 days 01:22:49.224000,SOFT,15.0,False,Alpine,1,False
731,0 days 00:02:00.381000,13.0,2.0,0 days 01:23:14.208000,NaT,HARD,1.0,True,Alpine,1,False
732,0 days 00:01:38.649000,14.0,2.0,NaT,NaT,HARD,2.0,True,Alpine,1,False
733,0 days 00:01:41.863000,15.0,2.0,NaT,0 days 01:28:10.124000,HARD,3.0,True,Alpine,1,False
734,0 days 00:02:15.708000,16.0,3.0,0 days 01:28:51.608000,NaT,HARD,4.0,False,Alpine,1,False
735,0 days 00:01:38.780000,17.0,3.0,NaT,NaT,HARD,5.0,False,Alpine,1,False
736,0 days 00:01:38.886000,18.0,3.0,NaT,NaT,HARD,6.0,False,Alpine,1,False
737,0 days 00:01:38.817000,19.0,3.0,NaT,NaT,HARD,7.0,False,Alpine,1,False


OCO' s pitstop in lap 15 is missing, but it s afront wing change so it's not relevant to the project so no change will be made.

In [267]:
lapData = pd.concat([canada23, canada24, british23, british24, italian23, italian24, bahrain23, bahrain24, saudi23, saudi24], ignore_index=True)
target = pd.concat([canada23_target, canada24_target, british23_target, british24_target, italian23_target, italian24_target, bahrain23_target, bahrain24_target, saudi23_target, saudi24_target], ignore_index=True)

In [268]:
# make bools into ints
lapData['Rainfall'] = lapData['Rainfall'].astype(int)

# encode compound types
lapData['Compound'] = lapData['Compound'].astype('category').cat.codes

# make column category types
lapData['Position'] = lapData['Position'].astype('category')
lapData['Stint'] = lapData['Stint'].astype('category')
lapData['Compound'] = lapData['Compound'].astype('category')
lapData['Rainfall'] = lapData['Rainfall'].astype('category')
lapData['RoundNumber'] = lapData['RoundNumber'].astype('category')
lapData.dtypes

LapNumber       float64
Position       category
Stint          category
Compound       category
TyreLife        float64
TrackTemp       float64
Rainfall       category
RoundNumber    category
TotalLaps       float64
dtype: object

In [269]:
lapData.shape, target.shape

((10516, 9), (10516,))

In [270]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Initialize model
model = RandomForestClassifier(random_state=42)

# Perform 5-fold cross-validation
scores = cross_val_score(model, lapData, target, cv=5)

# Print the average score
print("Average R² score:", scores.mean())

Average R² score: 0.9488379356667733


In [272]:
model = RandomForestClassifier(random_state=42)

# split data intro training and testing
from sklearn.model_selection import train_test_split
lapData_train, lapData_test, target_train, target_test = train_test_split(lapData, target, test_size=0.2, random_state=42)

# Fit model
model.fit(lapData_train, target_train)

# Evaluate model
from sklearn.metrics import accuracy_score, classification_report
predictions = model.predict(lapData_test)
accuracy = accuracy_score(target_test, predictions)
print("Accuracy: ", accuracy)

importances = model.feature_importances_
print("Fearure importance: ",importances)

Accuracy:  0.9667300380228137
Fearure importance:  [0.18305769 0.27547053 0.02574537 0.04741302 0.18368626 0.19740162
 0.0069101  0.04630907 0.03400633]
