We will approach each problem like a problem statement.

We are going to try to predict the race times from based on last year's conditions + results

In [1]:
import os
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import plotly.express as px
import plotly.graph_objects as go

In [2]:
import fastf1


cache_path = "../f1_cache"
# Check if the cache directory exists
if not os.path.exists(cache_path):
    os.makedirs(cache_path)

# Enable FastF1 caching
fastf1.Cache.enable_cache(cache_path)



In [3]:
# load data from Australian GP 2023, Australian GP 2024

australian_gp_2023 = fastf1.get_session(2023, 'Australian GP', 'R')
australian_gp_2023.load()

australian_gp_2024 = fastf1.get_session(2024, 'Australian GP', 'R')
australian_gp_2024.load()

core           INFO 	Loading data for Australian Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '44', '14', '18', '11', '4', '27', '81', '24', '22', '77', '55', '10', '31', '21', '2', '20', '63', '23', '16']
core           INFO 	Loading data for Australian Grand Pri

In [4]:
# weather data
weather_data_2023 = australian_gp_2023.weather_data
weather_data_2024 = australian_gp_2024.weather_data

avg_temp_2023 = weather_data_2023['AirTemp'].mean()
avg_temp_2024 = weather_data_2024['AirTemp'].mean()
avg_humidity_2023 = weather_data_2023['Humidity'].mean()
avg_humidity_2024 = weather_data_2024['Humidity'].mean()
avg_wind_speed_2023 = weather_data_2023['WindSpeed'].mean()
avg_wind_speed_2024 = weather_data_2024['WindSpeed'].mean()
avg_rain_2023 = weather_data_2023['Rainfall'].mean()
avg_rain_2024 = weather_data_2024['Rainfall'].mean()

In [5]:
# Get laps data for both years
feature_columns = [
    "Time",
    "Driver",
    "LapTime",
    "Sector1Time",
    "Sector2Time",
    "Sector3Time",
    "Compound",
    "TrackStatus",
]
laps_2023 = australian_gp_2023.laps[feature_columns].copy()
laps_2023['Year'] = 2023
laps_2024 = australian_gp_2024.laps[feature_columns].copy()
laps_2024['Year'] = 2024
# Clean up the laps data
laps_2023 = laps_2023.dropna()
laps_2024 = laps_2024.dropna()

# add weather data to laps data
laps_2023["AirTemp"] = avg_temp_2023
laps_2023["Humidity"] = avg_humidity_2023
laps_2023["WindSpeed"] = avg_wind_speed_2023
laps_2023["Rainfall"] = avg_rain_2023
laps_2024["AirTemp"] = avg_temp_2024
laps_2024["Humidity"] = avg_humidity_2024
laps_2024["WindSpeed"] = avg_wind_speed_2024
laps_2024["Rainfall"] = avg_rain_2024

timing_columns = ["LapTime", "Sector1Time", "Sector2Time", "Sector3Time"]
# Convert timing columns to seconds
for col in timing_columns:
    laps_2023[col] = laps_2023[col].dt.total_seconds()
    laps_2024[col] = laps_2024[col].dt.total_seconds()

# Combine laps data from both years
laps_combined = pd.concat([laps_2023, laps_2024], ignore_index=True)
if laps_combined.shape[0] == 0:
    raise ValueError("No laps data available after filtering.")
laps_combined.head()

Unnamed: 0,Time,Driver,LapTime,Sector1Time,Sector2Time,Sector3Time,Compound,TrackStatus,Year,AirTemp,Humidity,WindSpeed,Rainfall
0,0 days 01:06:10.776000,VER,132.105,48.846,31.614,51.645,MEDIUM,4,2023,17.44955,54.157658,1.127027,0.0
1,0 days 01:08:21.433000,VER,130.657,46.836,29.178,54.643,MEDIUM,41,2023,17.44955,54.157658,1.127027,0.0
2,0 days 01:09:44.824000,VER,83.391,28.9,18.326,36.165,MEDIUM,1,2023,17.44955,54.157658,1.127027,0.0
3,0 days 01:11:07.928000,VER,83.104,28.935,18.347,35.822,MEDIUM,1,2023,17.44955,54.157658,1.127027,0.0
4,0 days 01:12:30.771000,VER,82.843,28.986,17.951,35.906,MEDIUM,1,2023,17.44955,54.157658,1.127027,0.0


In [6]:
display(laps_combined.TrackStatus.value_counts())
laps_combined.Compound.value_counts()

TrackStatus
1       1677
671       24
4         21
124       21
41        19
126       19
12        18
16        17
14        12
6         10
1267       9
71         4
26         4
214        1
125        1
2671       1
Name: count, dtype: int64

Compound
HARD      1527
MEDIUM     301
SOFT        30
Name: count, dtype: int64

In [7]:
# try to get 2025 data
try:
    australian_gp_2025 = fastf1.get_session(2025, "Australian GP", "Q")
    australian_gp_2025.load()
    laps_2025 = australian_gp_2025.laps[feature_columns].copy()
    laps_2025 = laps_2025.dropna()
    laps_2025["Year"] = 2025

    weather_data_2025 = australian_gp_2025.weather_data
    laps_2025["AirTemp"] = weather_data_2025["AirTemp"].mean()
    laps_2025["Humidity"] = weather_data_2025["Humidity"].mean()
    laps_2025["WindSpeed"] = weather_data_2025["WindSpeed"].mean()
    laps_2025["Rainfall"] = weather_data_2025["Rainfall"].mean()
    for col in timing_columns:
        laps_2025[col] = laps_2025[col].dt.total_seconds()
    laps_combined = pd.concat([laps_combined, laps_2025], ignore_index=True)
except fastf1.exceptions.NoDataError:
    print("No data for 2025 available.")
except fastf1.exceptions.SessionNotFoundError:
    print("Session not found for 2025.")
except fastf1.exceptions.SessionNotLoadedError:
    print("Session not loaded for 2025.")
except Exception as e:
    print(f"An error occurred while loading 2025 data: {e}")

core           INFO 	Loading data for Australian Grand Prix - Qualifying [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['4', '81', '1', '63', '22', '23', '16', '44', '10', '55', '6', '14', '18', '7', '5', '12', '27', '30', '31', '87']


In [8]:
laps_combined

Unnamed: 0,Time,Driver,LapTime,Sector1Time,Sector2Time,Sector3Time,Compound,TrackStatus,Year,AirTemp,Humidity,WindSpeed,Rainfall
0,0 days 01:06:10.776000,VER,132.105,48.846,31.614,51.645,MEDIUM,4,2023,17.44955,54.157658,1.127027,0.0
1,0 days 01:08:21.433000,VER,130.657,46.836,29.178,54.643,MEDIUM,41,2023,17.44955,54.157658,1.127027,0.0
2,0 days 01:09:44.824000,VER,83.391,28.900,18.326,36.165,MEDIUM,1,2023,17.44955,54.157658,1.127027,0.0
3,0 days 01:11:07.928000,VER,83.104,28.935,18.347,35.822,MEDIUM,1,2023,17.44955,54.157658,1.127027,0.0
4,0 days 01:12:30.771000,VER,82.843,28.986,17.951,35.906,MEDIUM,1,2023,17.44955,54.157658,1.127027,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2056,0 days 00:15:26.930000,OCO,77.517,27.066,17.337,33.114,SOFT,1,2025,30.33600,46.146667,0.772000,0.0
2057,0 days 00:21:08.278000,OCO,129.804,64.408,19.093,46.303,SOFT,1,2025,30.33600,46.146667,0.772000,0.0
2058,0 days 00:22:25.425000,OCO,77.147,26.686,17.206,33.255,SOFT,1,2025,30.33600,46.146667,0.772000,0.0
2059,0 days 00:29:15.473000,OCO,130.240,66.648,21.854,41.738,SOFT,1,2025,30.33600,46.146667,0.772000,0.0


In [9]:
from utils import driver_mapping, driver_number_mapping

qualifying_2025 = pd.DataFrame(
    {
        "DriverName": [
            "Lando Norris",
            "Oscar Piastri",
            "Max Verstappen",
            "George Russell",
            "Yuki Tsunoda",
            "Alexander Albon",
            "Charles Leclerc",
            "Lewis Hamilton",
            "Pierre Gasly",
            "Carlos Sainz",
            "Fernando Alonso",
            "Lance Stroll",
        ],
        "QualifyingTime": [
            75.096,
            75.180,
            75.481,
            75.546,
            75.670,
            75.737,
            75.755,
            75.973,
            75.980,
            76.062,
            76.4,
            76.5,
        ],
    }
)

qualifying_2025['Driver'] = qualifying_2025['DriverName'].map(driver_mapping)
qualifying_2025

Unnamed: 0,DriverName,QualifyingTime,Driver
0,Lando Norris,75.096,NOR
1,Oscar Piastri,75.18,PIA
2,Max Verstappen,75.481,VER
3,George Russell,75.546,RUS
4,Yuki Tsunoda,75.67,TSU
5,Alexander Albon,75.737,ALB
6,Charles Leclerc,75.755,LEC
7,Lewis Hamilton,75.973,HAM
8,Pierre Gasly,75.98,GAS
9,Carlos Sainz,76.062,


In [10]:
# merge qualifying data with laps data
merged_dat = laps_combined.merge(
    qualifying_2025[["Driver", "QualifyingTime"]],
    on="Driver",
    how="left",
)
merged_dat

Unnamed: 0,Time,Driver,LapTime,Sector1Time,Sector2Time,Sector3Time,Compound,TrackStatus,Year,AirTemp,Humidity,WindSpeed,Rainfall,QualifyingTime
0,0 days 01:06:10.776000,VER,132.105,48.846,31.614,51.645,MEDIUM,4,2023,17.44955,54.157658,1.127027,0.0,75.481
1,0 days 01:08:21.433000,VER,130.657,46.836,29.178,54.643,MEDIUM,41,2023,17.44955,54.157658,1.127027,0.0,75.481
2,0 days 01:09:44.824000,VER,83.391,28.900,18.326,36.165,MEDIUM,1,2023,17.44955,54.157658,1.127027,0.0,75.481
3,0 days 01:11:07.928000,VER,83.104,28.935,18.347,35.822,MEDIUM,1,2023,17.44955,54.157658,1.127027,0.0,75.481
4,0 days 01:12:30.771000,VER,82.843,28.986,17.951,35.906,MEDIUM,1,2023,17.44955,54.157658,1.127027,0.0,75.481
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2056,0 days 00:15:26.930000,OCO,77.517,27.066,17.337,33.114,SOFT,1,2025,30.33600,46.146667,0.772000,0.0,
2057,0 days 00:21:08.278000,OCO,129.804,64.408,19.093,46.303,SOFT,1,2025,30.33600,46.146667,0.772000,0.0,
2058,0 days 00:22:25.425000,OCO,77.147,26.686,17.206,33.255,SOFT,1,2025,30.33600,46.146667,0.772000,0.0,
2059,0 days 00:29:15.473000,OCO,130.240,66.648,21.854,41.738,SOFT,1,2025,30.33600,46.146667,0.772000,0.0,


In [11]:
merged_dat[merged_dat['QualifyingTime'].isnull()]

Unnamed: 0,Time,Driver,LapTime,Sector1Time,Sector2Time,Sector3Time,Compound,TrackStatus,Year,AirTemp,Humidity,WindSpeed,Rainfall,QualifyingTime
102,0 days 01:06:40.985000,PER,131.789,41.480,27.181,63.128,MEDIUM,4,2023,17.44955,54.157658,1.127027,0.0,
103,0 days 01:08:31.634000,PER,110.649,36.623,27.351,46.675,HARD,41,2023,17.44955,54.157658,1.127027,0.0,
104,0 days 01:09:56.742000,PER,85.108,30.170,18.498,36.440,HARD,1,2023,17.44955,54.157658,1.127027,0.0,
105,0 days 01:11:21.243000,PER,84.501,29.362,18.331,36.808,HARD,1,2023,17.44955,54.157658,1.127027,0.0,
106,0 days 01:12:44.827000,PER,83.584,29.470,18.053,36.061,HARD,1,2023,17.44955,54.157658,1.127027,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2056,0 days 00:15:26.930000,OCO,77.517,27.066,17.337,33.114,SOFT,1,2025,30.33600,46.146667,0.772000,0.0,
2057,0 days 00:21:08.278000,OCO,129.804,64.408,19.093,46.303,SOFT,1,2025,30.33600,46.146667,0.772000,0.0,
2058,0 days 00:22:25.425000,OCO,77.147,26.686,17.206,33.255,SOFT,1,2025,30.33600,46.146667,0.772000,0.0,
2059,0 days 00:29:15.473000,OCO,130.240,66.648,21.854,41.738,SOFT,1,2025,30.33600,46.146667,0.772000,0.0,


In [12]:
merged_dat.dropna(subset=["QualifyingTime"], inplace=True)
merged_dat.isnull().sum()

Time              0
Driver            0
LapTime           0
Sector1Time       0
Sector2Time       0
Sector3Time       0
Compound          0
TrackStatus       0
Year              0
AirTemp           0
Humidity          0
WindSpeed         0
Rainfall          0
QualifyingTime    0
dtype: int64

In [13]:
merged_dat.info()

<class 'fastf1.core.Laps'>
Index: 1092 entries, 0 to 2018
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype          
---  ------          --------------  -----          
 0   Time            1092 non-null   timedelta64[ns]
 1   Driver          1092 non-null   object         
 2   LapTime         1092 non-null   float64        
 3   Sector1Time     1092 non-null   float64        
 4   Sector2Time     1092 non-null   float64        
 5   Sector3Time     1092 non-null   float64        
 6   Compound        1092 non-null   object         
 7   TrackStatus     1092 non-null   object         
 8   Year            1092 non-null   int64          
 9   AirTemp         1092 non-null   float64        
 10  Humidity        1092 non-null   float64        
 11  WindSpeed       1092 non-null   float64        
 12  Rainfall        1092 non-null   float64        
 13  QualifyingTime  1092 non-null   float64        
dtypes: float64(9), int64(1), object(3), timedelta64[ns](1)
m

In [14]:
numeric_columns = [
    "QualifyingTime",
    "Sector1Time",
    "Sector2Time",
    "Sector3Time",
    "AirTemp",
    "Humidity",
    "WindSpeed",
]
categorical_columns = ["Driver", "Compound", "TrackStatus", "Rainfall"]
# Convert categorical columns to category type
for col in categorical_columns:
    if col == "Driver":
        merged_dat[col] = merged_dat[col].map(driver_number_mapping)
    merged_dat[col] = pd.Categorical(merged_dat[col]).codes

X = merged_dat[numeric_columns + categorical_columns]
y = merged_dat["LapTime"]

print("X.shape" + str(X.shape))
print("y.shape" + str(y.shape))

X.shape(1092, 11)
y.shape(1092,)


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("X_train.shape" + str(X_train.shape))
print("y_train.shape" + str(y_train.shape))
print("X_test.shape" + str(X_test.shape))
print("y_test.shape" + str(y_test.shape))

X_train.shape(873, 11)
y_train.shape(873,)
X_test.shape(219, 11)
y_test.shape(219,)


In [16]:
model = GradientBoostingRegressor(
    n_estimators=150,
    learning_rate=0.1,
    max_depth=3,
    random_state=42,
)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.2f} seconds")
# Feature importance
importances = model.feature_importances_
feature_names = numeric_columns + categorical_columns
feature_importance_df = pd.DataFrame(
    {"Feature": feature_names, "Importance": importances}
).sort_values(by="Importance", ascending=False)
feature_importance_df

Mean Absolute Error: 0.54 seconds


Unnamed: 0,Feature,Importance
1,Sector1Time,0.765524
3,Sector3Time,0.175768
2,Sector2Time,0.057104
9,TrackStatus,0.000948
6,WindSpeed,0.000389
0,QualifyingTime,0.0001
7,Driver,7.8e-05
4,AirTemp,6.7e-05
8,Compound,1.8e-05
5,Humidity,2e-06


In [17]:
fig = px.bar(
    feature_importance_df,
    x="Importance",
    y="Feature",
    title="Feature Importance",
    orientation="h",
    text_auto=True,
    color="Importance",
    color_continuous_scale=px.colors.sequential.Viridis,
)
fig.update_layout(
    xaxis_title="Importance",
    yaxis_title="Feature",
    title_x=0.5,
)
fig.show()

In [18]:
# Plotting the actual vs predicted lap times
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=y_test,
        y=y_pred,
        mode="markers",
        name="Predicted vs Actual",
        marker=dict(color="blue", size=5),
    )
)
fig.add_trace(
    go.Scatter(
        x=y_test,
        y=y_test,
        mode="lines",
        name="Perfect Prediction",
        line=dict(color="red", dash="dash"),
    )
)
fig.update_layout(
    title="Actual vs Predicted Lap Times",
    xaxis_title="Actual Lap Time (seconds)",
    yaxis_title="Predicted Lap Time (seconds)",
    showlegend=True,
)
fig.show()

In [19]:
# make predictions for 2025
X_2025 = merged_dat[merged_dat["Year"] == 2025][numeric_columns + categorical_columns].copy()
# Ensure the columns are in the same order as the training data
X_2025 = X_2025[numeric_columns + categorical_columns]
# Check if the columns match
if set(X_2025.columns) != set(X_train.columns):
    raise ValueError("Feature columns do not match training data columns.")
# Check for missing columns
missing_columns = set(X_train.columns) - set(X_2025.columns)
if missing_columns:
    raise ValueError(f"Missing columns in 2025 data: {missing_columns}")
# Check for extra columns
extra_columns = set(X_2025.columns) - set(X_train.columns)
if extra_columns:
    raise ValueError(f"Extra columns in 2025 data: {extra_columns}")
        
# Drop rows with NaN values
X_2025 = X_2025.dropna()

# Now make predictions
predictions = model.predict(X_2025)

In [20]:
# Display the predictions
predictions_df = pd.DataFrame(
    {
        "Driver": merged_dat[merged_dat["Year"] == 2025]["Driver"],
        "PredictedLapTime": predictions,
    }
)
# reverse the driver mapping
predictions_df["Driver"] = predictions_df["Driver"].map({v: k for k, v in driver_number_mapping.items()})
predictions_df = predictions_df.sort_values(by="PredictedLapTime")
predictions_df.reset_index(drop=True, inplace=True)

fig = px.bar(
    predictions_df,
    x="Driver",
    y="PredictedLapTime",
    title="Predicted Lap Times for 2025 Australian GP",
    labels={"PredictedLapTime": "Predicted Lap Time (seconds)"},
)
fig.update_layout(
    xaxis_title="Driver",
    yaxis_title="Predicted Lap Time (seconds)",
    title_x=0.5,
)
fig.show()