### Importing libraries and csv files

In [56]:
import pandas as pd

In [57]:
constructors = pd.read_csv('constructors.csv')
constructors.head()

Unnamed: 0,constructorId,constructorRef,name,nationality,url
0,1,mclaren,McLaren,British,http://en.wikipedia.org/wiki/McLaren
1,2,bmw_sauber,BMW Sauber,German,http://en.wikipedia.org/wiki/BMW_Sauber
2,3,williams,Williams,British,http://en.wikipedia.org/wiki/Williams_Grand_Pr...
3,4,renault,Renault,French,http://en.wikipedia.org/wiki/Renault_in_Formul...
4,5,toro_rosso,Toro Rosso,Italian,http://en.wikipedia.org/wiki/Scuderia_Toro_Rosso


In [58]:
drivers = pd.read_csv('drivers.csv')
drivers.head()

Unnamed: 0,driverId,driverRef,number,code,forename,surname,dob,nationality,url
0,1,hamilton,44,HAM,Lewis,Hamilton,1985-01-07,British,http://en.wikipedia.org/wiki/Lewis_Hamilton
1,2,heidfeld,\N,HEI,Nick,Heidfeld,1977-05-10,German,http://en.wikipedia.org/wiki/Nick_Heidfeld
2,3,rosberg,6,ROS,Nico,Rosberg,1985-06-27,German,http://en.wikipedia.org/wiki/Nico_Rosberg
3,4,alonso,14,ALO,Fernando,Alonso,1981-07-29,Spanish,http://en.wikipedia.org/wiki/Fernando_Alonso
4,5,kovalainen,\N,KOV,Heikki,Kovalainen,1981-10-19,Finnish,http://en.wikipedia.org/wiki/Heikki_Kovalainen


In [59]:
lap = pd.read_csv('lap_times.csv')
lap.head()

Unnamed: 0,raceId,driverId,lap,position,time,milliseconds
0,841,20,1,1,1:38.109,98109
1,841,20,2,1,1:33.006,93006
2,841,20,3,1,1:32.713,92713
3,841,20,4,1,1:32.803,92803
4,841,20,5,1,1:32.342,92342


In [60]:
pit = pd.read_csv('pit_stops.csv')
pit.head()

Unnamed: 0,raceId,driverId,stop,lap,time,duration,milliseconds
0,841,153,1,1,17:05:23,26.898,26898
1,841,30,1,1,17:05:52,25.021,25021
2,841,17,1,11,17:20:48,23.426,23426
3,841,4,1,12,17:22:34,23.251,23251
4,841,13,1,13,17:24:10,23.842,23842


In [61]:
races = pd.read_csv('races.csv')
races.head()

Unnamed: 0,raceId,year,round,circuitId,name,date,time,url,fp1_date,fp1_time,fp2_date,fp2_time,fp3_date,fp3_time,quali_date,quali_time,sprint_date,sprint_time
0,1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00,http://en.wikipedia.org/wiki/2009_Australian_G...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N
1,2,2009,2,2,Malaysian Grand Prix,2009-04-05,09:00:00,http://en.wikipedia.org/wiki/2009_Malaysian_Gr...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N
2,3,2009,3,17,Chinese Grand Prix,2009-04-19,07:00:00,http://en.wikipedia.org/wiki/2009_Chinese_Gran...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N
3,4,2009,4,3,Bahrain Grand Prix,2009-04-26,12:00:00,http://en.wikipedia.org/wiki/2009_Bahrain_Gran...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N
4,5,2009,5,4,Spanish Grand Prix,2009-05-10,12:00:00,http://en.wikipedia.org/wiki/2009_Spanish_Gran...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N


In [62]:
results = pd.read_csv('results.csv')
results.head()

Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,points,laps,time,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId
0,1,18,1,1,22,1,1,1,1,10.0,58,1:34:50.616,5690616,39,2,1:27.452,218.3,1
1,2,18,2,2,3,5,2,2,2,8.0,58,+5.478,5696094,41,3,1:27.739,217.586,1
2,3,18,3,3,7,7,3,3,3,6.0,58,+8.163,5698779,41,5,1:28.090,216.719,1
3,4,18,4,4,5,11,4,4,4,5.0,58,+17.181,5707797,58,7,1:28.603,215.464,1
4,5,18,5,1,23,3,5,5,5,4.0,58,+18.014,5708630,43,1,1:27.418,218.385,1


### Removing unnecessary columns and cleaning out dataset

In [63]:
#remove url from the constructors table
constructors = constructors.drop(columns=['url'])

In [64]:
# Keep drivers with number <= 15 or number is NaN (missing)
# Convert the 'number' column to numeric, coercing errors to NaN
drivers['number'] = pd.to_numeric(drivers['number'], errors='coerce')
drivers = drivers[(drivers['number'].isna()) | (drivers['number'] <= 15)]
drivers = drivers.drop(columns=['url'])

In [65]:
#Filter races to keep only Canadian Grand Prix and select specific columns
races = races[races['name'] == 'Canadian Grand Prix']
races = races[['raceId', 'year', 'round', 'circuitId', 'name', 'date', 'time']]

In [66]:
#join tables
joined_df = results.merge(drivers, on='driverId', how='inner')

In [67]:
joined_df = joined_df.merge(constructors, on='constructorId', how='inner')

In [68]:
df = joined_df.merge(races, on='raceId', how='inner')

In [69]:
df.head()

Unnamed: 0,resultId,raceId,driverId,constructorId,number_x,grid,position,positionText,positionOrder,points,laps,time_x,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId,driverRef,number_y,code,forename,surname,dob,nationality_x,constructorRef,name_x,nationality_y,year,round,circuitId,name_y,date,time_y
0,130,24,2,2,3,8,2,2,2,8.0,70,16.495,5800722,25,2,1:17.430,202.758,1,heidfeld,,HEI,Nick,Heidfeld,1977-05-10,German,bmw_sauber,BMW Sauber,German,2008,7,7,Canadian Grand Prix,2008-06-08,17:00:00
1,131,24,14,9,9,13,3,3,3,6.0,70,23.352,5807579,36,7,1:18.085,201.057,1,coulthard,,COU,David,Coulthard,1971-03-27,British,red_bull,Red Bull,Austrian,2008,7,7,Canadian Grand Prix,2008-06-08,17:00:00
2,132,24,10,7,12,11,4,4,4,5.0,70,42.627,5826854,31,17,1:19.087,198.51,1,glock,,GLO,Timo,Glock,1982-03-18,German,toyota,Toyota,Japanese,2008,7,7,Canadian Grand Prix,2008-06-08,17:00:00
3,134,24,15,7,11,14,6,6,6,3.0,70,47.775,5832002,38,15,1:18.870,199.056,1,trulli,,TRU,Jarno,Trulli,1974-07-13,Italian,toyota,Toyota,Japanese,2008,7,7,Canadian Grand Prix,2008-06-08,17:00:00
4,135,24,22,11,17,9,7,7,7,2.0,70,53.597,5837824,35,10,1:18.301,200.503,1,barrichello,,BAR,Rubens,Barrichello,1972-05-23,Brazilian,honda,Honda,Japanese,2008,7,7,Canadian Grand Prix,2008-06-08,17:00:00


In [70]:
df.to_csv('f1_dataset.csv', index=False)

In [71]:
df.isna().sum()

Unnamed: 0,0
resultId,0
raceId,0
driverId,0
constructorId,0
number_x,0
grid,0
position,0
positionText,0
positionOrder,0
points,0


In [72]:
df.shape

(1131, 34)

In [73]:
df['number_y'] = df['number_y'].dropna

In [74]:
df.isna().sum()

Unnamed: 0,0
resultId,0
raceId,0
driverId,0
constructorId,0
number_x,0
grid,0
position,0
positionText,0
positionOrder,0
points,0


In [75]:
df.shape

(1131, 34)

In [76]:
df.columns

Index(['resultId', 'raceId', 'driverId', 'constructorId', 'number_x', 'grid',
       'position', 'positionText', 'positionOrder', 'points', 'laps', 'time_x',
       'milliseconds', 'fastestLap', 'rank', 'fastestLapTime',
       'fastestLapSpeed', 'statusId', 'driverRef', 'number_y', 'code',
       'forename', 'surname', 'dob', 'nationality_x', 'constructorRef',
       'name_x', 'nationality_y', 'year', 'round', 'circuitId', 'name_y',
       'date', 'time_y'],
      dtype='object')

### Constructor Predictions

In [77]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [78]:
from sklearn.preprocessing import LabelEncoder

In [79]:
df['is_winner'] = df['positionOrder'] == 1

In [80]:
features = ['grid', 'points', 'fastestLap', 'rank', 'statusId', 'constructorRef']
df = df[features + ['is_winner']].dropna()

In [81]:
le_constructor = LabelEncoder()
df['constructor_encoded'] = le_constructor.fit_transform(df['constructorRef'])
df['fastestLap'] = df['fastestLap'].astype(str)
df['rank'] = df['rank'].astype(str)

In [82]:
le_fastest = LabelEncoder()
le_rank = LabelEncoder()
df['fastest_encoded'] = le_fastest.fit_transform(df['fastestLap'])
df['rank_encoded'] = le_rank.fit_transform(df['rank'])

In [83]:
x = df[['grid', 'points', 'statusId', 'constructor_encoded', 'fastest_encoded', 'rank_encoded']]
y = df['is_winner']

In [84]:
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=0.2, random_state=42)
model = RandomForestClassifier(random_state=42)
model.fit(x_train, y_train)

In [85]:
constructor2025 = pd.DataFrame([
    {'grid': 1, 'points': 0, 'fastestLap': '45', 'rank': '1', 'statusId': 1, 'constructorRef': 'red_bull'},
    {'grid': 2, 'points': 0, 'fastestLap': '44', 'rank': '2', 'statusId': 1, 'constructorRef': 'ferrari'},
    {'grid': 3, 'points': 0, 'fastestLap': '43', 'rank': '3', 'statusId': 1, 'constructorRef': 'mclaren'},
    {'grid': 4, 'points': 0, 'fastestLap': '42', 'rank': '4', 'statusId': 1, 'constructorRef': 'mercedes'},
    {'grid': 5, 'points': 0, 'fastestLap': '41', 'rank': '5', 'statusId': 1, 'constructorRef': 'aston_martin'},
    {'grid': 6, 'points': 0, 'fastestLap': '40', 'rank': '6', 'statusId': 1, 'constructorRef': 'rb'},
    {'grid': 7, 'points': 0, 'fastestLap': '39', 'rank': '7', 'statusId': 1, 'constructorRef': 'alpine'},
    {'grid': 8, 'points': 0, 'fastestLap': '38', 'rank': '8', 'statusId': 1, 'constructorRef': 'williams'},
    {'grid': 9, 'points': 0, 'fastestLap': '37', 'rank': '9', 'statusId': 1, 'constructorRef': 'kick_sauber'},
    {'grid': 10, 'points': 0, 'fastestLap': '36', 'rank': '10', 'statusId': 1, 'constructorRef': 'haas'}
])

In [86]:
constructor2025 = constructor2025[constructor2025['constructorRef'].isin(le_constructor.classes_)]
constructor2025['constructor_encoded'] = le_constructor.transform(constructor2025['constructorRef'])
constructor2025['fastest_encoded'] = le_fastest.transform(constructor2025['fastestLap'].astype(str))
constructor2025['rank_encoded'] = le_rank.transform(constructor2025['rank'].astype(str))

In [87]:
X_2025 = constructor2025[['grid', 'points', 'statusId', 'constructor_encoded', 'fastest_encoded', 'rank_encoded']]

In [88]:
# Predict win probabilities
win_probabilities = model.predict_proba(X_2025)[:, 1]
constructor2025['win_probability'] = win_probabilities

In [89]:
winner = constructor2025.loc[constructor2025['win_probability'].idxmax()]
print("Most likely constructor to win 2025 Canada GP:")
print(f"{winner['constructorRef']} with probability {winner['win_probability']:.2f}")

Most likely constructor to win 2025 Canada GP:
red_bull with probability 0.05


In [90]:
print("\nFull prediction table:")
print(constructor2025[['constructorRef', 'win_probability']])


Full prediction table:
  constructorRef  win_probability
0       red_bull             0.05
1        ferrari             0.01
2        mclaren             0.02
3       mercedes             0.02
4   aston_martin             0.00
5             rb             0.00
6         alpine             0.00
7       williams             0.00
9           haas             0.00


### Evaluation

In [94]:
accuracy = model.score(x_test, y_test)
print("Accuracy:",accuracy)

Accuracy: 1.0


In [96]:
#recall and precision score
from sklearn.metrics import recall_score, precision_score
y_pred = model.predict(x_test)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
print("Recall:",recall)
print("Precision:",precision)

Recall: 1.0
Precision: 1.0
