<a href="https://colab.research.google.com/github/annmarie520/DATASCI112/blob/main/Formula_1_Data_collection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Fast F1 API: Data Collection

In [None]:
import pandas as pd
import numpy as np
import datetime

In [None]:
!pip install fastf1

In [None]:
import fastf1
fastf1.Cache.enable_cache('/content/f1cache') 

cols = ['round','air_temp','humidity','pressure','track_temp','wind_direction','wind_speed','rainfall']
lst = []

#race weather
for i in range(1,23):
  session = fastf1.get_session(2021, i, 'R') 
  session.load(telemetry=False, laps=False, weather=True)
  round = i
  weather_data = session.weather_data
  air_temp = weather_data['AirTemp'].mean()
  rainfall = weather_data['Rainfall'].mean()
  humidity = weather_data['Humidity'].mean()
  pressure = weather_data['Pressure'].mean()
  track_temp = weather_data['TrackTemp'].mean()
  wind_direction = weather_data['WindDirection'].mean()
  wind_speed = weather_data['WindSpeed'].mean()
  lst.append([round,air_temp,humidity,pressure,track_temp,wind_direction,wind_speed,rainfall])

df_race_weather = pd.DataFrame(lst, columns=cols)

In [None]:
#qualifying weather
cols = ['round','air_temp','humidity','pressure','track_temp','wind_direction','wind_speed','rainfall']
lst = []

for i in range(1,23):
  session = fastf1.get_session(2021, i, 'Q') 
  session.load(telemetry=False, laps=False, weather=True)
  round = i
  weather_data = session.weather_data
  air_temp = weather_data['AirTemp'].mean()
  rainfall = weather_data['Rainfall'].mean()
  humidity = weather_data['Humidity'].mean()
  pressure = weather_data['Pressure'].mean()
  track_temp = weather_data['TrackTemp'].mean()
  wind_direction = weather_data['WindDirection'].mean()
  wind_speed = weather_data['WindSpeed'].mean()
  lst.append([round,air_temp,humidity,pressure,track_temp,wind_direction,wind_speed,rainfall])

df_qualifying_weather = pd.DataFrame(lst, columns=cols)

## Kaggle csv: data collection

Read in csv files

In [None]:
races = pd.read_csv('/content/races.csv')
races = races.drop(columns=['time','url', 'fp1_date','fp1_time','fp2_date','fp2_time','fp3_date','fp3_time','quali_date','quali_time','sprint_date','sprint_time'])

In [None]:
pit_stops = pd.read_csv('/content/pit_stops.csv')

In [None]:
constructors = pd.read_csv('/content/constructors.csv')
constructors = constructors.drop(columns=['nationality', 'url'])

In [None]:
qualifying = pd.read_csv('/content/qualifying.csv')

In [None]:
drivers = pd.read_csv('/content/drivers.csv')
drivers = drivers.drop(columns=['forename', 'surname','dob','nationality','url'])

In [None]:
lap_times = pd.read_csv('/content/lap_times.csv')

Merge csv files together

In [None]:
qualifying_with_drivers = qualifying.merge(drivers, on=["driverId"],how="outer", suffixes=("_car_number", "_driver"))

In [None]:
qualifying_drivers_races = qualifying_with_drivers.merge(races, on=["raceId"],how="outer")

In [None]:
qualifying_drivers_races_constructors = qualifying_drivers_races.merge(constructors, on=["constructorId"],how="outer", suffixes=("_GP", "_team"))

In [None]:
#select only 2021 race season
races_2021 = qualifying_drivers_races_constructors[qualifying_drivers_races_constructors["year"]==2021.0].sort_values('round')

In [None]:
races_2021_with_weather = races_2021.merge(df_race_weather, on=["round"],how="outer")

In [None]:
races_2021_with_weather = races_2021_with_weather.merge(df_qualifying_weather, on=["round"],how="outer", suffixes=("_race", "_qualifying"))

In [None]:
#find fastest lap per driver per race
races = lap_times['raceId'].sort_values().unique()

drivers = lap_times['driverId'].sort_values().unique() 

fastest_laps = []

for race in races:
  for driver in drivers:
    df_times=lap_times.loc[(lap_times['raceId'] == race) & (lap_times['driverId'] == driver)]
    lap_min = df_times['time'].min()
    fastest_lap = {'raceId': race, 'driverId': driver, 'fastest_lap':lap_min}
    fastest_laps.append(fastest_lap)

df_fastest_lap = pd.DataFrame(fastest_laps).fillna('/N')

In [None]:
#find max pit stops per driver per race
races = pit_stops['raceId'].sort_values().unique()

drivers = pit_stops['driverId'].sort_values().unique() 

pit_stops_list = []

for race in races:
  for driver in drivers:
    df_stops=pit_stops.loc[(pit_stops['raceId'] == race) & (pit_stops['driverId'] == driver)]
    stop_number = df_stops['stop'].max()
    pit_stop = {'raceId': race, 'driverId': driver, 'stop':stop_number}
    pit_stops_list.append(pit_stop)

df_pit_stops = pd.DataFrame(pit_stops_list).fillna('/N')

Merge fastest lap times and pit stops with main dataframe

In [None]:
f1_races_2021 = races_2021_with_weather.merge(df_fastest_lap, on=["raceId","driverId"],how="left", suffixes=("_qualifying", "_final"))

In [None]:
f1_races_2021 = f1_races_2021.merge(df_pit_stops, on=["raceId","driverId"],how="left")
f1_races_2021

Unnamed: 0,qualifyId,raceId,driverId,constructorId,number_car_number,position,q1,q2,q3,driverRef,...,air_temp_qualifying,humidity_qualifying,pressure_qualifying,track_temp_qualifying,wind_direction_qualifying,wind_speed_qualifying,rainfall_qualifying,fastest_lap,stop_x,stop_y
0,8740.0,1052.0,817.0,1.0,3.0,6.0,1:30.795,1:30.222,1:29.927,ricciardo,...,29.060256,46.544872,1009.650000,33.84359,174.666667,0.966667,0.0,1:34.932,2.0,2.0
1,8736.0,1052.0,1.0,131.0,44.0,2.0,1:30.617,1:30.085,1:29.385,hamilton,...,29.060256,46.544872,1009.650000,33.84359,174.666667,0.966667,0.0,1:34.015,2.0,2.0
2,8737.0,1052.0,822.0,131.0,77.0,3.0,1:31.200,1:30.186,1:29.586,bottas,...,29.060256,46.544872,1009.650000,33.84359,174.666667,0.966667,0.0,1:32.090,3.0,3.0
3,8750.0,1052.0,839.0,214.0,31.0,16.0,1:31.724,\N,\N,ocon,...,29.060256,46.544872,1009.650000,33.84359,174.666667,0.966667,0.0,1:35.250,2.0,2.0
4,8743.0,1052.0,4.0,214.0,14.0,9.0,1:30.863,1:30.595,1:30.249,alonso,...,29.060256,46.544872,1009.650000,33.84359,174.666667,0.966667,0.0,1:36.063,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
434,9169.0,1073.0,841.0,51.0,99.0,14.0,1:24.118,1:24.251,\N,giovinazzi,...,25.129630,57.358025,1018.630864,29.17284,297.024691,0.316049,0.0,1:29.442,1.0,1.0
435,9167.0,1073.0,842.0,213.0,10.0,12.0,1:23.489,1:24.043,\N,gasly,...,25.129630,57.358025,1018.630864,29.17284,297.024691,0.316049,0.0,1:27.342,2.0,2.0
436,9163.0,1073.0,852.0,213.0,22.0,8.0,1:23.428,1:23.404,1:23.220,tsunoda,...,25.129630,57.358025,1018.630864,29.17284,297.024691,0.316049,0.0,1:27.496,2.0,2.0
437,9157.0,1073.0,1.0,131.0,44.0,2.0,1:22.845,1:23.145,1:22.480,hamilton,...,25.129630,57.358025,1018.630864,29.17284,297.024691,0.316049,0.0,1:26.615,1.0,1.0


Issues with fastest lap calculation for Saudi Arabia race so separately calculating fastest laps for Saudi Arabia

In [None]:
#saudi arabia race data
lap_times = pd.read_csv('/content/saudi_arabia.csv')

races = lap_times['raceId'].sort_values().unique()

drivers = lap_times['driverId'].sort_values().unique() 

fastest_laps = []

for race in races:
  for driver in drivers:
    df_times=lap_times.loc[(lap_times['raceId'] == race) & (lap_times['driverId'] == driver)]
    lap_min = df_times['time'].min()
    fastest_lap = {'raceId': race, 'driverId': driver, 'fastest_lap':lap_min}
    fastest_laps.append(fastest_lap)

df_fastest_lap_saudi = pd.DataFrame(fastest_laps).fillna('/N')
df_fastest_lap_saudi
#manually added to f1_data_final csv

In [None]:
df_model = pd.DataFrame(f1_races_2021)
df_model.to_csv("f1_data_final.csv")

from google.colab import files
files.download("f1_data_final.csv")

## Futher data cleaning: convert string objects to float

In [None]:
df_f1 = pd.read_csv('/content/f1_data_final.csv')
df_f1

Unnamed: 0.1,Unnamed: 0,qualifyId,raceId,driverId,constructorId,number_car_number,position,q1,q2,q3,...,rainfall_race,air_temp_qualifying,humidity_qualifying,pressure_qualifying,track_temp_qualifying,wind_direction_qualifying,wind_speed_qualifying,rainfall_qualifying,fastest_lap,stop
0,0,8740.0,1052.0,817.0,1.0,3.0,6.0,1:30.795,1:30.222,1:29.927,...,0.0,29.060256,46.544872,1009.650000,33.84359,174.666667,0.966667,0.0,1:34.932,2.0
1,1,8736.0,1052.0,1.0,131.0,44.0,2.0,1:30.617,1:30.085,1:29.385,...,0.0,29.060256,46.544872,1009.650000,33.84359,174.666667,0.966667,0.0,1:34.015,2.0
2,2,8737.0,1052.0,822.0,131.0,77.0,3.0,1:31.200,1:30.186,1:29.586,...,0.0,29.060256,46.544872,1009.650000,33.84359,174.666667,0.966667,0.0,1:32.090,3.0
3,3,8750.0,1052.0,839.0,214.0,31.0,16.0,1:31.724,\N,\N,...,0.0,29.060256,46.544872,1009.650000,33.84359,174.666667,0.966667,0.0,1:35.250,2.0
4,4,8743.0,1052.0,4.0,214.0,14.0,9.0,1:30.863,1:30.595,1:30.249,...,0.0,29.060256,46.544872,1009.650000,33.84359,174.666667,0.966667,0.0,1:36.063,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
434,434,9169.0,1073.0,841.0,51.0,99.0,14.0,1:24.118,1:24.251,\N,...,0.0,25.129630,57.358025,1018.630864,29.17284,297.024691,0.316049,0.0,1:29.442,1.0
435,435,9167.0,1073.0,842.0,213.0,10.0,12.0,1:23.489,1:24.043,\N,...,0.0,25.129630,57.358025,1018.630864,29.17284,297.024691,0.316049,0.0,1:27.342,2.0
436,436,9163.0,1073.0,852.0,213.0,22.0,8.0,1:23.428,1:23.404,1:23.220,...,0.0,25.129630,57.358025,1018.630864,29.17284,297.024691,0.316049,0.0,1:27.496,2.0
437,437,9157.0,1073.0,1.0,131.0,44.0,2.0,1:22.845,1:23.145,1:22.480,...,0.0,25.129630,57.358025,1018.630864,29.17284,297.024691,0.316049,0.0,1:26.615,1.0


In [None]:
#convert string object to time
pd.to_datetime(df_f1['fastest_lap'], format='%M:%S.%f', errors='ignore')

df_f1['fastest_lap_float'] = df_f1.loc[:, 'fastest_lap']

df_f1['fastest_lap_float']=df_f1['fastest_lap_float'].str.replace('.','')

df_f1['fastest_lap_float']=df_f1['fastest_lap_float'].str.replace(':','.')

df_f1['fastest_lap_float'] = df_f1['fastest_lap_float'].replace('\\N', np.nan)
df_f1['fastest_lap_float'] = pd.to_numeric(df_f1['fastest_lap_float'], errors='coerce')

In [None]:
pd.to_datetime(df_f1['q1'], format='%M:%S.%f', errors='ignore')

df_f1['q1_float'] = df_f1.loc[:, 'q1']

df_f1['q1_float']=df_f1['q1_float'].str.replace('.','')

df_f1['q1_float']=df_f1['q1_float'].str.replace(':','.')

df_f1['q1_float'] = df_f1['q1_float'].replace('\\N', np.nan)
df_f1['q1_float'] = pd.to_numeric(df_f1['q1_float'], errors='coerce')

In [None]:
pd.to_datetime(df_f1['q2'], format='%M:%S.%f', errors='ignore')

df_f1['q2_float'] = df_f1.loc[:, 'q2']

df_f1['q2_float']=df_f1['q2_float'].str.replace('.','')

df_f1['q2_float']=df_f1['q2_float'].str.replace(':','.')

df_f1['q2_float'] = df_f1['q2_float'].replace('\\N', np.nan)
df_f1['q2_float'] = pd.to_numeric(df_f1['q2_float'], errors='coerce')

In [None]:
pd.to_datetime(df_f1['q3'], format='%M:%S.%f', errors='ignore')

df_f1['q3_float'] = df_f1.loc[:, 'q3']

df_f1['q3_float']=df_f1['q3_float'].str.replace('.','')

df_f1['q3_float']=df_f1['q3_float'].str.replace(':','.')

df_f1['q3_float'] = df_f1['q3_float'].replace('\\N', np.nan)
df_f1['q3_float'] = pd.to_numeric(df_f1['q3_float'], errors='coerce')

In [None]:
df_model = pd.DataFrame(df_f1)
df_model.to_csv("f1_data_with_float.csv")

from google.colab import files
files.download("f1_data_with_float.csv")