# Updated Predictions Notebook

## Prepare Notebook and Load Data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
import datetime as dt
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix, precision_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn import svm
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor
import copy

np.set_printoptions(precision=4)

warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('use_inf_as_na', True)

In [2]:
# Import Data
races = pd.read_csv('./f1db_csv/races.csv')
results = pd.read_csv('./f1db_csv/results.csv')
qualifying = pd.read_csv('./f1db_csv/qualifying.csv')
driver_standings = pd.read_csv('./f1db_csv/driver_standings.csv')
constructor_standings = pd.read_csv('./f1db_csv/constructor_standings.csv')
constructors = pd.read_csv('./f1db_csv/constructors.csv')
drivers = pd.read_csv('./f1db_csv/drivers.csv')
driver_history = pd.read_csv('./f1db_csv/driver_history.csv')
lap_times = pd.read_csv('./f1db_csv/lap_times.csv')

### Get Weather Data from Wikipedia

In [3]:
# Scrape weather
weather = races.iloc[:,[0,1,2]]

# Scrape data for weather
info = []

for link in races.url:
    try:
        df = pd.read_html(link)[0]
        if 'Weather' in list(df.iloc[:,0]):
            n = list(df.iloc[:,0]).index('Weather')
            info.append(df.iloc[n,1])
        else:
            df = pd.read_html(link)[1]
            if 'Weather' in list(df.iloc[:,0]):
                n = list(df.iloc[:,0]).index('Weather')
                info.append(df.iloc[n,1])
            else:
                df = pd.read_html(link)[2]
                if 'Weather' in list(df.iloc[:,0]):
                    n = list(df.iloc[:,0]).index('Weather')
                    info.append(df.iloc[n,1])
                else:
                    df = pd.read_html(link)[3]
                    if 'Weather' in list(df.iloc[:,0]):
                        n = list(df.iloc[:,0]).index('Weather')
                        info.append(df.iloc[n,1])
                    else:
                        driver = webdriver.Chrome()
                        driver.get(link)

                        # click language button
                        button = driver.find_element_by_link_text('Italiano')
                        button.click()
                        
                        clima = driver.find_element_by_xpath('//*[@id="mw-content-text"]/div/table[1]/tbody/tr[9]/td').text
                        info.append(clima) 
                                
    except:
        info.append('not found')

In [4]:
# Create a df just for weather and clean it up
weather['weather'] = info
weather["weather"].iloc[1021] = "Warm, Sunny"
weather["weather"].iloc[1022] = "Warm, Sunny"
weather["weather"].iloc[1023] = "Warm, Sunny"
weather["weather"].iloc[1024] = "Rain, Cloudy"
weather["weather"].iloc[1025] = "Warm, Sunny"
weather["weather"].iloc[1026] = "Warm, Sunny"

weather_dict = {'weather_warm': ['soleggiato', 'clear', 'warm', 'hot', 'sunny', 'fine', 'mild', 'sereno'],
               'weather_cold': ['cold', 'fresh', 'chilly', 'cool'],
               'weather_dry': ['dry', 'asciutto'],
               'weather_wet': ['showers', 'wet', 'rain', 'pioggia', 'damp', 'thunderstorms', 'rainy'],
               'weather_cloudy': ['overcast', 'nuvoloso', 'clouds', 'cloudy', 'grey', 'coperto']}

weather_df = pd.DataFrame(columns = weather_dict.keys())

for col in weather_df:
    weather_df[col] = weather['weather'].map(lambda x: 1 if any(i in weather_dict[col] for i in x.lower().split()) else 0)

weather_info = pd.concat([weather, weather_df], axis = 1)
weather_info
weather_info.tail(15)

Unnamed: 0,raceId,year,round,weather,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy
1020,1033,2020,3,Partly cloudy,0,0,0,0,1
1021,1034,2020,4,"Warm, Sunny",1,0,0,0,0
1022,1035,2020,5,"Warm, Sunny",1,0,0,0,0
1023,1036,2020,6,"Warm, Sunny",1,0,0,0,0
1024,1037,2020,7,"Rain, Cloudy",0,0,0,0,1
1025,1038,2020,8,"Warm, Sunny",1,0,0,0,0
1026,1039,2020,9,"Warm, Sunny",1,0,0,0,0
1027,1040,2020,10,not found,0,0,0,0,0
1028,1041,2020,11,not found,0,0,0,0,0
1029,1042,2020,12,not found,0,0,0,0,0


In [5]:
# Write this to a csv to be used later
weather_info.to_csv("./f1db_csv/weather.csv")

In [7]:
weather = pd.read_csv("./f1db_csv/weather.csv")

## Create Tables for Predictions

In [57]:
# Merge together to create final df for predictions
df1 = pd.merge(races, weather[["raceId", "weather_warm", "weather_cold", "weather_dry", "weather_wet", "weather_cloudy"]], on=['raceId'], how = "left")
df2 = pd.merge(df1, results[["raceId", "driverId", "constructorId", "grid", "position", "points", "milliseconds", "laps", "fastestLapSpeed"]], on=['raceId'], how = "left")
df2 = df2.rename({'position' : 'finishingPosition', 'points' : 'pointsGained'}, axis = 1)
df3 = pd.merge(df2, driver_standings[["raceId", "driverId", "points", "position", "wins"]], on=['raceId', 'driverId'], how = "left")
df3 = df3.rename({'position' : 'driverStanding', 'points' : 'cumulativePoints', 'wins' : 'driver_wins'}, axis = 1)
df4 = pd.merge(df3, constructor_standings[["raceId", "constructorId", "points", "position", "wins"]], how='left', on=['raceId', 'constructorId']) #from 1958
df4 = df4.rename({'position' : 'constructorStanding', 'points' : 'constructorPoints', 'wins' : 'constructorWins', 'name' : 'raceName'}, axis = 1)
df5 = pd.merge(df4, constructors[["constructorId", "name"]], on = "constructorId", how = "left")
df5 = df5.rename({'name' : 'constructorName'}, axis = 1)

# Match drivers in driver history table to their driver IDs
drivers["driverName"] = drivers["forename"].str.cat(drivers["surname"],sep = " ")
df6 = pd.merge(driver_history, drivers[["driverName", "driverId"]], on = "driverName", how = "left")

final_df = pd.merge(df5, df6[["raceId", "driverId", "minQualifyingTime", "fastestLapTime"]], on=['raceId', 'driverId'], how = "left")
final_df = final_df.drop(columns = ["round", "circuitId", "time", "url", "constructorId"])

# Rearrange the columns so identifying information is at the beginning
key = lambda x: (x != 'raceId', x != 'date', x != 'year', x != 'raceName', x != 'driverId', x != 'finishingPosition')
final_df = final_df[sorted(final_df, key=key)]

In [58]:
# Get dummies for categorical variables
df_dum = pd.get_dummies(final_df, columns = ['constructorName'])
# Convert date to datetime
df_dum["date"] = pd.to_datetime(df_dum["date"])
df_dum = df_dum.sort_values("date", ascending = True)

In [63]:
# Merge lap times data
total_lap_times = pd.merge(lap_times, races[["raceId", "year"]], on = "raceId")
total_lap_times = total_lap_times[["raceId", "driverId", "milliseconds"]]
total_lap_times = total_lap_times.groupby(['raceId', 'driverId']).sum()
total_lap_times["seconds"] = total_lap_times.milliseconds / 1000

# Merge in the seconds for each race
df_dum_merged = pd.merge(df_dum, total_lap_times, on = ["raceId", "driverId"])
df_dum_merged = df_dum_merged.drop(columns = ["milliseconds_x", "milliseconds_y"])

# Look only at 2020 data right now
df_2020 = df_dum_merged[df_dum_merged.year == 2020]

# Drop teams that didn't exist in 2020
for col in df_2020.columns:
    if 'constructorName' in col and df_2020[col].sum() < 1:
        df_2020.drop(col, axis = 1, inplace = True)

df_2020 = df_2020.sort_values(["date", "driverId"], ascending = True)
df_2020

Unnamed: 0,raceId,date,year,raceName,driverId,finishingPosition,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,grid,pointsGained,laps,fastestLapSpeed,cumulativePoints,driverStanding,driver_wins,constructorPoints,constructorStanding,constructorWins,minQualifyingTime,fastestLapTime,constructorName_Alfa Romeo,constructorName_AlphaTauri,constructorName_Ferrari,constructorName_Haas F1 Team,constructorName_McLaren,constructorName_Mercedes,constructorName_Racing Point,constructorName_Red Bull,constructorName_Renault,constructorName_Williams,seconds
8973,1031,2020-07-05,2020,Austrian Grand Prix,1.0,4,1,0,0,0,0,5.0,12.0,71.0,229.572,12.0,4.0,0.0,37.0,1.0,1.0,62.951,67.712,0,0,0,0,0,1,0,0,0,0,5456.428
8960,1031,2020-07-05,2020,Austrian Grand Prix,8.0,\N,1,0,0,0,0,19.0,0.0,53.0,225.185,0.0,14.0,0.0,2.0,7.0,0.0,65.224,69.031,1,0,0,0,0,0,0,0,0,0,3980.856
8974,1031,2020-07-05,2020,Austrian Grand Prix,20.0,10,1,0,0,0,0,11.0,1.0,71.0,226.524,1.0,10.0,0.0,19.0,3.0,0.0,64.206,68.623,0,0,1,0,0,0,0,0,0,0,5480.284
8966,1031,2020-07-05,2020,Austrian Grand Prix,154.0,\N,1,0,0,0,0,15.0,0.0,49.0,221.347,0.0,16.0,0.0,0.0,10.0,0.0,64.691,70.228,0,0,0,1,0,0,0,0,0,0,3695.277
8975,1031,2020-07-05,2020,Austrian Grand Prix,815.0,6,1,0,0,0,0,6.0,8.0,71.0,227.579,8.0,6.0,0.0,8.0,4.0,0.0,63.860,68.305,0,0,0,0,0,0,1,0,0,0,5465.831
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9103,1038,2020-09-06,2020,Italian Grand Prix,844.0,\N,1,0,0,0,0,13.0,0.0,23.0,242.424,45.0,7.0,0.0,61.0,6.0,0.0,80.273,86.026,0,0,1,0,0,0,0,0,0,0,2127.678
9112,1038,2020-09-06,2020,Italian Grand Prix,846.0,4,1,0,0,0,0,6.0,12.0,53.0,247.587,57.0,5.0,0.0,98.0,3.0,0.0,79.820,84.232,0,0,0,0,1,0,0,0,0,0,6432.056
9099,1038,2020-09-06,2020,Italian Grand Prix,847.0,14,1,0,0,0,0,19.0,0.0,53.0,247.033,0.0,21.0,0.0,0.0,10.0,0.0,81.587,84.421,0,0,0,0,0,0,0,0,0,1,6462.649
9100,1038,2020-09-06,2020,Italian Grand Prix,848.0,15,1,0,0,0,0,9.0,0.0,53.0,245.564,48.0,6.0,0.0,158.0,2.0,1.0,80.064,84.926,0,0,0,0,0,0,0,1,0,0,6463.589


In [79]:
df_2020[df_2020.raceId == 1038]

Unnamed: 0,raceId,date,year,raceName,driverId,finishingPosition,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,grid,pointsGained,laps,fastestLapSpeed,cumulativePoints,driverStanding,driver_wins,constructorPoints,constructorStanding,constructorWins,minQualifyingTime,fastestLapTime,constructorName_Alfa Romeo,constructorName_AlphaTauri,constructorName_Ferrari,constructorName_Haas F1 Team,constructorName_McLaren,constructorName_Mercedes,constructorName_Racing Point,constructorName_Red Bull,constructorName_Renault,constructorName_Williams,seconds
9109,1038,2020-09-06,2020,Italian Grand Prix,1.0,7,1,0,0,0,0,1.0,7.0,53.0,252.033,164.0,1.0,5.0,281.0,1.0,6.0,78.887,82.746,0,0,0,0,0,1,0,0,0,0,6443.301
9098,1038,2020-09-06,2020,Italian Grand Prix,8.0,13,1,0,0,0,0,14.0,0.0,53.0,245.827,0.0,19.0,0.0,2.0,8.0,0.0,80.926,84.835,1,0,0,0,0,0,0,0,0,0,6462.368
9104,1038,2020-09-06,2020,Italian Grand Prix,20.0,\N,1,0,0,0,0,17.0,0.0,6.0,239.415,16.0,13.0,0.0,61.0,6.0,0.0,81.151,87.107,0,0,1,0,0,0,0,0,0,0,588.85
9097,1038,2020-09-06,2020,Italian Grand Prix,154.0,12,1,0,0,0,0,16.0,0.0,53.0,245.972,0.0,20.0,0.0,1.0,9.0,0.0,81.139,84.785,0,0,0,1,0,0,0,0,0,0,6461.22
9116,1038,2020-09-06,2020,Italian Grand Prix,815.0,10,1,0,0,0,0,4.0,1.0,53.0,247.282,34.0,11.0,0.0,82.0,4.0,0.0,79.718,84.336,0,0,0,0,0,0,1,0,0,0,6449.28
9110,1038,2020-09-06,2020,Italian Grand Prix,817.0,6,1,0,0,0,0,7.0,8.0,53.0,248.573,41.0,10.0,0.0,71.0,5.0,0.0,79.864,83.898,0,0,0,0,0,0,0,0,1,0,6434.447
9111,1038,2020-09-06,2020,Italian Grand Prix,822.0,5,1,0,0,0,0,2.0,10.0,53.0,248.386,117.0,2.0,1.0,281.0,1.0,6.0,78.952,83.961,0,0,0,0,0,1,0,0,0,0,6433.164
9101,1038,2020-09-06,2020,Italian Grand Prix,825.0,\N,1,0,0,0,0,15.0,0.0,17.0,243.099,1.0,17.0,0.0,1.0,9.0,0.0,80.869,85.787,0,0,0,1,0,0,0,0,0,0,1515.852
9107,1038,2020-09-06,2020,Italian Grand Prix,826.0,9,1,0,0,0,0,11.0,2.0,53.0,246.863,4.0,15.0,0.0,47.0,7.0,1.0,80.169,84.479,0,1,0,0,0,0,0,0,0,0,6448.264
9102,1038,2020-09-06,2020,Italian Grand Prix,830.0,\N,1,0,0,0,0,5.0,0.0,30.0,243.804,110.0,3.0,1.0,158.0,2.0,1.0,79.78,85.539,0,0,0,0,0,0,0,1,0,0,4499.415


In [102]:
# Reformat the data so that it's taking info from the previous race
# Separate the data from the current race and the previous race
current_df = df_2020[["raceId", "date", "year", "raceName", "driverId", "finishingPosition", "seconds", "weather_warm", "weather_cold", 
                      "weather_dry", "weather_wet", "weather_cloudy", "grid", "minQualifyingTime", 
                      "constructorName_Alfa Romeo", "constructorName_AlphaTauri", "constructorName_Ferrari", 
                      "constructorName_Haas F1 Team", "constructorName_McLaren", "constructorName_Mercedes", 
                      "constructorName_Racing Point", "constructorName_Red Bull", "constructorName_Renault", 
                      "constructorName_Williams"]]
current_df = current_df.rename({'finishingPosition' : 'currentFinishingPosition', 'grid' : 'qualiPosition'}, axis = 1)
past_df = df_2020[["raceId", "date", "year", "raceName", "driverId", "finishingPosition", "laps", "pointsGained", 
                   "fastestLapSpeed", "cumulativePoints", "driverStanding", "driver_wins", "constructorPoints", "constructorStanding",
                   "constructorWins", "fastestLapTime"]]
past_df = past_df.rename({'finishingPosition' : 'pastFinishingPosition', 'pointsGained' : 'pastPointsGained'}, axis = 1)

# Change raceId for the past races so that info from the previous race gets matched to the current race for preds
past_df["raceId"] = past_df["raceId"] + 1

# Merge the two together for predictions
full_df = pd.merge(current_df, past_df[["raceId", "driverId", "laps", "pastFinishingPosition", "pastPointsGained",
                                        "fastestLapSpeed", "cumulativePoints", "driverStanding", "driver_wins", 
                                        "constructorPoints", "constructorStanding", "constructorWins", "fastestLapTime"]],
                   on = ["raceId", "driverId"], how = "left")

# Create variable for the difference between qualifying position and finishing position
full_df = full_df.replace({r"\N": None, "NaN" : None})
# full_df["finishingPosition"] = pd.to_numeric(full_df["finishingPosition"])
# full_df["quali_final_delta"] = full_df["finishingPosition"] - full_df["grid"]
full_df[["driverId", "currentFinishingPosition", "pastFinishingPosition"]][full_df.raceId == 1038].sort_values("currentFinishingPosition")

Unnamed: 0,driverId,currentFinishingPosition,pastFinishingPosition
152,842.0,1.0,8.0
142,815.0,10.0,10.0
157,849.0,11.0,16.0
141,154.0,12.0,15.0
139,8.0,13.0,12.0
155,847.0,14.0,
156,848.0,15.0,6.0
151,841.0,16.0,
148,832.0,2.0,
150,840.0,3.0,9.0


In [103]:
full_df

Unnamed: 0,raceId,date,year,raceName,driverId,currentFinishingPosition,seconds,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,qualiPosition,minQualifyingTime,constructorName_Alfa Romeo,constructorName_AlphaTauri,constructorName_Ferrari,constructorName_Haas F1 Team,constructorName_McLaren,constructorName_Mercedes,constructorName_Racing Point,constructorName_Red Bull,constructorName_Renault,constructorName_Williams,laps,pastFinishingPosition,pastPointsGained,fastestLapSpeed,cumulativePoints,driverStanding,driver_wins,constructorPoints,constructorStanding,constructorWins,fastestLapTime
0,1031,2020-07-05,2020,Austrian Grand Prix,1.0,4,5456.428,1,0,0,0,0,5.0,62.951,0,0,0,0,0,1,0,0,0,0,,,,,,,,,,,
1,1031,2020-07-05,2020,Austrian Grand Prix,8.0,,3980.856,1,0,0,0,0,19.0,65.224,1,0,0,0,0,0,0,0,0,0,,,,,,,,,,,
2,1031,2020-07-05,2020,Austrian Grand Prix,20.0,10,5480.284,1,0,0,0,0,11.0,64.206,0,0,1,0,0,0,0,0,0,0,,,,,,,,,,,
3,1031,2020-07-05,2020,Austrian Grand Prix,154.0,,3695.277,1,0,0,0,0,15.0,64.691,0,0,0,1,0,0,0,0,0,0,,,,,,,,,,,
4,1031,2020-07-05,2020,Austrian Grand Prix,815.0,6,5465.831,1,0,0,0,0,6.0,63.860,0,0,0,0,0,0,1,0,0,0,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,1038,2020-09-06,2020,Italian Grand Prix,844.0,,2127.678,1,0,0,0,0,13.0,80.273,0,0,1,0,0,0,0,0,0,0,44.0,14,0.0,233.813,45.0,5.0,0.0,61.0,5.0,0.0,107.840
154,1038,2020-09-06,2020,Italian Grand Prix,846.0,4,6432.056,1,0,0,0,0,6.0,79.820,0,0,0,0,1,0,0,0,0,0,44.0,7,6.0,232.279,45.0,6.0,0.0,68.0,3.0,0.0,108.552
155,1038,2020-09-06,2020,Italian Grand Prix,847.0,14,6462.649,1,0,0,0,0,19.0,81.587,0,0,0,0,0,0,0,0,0,1,9.0,,0.0,225.624,0.0,20.0,0.0,0.0,10.0,0.0,111.754
156,1038,2020-09-06,2020,Italian Grand Prix,848.0,15,6463.589,1,0,0,0,0,9.0,80.064,0,0,0,0,0,0,0,1,0,0,44.0,6,8.0,231.886,48.0,4.0,0.0,158.0,2.0,1.0,108.736


In [105]:
# Find the time for the fastest driver at every race
# Fastest Lap
fastest_time_df = full_df[full_df.pastFinishingPosition == "1"]
fastest_time_df = fastest_time_df[["raceId", "seconds", "laps"]]
fastest_time_df["avg_lap_time"] = fastest_time_df["seconds"] / fastest_time_df["laps"]
fastest_time_df

Unnamed: 0,raceId,seconds,laps,avg_lap_time
26,1032,4984.402,71.0,70.202845
40,1033,5772.473,71.0,81.302437
60,1034,5281.283,70.0,75.4469
79,1035,4793.319,52.0,92.179212
108,1036,5529.456,52.0,106.335692
119,1037,5048.761,66.0,76.496379
138,1038,6443.301,44.0,146.438659


In [119]:
# Delete rows where there are nans
# full_df = full_df.replace({r"\N": None})
# full_df = full_df.dropna()

# Create ratios for all the variables in terms of seconds and then create a rolling average for them over the season
min_df = full_df.groupby("raceId", as_index = False)
min_lap_time = min_df.fastestLapTime.min()
temp1 = pd.merge(full_df, min_lap_time, on = "raceId")  
temp1["fastestLapRatio"] = temp1["fastestLapTime_x"] / temp1["fastestLapTime_y"]
temp1 = temp1.drop(columns = ["fastestLapTime_x", "fastestLapTime_y"])
min_quali_time = min_df.minQualifyingTime.min()
temp2 = pd.merge(temp1, min_quali_time, on = "raceId")
temp2["minQualiRatio"] = temp2["minQualifyingTime_x"] / temp2["minQualifyingTime_y"]
temp2 = temp2.drop(columns = ["minQualifyingTime_x", "minQualifyingTime_y"])
temp2["avg_lap_time"] = temp2["seconds"] / temp2["laps"]
temp_min = temp2.groupby("raceId", as_index = False)
min_avg_lap_time = temp_min.avg_lap_time.min()
temp_min1 = pd.merge(temp2, min_avg_lap_time, on = "raceId")
temp_min1["avgLapRatio"] = temp_min1["avg_lap_time_x"] / temp_min1["avg_lap_time_y"]
temp_min1 = temp_min1.drop(columns = ["avg_lap_time_x", "avg_lap_time_y"])
temp3 = temp_min1.groupby("driverId", as_index = True)
rolling_fastLapRatio = pd.DataFrame(temp3["fastestLapRatio"].rolling(window = 50, min_periods = 1).mean())
rolling_fastLapRatio = rolling_fastLapRatio.reset_index()
rolling_fastLapRatio = rolling_fastLapRatio.rename({"level_1" : "index", "fastestLapRatio" : "rolling_fastestLap"}, axis = 1)
rolling_minQualiRatio = pd.DataFrame(temp3["minQualiRatio"].rolling(window = 50, min_periods = 1).mean())
rolling_minQualiRatio = rolling_minQualiRatio.reset_index()
rolling_minQualiRatio = rolling_minQualiRatio.rename({"level_1" : "index", "minQualiRatio" : "rolling_minQualiRatio"}, axis = 1)
rolling_avgLapRatio = pd.DataFrame(temp3["avgLapRatio"].rolling(window = 50, min_periods = 1).mean())
rolling_avgLapRatio = rolling_avgLapRatio.reset_index()
rolling_avgLapRatio = rolling_avgLapRatio.rename({"level_1" : "index", "avgLapRatio" : "rolling_avgLapTime"}, axis = 1)
temp_min1 = temp_min1.reset_index()
temp4 = pd.merge(temp_min1, rolling_fastLapRatio, on = ["driverId", "index"])
temp5 = pd.merge(temp4, rolling_minQualiRatio, on = ["driverId", "index"])
temp6 = pd.merge(temp5, rolling_avgLapRatio, on = ["driverId", "index"])
temp6 = temp6.drop(columns = ["fastestLapRatio", "minQualiRatio", "avgLapRatio", "seconds", "laps"])

key = lambda x: (x != 'index', x != 'raceId', x != 'date', x != 'year', x != 'raceName', x != 'driverId', x != 'rolling_avgLapTime')
sorted_df = temp6[sorted(temp6, key=key)]
sorted_df["rolling_avgLapTime"] = sorted_df["rolling_avgLapTime"] * 100
sorted_df["rolling_fastestLap"] = sorted_df["rolling_fastestLap"] * 100
sorted_df["rolling_minQualiRatio"] = sorted_df["rolling_minQualiRatio"] * 100
sorted_df = sorted_df.drop(columns = "rolling_avgLapTime")
sorted_df[sorted_df.raceId == 1038]

Unnamed: 0,index,raceId,date,year,raceName,driverId,currentFinishingPosition,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,qualiPosition,constructorName_Alfa Romeo,constructorName_AlphaTauri,constructorName_Ferrari,constructorName_Haas F1 Team,constructorName_McLaren,constructorName_Mercedes,constructorName_Racing Point,constructorName_Red Bull,constructorName_Renault,constructorName_Williams,pastFinishingPosition,pastPointsGained,fastestLapSpeed,cumulativePoints,driverStanding,driver_wins,constructorPoints,constructorStanding,constructorWins,rolling_fastestLap,rolling_minQualiRatio
138,138,1038,2020-09-06,2020,Italian Grand Prix,1.0,7.0,1,0,0,0,0,1.0,0,0,0,0,0,1,0,0,0,0,1.0,25.0,233.99,157.0,1.0,5.0,264.0,1.0,6.0,100.976855,100.011631
139,139,1038,2020-09-06,2020,Italian Grand Prix,8.0,13.0,1,0,0,0,0,14.0,1,0,0,0,0,0,0,0,0,0,12.0,0.0,229.593,0.0,18.0,0.0,2.0,8.0,0.0,103.718908,103.398184
140,140,1038,2020-09-06,2020,Italian Grand Prix,20.0,,1,0,0,0,0,17.0,0,0,1,0,0,0,0,0,0,0,13.0,0.0,229.309,16.0,13.0,0.0,61.0,5.0,0.0,103.542403,102.206774
141,141,1038,2020-09-06,2020,Italian Grand Prix,154.0,12.0,1,0,0,0,0,16.0,0,0,0,1,0,0,0,0,0,0,15.0,0.0,229.184,0.0,21.0,0.0,1.0,9.0,0.0,103.345861,103.137817
142,142,1038,2020-09-06,2020,Italian Grand Prix,815.0,10.0,1,0,0,0,0,4.0,0,0,0,0,0,0,1,0,0,0,10.0,1.0,232.628,33.0,9.0,0.0,66.0,4.0,0.0,102.501233,101.88726
143,143,1038,2020-09-06,2020,Italian Grand Prix,817.0,6.0,1,0,0,0,0,7.0,0,0,0,0,0,0,0,0,1,0,4.0,13.0,234.589,33.0,8.0,0.0,59.0,6.0,0.0,103.299351,101.759943
144,144,1038,2020-09-06,2020,Italian Grand Prix,822.0,5.0,1,0,0,0,0,2.0,0,0,0,0,0,1,0,0,0,0,2.0,18.0,233.503,107.0,3.0,1.0,264.0,1.0,6.0,101.283235,100.281396
145,145,1038,2020-09-06,2020,Italian Grand Prix,825.0,,1,0,0,0,0,15.0,0,0,0,1,0,0,0,0,0,0,17.0,0.0,233.287,1.0,17.0,0.0,1.0,9.0,0.0,103.881753,103.237648
146,146,1038,2020-09-06,2020,Italian Grand Prix,826.0,9.0,1,0,0,0,0,11.0,0,1,0,0,0,0,0,0,0,0,11.0,0.0,229.572,2.0,16.0,0.0,20.0,7.0,0.0,104.074585,102.483344
147,147,1038,2020-09-06,2020,Italian Grand Prix,830.0,,1,0,0,0,0,5.0,0,0,0,0,0,0,0,1,0,0,3.0,15.0,232.809,110.0,2.0,1.0,158.0,2.0,1.0,101.863313,100.988691


## Build the Model

In [123]:
# Split x and y in training data
sorted_df["fastestLapSpeed"] = pd.to_numeric(sorted_df["fastestLapSpeed"])
sorted_df["currentFinishingPosition"] = pd.to_numeric(sorted_df["currentFinishingPosition"])
sorted_df["pastFinishingPosition"] = pd.to_numeric(sorted_df["pastFinishingPosition"])

# Remove any NAs
sorted_df = sorted_df.replace({r"\N": None})
sorted_df = sorted_df.dropna()

x = sorted_df.iloc[:, 7:]
y = sorted_df.iloc[:, 6]

In [124]:
xgb_model = xgb.XGBRegressor(n_estimators = 300, max_depth = 9)
xgb_model.fit(x, y)
xgb_model.score(x, y)

0.999999976622024

In [126]:
# Doing the same for the most recent race, pulled in from Raw Predictive Modeling ipynb
# Merge lap times data
total_lap_times = pd.merge(lap_times, races[["raceId", "year"]], on = "raceId")
total_lap_times = total_lap_times[["raceId", "driverId", "milliseconds"]]
total_lap_times = total_lap_times.groupby(['raceId', 'driverId']).sum()
total_lap_times["seconds"] = total_lap_times.milliseconds / 1000
total_lap_info = pd.merge(total_lap_times, df_dum[["raceId", "driverId", "laps"]], on = ["raceId", "driverId"])
total_lap_info["avg_lap_time"] = total_lap_info.seconds / total_lap_info.laps
total_lap_info

test_df = pd.read_csv("./predictions/tu_2020_raw_table").iloc[:, 1:]
past_race_lap_info = total_lap_info[total_lap_info.raceId == 1038]
past_race_lap_info["raceId"] = past_race_lap_info["raceId"] + 1
past_race_lap_info
test_merged_df = pd.merge(test_df, past_race_lap_info[["raceId", "driverId", "seconds"]], on = ["raceId", "driverId"])
test_merged_df

Unnamed: 0,raceId,driverId,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,grid,minQualifyingTime,constructorName_Alfa Romeo,constructorName_AlphaTauri,constructorName_Ferrari,constructorName_Haas F1 Team,constructorName_McLaren,constructorName_Mercedes,constructorName_Racing Point,constructorName_Red Bull,constructorName_Renault,constructorName_Williams,finishingPosition,pointsGained,laps,fastestLapSpeed,cumulativePoints,driverStanding,driver_wins,constructorPoints,constructorStanding,constructorWins,fastestLapTime,seconds
0,1039,1,1,0,0,0,0,1,75.144,0,0,0,0,0,1,0,0,0,0,7.0,7.0,53,252.033,164.0,1,5,281.0,1.0,6.0,82.746,6443.301
1,1039,822,1,0,0,0,0,2,75.203,0,0,0,0,0,1,0,0,0,0,5.0,10.0,53,248.386,117.0,2,1,281.0,1.0,6.0,83.961,6433.164
2,1039,830,1,0,0,0,0,3,75.471,0,0,0,0,0,0,0,1,0,0,,0.0,30,243.804,110.0,3,1,158.0,2.0,1.0,85.539,4499.415
3,1039,848,1,0,0,0,0,4,75.914,0,0,0,0,0,0,0,1,0,0,15.0,0.0,53,245.564,48.0,6,0,158.0,2.0,1.0,84.926,6463.589
4,1039,844,1,0,0,0,0,5,76.27,0,0,1,0,0,0,0,0,0,0,,0.0,23,242.424,45.0,7,0,61.0,6.0,0.0,86.026,2127.678
5,1039,20,1,0,0,0,0,14,76.858,0,0,1,0,0,0,0,0,0,0,,0.0,6,239.415,16.0,13,0,61.0,6.0,0.0,87.107,588.85
6,1039,815,1,0,0,0,0,6,76.311,0,0,0,0,0,0,1,0,0,0,10.0,1.0,53,247.282,34.0,11,0,82.0,4.0,0.0,84.336,6449.28
7,1039,840,1,0,0,0,0,7,76.271,0,0,0,0,0,0,1,0,0,0,3.0,15.0,53,248.576,57.0,4,0,82.0,4.0,0.0,83.897,6429.414
8,1039,817,1,0,0,0,0,8,76.243,0,0,0,0,0,0,0,0,1,0,6.0,8.0,53,248.573,41.0,10,0,71.0,5.0,0.0,83.898,6434.447
9,1039,839,1,0,0,0,0,10,76.297,0,0,0,0,0,0,0,0,1,0,8.0,4.0,53,246.831,30.0,12,0,71.0,5.0,0.0,84.49,6444.747


In [143]:
current_min_df = test_merged_df.groupby("raceId", as_index = False)
current_min_lap_time = current_min_df.fastestLapTime.min()
current_temp1 = pd.merge(test_merged_df, current_min_lap_time, on = "raceId")  
current_temp1["fastestLapRatio"] = current_temp1["fastestLapTime_x"] / current_temp1["fastestLapTime_y"]
current_temp1 = current_temp1.drop(columns = ["fastestLapTime_x", "fastestLapTime_y"])
current_min_quali_time = current_min_df.minQualifyingTime.min()
current_temp2 = pd.merge(current_temp1, current_min_quali_time, on = "raceId")
current_temp2["minQualiRatio"] = current_temp2["minQualifyingTime_x"] / current_temp2["minQualifyingTime_y"]
current_temp2 = current_temp2.drop(columns = ["minQualifyingTime_x", "minQualifyingTime_y"])
current_temp2["avg_lap_time"] = current_temp2["seconds"] / current_temp2["laps"]
current_temp_min = current_temp2.groupby("raceId", as_index = False)
current_min_avg_lap_time = current_temp_min.avg_lap_time.min()
current_temp_min1 = pd.merge(current_temp2, current_min_avg_lap_time, on = "raceId")
current_temp_min1["avgLapRatio"] = current_temp_min1["avg_lap_time_x"] / current_temp_min1["avg_lap_time_y"]
current_temp_min1 = current_temp_min1.drop(columns = ["avg_lap_time_x", "avg_lap_time_y"])
current_temp3 = current_temp_min1.groupby("driverId", as_index = True)
current_rolling_fastLapRatio = pd.DataFrame(current_temp3["fastestLapRatio"].rolling(window = 50, min_periods = 1).mean())
current_rolling_fastLapRatio = current_rolling_fastLapRatio.reset_index()
current_rolling_fastLapRatio = current_rolling_fastLapRatio.rename({"level_1" : "index", "fastestLapRatio" : "rolling_fastestLap"}, axis = 1)
current_rolling_minQualiRatio = pd.DataFrame(current_temp3["minQualiRatio"].rolling(window = 50, min_periods = 1).mean())
current_rolling_minQualiRatio = current_rolling_minQualiRatio.reset_index()
current_rolling_minQualiRatio = current_rolling_minQualiRatio.rename({"level_1" : "index", "minQualiRatio" : "rolling_minQualiRatio"}, axis = 1)
current_rolling_avgLapRatio = pd.DataFrame(current_temp3["avgLapRatio"].rolling(window = 50, min_periods = 1).mean())
current_rolling_avgLapRatio = current_rolling_avgLapRatio.reset_index()
current_rolling_avgLapRatio = current_rolling_avgLapRatio.rename({"level_1" : "index", "avgLapRatio" : "rolling_avgLapTime"}, axis = 1)
current_temp_min1 = current_temp_min1.reset_index()
current_temp4 = pd.merge(current_temp_min1, current_rolling_fastLapRatio, on = ["driverId", "index"])
current_temp5 = pd.merge(current_temp4, current_rolling_minQualiRatio, on = ["driverId", "index"])
current_temp6 = pd.merge(current_temp5, current_rolling_avgLapRatio, on = ["driverId", "index"])
current_temp6 = current_temp6.drop(columns = ["fastestLapRatio", "minQualiRatio", "avgLapRatio", "seconds", "laps"])

key = lambda x: (x != 'index', x != 'raceId', x != 'date', x != 'year', x != 'raceName', x != 'driverId', x != 'rolling_avgLapTime')
current_sorted_df = current_temp6[sorted(current_temp6, key=key)]
current_sorted_df["rolling_avgLapTime"] = current_sorted_df["rolling_avgLapTime"] * 100
current_sorted_df["rolling_fastestLap"] = current_sorted_df["rolling_fastestLap"] * 100
current_sorted_df["rolling_minQualiRatio"] = current_sorted_df["rolling_minQualiRatio"] * 100
# current_sorted_df = current_sorted_df.dropna()
current_sorted_df = current_sorted_df.drop(columns = "rolling_avgLapTime")
current_sorted_df = current_sorted_df.rename({'finishingPosition' : 'pastFinishingPosition', 'grid' : 'qualiPosition', 'pointsGained' : 'pastPointsGained'}, axis = 1)
current_sorted_df

Unnamed: 0,index,raceId,driverId,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,qualiPosition,constructorName_Alfa Romeo,constructorName_AlphaTauri,constructorName_Ferrari,constructorName_Haas F1 Team,constructorName_McLaren,constructorName_Mercedes,constructorName_Racing Point,constructorName_Red Bull,constructorName_Renault,constructorName_Williams,pastFinishingPosition,pastPointsGained,fastestLapSpeed,cumulativePoints,driverStanding,driver_wins,constructorPoints,constructorStanding,constructorWins,rolling_fastestLap,rolling_minQualiRatio
0,0,1039,1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,7.0,7.0,252.033,164.0,1,5,281.0,1.0,6.0,100.0,100.0
1,1,1039,822,1,0,0,0,0,2,0,0,0,0,0,1,0,0,0,0,5.0,10.0,248.386,117.0,2,1,281.0,1.0,6.0,101.468349,100.078516
2,2,1039,830,1,0,0,0,0,3,0,0,0,0,0,0,0,1,0,0,,0.0,243.804,110.0,3,1,158.0,2.0,1.0,103.37539,100.435164
3,3,1039,848,1,0,0,0,0,4,0,0,0,0,0,0,0,1,0,0,15.0,0.0,245.564,48.0,6,0,158.0,2.0,1.0,102.634568,101.024699
4,4,1039,844,1,0,0,0,0,5,0,0,1,0,0,0,0,0,0,0,,0.0,242.424,45.0,7,0,61.0,6.0,0.0,103.963938,101.498456
5,5,1039,20,1,0,0,0,0,14,0,0,1,0,0,0,0,0,0,0,,0.0,239.415,16.0,13,0,61.0,6.0,0.0,105.270345,102.280954
6,6,1039,815,1,0,0,0,0,6,0,0,0,0,0,0,1,0,0,0,10.0,1.0,247.282,34.0,11,0,82.0,4.0,0.0,101.921543,101.553018
7,7,1039,840,1,0,0,0,0,7,0,0,0,0,0,0,1,0,0,0,3.0,15.0,248.576,57.0,4,0,82.0,4.0,0.0,101.391004,101.499787
8,8,1039,817,1,0,0,0,0,8,0,0,0,0,0,0,0,0,1,0,6.0,8.0,248.573,41.0,10,0,71.0,5.0,0.0,101.392212,101.462525
9,9,1039,839,1,0,0,0,0,10,0,0,0,0,0,0,0,0,1,0,8.0,4.0,246.831,30.0,12,0,71.0,5.0,0.0,102.107655,101.534387


In [144]:
x_test = current_sorted_df.iloc[:, 3:]

In [145]:
x_test

Unnamed: 0,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,qualiPosition,constructorName_Alfa Romeo,constructorName_AlphaTauri,constructorName_Ferrari,constructorName_Haas F1 Team,constructorName_McLaren,constructorName_Mercedes,constructorName_Racing Point,constructorName_Red Bull,constructorName_Renault,constructorName_Williams,pastFinishingPosition,pastPointsGained,fastestLapSpeed,cumulativePoints,driverStanding,driver_wins,constructorPoints,constructorStanding,constructorWins,rolling_fastestLap,rolling_minQualiRatio
0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,7.0,7.0,252.033,164.0,1,5,281.0,1.0,6.0,100.0,100.0
1,1,0,0,0,0,2,0,0,0,0,0,1,0,0,0,0,5.0,10.0,248.386,117.0,2,1,281.0,1.0,6.0,101.468349,100.078516
2,1,0,0,0,0,3,0,0,0,0,0,0,0,1,0,0,,0.0,243.804,110.0,3,1,158.0,2.0,1.0,103.37539,100.435164
3,1,0,0,0,0,4,0,0,0,0,0,0,0,1,0,0,15.0,0.0,245.564,48.0,6,0,158.0,2.0,1.0,102.634568,101.024699
4,1,0,0,0,0,5,0,0,1,0,0,0,0,0,0,0,,0.0,242.424,45.0,7,0,61.0,6.0,0.0,103.963938,101.498456
5,1,0,0,0,0,14,0,0,1,0,0,0,0,0,0,0,,0.0,239.415,16.0,13,0,61.0,6.0,0.0,105.270345,102.280954
6,1,0,0,0,0,6,0,0,0,0,0,0,1,0,0,0,10.0,1.0,247.282,34.0,11,0,82.0,4.0,0.0,101.921543,101.553018
7,1,0,0,0,0,7,0,0,0,0,0,0,1,0,0,0,3.0,15.0,248.576,57.0,4,0,82.0,4.0,0.0,101.391004,101.499787
8,1,0,0,0,0,8,0,0,0,0,0,0,0,0,1,0,6.0,8.0,248.573,41.0,10,0,71.0,5.0,0.0,101.392212,101.462525
9,1,0,0,0,0,10,0,0,0,0,0,0,0,0,1,0,8.0,4.0,246.831,30.0,12,0,71.0,5.0,0.0,102.107655,101.534387


In [146]:
xgb_preds = xgb_model.predict(x_test)
xgb_preds_df = pd.DataFrame({'Predictions' : xgb_preds})
merged_xgb_df = pd.merge(xgb_preds_df, current_sorted_df[["raceId", "driverId"]], left_index = True, right_index = True)
full_xgb_df = pd.merge(merged_xgb_df, drivers[["driverName", "driverId"]], on = "driverId")
full_xgb_df = full_xgb_df.sort_values("Predictions")
full_xgb_df = full_xgb_df.reset_index(drop = True)
full_xgb_df["Predicted Position"] = full_xgb_df.index + 1
full_xgb_df.sort_values("Predictions")

Unnamed: 0,Predictions,raceId,driverId,driverName,Predicted Position
0,1.652876,1039,822,Valtteri Bottas,1
1,1.765163,1039,848,Alexander Albon,2
2,1.803732,1039,830,Max Verstappen,3
3,2.893155,1039,1,Lewis Hamilton,4
4,4.898154,1039,839,Esteban Ocon,5
5,5.178482,1039,826,Daniil Kvyat,6
6,5.93435,1039,815,Sergio Pérez,7
7,6.853724,1039,844,Charles Leclerc,8
8,6.854187,1039,832,Carlos Sainz,9
9,6.912608,1039,846,Lando Norris,10
