**Use Linear Regression to predict 2020 cut-off based on historical cut-off**

- The only feature we can use here is 2020 due to there being one cutoff point per year only
- A lot of data has to be taken out due to red flags and rain
- Linear regression only goes ahead if there are two or three points

In [44]:
import numpy as np
import pandas as pd
import seaborn as sns
import os
import itertools
from collections import defaultdict
import matplotlib.pyplot as plt

import math

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

In [94]:
## Getting a list of all event files in the right format for the read data function ##

files = os.listdir('/Users/chanelbrown/Desktop/Notebooks/All CSV Files - without fuel adjusted lap times/')

events = []

for file in files:
    events.append(file[0:16])

events.remove('.DS_Store')

# quick fix because we dont need 20 data here anyway
events.remove('07_12A1r_Qu_2020')
events.remove('08_09Sil_Qu_2020')

In [95]:
## Function reads data into a Pandas DataFrame from a CSV file ##

def read_data(filename):
    
    df = pd.read_csv('/Users/chanelbrown/Desktop/Notebooks/All CSV Files - without fuel adjusted lap times/%s.csv' % (filename))
    
    # select required features
    d = df[['Season', 'Circuit', 'Session', 'Driver Short Name', 'Team', 'Outing Number','Lap number',
                   'Sector 1', 'Sector 2', 'Sector 3', 'Full Lap (no fuel adjustment)']].copy()
    
    ''' some sectors have missing values (probably where car has been taken off, 
    convert the white space to NaN objects and drop them'''
    
    d.replace(' ', np.nan, inplace = True) 
                                              
    data = d.dropna()
    
    return data

In [4]:
## Function finds which drivers made it into each sub-session ##

def find_drivers(data, session):
    
    session_data = data[data.loc[:,'Session'] == session]
    drivers = list(np.unique(session_data['Driver Short Name']))
    
    return drivers

In [5]:
## Function finds the fastest actual lap for each driver ## 

def fastest_actual_lap(data, driver, session):
    
    driver_sesh = data.loc[(data.loc[:, 'Driver Short Name'] == driver) & (data.Session == session), :]
    
    fastest = np.min(driver_sesh['Full Lap (no fuel adjustment)'])
    
    #print(f'The fastest theoretical lap for {team} in {session} is {fastest} seconds')
    
    season = data.loc[1, 'Season'] # finds the season of the fastest theoretical lap
    circuit = data.loc[1, 'Circuit'] # finds the circuit of the fastest theoretical lap
    
    team = driver_sesh.loc[driver_sesh.loc[:, 'Full Lap (no fuel adjustment)'] == fastest].iloc[0, 4] # finds the team of the fastest theoretical lap
    
    return season, circuit, team, driver, session, fastest

In [6]:
## Function puts the fastest actual lap per team into a dictionary where the qualifying session is the key ##

def create_qualifier_dict(data, sessions):
    
    qualifiers = {} # {'Q1' : df, 'Q2': df, 'Q3': df}
    
    for s in sessions:

        drivers = find_drivers(data, s)

        df = pd.DataFrame(columns = ['Season', 'Circuit', 'Team', 'Driver', 'Session', 'Fastest Actual Lap Time'])

        for d in drivers:

            season, circuit, team, driver, session, fastest = fastest_actual_lap(data, driver = d, session = s) 
            
            to_append = [season, circuit, team, driver, session, fastest] # values to append to empty dataframe
            a_series = pd.Series(to_append, index = df.columns) # convert into a series first
            df = df.append(a_series, ignore_index = True) 

        df = df.sort_values(by = ['Fastest Actual Lap Time'])

        qualifiers[s] = df # adds each set of results into a dictionary that has {qualifying round: data frame of fastest laps}
       
        
    return qualifiers, circuit

In [7]:
## Runs through all events available and works out actual laps for each qualifying session specified ##

sessions = ['Q1', 'Q2']

q = []
c = []

for event in events:
    
    data = read_data(event)
   
    qualifiers, circuit = create_qualifier_dict(data, sessions)
    
    q.append(qualifiers)
    c.append(circuit)

assert len(q) == len(c)

circuit_fastest_laps = defaultdict(list) # returns a dictionary where the keys are circuits and the values are qualifying dictionaries for each year

for i, j in zip(c, q):
    circuit_fastest_laps[i].append(j)

print(circuit_fastest_laps.keys())

dict_keys(['Barcelona', 'Bahrain', 'Sochi', 'Monza', 'Suzuka', 'A1-Ring', 'Melbourne', 'Sepang', 'Hockenheim', 'Mexico City', 'Montreal', 'Spa', 'Baku', 'Monaco', 'Austin', 'Interlagos', 'Silverstone', 'Budapest', 'Nurburgring', 'Shanghai', 'Abu Dhabi', 'Mugello', 'Singapore', 'Istanbul', 'Paul Ricard', 'Sakhir', 'Portimao', 'Imola'])


In [8]:
## Returns dictionary containing the laps for Q1 and Q2 when track and year specified ##

def return_dict(year, track, res):
    
    if res[track][0]['Q1'].loc[0, 'Season'] == year:
        return {'Q1': res[track][0]['Q1'], 'Q2': res[track][0]['Q2']}
    
    else:
        pass
    
    if res[track][1]['Q1'].loc[0, 'Season'] == year:
        return {'Q1': res[track][1]['Q1'], 'Q2': res[track][1]['Q2']}
    
    else:
        pass
    
    if res[track][2]['Q1'].loc[0, 'Season'] == year:
        return {'Q1': res[track][2]['Q1'], 'Q2': res[track][2]['Q2']}
    
    else:
        pass
    
    if res[track][3]['Q1'].loc[0, 'Season'] == year:
        return {'Q1': res[track][3]['Q1'], 'Q2': res[track][3]['Q2']}
    
    else:
        print('Enter valid arguments: year is not a string, track is a string')

In [9]:
## Finds cut-off points ##

def find_cutoff(input_data):

    Q1 = input_data['Q1']
    Q2 = input_data['Q2']
    
    Q1_cut = Q1.iloc[:, -1].nsmallest(16).iloc[-1] # 16th place is the score to beat, base it on below the 8th
    Q2_cut = Q2.iloc[:, -1].nsmallest(11).iloc[-1] # 11th place is the score to beat, base it on below the 5th
  
    #print(f'Q1 cut-off is {round(Q1_cut, 3)} seconds')
    #print(f'Q2 cut-off is {round(Q2_cut, 3)} seconds')
    
    return Q1_cut, Q2_cut

In [10]:
## Find the Q1 and Q2 cut-off points and the speed ratios for all years for all tracks - return info in DF ##


Q1_cutoff = pd.DataFrame(columns = ['Circuit', 'Cut-off 2017', 'Cut-off 2018', 'Cut-off 2019'])
Q2_cutoff = pd.DataFrame(columns = ['Circuit', 'Cut-off 2017', 'Cut-off 2018', 'Cut-off 2019'])


for track in circuit_fastest_laps.keys():
    
    try:
        event_2017 = return_dict(2017, track, circuit_fastest_laps)
        Q1_cut_2017, Q2_cut_2017 = find_cutoff(event_2017)
        
    except:
        Q1_cut_2017, Q2_cut_2017 = None, None
        
    
    try:
        event_2018 = return_dict(2018, track, circuit_fastest_laps)
        Q1_cut_2018, Q2_cut_2018 = find_cutoff(event_2018)
    
    except:
        Q1_cut_2018, Q2_cut_2018 = None, None
        
        
    try:
        event_2019 = return_dict(2019, track, circuit_fastest_laps)
        Q1_cut_2019, Q2_cut_2019 = find_cutoff(event_2019)
    
    except:
        Q1_cut_2019, Q2_cut_2019 = None, None
    

   
    to_append = [track, Q1_cut_2017, Q1_cut_2018, Q1_cut_2019] # values to append to empty dataframe
    series = pd.Series(to_append, index = Q1_cutoff.columns) # convert into a series first
    Q1_cutoff = Q1_cutoff.append(series, ignore_index = True) 

    
    to_append = [track, Q2_cut_2017, Q2_cut_2018, Q2_cut_2019] # values to append to empty dataframe
    series = pd.Series(to_append, index = Q2_cutoff.columns) # convert into a series first
    Q2_cutoff = Q2_cutoff.append(series, ignore_index = True) 
    

In [11]:
Q1_cutoff

Unnamed: 0,Circuit,Cut-off 2017,Cut-off 2018,Cut-off 2019
0,Barcelona,82.332,78.923,78.404
1,Bahrain,92.118,90.53,90.026
2,Sochi,96.462,95.037,94.84
3,Monza,100.489,81.888,80.784
4,Suzuka,90.849,90.361,89.822
5,A1-Ring,66.345,65.271,64.789
6,Melbourne,86.419,84.532,83.017
7,Sepang,93.308,,
8,Hockenheim,,73.72,73.333
9,Mexico City,79.176,76.911,78.065


In [12]:
Q2_cutoff

Unnamed: 0,Circuit,Cut-off 2017,Cut-off 2018,Cut-off 2019
0,Barcelona,81.329,78.323,77.338
1,Bahrain,90.923,90.105,89.488
2,Sochi,95.948,93.995,93.95
3,Monza,97.582,81.669,80.517
4,Suzuka,89.778,89.864,89.254
5,A1-Ring,65.597,64.845,64.49
6,Melbourne,85.081,83.692,82.562
7,Sepang,92.034,,
8,Hockenheim,,73.657,72.786
9,Mexico City,78.099,76.844,76.687


In [41]:
## Data that has to be taken out due to rain and red flags 1 means the data is good to keep for the year index position, 0 means its bad ##

Q1_bad_data = {'Sochi': [1, 1, 0], 'Monza': [0, 1, 0], 'Suzuka': [0, 0, 0], 'A1-Ring': [1, 1, 1],
              'Melbourne': [1, 1, 1], 'Hockenheim': [0, 1, 1], 'Mexico City': [1, 1, 1], 'Bahrain': [1, 0, 1], 
               'Montreal': [1, 1, 1], 'Spa': [1, 1, 0], 'Baku': [1, 1, 0], 'Monaco': [1, 1, 1],
               'Silverstone': [0, 0, 1], 'Shanghai': [1, 1, 1], 'Abu Dhabi': [1, 1, 1], 'Barcelona': [1, 1, 1],
               'Singapore': [1, 1, 1], 'Paul Ricard': [0, 1, 1], 'Interlagos': [0, 1, 1], 'Budapest': [1, 0, 1], 
               'Austin': [1, 1, 1], 'Portimao': [0, 0, 0], 'Nurburgring': [0, 0, 0], 
               'Mugello': [0, 0, 0], 'Imola': [0, 0, 0], 'Istanbul': [0, 0, 0], 'Sakhir': [0, 0, 0]}

Q2_bad_data = {'Sochi': [1, 1, 1], 'Monza': [0, 1, 1], 'Suzuka': [1, 1, 1], 'A1-Ring': [1, 1, 1],
              'Melbourne': [1, 1, 1], 'Hockenheim': [0, 0, 1], 'Mexico City': [1, 1, 1], 'Bahrain': [1, 1, 1], 
               'Montreal': [1, 1, 1], 'Spa': [1, 1, 1], 'Baku': [1, 1, 0], 'Monaco': [1, 1, 1],
               'Silverstone': [0, 1, 1], 'Shanghai': [1, 1, 1], 'Abu Dhabi': [1, 1, 1], 'Barcelona': [1, 1, 1],
               'Singapore': [1, 1, 1], 'Paul Ricard': [0, 1, 1], 'Interlagos': [0, 1, 1], 'Budapest': [1, 0, 1], 
               'Austin': [1, 1, 1], 'Portimao': [0, 0, 0], 'Nurburgring': [0, 0, 0], 
               'Mugello': [0, 0, 0], 'Imola': [0, 0, 0], 'Istanbul': [0, 0, 0], 'Sakhir': [0, 0, 0]}

In [69]:
def remove_bad_data(bad_data, cutoff):
    
    clean_data = {}

    for track in bad_data.keys():
        data = bad_data[track]

        if data[0] == 1:
            data_17 = cutoff[cutoff['Circuit'] == track]['Cut-off 2017'].item()
        else:
            data_17 = 0

        if data[1] == 1:
            data_18 = cutoff[cutoff['Circuit'] == track]['Cut-off 2018'].item()
        else:
            data_18 = 0

        if data[2] == 1:
            data_19 = cutoff[cutoff['Circuit'] == track]['Cut-off 2019'].item()
        else:
            data_19 = 0
    
        clean_data[track] = [data_17, data_18, data_19]

    return clean_data

In [42]:
Q1_data = remove_bad_data(Q1_bad_data, Q1_cutoff)
Q2_data = remove_bad_data(Q2_bad_data, Q2_cutoff)

In [70]:
Q1_data # {track: [2017 cutoff, 2018 cutoff, 2019 cutoff]}

{'Sochi': [96.462, 95.037, 0],
 'Monza': [0, 81.888, 0],
 'Suzuka': [0, 0, 0],
 'A1-Ring': [66.345, 65.271, 64.789],
 'Melbourne': [86.419, 84.532, 83.017],
 'Hockenheim': [0, 73.72, 73.333],
 'Mexico City': [79.176, 76.911, 78.065],
 'Bahrain': [92.118, 0, 90.026],
 'Montreal': [74.182, 73.047, 72.197],
 'Spa': [106.028, 104.489, 0],
 'Baku': [104.334, 104.489, 0],
 'Monaco': [74.101, 73.179, 72.149],
 'Silverstone': [0, 0, 86.662],
 'Shanghai': [95.023, 94.062, 94.292],
 'Abu Dhabi': [99.516, 97.994, 98.051],
 'Barcelona': [82.332, 78.923, 78.404],
 'Singapore': [104.014, 99.644, 99.957],
 'Paul Ricard': [0, 92.976, 91.564],
 'Interlagos': [0, 69.269, 69.32],
 'Budapest': [79.095, 0, 77.031],
 'Austin': [96.842, 95.294, 94.226],
 'Portimao': [0, 0, 0],
 'Nurburgring': [0, 0, 0],
 'Mugello': [0, 0, 0],
 'Imola': [0, 0, 0],
 'Istanbul': [0, 0, 0],
 'Sakhir': [0, 0, 0]}

In [43]:
Q2_data

{'Sochi': [95.948, 93.995, 93.95],
 'Monza': [0, 81.669, 80.517],
 'Suzuka': [89.778, 89.864, 89.254],
 'A1-Ring': [65.597, 64.845, 64.49],
 'Melbourne': [85.081, 83.692, 82.562],
 'Hockenheim': [0, 0, 72.786],
 'Mexico City': [78.099, 76.844, 76.687],
 'Bahrain': [90.923, 90.105, 89.488],
 'Montreal': [73.69, 72.606, 71.8],
 'Spa': [105.09, 103.844, 104.797],
 'Baku': [103.186, 103.585, 0],
 'Monaco': [73.516, 72.411, 71.67],
 'Silverstone': [0, 87.901, 86.519],
 'Shanghai': [94.15, 92.986, 93.236],
 'Abu Dhabi': [98.636, 96.982, 97.055],
 'Barcelona': [81.329, 78.323, 77.338],
 'Singapore': [102.246, 98.641, 98.62],
 'Paul Ricard': [0, 92.075, 90.461],
 'Interlagos': [0, 68.659, 68.868],
 'Budapest': [78.415, 0, 76.565],
 'Austin': [95.155, 94.566, 93.785],
 'Portimao': [0, 0, 0],
 'Nurburgring': [0, 0, 0],
 'Mugello': [0, 0, 0],
 'Imola': [0, 0, 0],
 'Istanbul': [0, 0, 0],
 'Sakhir': [0, 0, 0]}

In [83]:
## Linear Regression - made so that it matches up the years data available with the correct feature value i.e the year ##
def LR(data, track):

    model = LinearRegression()
    X_test = np.array(2020).reshape(-1, 1)
    
    if data[track][0] == 0:
        X_train = np.array([2018, 2019]).reshape(-1, 1)
        y_train = np.array([data[track][1], data[track][2]]).reshape(-1, 1)

    elif data[track][1] == 0:
        X_train = np.array([2017, 2019]).reshape(-1, 1)
        y_train = np.array([data[track][0], data[track][2]]).reshape(-1, 1)

    elif data[track][2] == 0:
        X_train = np.array([2017, 2018]).reshape(-1, 1)
        y_train = np.array([data[track][0], data[track][1]]).reshape(-1, 1)

    else:
        X_train = np.array([2017, 2018, 2019]).reshape(-1, 1)
        y_train = np.array(data[track]).reshape(-1, 1)
    
    fit = model.fit(X_train, y_train)
    y_pred = fit.predict(X_test)
    #rmse = math.sqrt(mean_squared_error(y_test, y_pred))
    
    return y_pred

In [92]:
## Q1 Predictions ##

for track in Q1_data.keys():
    
    if Q1_data[track].count(0) < 2:
        y_pred = LR(Q1_data, track)
    else:
        y_pred = 'NA'
    
    print(track, y_pred)

Sochi [[92.187]]
Monza NA
Suzuka NA
A1-Ring [[63.91233333]]
Melbourne [[81.254]]
Hockenheim [[72.946]]
Mexico City [[76.93966667]]
Bahrain [[88.98]]
Montreal [[71.157]]
Spa [[101.411]]
Baku [[104.799]]
Monaco [[71.191]]
Silverstone NA
Shanghai [[93.728]]
Abu Dhabi [[97.05533333]]
Barcelona [[75.95833333]]
Singapore [[97.148]]
Paul Ricard [[90.152]]
Interlagos [[69.371]]
Budapest [[75.999]]
Austin [[92.838]]
Portimao NA
Nurburgring NA
Mugello NA
Imola NA
Istanbul NA
Sakhir NA


In [93]:
## Q2 Predictions ##

for track in Q2_data.keys():
    
    if Q2_data[track].count(0) < 2:
        y_pred = LR(Q2_data, track)
    else:
        y_pred = 'NA'
    
    print(track, y_pred)

Sochi [[92.633]]
Monza [[79.365]]
Suzuka [[89.108]]
A1-Ring [[63.87033333]]
Melbourne [[81.25933333]]
Hockenheim NA
Mexico City [[75.798]]
Bahrain [[88.737]]
Montreal [[70.80866667]]
Spa [[104.284]]
Baku [[104.383]]
Monaco [[70.68633333]]
Silverstone [[85.137]]
Shanghai [[92.54333333]]
Abu Dhabi [[95.97666667]]
Barcelona [[75.00566667]]
Singapore [[96.20966667]]
Paul Ricard [[88.847]]
Interlagos [[69.077]]
Budapest [[75.64]]
Austin [[93.132]]
Portimao NA
Nurburgring NA
Mugello NA
Imola NA
Istanbul NA
Sakhir NA
