In [39]:
import numpy as np
import pandas as pd
import os
import itertools
from collections import defaultdict
import matplotlib.pyplot as plt
import math

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler, label_binarize
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.manifold import Isomap

from sklearn.linear_model import LinearRegression

from sklearn.metrics import r2_score, mean_squared_error, classification_report, accuracy_score, confusion_matrix, roc_curve, auc, matthews_corrcoef, f1_score
from sklearn.inspection import permutation_importance

In [40]:
## Getting a list of all event files in the right format for the read data function ##

files = os.listdir('/Users/chanelbrown/Desktop/Notebooks/CSV Files/')

events = []

for file in files:
    events.append(file[0:16])
    
events.remove('.DS_Store') 

In [41]:
## Function reads data into a Pandas DataFrame from a CSV file ##

def read_data(filename):
    
    df = pd.read_csv('/Users/chanelbrown/Desktop/Notebooks/CSV Files/%s.csv' % (filename))
    
    # select required features
    d = df[['Season', 'Circuit', 'Session', 'Time of Day (seconds)', 'Driver Short Name', 'Team', 'Outing Number','Lap number',
                   'Sector 1', 'Sector 2', 'Sector 3', 'Full Lap', 'Lap DRS (seconds)']]
    
    ''' some sectors have missing values (probably where car has been taken off, 
    convert the white space to NaN objects and drop them'''
    
    d.replace(' ', np.nan, inplace = True) 
                                              
    data = d.dropna()
    
    return data

In [42]:
## Remove outlier laps from the dataset (outlier is considered + 2 seconds difference) ##

def clean_data(event):
    
    # create dataframes for each qualifier
    Q1_event = event[event.loc[:,'Session'] == 'Q1']
    Q2_event = event[event.loc[:,'Session'] == 'Q2'] 
    
    # find fastest lap for each qualifier
    Q1_fast_lap = Q1_event[['Full Lap']].min()
    Q2_fast_lap = Q2_event[['Full Lap']].min()
    
    # keep data that less than or equal to 2 seconds slower than the fastest lap
    Q1_data = Q1_event[Q1_event.loc[:, 'Full Lap'] <= float(Q1_fast_lap) + 2] 
    Q2_data = Q2_event[Q2_event.loc[:, 'Full Lap'] <= float(Q2_fast_lap) + 2]
    
    # stack Q1_data and Q2_data
    data = pd.concat([Q1_data, Q2_data], ignore_index = True)
    
    return data, Q1_data, Q2_data

In [43]:
data = read_data('03_17Mel_Qu_2019')
#data = read_data('07_14Sil_Qu_2019')
Mel_2019, Q1_data, Q2_data  = clean_data(data)

#Q1_data.sort_values(["Time of Day (seconds)"], ascending=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


In [44]:
data = read_data('03_17Mel_Qu_2019')
Mel_2019, Q1_data, Q2_data  = clean_data(data)

data1 = read_data('03_25Mel_Qu_2018')
Mel_2018, Q1_data1, Q2_data1  = clean_data(data1)

data2 = read_data('03_26Mel_Qu_2017')
Mel_2017, Q1_data2, Q2_data2  = clean_data(data2)

In [45]:
Mel = pd.concat([Mel_2019, Mel_2018, Mel_2017], ignore_index = True)
Mel

Unnamed: 0,Season,Circuit,Session,Time of Day (seconds),Driver Short Name,Team,Outing Number,Lap number,Sector 1,Sector 2,Sector 3,Full Lap,Lap DRS (seconds)
0,2019,Melbourne,Q1,61739.570,HAM,Mercedes AMG,1,3,27.392,22.574,32.404,82.370,18.958
1,2019,Melbourne,Q1,62044.566,HAM,Mercedes AMG,1,6,27.355,22.259,32.305,81.919,18.884
2,2019,Melbourne,Q1,61746.529,BOT,Mercedes AMG,1,3,27.423,22.363,32.499,82.285,17.318
3,2019,Melbourne,Q1,61955.682,BOT,Mercedes AMG,1,5,27.465,22.291,32.372,82.128,18.445
4,2019,Melbourne,Q1,62154.203,BOT,Mercedes AMG,1,7,27.309,22.419,32.515,82.243,18.084
...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,2017,Melbourne,Q2,63635.727,KVY,Toro Rosso,4,13,27.931,22.773,34.037,84.742,12.111
151,2017,Melbourne,Q2,63080.021,SAI,Toro Rosso,3,10,27.994,22.983,33.897,84.875,12.743
152,2017,Melbourne,Q2,63617.543,SAI,Toro Rosso,4,13,28.089,22.882,33.952,84.924,12.709
153,2017,Melbourne,Q2,63631.997,GRO,Haas,4,14,27.947,22.949,33.699,84.596,12.883


Add in the actual cut-off to each data point (i.e what was the final cut-off in the session that the lap was set)

Use linear regression to predict this

In [48]:
HAM_Q1 = Mel.loc[(Mel.loc[:,'Session'] == 'Q1') & (Mel.loc[:,'Driver Short Name'] == 'HAM')]
HAM_Q2 = Mel.loc[(Mel.loc[:,'Session'] == 'Q2') & (Mel.loc[:,'Driver Short Name'] == 'HAM')]
HAM =  Mel.loc[Mel.loc[:,'Driver Short Name'] == 'HAM']

In [49]:
HAM

Unnamed: 0,Season,Circuit,Session,Time of Day (seconds),Driver Short Name,Team,Outing Number,Lap number,Sector 1,Sector 2,Sector 3,Full Lap,Lap DRS (seconds)
0,2019,Melbourne,Q1,61739.57,HAM,Mercedes AMG,1,3,27.392,22.574,32.404,82.37,18.958
1,2019,Melbourne,Q1,62044.566,HAM,Mercedes AMG,1,6,27.355,22.259,32.305,81.919,18.884
38,2019,Melbourne,Q2,62923.621,HAM,Mercedes AMG,2,9,26.926,22.359,32.452,81.737,18.544
39,2019,Melbourne,Q2,63603.101,HAM,Mercedes AMG,3,12,26.743,22.013,32.134,80.89,18.53
63,2018,Melbourne,Q1,61573.803,HAM,Mercedes AMG,1,3,27.55,22.788,33.109,83.448,18.115
64,2018,Melbourne,Q1,61769.58,HAM,Mercedes AMG,1,5,27.332,22.747,32.947,83.027,17.939
65,2018,Melbourne,Q1,61971.227,HAM,Mercedes AMG,1,7,27.339,22.569,32.798,82.706,17.871
98,2018,Melbourne,Q2,63059.136,HAM,Mercedes AMG,2,10,26.865,22.29,32.778,81.933,17.084
115,2017,Melbourne,Q1,61615.935,HAM,Mercedes AMG,1,3,27.704,22.877,33.71,84.29,12.182
116,2017,Melbourne,Q1,61818.272,HAM,Mercedes AMG,1,5,27.677,22.835,33.556,84.069,12.259


In [141]:
Target = HAM_Q1[['Full Lap']]
Features = HAM_Q1[['Sector 1', 'Sector 2', 'Lap DRS (seconds)']]

In [142]:
X_train, X_test, y_train, y_test = train_test_split(Features, Target, test_size = 0.25, random_state = 0)

In [143]:
model = LinearRegression()
fit = model.fit(X_train, y_train)
y_pred = fit.predict(X_test)

In [144]:
rmse = math.sqrt(mean_squared_error(y_test, y_pred))
rmse

0.5008569557023571

In [145]:
for i in range(len(y_pred)):
    pred = round(float(y_pred[i]),3)
    actual = round(float(np.array(y_test)[i]),3)
    print(f'Prediction: {pred} --- Actual: {actual} --- Difference: {round(pred - actual, 3)}')

Prediction: 84.232 --- Actual: 84.069 --- Difference: 0.163
Prediction: 82.759 --- Actual: 83.448 --- Difference: -0.689
