In [1]:
# %%
import numpy as np 
import pandas as pd 
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline

from sklearn.decomposition import PCA

from sklearn.impute import SimpleImputer, KNNImputer

pd.set_option("display.max_columns", None)


In [2]:
def flattenColumnIndex(columns: list) -> list:
   return ['_'.join(col).strip() if col[1] != '' else col[0] for col in columns]

In [3]:
%%html
<style>
.dataframe td {
    white-space: nowrap;
}
</style>

In [4]:
df = 'MotoGP_2021.csv'
df = pd.read_csv(df)

print(df.shape)
df.head()

(42741, 33)


Unnamed: 0,data,year,event,session,rider_position,rider_number,rider,nation,team,motorcycle,rider_classification,total_laps,full_laps,run_number,front_tire,rear_tire,front_tire_age,rear_tire_age,lap_invalidated,lap_unfinished,lap_number,lap_type,lap_time,T1,T2,T3,T4,speed,invalidated_T1,invalidated_T2,invalidated_T3,invalidated_T4,lap_time_seconds
0,1 2'29.580 42.632 35.061 30.943 33.122 106.4,2021,QAT,FP1,1,21,MORBIDELLI Franco (FMor),ITA,Petronas Yamaha SRT,YAMAHA,*,17,11,1,Slick-Hard,Slick-Hard,0,0,False,False,1,Out,2'29.580,42.632,35.061,30.943,33.122,106.4,,,,,149.58
1,2 1'57.714 26.068 30.602 29.011 32.033 339.6,2021,QAT,FP1,1,21,MORBIDELLI Franco (FMor),ITA,Petronas Yamaha SRT,YAMAHA,*,17,11,1,Slick-Hard,Slick-Hard,1,1,False,False,2,Speed,1'57.714,26.068,30.602,29.011,32.033,339.6,,,,,117.714
2,3 1'56.337 * 25.578 30.349 28.571 31.839* 339.6,2021,QAT,FP1,1,21,MORBIDELLI Franco (FMor),ITA,Petronas Yamaha SRT,YAMAHA,*,17,11,1,Slick-Hard,Slick-Hard,2,2,True,False,3,Speed,1'56.337,25.578,30.349,28.571,31.839,339.6,,,,True,116.337
3,4 1'56.618 25.431 30.567 28.788 31.832 339.6,2021,QAT,FP1,1,21,MORBIDELLI Franco (FMor),ITA,Petronas Yamaha SRT,YAMAHA,*,17,11,1,Slick-Hard,Slick-Hard,3,3,False,False,4,Speed,1'56.618,25.431,30.567,28.788,31.832,339.6,,,,,116.618
4,5 1'55.518 25.254 30.048 28.554 31.662 341.7,2021,QAT,FP1,1,21,MORBIDELLI Franco (FMor),ITA,Petronas Yamaha SRT,YAMAHA,*,17,11,1,Slick-Hard,Slick-Hard,4,4,False,False,5,Speed,1'55.518,25.254,30.048,28.554,31.662,341.7,,,,,115.518


In [5]:
## each row in the data set should be one race weekend.
## features to be session fastest time, avg time to start

# Only going to selecct the fields required from the base data frame
pruned = (
    df[['year','event','session','rider_position','lap_time_seconds']]
    [
        (df['session'] != 'RAC') &
        (df['lap_type'] == 'Speed')
    ]
)
    
# Splitting out practices & Qualitfying, will need to do separate operations 
# on each of them
qualifying = (
    pruned[pruned['session'] == 'Q']
    .groupby(['year','event'])
    .agg({'lap_time_seconds':'min'})
)
qualifying.columns = ['y']

practices = (
    pruned[pruned['session'] != 'Q']
    .groupby(['year','event','session'])
    .agg({'lap_time_seconds':['min','mean','median','std']})
    .reset_index()
)
practices.columns = flattenColumnIndex(practices.columns)

# Removed pruned from memory
del(pruned)

In [6]:
x = (
    practices
    .pivot(
        index=['year','event'],
        columns='session',
        values=[
            'lap_time_seconds_min',
            'lap_time_seconds_mean',
            'lap_time_seconds_median',
            'lap_time_seconds_std'
        ]
    )
)
x.columns = flattenColumnIndex(x.columns)
x = x.merge(qualifying, on='event')


X = x.iloc[:,:-1].to_numpy()
y = x.iloc[:,-1].to_numpy()



x_train, x_test, y_train, y_test = train_test_split(X, y)

In [7]:
lr_model = LinearRegression()

lr_model = lr_model.fit(x_train, y_train)

In [8]:
y_pred = lr_model.predict(x_test)

In [9]:
y_pred 

array([113.7277616 ,  90.22541436,  78.51197922, 106.41849404,
        91.75486106])

In [10]:
y_test

array([113.106,  89.936,  80.236, 106.322,  91.814])

In [11]:
lr_model.coef_

array([ 0.05133559, -0.21684817,  0.12223001,  0.4567169 , -0.23906976,
       -0.08080112,  0.13435423, -0.46535457, -0.21738532,  0.61148072,
        0.03620637,  0.2443977 ,  0.42051681,  0.35482181, -0.17399021,
        0.0131986 , -0.02450839, -0.60605139, -0.36664494,  0.01879836])

In [19]:
for coef, val in zip(x.columns, lr_model.coef_):
    print(coef,': ', val)

lap_time_seconds_min_FP1 :  0.051335593416905134
lap_time_seconds_min_FP2 :  -0.21684817495742575
lap_time_seconds_min_FP3 :  0.12223001022972875
lap_time_seconds_min_FP4 :  0.4567168997395745
lap_time_seconds_min_WUP :  -0.23906975980846554
lap_time_seconds_mean_FP1 :  -0.0808011161710141
lap_time_seconds_mean_FP2 :  0.13435423201828237
lap_time_seconds_mean_FP3 :  -0.46535457318742945
lap_time_seconds_mean_FP4 :  -0.21738531762553126
lap_time_seconds_mean_WUP :  0.6114807201582979
lap_time_seconds_median_FP1 :  0.03620637315093626
lap_time_seconds_median_FP2 :  0.24439770047473677
lap_time_seconds_median_FP3 :  0.4205168097140321
lap_time_seconds_median_FP4 :  0.35482181126479245
lap_time_seconds_median_WUP :  -0.17399020993397005
lap_time_seconds_std_FP1 :  0.013198597322419075
lap_time_seconds_std_FP2 :  -0.024508394520953273
lap_time_seconds_std_FP3 :  -0.6060513850570998
lap_time_seconds_std_FP4 :  -0.3666449438546875
lap_time_seconds_std_WUP :  0.01879835546495785
