In [44]:
import numpy as np 
import pandas as pd 
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import typing

pd.set_option("display.max_columns", None)

In [2]:
def flattenColumnIndex(columns: list) -> list:
   return ['_'.join(col).strip() if col[1] != '' else col[0] for col in columns]

In [3]:
data = 'MotoGP_2021.csv'
data = pd.read_csv(data)


print(data.shape)
data.head()

(42741, 34)


Unnamed: 0.1,Unnamed: 0,data,year,event,session,rider_position,rider_number,rider,nation,team,motorcycle,rider_classification,total_laps,full_laps,run_number,front_tire,rear_tire,front_tire_age,rear_tire_age,lap_invalidated,lap_unfinished,lap_number,lap_type,lap_time,T1,T2,T3,T4,speed,invalidated_T1,invalidated_T2,invalidated_T3,invalidated_T4,lap_time_seconds
0,1,1 2'29.580 42.632 35.061 30.943 33.122 106.4,2021,QAT,FP1,1,21,MORBIDELLI Franco (FMor),ITA,Petronas Yamaha SRT,YAMAHA,*,17,11,1,Slick-Hard,Slick-Hard,0,0,False,False,1,Out,2'29.580,42.632,35.061,30.943,33.122,106.4,,,,,149.58
1,2,2 1'57.714 26.068 30.602 29.011 32.033 339.6,2021,QAT,FP1,1,21,MORBIDELLI Franco (FMor),ITA,Petronas Yamaha SRT,YAMAHA,*,17,11,1,Slick-Hard,Slick-Hard,1,1,False,False,2,Speed,1'57.714,26.068,30.602,29.011,32.033,339.6,,,,,117.714
2,3,3 1'56.337 * 25.578 30.349 28.571 31.839* 339.6,2021,QAT,FP1,1,21,MORBIDELLI Franco (FMor),ITA,Petronas Yamaha SRT,YAMAHA,*,17,11,1,Slick-Hard,Slick-Hard,2,2,True,False,3,Speed,1'56.337,25.578,30.349,28.571,31.839,339.6,,,,True,116.337
3,4,4 1'56.618 25.431 30.567 28.788 31.832 339.6,2021,QAT,FP1,1,21,MORBIDELLI Franco (FMor),ITA,Petronas Yamaha SRT,YAMAHA,*,17,11,1,Slick-Hard,Slick-Hard,3,3,False,False,4,Speed,1'56.618,25.431,30.567,28.788,31.832,339.6,,,,,116.618
4,5,5 1'55.518 25.254 30.048 28.554 31.662 341.7,2021,QAT,FP1,1,21,MORBIDELLI Franco (FMor),ITA,Petronas Yamaha SRT,YAMAHA,*,17,11,1,Slick-Hard,Slick-Hard,4,4,False,False,5,Speed,1'55.518,25.254,30.048,28.554,31.662,341.7,,,,,115.518


In [4]:
agg_funcs = ['mean', 'std']
agg_dict = {'T{}'.format(i+1): agg_funcs for i in range(4)}

target_df = data[data.session == 'RAC'][['year','event','rider','rider_number','rider_position']].drop_duplicates()

# Estimating population parameters for scaling T1:T4 based on year, event, and session
pop_params = data[
        (data.session != 'RAC') &
        (data.lap_type == 'Speed')
    ] \
    .groupby(['year','event','session']) \
    .agg(agg_dict) \
    .reset_index()

pop_params.columns = flattenColumnIndex(pop_params.columns)

data = data \
    .merge(pop_params, how='left')

In [5]:
## going to scale the T1:T4

sector_cols = ['T{}'.format(i+1) for i in range(4)]

df = data.copy()

for col in sector_cols:
    df[col] = (data[col] - data[col + '_mean']) / data[col + '_std']


In [6]:
df[[
    'year','event','session','rider_position','rider_number','rider','T1','T2','T3','T4'
]].head()

Unnamed: 0,year,event,session,rider_position,rider_number,rider,T1,T2,T3,T4
0,2021,QAT,FP1,1,21,MORBIDELLI Franco (FMor),11.330726,4.36316,2.053887,3.667228
1,2021,QAT,FP1,1,21,MORBIDELLI Franco (FMor),0.03214,-0.082059,-0.020058,0.433055
2,2021,QAT,FP1,1,21,MORBIDELLI Franco (FMor),-0.302097,-0.334277,-0.492385,-0.143096
3,2021,QAT,FP1,1,21,MORBIDELLI Franco (FMor),-0.402368,-0.116951,-0.259442,-0.163885
4,2021,QAT,FP1,1,21,MORBIDELLI Franco (FMor),-0.523103,-0.634347,-0.510634,-0.668761


In [28]:
calced = df \
    .melt(
        id_vars=['year','event','session','rider_position','rider_number','rider'],
        value_vars=['T1','T2','T3','T4'],
        var_name='sector',
        value_name='time'
    ) \
    .groupby([
        'year','event','session','rider_position','rider_number','rider','sector'
    ]) \
    .agg({
        'time':['mean','std']
    })

calced.columns = flattenColumnIndex(calced.columns)

calced = calced \
    .pivot_table(
        index=['year','event','rider_number','rider'],
        columns=['sector','session'],
        values=['time_mean','time_std']
    )

calced.columns = flattenColumnIndex(calced.columns)

calced.reset_index(inplace=True)

data = calced

X = data.merge(target_df)

x = np.nan_to_num(X.drop(['year','event','rider','rider_number'],axis=1).to_numpy()[:, :-1])

y = X.rider_position.to_numpy()



In [45]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

In [46]:
model = LinearRegression()
model.fit(x_train, y_train)

LinearRegression()

In [51]:
y_pred = model.predict(x_test)


y_test

array([ 7,  1, 16, 20, 14, 13, 19,  4,  9,  5, 20,  2,  7, 21, 20, 14, 15,
       16, 20, 14, 17, 17,  7, 13, 13,  2, 16, 15, 18, 20, 18, 13, 12, 17,
        3, 19,  1, 10, 15,  3,  1,  7,  5,  6,  7, 18,  9,  8,  2, 17,  4,
       18, 19,  3, 13,  1, 13, 15, 13, 17, 19, 23, 15, 15,  2,  5,  3,  8,
        3, 16,  8,  3, 18,  6, 15, 15, 12, 14,  8, 17, 21,  6, 21, 10, 10,
       19,  6, 15, 19, 15, 18, 20,  2, 11,  1,  3, 16,  1, 14, 21, 13, 10,
       13, 14,  2,  6, 17, 12,  1,  5, 22, 14, 11,  8], dtype=int64)