In [82]:
import numpy as np 
import pandas as pd 
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer

pd.set_option("display.max_columns", None)

In [3]:
def flattenColumnIndex(columns: list) -> list:
   return ['_'.join(col).strip() if col[1] != '' else col[0] for col in columns]

In [4]:
file = 'MotoGP_2021.csv'
data = pd.read_csv(file).drop('data', axis=1)

print(data.shape)
data.head()

(42741, 32)


Unnamed: 0,year,event,session,rider_position,rider_number,rider,nation,team,motorcycle,rider_classification,total_laps,full_laps,run_number,front_tire,rear_tire,front_tire_age,rear_tire_age,lap_invalidated,lap_unfinished,lap_number,lap_type,lap_time,T1,T2,T3,T4,speed,invalidated_T1,invalidated_T2,invalidated_T3,invalidated_T4,lap_time_seconds
0,2021,QAT,FP1,1,21,MORBIDELLI Franco (FMor),ITA,Petronas Yamaha SRT,YAMAHA,*,17,11,1,Slick-Hard,Slick-Hard,0,0,False,False,1,Out,2'29.580,42.632,35.061,30.943,33.122,106.4,,,,,149.58
1,2021,QAT,FP1,1,21,MORBIDELLI Franco (FMor),ITA,Petronas Yamaha SRT,YAMAHA,*,17,11,1,Slick-Hard,Slick-Hard,1,1,False,False,2,Speed,1'57.714,26.068,30.602,29.011,32.033,339.6,,,,,117.714
2,2021,QAT,FP1,1,21,MORBIDELLI Franco (FMor),ITA,Petronas Yamaha SRT,YAMAHA,*,17,11,1,Slick-Hard,Slick-Hard,2,2,True,False,3,Speed,1'56.337,25.578,30.349,28.571,31.839,339.6,,,,True,116.337
3,2021,QAT,FP1,1,21,MORBIDELLI Franco (FMor),ITA,Petronas Yamaha SRT,YAMAHA,*,17,11,1,Slick-Hard,Slick-Hard,3,3,False,False,4,Speed,1'56.618,25.431,30.567,28.788,31.832,339.6,,,,,116.618
4,2021,QAT,FP1,1,21,MORBIDELLI Franco (FMor),ITA,Petronas Yamaha SRT,YAMAHA,*,17,11,1,Slick-Hard,Slick-Hard,4,4,False,False,5,Speed,1'55.518,25.254,30.048,28.554,31.662,341.7,,,,,115.518


In [92]:

timed_features = ['T1','T2','T3','T4','lap_time_seconds']
categorical_features = None

scalerFunc = lambda X: (X - X.min()) / (X.max() - X.min()) * (1 - 0) + 0


def attachTarget(X):
    """
        Taking each race weekend, and for each record, attaching
        the final race position for that weekend for that rider    
    """
    
    target = X[X.session == 'RAC'] \
        [['year','event','rider','rider_number','rider_position']] \
        .drop_duplicates() \
        .rename({'rider_position': 'y'}, axis='columns')
    
    X = X[X.session != 'RAC'] \
        .merge(target, on=['year','event','rider','rider_number'])

    return X



def scale(X):
    """
        Scales the timed features such as sector times
        and lap times to the session and the track since
        those distances/times are specific to the race weekend
        and session
    """
    
    X = X.dropna(subset=timed_features, how='any')

    scaled = X.groupby(['year','event','session'])[timed_features] \
            .transform(lambda X: scalerFunc(X)) \
            .reset_index() \
            .drop('index', axis=1)

    columns_wanted_back = set(list(X.columns)) - set(list(scaled.columns))

    X = pd.concat([X[columns_wanted_back], scaled], axis=1)
    return X


def tabulateAcrossSessions(X):
    calced = X \
        .melt(
            id_vars=['y','year','event','session','rider_position','rider_number','rider'],
            value_vars=timed_features,
            var_name='sector',
            value_name='time'
        ) \
        .groupby([
            'y','year','event','session','rider_position','rider_number','rider','sector'
        ]) \
        .agg({
            'time':['mean','std']
        })

    calced.columns = flattenColumnIndex(calced.columns)

    calced = calced \
        .pivot_table(
            index=['y','year','event','rider_number','rider'],
            columns=['sector','session'],
            values=['time_mean','time_std']
        )

    calced.columns = flattenColumnIndex(calced.columns)

    calced.reset_index(inplace=True)


    return calced 


def split_data(data):
    
    dropCols = ['year','event','rider_number','rider','y']
    X = data.drop(dropCols, axis=1)
    y = data.y

    return X, y

attach_target = FunctionTransformer(attachTarget)
scale_data = FunctionTransformer(scale)
widen_data = FunctionTransformer(tabulateAcrossSessions)
split = FunctionTransformer(split_data)

preProcessPipeline = make_pipeline(
    attach_target,
    scale_data,
    widen_data,
    split
)


X, y = preProcessPipeline.fit_transform(data)

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

model = Pipeline([
    ('ImputeNans', SimpleImputer()),
    ('LinearModel', LogisticRegression())
])


In [95]:

model = model.fit(x_train, y_train)

model.score(x_test, y_test)

model.predict(x_test)



array([ 2., 11.,  3., 19., 15.,  9., 18.,  6., 10.,  8., 18.,  2.,  6.,
       12.,  4.,  2.,  2., 14.,  7.,  3.,  2.,  2.,  4.,  2.,  9.,  2.,
        2., 13.,  4.,  2.,  8.,  9.,  9., 10., 10.,  2.,  9., 18.,  8.,
       19., 13.,  7.,  8.,  9., 15., 10., 11.,  6.,  2., 13.,  8.,  4.,
        9.,  5.,  2.,  3.,  2., 18.,  9.,  5., 13.,  9.,  6.,  7.,  8.,
       18., 10., 11.,  3.,  9.,  2.,  8.,  5.,  4., 11.])

In [96]:
y_test

55      3.0
327    19.0
104     6.0
261    15.0
215    13.0
       ... 
246    14.0
344    20.0
369    22.0
353    21.0
308    18.0
Name: y, Length: 75, dtype: float64