# This notebook show the preparation of the data for the machine learning prediction 

The objective here is to obtain a data representing the full grand Prix week end, for every drivers.
Now we have those informations in the data depending on the session name. 
In  our case we need to have them in the same row.

### Import the data and the tools

In [1]:
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [2]:
data = pd.read_csv(r'..\Data\allData.csv')

## Checking the data

In [3]:
data.shape

(6625, 25)

In [4]:
data.columns

Index(['number', 'position', 'Q1', 'Q2', 'Q3', 'positionText', 'points',
       'grid', 'laps', 'status', 'year', 'gpName', 'gpNumber', 'sessionName',
       'driverId', 'code', 'DriverNationality', 'constructorId',
       'constructorNationality', 'fastestLapNumber', 'fastestLapRank',
       'fastestLapAvgSpeed', 'fastestLapTime', 'totalTime', 'TimeInterval'],
      dtype='object')

In [5]:
data['gpNumber']

0        1
1        1
2        1
3        1
4        1
        ..
6620    22
6621    22
6622    22
6623    22
6624    22
Name: gpNumber, Length: 6625, dtype: int64

## Spliting the data into the practice, the race and qualifying 

### The meaning of this step is to be able to merge the data 

In [6]:
df_race = data[data['sessionName'] == 'Race']

In [7]:
df_practice1 = data[data['sessionName'] == 'Practice 1'][['fastestLapRank','fastestLapTime','year','gpName','driverId']]

In [8]:
df_practice2 = data[data['sessionName'] == 'Practice 2'][['fastestLapRank','fastestLapTime','year','gpName','driverId']]

In [9]:
df_practice3 = data[data['sessionName'] == 'Practice 3'][['fastestLapRank','fastestLapTime','year','gpName','driverId']]

In [10]:
df_practice1 = df_practice1.rename(columns={"fastestLapRank": "fastestLapRankP1","fastestLapTime": "fastestLapTimeP1"})
df_practice2 = df_practice2.rename(columns={"fastestLapRank": "fastestLapRankP2","fastestLapTime": "fastestLapTimeP2"})
df_practice3 = df_practice3.rename(columns={"fastestLapRank": "fastestLapRankP3","fastestLapTime": "fastestLapTimeP3"})

In [11]:
df_quali = data[data['sessionName'] == 'Qualifying'][['Q1','Q2','Q3','year','gpName','driverId']]

#### Data manipulation on race and qualifying to merge them 

In [12]:
df_race.describe()

Unnamed: 0,number,position,points,grid,laps,year,gpNumber,fastestLapNumber,fastestLapRank,fastestLapAvgSpeed,totalTime
count,1619.0,1599.0,1599.0,1599.0,1599.0,1619.0,1619.0,1526.0,1526.0,1526.0,784.0
mean,28.008647,10.494059,5.065979,10.151345,54.482802,2019.492897,10.70908,46.992136,10.185452,210.311455,5668426.0
std,25.666323,5.764989,7.217079,5.819209,17.902728,1.145386,5.981119,15.20099,5.613831,19.50075,1275282.0
min,2.0,1.0,0.0,0.0,0.0,2018.0,1.0,2.0,1.0,140.802,207071.0
25%,8.0,5.5,0.0,5.0,52.0,2018.0,6.0,40.0,5.0,200.2445,5280748.0
50%,18.0,10.0,1.0,10.0,56.0,2019.0,11.0,49.0,10.0,209.8255,5591682.0
75%,44.0,15.0,8.0,15.0,67.0,2021.0,16.0,57.0,15.0,224.209,5884478.0
max,99.0,20.0,27.0,20.0,87.0,2021.0,22.0,85.0,20.0,255.014,10810320.0


In [13]:
df_race.isna().sum()

number                       0
position                    20
Q1                        1619
Q2                        1619
Q3                        1619
positionText                20
points                      20
grid                        20
laps                        20
status                      20
year                         0
gpName                       0
gpNumber                     0
sessionName                  0
driverId                    20
code                         0
DriverNationality           20
constructorId               20
constructorNationality      20
fastestLapNumber            93
fastestLapRank              93
fastestLapAvgSpeed          93
fastestLapTime              93
totalTime                  835
TimeInterval               835
dtype: int64

In [14]:
df_race = df_race.drop(['Q1','Q2','Q3'], axis=1)

In [15]:
df_race['totalTime'].describe()

count    7.840000e+02
mean     5.668426e+06
std      1.275282e+06
min      2.070710e+05
25%      5.280748e+06
50%      5.591682e+06
75%      5.884478e+06
max      1.081032e+07
Name: totalTime, dtype: float64

In [16]:
df_race['totalTime'] = df_race['totalTime'].fillna(df_race['totalTime'].max())

In [17]:
df_race.isna().sum()

number                      0
position                   20
positionText               20
points                     20
grid                       20
laps                       20
status                     20
year                        0
gpName                      0
gpNumber                    0
sessionName                 0
driverId                   20
code                        0
DriverNationality          20
constructorId              20
constructorNationality     20
fastestLapNumber           93
fastestLapRank             93
fastestLapAvgSpeed         93
fastestLapTime             93
totalTime                   0
TimeInterval              835
dtype: int64

first step of our data creation, the race is combined to the qualifying for each drivers 

In [18]:
race_quali = pd.merge(df_race, df_quali, how="left", on=['year','gpName','driverId'])

In [19]:
race_quali.shape 

(1619, 25)

In [20]:
race_quali.columns

Index(['number', 'position', 'positionText', 'points', 'grid', 'laps',
       'status', 'year', 'gpName', 'gpNumber', 'sessionName', 'driverId',
       'code', 'DriverNationality', 'constructorId', 'constructorNationality',
       'fastestLapNumber', 'fastestLapRank', 'fastestLapAvgSpeed',
       'fastestLapTime', 'totalTime', 'TimeInterval', 'Q1', 'Q2', 'Q3'],
      dtype='object')

Second step is to combined the practices data with 

In [22]:
race_quali_1 = pd.merge(race_quali, df_practice1, how="left", on=['year','gpName','driverId'])

In [24]:
race_quali_12 = pd.merge(race_quali_1, df_practice2, how="left", on=['year','gpName','driverId'])

In [34]:
data_complete = pd.merge(race_quali_12, df_practice3, how="left", on=['year','gpName','driverId'])

In [35]:
data_complete.columns

Index(['number', 'position', 'positionText', 'points', 'grid', 'laps',
       'status', 'year', 'gpName', 'gpNumber', 'sessionName', 'driverId',
       'code', 'DriverNationality', 'constructorId', 'constructorNationality',
       'fastestLapNumber', 'fastestLapRank', 'fastestLapAvgSpeed',
       'fastestLapTime', 'totalTime', 'TimeInterval', 'Q1', 'Q2', 'Q3',
       'fastestLapRankP1', 'fastestLapTimeP1', 'fastestLapRankP2',
       'fastestLapTimeP2', 'fastestLapRankP3', 'fastestLapTimeP3'],
      dtype='object')

In [36]:
data_complete.shape

(1619, 31)

We have now finaly a data that represent the entire week end (Grand prix) for a driver 

## Now we have to make the data suitable for the predictions techniques 

In [21]:
fastest  = []
for i in df_race['fastestLapTime']:
    if i != '192.074':
        a = list(map(float,i.split(':')))
        b = int((a[0]*60 + a[1])*1000)
    else : 
        b = 92074
    fastest.append(b)
df_race['fastestLapTime'] = fastest

AttributeError: 'float' object has no attribute 'split'

In [None]:
from sklearn.ensemble import RandomForestRegressor
random_forest = RandomForestRegressor() 

In [None]:
col = [ 'positionText',
       'status', 'year', 'gpName', 'gpNumber', 'sessionName', 'driverId', 'DriverNationality', 'constructorId', 'constructorNationality']
for elmt in col:
    df_race[col] = df_race[col].astype("category")

In [None]:
df_features = pd.get_dummies(df_race, columns = ['positionText',
       'status', 'year', 'gpName', 'gpNumber', 'sessionName', 'driverId',
       'DriverNationality', 'constructorId', 'constructorNationality'])

In [None]:
df_features.columns

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_features, test_size=0.2)
X_train = train.drop(['position'], axis=1)
y_train = train['position']
X_test = test.drop(['position'], axis=1)
y_test = test['position']

In [None]:
model_rd = random_forest.fit(X_train, y_train)

In [None]:
model_rd.score(X_test,y_test)

In [None]:
y_pred = random_forest.predict(X_test) 

In [None]:
erreurs = abs(y_pred - y_test)
print('Mean Absolute Error:', round(np.mean(erreurs), 2))
mape = 100 * (erreurs / y_test)
print('Mean Absolute Percentage Error :', round(np.mean(mape), 2), '%.')

In [None]:
importances = model_rd.feature_importances_
indices = np.argsort(importances)
# style du graphique 
plt.figure(1)
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [X_test.columns[i] for i in indices])
plt.xlabel('Relative Importance')

In [None]:
# example of training a final regression model
from sklearn.linear_model import LinearRegression


model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

erreurs = abs(y_pred - y_test)
print('Mean Absolute Error:', round(np.mean(erreurs), 2))
mape = 100 * (erreurs / y_test)
print('Mean Absolute Percentage Error :', round(np.mean(mape), 2), '%.')