In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## Formula One Race Results
Build a neural network to predict podium finishes (1, 2, 3)

In [2]:
results_raw = pd.read_csv('results.csv')
results_raw.head()

Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,points,laps,time,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId
0,1,18,1,1,22.0,1,1.0,1,1,10.0,58,34:50.6,5690616.0,39.0,2.0,01:27.5,218.3,1
1,2,18,2,2,3.0,5,2.0,2,2,8.0,58,5.478,5696094.0,41.0,3.0,01:27.7,217.586,1
2,3,18,3,3,7.0,7,3.0,3,3,6.0,58,8.163,5698779.0,41.0,5.0,01:28.1,216.719,1
3,4,18,4,4,5.0,11,4.0,4,4,5.0,58,17.181,5707797.0,58.0,7.0,01:28.6,215.464,1
4,5,18,5,1,23.0,3,5.0,5,5,4.0,58,18.014,5708630.0,43.0,1.0,01:27.4,218.385,1


In [3]:
results_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23777 entries, 0 to 23776
Data columns (total 18 columns):
resultId           23777 non-null int64
raceId             23777 non-null int64
driverId           23777 non-null int64
constructorId      23777 non-null int64
number             23771 non-null float64
grid               23777 non-null int64
position           13227 non-null float64
positionText       23777 non-null object
positionOrder      23777 non-null int64
points             23777 non-null float64
laps               23777 non-null int64
time               6004 non-null object
milliseconds       6003 non-null float64
fastestLap         5383 non-null float64
rank               5531 non-null float64
fastestLapTime     5383 non-null object
fastestLapSpeed    5383 non-null object
statusId           23777 non-null int64
dtypes: float64(6), int64(8), object(4)
memory usage: 3.3+ MB


## Cleaning
#### Drop
- resultId: just another index
- number: driver's number, long story number assignment methods have varied and could be a significant source of error
    - maybe try adding back in later
- position, positionText: nuanced versions of final position, best representation for this task is positionOrder
- points: points awarded for final position, has changed a lot during history and also could be a dead giveaway
- time, milliseconds: also dead giveaways (shortest three times are podium finishes)

#### Other
- statusId indicates the conditions under which a car finished or not
- only keep statusId's for drivers who finished

#### Missingness
- Fastest lap info variables could explain some variance but was not available until the mid 2000s
- Dropping for now until I come up with a better way to handle this

In [4]:
results = results_raw.copy()
results = results.drop(['resultId', 'number', 'position', 'positionText',
                        'points', 'time', 'milliseconds'], 1)

In [5]:
results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23777 entries, 0 to 23776
Data columns (total 11 columns):
raceId             23777 non-null int64
driverId           23777 non-null int64
constructorId      23777 non-null int64
grid               23777 non-null int64
positionOrder      23777 non-null int64
laps               23777 non-null int64
fastestLap         5383 non-null float64
rank               5531 non-null float64
fastestLapTime     5383 non-null object
fastestLapSpeed    5383 non-null object
statusId           23777 non-null int64
dtypes: float64(2), int64(7), object(2)
memory usage: 2.0+ MB


In [6]:
finish_ids = [1,11,12,13,14,15,16,17,18,19,45,50,128,53,55,58,88,111,112,
              113,114,115,116,117,118,119,120,122,123,124,125,127,133,134]
results = results.loc[results['statusId'].isin(finish_ids)]

In [7]:
results = results.drop(['fastestLap', 'rank',
                        'fastestLapTime', 'fastestLapSpeed'], 1)
results.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12637 entries, 0 to 23774
Data columns (total 7 columns):
raceId           12637 non-null int64
driverId         12637 non-null int64
constructorId    12637 non-null int64
grid             12637 non-null int64
positionOrder    12637 non-null int64
laps             12637 non-null int64
statusId         12637 non-null int64
dtypes: int64(7)
memory usage: 789.8 KB


In [8]:
#create classes for podium finish (1) or not (0)
results['positionOrder'] = results['positionOrder'].apply(
    lambda x: 1 if x <= 3 else 0)

#probably gonna have class imbalance problem
print(results['positionOrder'].value_counts())

0    9694
1    2943
Name: positionOrder, dtype: int64


In [9]:
#convert other id variables to cat and create dummies
results['raceId'] = results['raceId'].astype(str)
results['driverId'] = results['driverId'].astype(str)
results['constructorId'] = results['constructorId'].astype(str)
results['statusId'] = results['statusId'].astype(str)

raceId = pd.get_dummies(results.raceId)
driverId = pd.get_dummies(results.driverId)
constructorId = pd.get_dummies(results.constructorId)
statusId = pd.get_dummies(results.statusId)

results_no_dummies = results.drop(['raceId', 'driverId', 'constructorId', 'statusId'], 1)

In [31]:
#put driverId, constructorId back in
results_ = pd.concat([results_no_dummies, driverId, constructorId], axis=1)

In [32]:
#separate majority and minority classes for even train/test split
podium_0 = results_.loc[results_['positionOrder'] == 0]
podium_1 = results_.loc[results_['positionOrder'] == 1]

X_0 = podium_0.drop(['positionOrder'], 1)
y_0 = podium_0['positionOrder']
X_1 = podium_1.drop(['positionOrder'], 1)
y_1 = podium_1['positionOrder']

In [33]:
print(len(X_0) == len(y_0))
print(len(X_1) == len(y_1))

True
True


In [34]:
from sklearn.model_selection import train_test_split
#majority class
X_train0, X_test0, y_train0, y_test0 = train_test_split(X_0,
                                                        y_0,
                                                        test_size=0.2)
#minority class
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_1,
                                                        y_1,
                                                        test_size=0.2)

#combine to create class proportional train & test sets
X_train = pd.concat([X_train0, X_train1])
X_test = pd.concat([X_test0, X_test1]) 
y_train = pd.concat([y_train0, y_train1])
y_test = pd.concat([y_test0, y_test1])

In [35]:
print(len(X_train) == len(y_train))
print(len(X_test) == len(y_test))

True
True


In [36]:
import imblearn
from imblearn.over_sampling import RandomOverSampler

#random oversample train & test sets
X_train_ros, y_train_ros = RandomOverSampler().fit_sample(X_train, y_train)
X_test_ros, y_test_ros = RandomOverSampler().fit_sample(X_test, y_test)

#make sure it worked
print('train class balance: {}%'.format(len(y_train_ros[y_train_ros == 1]) / len(y_train_ros) * 100))
print('test class balance: {}%'.format(len(y_test_ros[y_test_ros == 1]) / len(y_test_ros) * 100))

train class balance: 50.0%
test class balance: 50.0%


## Multi-layer perceptron neural net

In [37]:
from sklearn.neural_network import MLPClassifier
import time

start_time = time.clock()
mlp = MLPClassifier(hidden_layer_sizes=(1000,)).fit(X_train_ros, y_train_ros)
print(mlp.score(X_train_ros, y_train_ros))
print('{} seconds'.format(time.clock() - start_time))

0.8936814958091553
223.48893600000002 seconds


In [38]:
from sklearn.model_selection import cross_val_score
start_time = time.clock()
print('train set cv=5')
print(cross_val_score(mlp, X_train_ros, y_train_ros, cv=5))
print('{} seconds'.format(time.clock() - start_time))

train set cv=5
[0.84300451 0.84945197 0.85686654 0.85815603 0.85525467]
455.99673999999993 seconds


In [39]:
start_time = time.clock()
print('test set cv=5')
print(cross_val_score(mlp, X_test_ros, y_test_ros, cv=5))
print('{} seconds'.format(time.clock() - start_time))

test set cv=5
[0.8492268  0.86469072 0.8685567  0.86726804 0.88113695]
114.90104899999983 seconds


In [42]:
start_time = time.clock()
mlp = MLPClassifier(hidden_layer_sizes=(500,250,250,)
                   ).fit(X_train_ros, y_train_ros)
print(mlp.score(X_train_ros, y_train_ros))
print('{} seconds'.format(time.clock() - start_time))

0.9190844616376531
208.12137800000028 seconds


In [43]:
start_time = time.clock()
print('train set cv=5')
print(cross_val_score(mlp, X_train_ros, y_train_ros, cv=5))
print('{} seconds'.format(time.clock() - start_time))

train set cv=5
[0.85041908 0.85557705 0.85299807 0.8136686  0.8549323 ]
446.36512900000025 seconds


In [44]:
start_time = time.clock()
print('test set cv=5')
print(cross_val_score(mlp, X_test_ros, y_test_ros, cv=5))
print('{} seconds'.format(time.clock() - start_time))

test set cv=5
[0.85051546 0.86340206 0.82603093 0.87113402 0.87855297]
57.57107499999984 seconds


## Random Forest

In [45]:
from sklearn import ensemble
from sklearn.ensemble import RandomForestClassifier

start_times = time.clock()
rfc = RandomForestClassifier().fit(X_train_ros, y_train_ros)
print('train set cv=5')
print(cross_val_score(rfc, X_train_ros, y_train_ros, cv=5))
print('{} seconds'.format(time.clock() - start_time))

train set cv=5
[0.90651193 0.9174726  0.93068988 0.92617666 0.93197937]
62.7291009999999 seconds


In [46]:
start_time = time.clock()
print('test set cv=5')
print(cross_val_score(rfc, X_test_ros, y_test_ros, cv=5))
print('{} seconds'.format(time.clock() - start_time))

test set cv=5
[0.90979381 0.91752577 0.94201031 0.93298969 0.95348837]
0.5986440000001494 seconds


## Gradient Boosting Classifier

In [47]:
from sklearn.ensemble import GradientBoostingClassifier

start_time = time.clock()
gbc = ensemble.GradientBoostingClassifier().fit(X_train_ros, y_train_ros)
print('train set cv=5')
print(cross_val_score(gbc, X_train_ros, y_train_ros, cv=5))
print('{} seconds'.format(time.clock() - start_time))

train set cv=5
[0.83687943 0.84332689 0.83333333 0.84364926 0.8349452 ]
210.767785 seconds


In [48]:
start_time = time.clock()
print('test set cv=5')
print(cross_val_score(gbc, X_test_ros, y_test_ros, cv=5))
print('{} seconds'.format(time.clock() - start_time))

test set cv=5
[0.84407216 0.85438144 0.85309278 0.85438144 0.88501292]
23.579212000000098 seconds


## Results
Considering all they have to train on is driver, constructor, and grid position information, all models performed much better than I expected them to. Random forest however is the best performing model for this classification task, and by a significant margin. It is also by far the fastest model to train and run.

Compared to earlier rounds missing constructor & driver data, NN sees the most improvement in scores from having the additional data, but also a significant increase in time to train. Considering the quality of results I got from only including what I could from this file (dataset has a few other files to draw from), I would like to spend more time on predicting race results as a portfolio project or just for fun.