In [421]:
from sklearn.preprocessing import LabelEncoder
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from models import *
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

predict = 'podium'

def filter_races_by_years(races, num_of_years = 5):
    """
    Filters the races dataframe to keep only races from the last 5 years.

    Parameters:
        races (DataFrame): DataFrame containing races information.

    Returns:
        DataFrame: Filtered DataFrame containing only races from the last 5 years.
    """
    current_year = datetime.now().year
    start_year = current_year - num_of_years
    return races[(races['year'] >= start_year) & (races['year'] <= current_year)]


# Load Datasets + Preprocessing

In [422]:
drivers = Drivers()
drivers.df.drop(['url', 'dob', 'number', 'driverRef'], axis=1, inplace=True)

ps = Pit_Stops()
ps.df.drop(['time', 'duration'], axis=1, inplace=True)

# qual = Qualifying()
# qual.df.drop(['q1', 'q2', 'q3'])

results = Results()
results.df.drop(['milliseconds', 'time', 'fastestLapTime', 'fastestLapSpeed', 'position', 'positionText', 'rank', 'number', 'fastestLap'], axis=1, inplace=True)
print(results.df.dtypes)

# results.df['milliseconds'].apply(lambda x: )
# df['column_name'].replace(r'\\N', -1, inplace=True)

races = Races()
races.df = filter_races_by_years(races.df, 10)
races.df.drop(['year', 'time', 'url','date', 'fp1_date', 'fp1_time','fp2_date', 'fp2_time','fp3_date', 'fp3_time', 'quali_date', 'quali_time', 'sprint_date', 'sprint_time'], axis=1, inplace=True)
races.df = races.df.merge(results.df, on='raceId', how='inner')
if predict == 'podium':
    races.df[predict] = races.df['positionOrder'].apply(lambda x: 1 if x <= 3 else 0)
elif predict == 'pointFinish':
    races.df[predict] = races.df['positionOrder'].apply(lambda x: 1 if x <= 10 else 0)
elif predict == 'winner':
    races.df[predict] = races.df['positionOrder'].apply(lambda x: 1 if x == 1 else 0)
constructors = Contructors()
constructors.df.drop(['constructorRef', 'url'], axis=1, inplace=True)

races.df.head()


resultId           int64
raceId             int64
driverId           int64
constructorId      int64
grid               int64
positionOrder      int64
points           float64
laps               int64
statusId           int64
dtype: object


Unnamed: 0,raceId,round,circuitId,name,resultId,driverId,constructorId,grid,positionOrder,points,laps,statusId,podium
0,900,1,1,Australian Grand Prix,22130,3,131,3,1,25.0,57,1,1
1,900,1,1,Australian Grand Prix,22131,825,1,4,2,18.0,57,1,1
2,900,1,1,Australian Grand Prix,22132,18,1,10,3,15.0,57,1,1
3,900,1,1,Australian Grand Prix,22133,4,6,5,4,12.0,57,1,0
4,900,1,1,Australian Grand Prix,22134,822,3,15,5,10.0,57,1,0


# Encoding Values to Ints

In [423]:
label_encoder = LabelEncoder()
races.df['name'] = label_encoder.fit_transform(races.df['name'])
races.df.head()

Unnamed: 0,raceId,round,circuitId,name,resultId,driverId,constructorId,grid,positionOrder,points,laps,statusId,podium
0,900,1,1,2,22130,3,131,3,1,25.0,57,1,1
1,900,1,1,2,22131,825,1,4,2,18.0,57,1,1
2,900,1,1,2,22132,18,1,10,3,15.0,57,1,1
3,900,1,1,2,22133,4,6,5,4,12.0,57,1,0
4,900,1,1,2,22134,822,3,15,5,10.0,57,1,0


In [424]:
# Dropping features that I don't want to train on
races.df.drop(['raceId', 'points', 'resultId', 'positionOrder', 'round'], axis=1, inplace=True)
races.df.head()


Unnamed: 0,circuitId,name,driverId,constructorId,grid,laps,statusId,podium
0,1,2,3,131,3,57,1,1
1,1,2,825,1,4,57,1,1
2,1,2,18,1,10,57,1,1
3,1,2,4,6,5,57,1,0
4,1,2,822,3,15,57,1,0


# ML Models

In [425]:
X = races.df.drop(predict, axis=1, inplace=False)
y = races.df[predict]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=42)

## Naive Bayes Training

In [426]:
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)


## NBC Testing

In [427]:
y_pred = nb_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

Accuracy: 0.72
Confusion Matrix:
[[1753  863]
 [  11  444]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.67      0.80      2616
           1       0.34      0.98      0.50       455

    accuracy                           0.72      3071
   macro avg       0.67      0.82      0.65      3071
weighted avg       0.90      0.72      0.76      3071



## LR Training

In [447]:
model2 = LogisticRegression(max_iter=1000)
model2.fit(X_train, y_train)

## LR Testing

In [448]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)


y_pred = model2.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)


Accuracy: 0.84
Confusion Matrix:
[[2151  465]
 [  35  420]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.82      0.90      2616
           1       0.47      0.92      0.63       455

    accuracy                           0.84      3071
   macro avg       0.73      0.87      0.76      3071
weighted avg       0.91      0.84      0.86      3071

Accuracy: 0.92
Confusion Matrix:
[[2498  118]
 [ 139  316]]
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.95      0.95      2616
           1       0.73      0.69      0.71       455

    accuracy                           0.92      3071
   macro avg       0.84      0.82      0.83      3071
weighted avg       0.91      0.92      0.92      3071



## Random Forest Training

In [449]:
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)

## Random Forest Testing

In [431]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)


Accuracy: 0.92
Confusion Matrix:
[[2492  124]
 [ 133  322]]
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.95      0.95      2616
           1       0.72      0.71      0.71       455

    accuracy                           0.92      3071
   macro avg       0.84      0.83      0.83      3071
weighted avg       0.92      0.92      0.92      3071



## SVM Training

In [455]:
model = SVC(kernel='linear')
model.fit(X_train, y_train)

## SVM Testing

In [456]:
y_pred = model.predict(X_test)
print(y_pred)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

[0 0 0 ... 1 0 0]
Accuracy: 0.92
Confusion Matrix:
[[2499  117]
 [ 137  318]]
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.96      0.95      2616
           1       0.73      0.70      0.71       455

    accuracy                           0.92      3071
   macro avg       0.84      0.83      0.83      3071
weighted avg       0.92      0.92      0.92      3071



## KNN Training

In [462]:
models = []
for k in [1, 3, 5, 9, 11]:
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, y_train)
    models.append(model)

## KNN Testing

In [463]:
for model in models:
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {accuracy:.2f}')

    conf_matrix = confusion_matrix(y_test, y_pred)
    print('Confusion Matrix:')
    print(conf_matrix)

    class_report = classification_report(y_test, y_pred)
    print('Classification Report:')
    print(class_report)

Accuracy: 0.87
Confusion Matrix:
[[2397  219]
 [ 194  261]]
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.92      0.92      2616
           1       0.54      0.57      0.56       455

    accuracy                           0.87      3071
   macro avg       0.73      0.74      0.74      3071
weighted avg       0.87      0.87      0.87      3071

Accuracy: 0.86
Confusion Matrix:
[[2418  198]
 [ 219  236]]
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.92      0.92      2616
           1       0.54      0.52      0.53       455

    accuracy                           0.86      3071
   macro avg       0.73      0.72      0.73      3071
weighted avg       0.86      0.86      0.86      3071

Accuracy: 0.87
Confusion Matrix:
[[2469  147]
 [ 247  208]]
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.94      0.9