In [76]:
from sklearn.preprocessing import LabelEncoder
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

import sklearn.metrics

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from models import *
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

predict = 'podium'
top_n = 10
if predict == 'podium':
    top_n = 3
elif predict == 'winner':
    top_n = 1

def filter_races_by_years(races, num_of_years = 5):
    """
    Filters the races dataframe to keep only races from the last 5 years.

    Parameters:
        races (DataFrame): DataFrame containing races information.

    Returns:
        DataFrame: Filtered DataFrame containing only races from the last 5 years.
    """
    current_year = datetime.now().year
    start_year = current_year - num_of_years
    return races[(races['year'] >= start_year) & (races['year'] <= current_year)]


# Load Datasets + Preprocessing

In [77]:
drivers = Drivers()
drivers.df.drop(['url', 'dob', 'number', 'driverRef'], axis=1, inplace=True)

ps = Pit_Stops()
ps.df.drop(['time', 'duration'], axis=1, inplace=True)

# qual = Qualifying()
# qual.df.drop(['q1', 'q2', 'q3'])

results = Results()
results.df.drop(['milliseconds', 'time', 'fastestLapTime', 'fastestLapSpeed', 'position', 'positionText', 'rank', 'number', 'fastestLap'], axis=1, inplace=True)
print(results.df.dtypes)

# results.df['milliseconds'].apply(lambda x: )
# df['column_name'].replace(r'\\N', -1, inplace=True)

races = Races()
races.df = filter_races_by_years(races.df, 10)
races.df.drop(['time', 'url','date', 'fp1_date', 'fp1_time','fp2_date', 'fp2_time','fp3_date', 'fp3_time', 'quali_date', 'quali_time', 'sprint_date', 'sprint_time'], axis=1, inplace=True)
races.df = races.df.merge(results.df, on='raceId', how='inner')
if predict == 'podium':
    races.df[predict] = races.df['positionOrder'].apply(lambda x: 1 if x <= top_n else 0)
elif predict == 'pointFinish':
    races.df[predict] = races.df['positionOrder'].apply(lambda x: 1 if x <= top_n else 0)
elif predict == 'winner':
    races.df[predict] = races.df['positionOrder'].apply(lambda x: 1 if x == top_n else 0)
constructors = Contructors()
constructors.df.drop(['constructorRef', 'url'], axis=1, inplace=True)

races.df.head()


resultId           int64
raceId             int64
driverId           int64
constructorId      int64
grid               int64
positionOrder      int64
points           float64
laps               int64
statusId           int64
dtype: object


Unnamed: 0,raceId,year,round,circuitId,name,resultId,driverId,constructorId,grid,positionOrder,points,laps,statusId,podium
0,900,2014,1,1,Australian Grand Prix,22130,3,131,3,1,25.0,57,1,1
1,900,2014,1,1,Australian Grand Prix,22131,825,1,4,2,18.0,57,1,1
2,900,2014,1,1,Australian Grand Prix,22132,18,1,10,3,15.0,57,1,1
3,900,2014,1,1,Australian Grand Prix,22133,4,6,5,4,12.0,57,1,0
4,900,2014,1,1,Australian Grand Prix,22134,822,3,15,5,10.0,57,1,0


# Encoding Values to Ints

In [78]:
label_encoder = LabelEncoder()
races.df['name'] = label_encoder.fit_transform(races.df['name'])
label_encoder = LabelEncoder()
races.df['year'] = label_encoder.fit_transform(races.df['year'])

races.df.head()

Unnamed: 0,raceId,year,round,circuitId,name,resultId,driverId,constructorId,grid,positionOrder,points,laps,statusId,podium
0,900,0,1,1,2,22130,3,131,3,1,25.0,57,1,1
1,900,0,1,1,2,22131,825,1,4,2,18.0,57,1,1
2,900,0,1,1,2,22132,18,1,10,3,15.0,57,1,1
3,900,0,1,1,2,22133,4,6,5,4,12.0,57,1,0
4,900,0,1,1,2,22134,822,3,15,5,10.0,57,1,0


In [79]:
# Dropping features that I don't want to train on
races.df.drop(['points', 'resultId', 'positionOrder', 'round'], axis=1, inplace=True)
races.df.head()


Unnamed: 0,raceId,year,circuitId,name,driverId,constructorId,grid,laps,statusId,podium
0,900,0,1,2,3,131,3,57,1,1
1,900,0,1,2,825,1,4,57,1,1
2,900,0,1,2,18,1,10,57,1,1
3,900,0,1,2,4,6,5,57,1,0
4,900,0,1,2,822,3,15,57,1,0


# ML Models

In [80]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=42)

# from sklearn.model_selection import train_test_split

X = races.df.drop(predict, axis=1, inplace=False)
y = races.df[predict]

race_ids = X["raceId"].unique()
train_race_ids, test_race_ids = train_test_split(race_ids, test_size=0.7, random_state=42)

# Create train and test sets by selecting rows based on race IDs
X_train = X[X["raceId"].isin(train_race_ids)]
y_train = y[X["raceId"].isin(train_race_ids)]

X_test = X[X["raceId"].isin(test_race_ids)]
y_test = y[X["raceId"].isin(test_race_ids)]



In [None]:
models = [GaussianNB(), LogisticRegression(max_iter=1000), RandomForestClassifier(n_estimators=100), SVC(kernel='linear', probability=True), KNeighborsClassifier(n_neighbors=9)]
verbose = False

for model in models:
    print(f'{model}')
    # Training
    model.fit(X_train.drop("raceId", axis=1), y_train)

    # Testing
    metrics_per_race = {}
    correct_predictions = 0
    for race_id in test_race_ids:
        X_test_subset = X_test[X_test["raceId"] == race_id].drop("raceId", axis=1)
        y_test_subset = y_test[X_test["raceId"] == race_id]

        y_proba = model.predict_proba(X_test_subset)
        
        predicted_idx = np.argsort(y_proba[:, 1])[-top_n:] 

        y_pred = np.zeros_like(y_test_subset, dtype=int)
        y_pred[predicted_idx] = 1
        
        actual_winner_idx = np.argmax(y_test_subset.values)
        if predict == 'winner' and predicted_idx == actual_winner_idx:
            correct_predictions += 1

        accuracy = accuracy_score(y_test_subset, y_pred)
        precision = precision_score(y_test_subset, y_pred, zero_division=0)
        recall = recall_score(y_test_subset, y_pred, zero_division=0)
        f1 = f1_score(y_test_subset, y_pred, zero_division=0)
        conf_matrix = confusion_matrix(y_test_subset, y_pred)
        
        metrics_per_race[race_id] = {
            "accuracy": accuracy,
            "confusion_matrix": conf_matrix,
            "precision": precision,
            "recall": recall,
            "f1_score": f1
        }

    # Overall insights
    if predict == 'winner' and verbose:
        print(f"Correct Prediction rate: {correct_predictions / len(test_race_ids)}")
    if verbose:
        print("Performance Metrics Per Race:")
        for race_id, metrics in metrics_per_race.items():
            print(f"Race ID: {race_id}")
            print(f"Accuracy: {metrics['accuracy']:.4f}")
            print("Confusion Matrix:")
            print(metrics["confusion_matrix"])
            print(f"Precision: {metrics['precision']:.4f}")
            print(f"Recall: {metrics['recall']:.4f}")
            print(f"F1 Score: {metrics['f1_score']:.4f}")
            print("-" * 30)

    # Aggregate insights
    overall_accuracy = np.mean([metrics["accuracy"] for metrics in metrics_per_race.values()])
    overall_precision = np.mean([metrics["precision"] for metrics in metrics_per_race.values()])
    overall_recall = np.mean([metrics["recall"] for metrics in metrics_per_race.values()])
    overall_f1 = np.mean([metrics["f1_score"] for metrics in metrics_per_race.values()])

    print(f"Overall Accuracy across races: {overall_accuracy:.4f}")
    print(f"Overall Precision: {overall_precision:.4f}")
    print(f"Overall Recall: {overall_recall:.4f}")
    print(f"Overall F1 Score: {overall_f1:.4f}")
    print("-" * 60)
    


GaussianNB()
Overall Accuracy across races: 0.9125
Overall Precision: 0.7039
Overall Recall: 0.7039
Overall F1 Score: 0.7039
------------------------------------------------------------
LogisticRegression(max_iter=1000)
Overall Accuracy across races: 0.9183
Overall Precision: 0.7237
Overall Recall: 0.7237
Overall F1 Score: 0.7237
------------------------------------------------------------
RandomForestClassifier()
Overall Accuracy across races: 0.9234
Overall Precision: 0.7412
Overall Recall: 0.7412
Overall F1 Score: 0.7412
------------------------------------------------------------
SVC(kernel='linear', probability=True)
Overall Accuracy across races: 0.9202
Overall Precision: 0.7303
Overall Recall: 0.7303
Overall F1 Score: 0.7303
------------------------------------------------------------
KNeighborsClassifier(n_neighbors=9)
Overall Accuracy across races: 0.8856
Overall Precision: 0.6140
Overall Recall: 0.6140
Overall F1 Score: 0.6140
-------------------------------------------------