In [65]:
import pandas as pd
import numpy as np
# from matplotlib import pyplot as plt
# %matplotlib inline
# import seaborn as sns

from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE, RandomOverSampler


# Data preparation

In [95]:
def create_dataset(train_start_year=2019, train_end_year=2023, test_start_year=2024, test_end_year=2024):
    """
    Create dataset used in paper
    """
    races = pd.read_csv("database/races.csv")
    results = pd.read_csv("database/results.csv")

    # Get races within specified years
    races = races[(races["year"] >= min(train_start_year, test_start_year)) & (races["year"] <= max(train_end_year, test_end_year))]
    df = pd.merge(races, results, on="raceId", how="inner")
    df["podium"] = df["positionOrder"].apply(lambda x: 1 if x <= 3 else 0)
    df["win"] = df["positionOrder"].apply(lambda x: 1 if x == 1 else 0)

    # Get the number of wins and podiums in the last 5 races for each driver
    driver_win = {}
    driver_podium = {}
    for d in pd.unique(df["driverId"]):
        df_driver = df[(df["driverId"] == d) & (df["year"] <= train_end_year)].tail(5)
        driver_win[d] = df_driver["win"].sum()
        driver_podium[d] = df_driver["podium"].sum()

    df["driver_win"] = df["driverId"].map(driver_win)
    df["driver_podium"] = df["driverId"].map(driver_podium)
    cols = ["raceId", "year", "grid", "driver_win", "driver_podium", "circuitId", "win"]
    df = df[cols]

    # One hot encoding of circuitId
    df = pd.get_dummies(df, columns=['circuitId'], prefix='circuit', dtype=int)

    # Split the dataset based on year
    df_train = df[(df["year"] >= train_start_year) & (df["year"] <= train_end_year)].drop(["year"], axis=1)
    df_test = df[(df["year"] >= test_start_year) & (df["year"] <= test_end_year)].drop(["year"], axis=1)

    # Random oversample to make a balanced dataset
    oversampler = SMOTE(sampling_strategy='auto', random_state=42)
    # oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)
    X = df_train.drop(columns=['win'])
    y = df_train['win']
    X_resampled, y_resampled = oversampler.fit_resample(X, y)
    df_train = pd.concat([X_resampled, y_resampled], axis=1)

    print(len(pd.unique(df_train["raceId"])))
    print(len(pd.unique(df_test["raceId"])))
    df_train.to_csv("database/qi_train.csv", index=False)
    df_test.to_csv("database/qi_test.csv", index=False)


create_dataset(2010, 2014, 2020, 2024)

97
95


# Model training & testing

In [96]:
# Get data
df_train = pd.read_csv("database/qi_train.csv")
df_test = pd.read_csv("database/qi_test.csv")
X_train = df_train.drop(["raceId", "win"], axis=1, inplace=False)
y_train = df_train["win"]

# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Test model
y_pred = []
for ri in pd.unique(df_test["raceId"]):
    df_slice = df_test[df_test["raceId"] == ri]
    X_test = df_slice.drop(["raceId", "win"], axis=1, inplace=False)
    logprobs = model.predict_log_proba(X_test)[:, 1]
    y_pred.append(np.argmax(logprobs))

print(f"Correct: {y_pred.count(0)}/{len(y_pred)}\nAccuracy: {y_pred.count(0)/len(y_pred):.4f}")

Correct: 37/95
Accuracy: 0.3895
