In [92]:
import pandas as pd
import numpy as np
# from matplotlib import pyplot as plt
# %matplotlib inline
# import seaborn as sns

from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE


# Data preparation

In [105]:
def create_dataset(start_year=2019, end_year=2024, num_testyears=1):
    """
    Create dataset used in paper
    """
    races = pd.read_csv("database/races.csv")
    results = pd.read_csv("database/results.csv")

    # Get races within 5 years
    races = races[(races["year"] >= start_year) & (races["year"] <= end_year)]
    df = pd.merge(races, results, on="raceId", how="inner")
    df["podium"] = df["positionOrder"].apply(lambda x: 1 if x <= 3 else 0)
    df["win"] = df["positionOrder"].apply(lambda x: 1 if x == 1 else 0)

    # Get the number of wins and podiums in the last 5 races for each driver
    driver_win = {}
    driver_podium = {}
    for d in pd.unique(df["driverId"]):
        df_driver = df[(df["driverId"] == d) & (df["year"] <= end_year-num_testyears)].tail(5)
        driver_win[d] = df_driver["win"].sum()
        driver_podium[d] = df_driver["podium"].sum()

    df["driver_win"] = df["driverId"].map(driver_win)
    df["driver_podium"] = df["driverId"].map(driver_podium)
    cols = ["raceId", "year", "grid", "driver_win", "driver_podium", "circuitId", "win"]
    df = df[cols]

    # One hot encoding of circuitId
    df = pd.get_dummies(df, columns=['circuitId'], prefix='circuit', dtype=int)

    # Split the dataset based on year
    df_train = df[df["year"] <= end_year-num_testyears].drop(["year"], axis=1)
    df_test = df[df["year"] > end_year-num_testyears].drop(["year"], axis=1)

    # Random oversample to make a balanced dataset
    X = df_train.drop(columns=['win'])
    y = df_train['win']
    oversampler = SMOTE(sampling_strategy='auto', random_state=42)
    X_resampled, y_resampled = oversampler.fit_resample(X, y)
    df_train = pd.concat([X_resampled, y_resampled], axis=1)

    print(df_train)
    print(df_test)
    df_train.to_csv("database/qi_train.csv", index=False)
    df_test.to_csv("database/qi_test.csv", index=False)


create_dataset(2015, 2020, 1)

      raceId  grid  driver_win  driver_podium  circuit_1  circuit_2  \
0        931     2           1              5          0          0   
1        931     3           0              2          0          0   
2        931     1           2              4          0          0   
3        931     5           0              0          0          0   
4        931     4           0              0          0          0   
...      ...   ...         ...            ...        ...        ...   
3951     928     1           0              3          0          0   
3952     929     1           1              4          0          0   
3953     959     2           1              4          0          0   
3954     958     1           2              4          0          0   
3955    1026     3           2              3          0          0   

      circuit_3  circuit_4  circuit_5  circuit_6  ...  circuit_24  circuit_32  \
0             0          0          0          1  ...           0 

# Model training & testing

In [106]:
# Get data
df_train = pd.read_csv("database/qi_train.csv")
df_test = pd.read_csv("database/qi_test.csv")
X_train = df_train.drop(["raceId", "win"], axis=1, inplace=False)
y_train = df_train["win"]

# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Test model
y_pred = []
for ri in pd.unique(df_test["raceId"]):
    df_slice = df_test[df_test["raceId"] == ri]
    X_test = df_slice.drop(["raceId", "win"], axis=1, inplace=False)
    logprobs = model.predict_log_proba(X_test)[:, 1]
    y_pred.append(np.argmax(logprobs))

print(f"Correct: {y_pred.count(0)}/{len(y_pred)}\nAccuracy: {y_pred.count(0)/len(y_pred):.2f}")

Correct: 12/17
Accuracy: 0.71
