In [1]:
import pandas as pd, numpy as np

In [3]:
# read input data
df = pd.read_csv("f1_2024_cleaned.csv") 

df.head(3)

Unnamed: 0,season,year,round,race_name,driver,driverId,constructor,grid_position,finishing_position,points,...,session_type,made_Q2,made_Q3,eliminated_in_Q1,eliminated_in_Q2,eliminated_in_Q3,qualifying_round_reached,pitted,multi_stop,one_stop
0,2024,2024,1,Bahrain Grand Prix,VER,max_verstappen,Red Bull Racing,1,1,26,...,race,True,True,False,False,True,3,True,True,False
1,2024,2024,1,Bahrain Grand Prix,PER,perez,Red Bull Racing,5,2,18,...,race,True,True,False,False,True,3,True,True,False
2,2024,2024,1,Bahrain Grand Prix,SAI,sainz,Ferrari,4,3,15,...,race,True,True,False,False,True,3,True,True,False


In [4]:
# convert M:SS.sss to seconds
def time_to_seconds(t):
    if pd.isna(t):
        return None
    if ":" in t:
        m, s = t.split(":")
        return float(m) * 60 + float(s)
    return float(t)

df["Q1_sec"] = df["Q1_time"].apply(time_to_seconds)
df["Q2_sec"] = df["Q2_time"].apply(time_to_seconds)
df["Q3_sec"] = df["Q3_time"].apply(time_to_seconds)


# impute Q_i sec with maximum seconds over all dataset for that qualifying time
df["Q1_sec_filled"] = df["Q1_sec"].fillna(df["Q1_sec"].max() + 5)
df["Q2_sec_filled"] = df["Q2_sec"].fillna(df["Q2_sec"].max() + 5)
df["Q3_sec_filled"] = df["Q3_sec"].fillna(df["Q3_sec"].max() + 5)


df["Q1_gap"] = df["Q1_sec_filled"] - df["Q1_sec_filled"].min() # Diff Q1 seconds to best Q1 time
df["Q2_gap"] = df["Q2_sec_filled"] - df["Q2_sec_filled"].min() # Diff Q2 seconds to best Q2 time
df["Q3_gap"] = df["Q3_sec_filled"] - df["Q3_sec_filled"].min() # Diff Q3 seconds to best Q3 time


# drop original q_i time columns
df=df.drop(['Q1_time', 'Q2_time', 'Q3_time'], axis=1)

# drop few rows with no data for grid position
df = df.dropna(subset=['grid_position'])


# impute zeroes when no laps
df["laps_total"] = df["laps_total"].fillna(0)
df["laps_count"] = df["laps_count"].fillna(0)

# impute worse pit duration when no pit time data
max_pit = df["pit_avg_duration_sec"].max()
df["pit_avg_duration_sec"] = df["pit_avg_duration_sec"].fillna(max_pit + 5)


# assume laps_total already created (0 for no race, real laps otherwise)

df["pit_first_lap"] = np.where(
    df["laps_total"] == 0,          # no race laps
    999,                            # special code: no race / no pit
    df["pit_first_lap"].fillna(df["laps_total"] + 1)
)

df["pit_last_lap"] = np.where(
    df["laps_total"] == 0,
    999,
    df["pit_last_lap"].fillna(df["laps_total"] + 1)
)


max_pit = df["pit_min_duration_sec"].max()
df["pit_min_duration_sec"] = df["pit_min_duration_sec"].fillna(max_pit + 5)


max_pit = df["pit_total_duration_sec"].max()
df["pit_total_duration_sec"] = df["pit_total_duration_sec"].fillna(max_pit + 5)

# target variable for classification
# binary podium 
df['flg_podium'] = np.where(df['finishing_position'] <= 3, '1', '0')

# top-5 positions
df['flg_top5'] = np.where(df['finishing_position'] <= 5, '1', '0')

In [5]:
# Write recipe outputs
# Dataset f1_2022_2023_train renamed to f1_2024_test
df.to_csv("f1_2024_test.csv", index=False)