In [26]:
import pandas as pd
import plotly.express as px
import os

In [27]:
# Required load_data function
def load_race_results_data():
    """Loads the race results data from CSV."""
    file_path = os.path.join(os.path.dirname(__name__), '..', 'Datasets', 'race_results.csv')
    df = pd.read_csv(file_path)
    df['constructorId'] = df['constructorId'].replace({'alphatauri': 'rb'})
    discontinued_teams = ['toro_rosso', 'force_india', 'racing_point']
    df = df[~df['constructorId'].isin(discontinued_teams)]
    return df

def load_race_schedule_data():
    file_path = os.path.join(os.path.dirname(__name__), '..', 'Datasets', 'race_schedule.csv')
    return pd.read_csv(file_path)

def load_lap_times_data():
    """Loads the lap times data from concatenated CSV files."""
    file_paths = [f'../Datasets/lap_times_{year}.csv' for year in range(2017, 2025)]
    dataframes = [pd.read_csv(file) for file in file_paths]
    lap_times_df = pd.concat(dataframes, ignore_index=True)
    return lap_times_df

def convert_time_to_seconds(time_str):
    """Converts lap time from string format 'M:SS.mmm' to seconds."""
    minutes, seconds = time_str.split(':')
    total_seconds = int(minutes) * 60 + float(seconds)
    return total_seconds

In [28]:
def impute_lap_times_mean_median(driver_id, season, round_num, df_lap_times, race_schedule, method='mean'):
    """Imputes missing lap times using mean or median imputation for a specific driver, season, and round."""
    
    # Ensure the round number corresponds to the correct race name
    race_name = race_schedule[(race_schedule['season'] == season) &
                              (race_schedule['round'] == round_num)]['raceName'].values[0]
    
    # Collect historical lap times for the driver in the same race across all seasons
    historical_lap_times = df_lap_times.merge(race_schedule[['season', 'round', 'raceName']], on=['season', 'round'])
    historical_data = historical_lap_times[(historical_lap_times['driverId'] == driver_id) &
                                           (historical_lap_times['raceName'] == race_name) &
                                           (historical_lap_times['season'] < season)].copy()  # Make a copy here
    
    # Convert time to seconds
    historical_data.loc[:, 'lap_time_seconds'] = historical_data['time'].apply(convert_time_to_seconds)
    
    # Filter out extreme lap times based on lower and upper percentiles
    lower_limit = historical_data['lap_time_seconds'].quantile(0.01)
    upper_limit = historical_data['lap_time_seconds'].quantile(0.99)
    
    filtered_data = historical_data[(historical_data['lap_time_seconds'] >= lower_limit) &
                                    (historical_data['lap_time_seconds'] <= upper_limit)]
    
    # Determine the imputation value based on the selected method
    if method == 'mean':
        impute_value = filtered_data['lap_time_seconds'].mean()
    elif method == 'median':
        impute_value = filtered_data['lap_time_seconds'].median()
    else:
        raise ValueError("Invalid method specified. Use 'mean' or 'median'.")
    
    # Get available lap times for the given race and driver
    driver_lap_times = df_lap_times[(df_lap_times['driverId'] == driver_id) &
                                    (df_lap_times['season'] == season) &
                                    (df_lap_times['round'] == round_num)]
    driver_lap_times = driver_lap_times.copy()
    driver_lap_times.loc[:, 'lap_time_seconds'] = driver_lap_times['time'].apply(convert_time_to_seconds)
    
    max_laps_in_race = df_lap_times[(df_lap_times['season'] == season) &
                                    (df_lap_times['round'] == round_num)]['lap'].max()
    current_laps = driver_lap_times['lap'].unique()
    missing_laps = sorted(set(range(1, max_laps_in_race + 1)) - set(current_laps))
    
    # Impute the missing laps with the chosen imputation value
    imputed_laps = pd.DataFrame([{
        'season': season,
        'round': round_num,
        'lap': lap,
        'position': None,  # Position unknown for imputed laps
        'driverId': driver_id,
        'lap_time_seconds': impute_value
    } for lap in missing_laps])
    
    # Combine the imputed laps with the existing lap times
    combined_driver_lap_times = pd.concat([driver_lap_times, imputed_laps], ignore_index=True)
    
    return combined_driver_lap_times

In [29]:
# Example
driver_id = 'stroll'
season = 2024
round_num = 2
df_lap_times = load_lap_times_data()
df_race_results = load_race_results_data()
race_schedule = load_race_schedule_data()
df = impute_lap_times_mean_median(driver_id, season, round_num, df_lap_times, race_schedule, method='median')

In [30]:
from statsmodels.tsa.arima.model import ARIMA

def impute_lap_times_arima(driver_id, season, round_num, df_lap_times, df_race_results, race_schedule):
    """Imputes missing lap times based on a time series ARIMA model, with outlier filtering."""
    
    # Get available lap times for the given race and driver
    driver_lap_times = df_lap_times[(df_lap_times['driverId'] == driver_id) &
                                    (df_lap_times['season'] == season) &
                                    (df_lap_times['round'] == round_num)]
    
    max_laps_in_race = df_lap_times[(df_lap_times['season'] == season) &
                                    (df_lap_times['round'] == round_num)]['lap'].max()
    max_laps_for_driver = driver_lap_times['lap'].max()
    
    # Ensure the same Grand Prix in previous seasons
    current_race_name = race_schedule[(race_schedule['season'] == season) &
                                      (race_schedule['round'] == round_num)]['raceName'].values[0]
    previous_lap_times = df_lap_times.merge(race_schedule[['season', 'round', 'raceName']], on=['season', 'round'])
    previous_lap_times = previous_lap_times[(previous_lap_times['driverId'] == driver_id) &
                                            (previous_lap_times['raceName'] == current_race_name) &
                                            (previous_lap_times['season'] <= season)]
    
    # Convert time column to seconds
    previous_lap_times['lap_time_seconds'] = previous_lap_times['time'].apply(convert_time_to_seconds)
    
    # Filter out extreme lap times based on lower and upper percentiles
    lower_percentile = 1
    upper_percentile = 99
    lower_limit = previous_lap_times['lap_time_seconds'].quantile(lower_percentile / 100)
    upper_limit = previous_lap_times['lap_time_seconds'].quantile(upper_percentile / 100)
    
    previous_lap_times_filtered = previous_lap_times[
        (previous_lap_times['lap_time_seconds'] >= lower_limit) &
        (previous_lap_times['lap_time_seconds'] <= upper_limit)
    ]
    
    if max_laps_for_driver < max_laps_in_race and not previous_lap_times_filtered.empty:
        available_lap_times = previous_lap_times['lap_time_seconds'].reset_index(drop=True)

        # Train the ARIMA model (p, d, q) need tuning based on data
        model = ARIMA(available_lap_times, order=(10, 1, 0))
        model_fit = model.fit()

        # Forecast the number of laps we are missing
        n_forecasts = max_laps_in_race - max_laps_for_driver
        forecasted_laps = model_fit.forecast(steps=n_forecasts)

        # Impute the missing lap times
        missing_laps = sorted(set(range(1, max_laps_in_race + 1)) - set(driver_lap_times['lap'].unique()))
        imputed_laps = pd.DataFrame([{
                'season': season,
                'round': round_num,
                'lap': lap,
                'position': None,
                'driverId': driver_id,
                'lap_time_seconds': forecast
            } for lap, forecast in zip(missing_laps, forecasted_laps)])

        # Concatenate imputed laps with driver lap times
        combined_driver_lap_times = pd.concat([driver_lap_times, imputed_laps], ignore_index=True)
        # Concatenate the historical lap times with current and imputed lap times
        final_combined_lap_times = pd.concat([previous_lap_times, combined_driver_lap_times], ignore_index=True)

        return final_combined_lap_times

    # If no imputation is needed or no historical data is available, return driver_lap_times and previous_lap_times combined
    return pd.concat([previous_lap_times, driver_lap_times], ignore_index=True)

In [31]:
# Example
driver_id = 'stroll'
season = 2024
round_num = 2
df_lap_times = load_lap_times_data()
df_race_results = load_race_results_data()
race_schedule = load_race_schedule_data()
df = impute_lap_times_arima(driver_id, season, round_num, df_lap_times, df_race_results, race_schedule)

lower_percentile=1
upper_percentile=99

lower_limit = df['lap_time_seconds'].quantile(lower_percentile / 100)
upper_limit = df['lap_time_seconds'].quantile(upper_percentile / 100)

df = df[
        (df['lap_time_seconds'] >= lower_limit) &
        (df['lap_time_seconds'] <= upper_limit)
    ]

x_values = pd.Series(range(len(df)))

fig = px.scatter(df, x=x_values, y='lap_time_seconds',
                 title='Lap Times Scatter Plot', labels={'x': 'Lap Number', 'lap_time_seconds': 'Lap Time (seconds)'})
fig.show()