In [1]:
import fastf1 as ff1
from utils import *
import numpy as np
import pandas as pd
import dict_data


c:\Users\User\anaconda3\lib\site-packages\numpy\.libs\libopenblas.GK7GX5KEQ4F6UYO3P26ULGBQYHGQO7J4.gfortran-win_amd64.dll
c:\Users\User\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll


In [2]:
def timedelta_to_seconds(td):
    return td / np.timedelta64(1, 's')

# Function returns all lap times for each lap for each driver
def get_lap_times(session):
    lap_time_data_dict = [] # Create a list of dictionaries
    drivers = pd.unique(session.laps['Driver']) # Array of all drivers

    # Iterate over the tuples in the list
    for driver in drivers:
        # Get all lap time for the current driver
        session_driver = session.laps.pick_driver(driver)
        lap_time = session_driver['LapTime']
        count = 0
        for lap in lap_time:    
            # Create a new dictionary for each tuple
            lap_dict = {'driver': driver, 'lap': count ,'lap_time': lap}
            
            if pd.notnull(lap): # Value != NaT. The first lap is not in the dict. 
                lap_time_data_dict.append(lap_dict) # Add the dictionary to the list
            
            # Update count that it's the lap's number.
            count = count + 1 
                    
    return lap_time_data_dict

def get_data(driver, session):
    session_driver = session.laps.pick_driver(driver)

    
    driver_lap_number = session_driver['LapNumber'] # Driver's lap  
    driver_sector1_time = (session_driver['Sector1Time'] / np.timedelta64(1, 's')).astype(float) # Sector 1 recorded time
    driver_sector2_time = (session_driver['Sector2Time'] / np.timedelta64(1, 's')).astype(float) # Sector 2 recorded time
    driver_sector3_time = (session_driver['Sector3Time'] / np.timedelta64(1, 's')).astype(float) # Sector 3 recorded time
    driver_lap_time = session_driver['LapTime'].apply(timedelta_to_seconds)
    
    weather_rainfall = session.laps.get_weather_data()['Rainfall'] # Shows if there is rainfall
    weather_rainfall = np.where(weather_rainfall == True, 1, 0)
    weather_track_temperature = session.laps.get_weather_data()['TrackTemp'] # Track temperature [°C]
       
    driver_list = [driver] * len(driver_lap_number)
    grand_prix_list = [session.event['Location']] * len(driver_lap_number)   
    
    compound = session_driver['Compound']
    
    list_of_tuples = list(zip(driver_list, grand_prix_list, driver_lap_number, driver_sector1_time, driver_sector2_time, driver_sector3_time, driver_lap_time, weather_rainfall, weather_track_temperature, compound))
    df = pd.DataFrame(list_of_tuples, columns = ['Driver', 'Race', 'Lap', 'Sector 1 Time', 'Sector 2 Time', 'Sector 3 Time', 'Lap Time', 'Rainfall', 'Track Temp', 'Compound']) 
    
    return df  

# Function selects data on driver lap times at time t from the full lap time data. It returns information for all driver at the specific lap t. 
def get_data_for_time(lap_data_dict, t):
    data_t = [] # Empty list to store the selected data
    
    for entry in lap_data_dict:
        if entry["lap"] == t:
            data_t.append(entry)
            
    return data_t

# Function choose driver with the best lap time among those available at t time, using the lap time data provided as input.
def get_best_driver(lap_data_dict):
    if not lap_data_dict:
        return None
    
    best_driver = lap_data_dict[0]['driver']
    best_lap_time = lap_data_dict[0]['lap_time']

    for entry in lap_data_dict:
        # If this driver's lap time is better than the best lap time of the current best driver, update the best driver.
        if entry['lap_time'] < best_lap_time:
            best_lap_time = entry['lap_time']
            best_driver = entry['driver']
            
    return best_driver

# Function returns the driver with the best lap time at time t, based on available data on driver lap times. 
def get_best_driver_for_time(session, t):
    # Load data on driver lap times.
    lap_data = get_lap_times(session)
    
    # Select data on driver lap times at time t
    data_t = get_data_for_time(lap_data, t)
    
    # Choose the driver with the best lap time among those available at time t
    best_driver = get_best_driver(data_t)
    
    return best_driver

def get_race_list(year):
    grand_prix_list = ff1.get_event_schedule(year)
    race_list = []
                     
    for race in grand_prix_list['Location']:
        race_list.append(race)  
            
    # Removing Pre-season test sessions.
    if year == 2022:
        race_list.remove('Spain')
        race_list.remove('Bahrain')
        
    elif year == 2021:
        race_list.remove('Sakhir') 
      
    elif year == 2020:
        if 'Montmeló' in race_list:
            race_list.remove('Montmeló')  
        if 'Montmeló' in race_list:  
            race_list.remove('Montmeló')  

    return race_list

def load_dataset(year_list):
    driver_race_data_list = []
    driver_encoding = {}
    race_encoding = {}
    compound_encoding = {}

    for year in year_list:
        # Get the race list for the input year
        race_list = get_race_list(year) 

        driver_race_data = {}

        for race in race_list:
            session = ff1.get_session(year, race, 'R')
            session.load()
            driver_list = pd.unique(session.laps['Driver'])

            for driver in driver_list:
                session_driver = session.laps.pick_driver(driver)

                # Load all the driver's information for the current session
                data = get_data(driver, session)

                # Encode and replace driver data.
                driver_encoding[driver] = dict_data.drivers[driver]
                driver_encoded = driver_encoding[driver]
                data['Driver'] = data['Driver'].replace(driver, driver_encoded)

                # Encode and replace race data.
                race_encoding[race] = dict_data.races[race]
                race_encoded = race_encoding[race]
                data['Race'] = data['Race'].replace(race, race_encoded)

                # Compound's driver data from fastf1 library. 
                compound_list = session_driver['Compound']

                for compound in compound_list:

                    # Encode and replace compound data.
                    compound_encoding[compound] = dict_data.compound.get(compound, -1)
                    compound_encoded = compound_encoding[compound]
                    data['Compound'] = data['Compound'].replace(compound, compound_encoded) 

                    driver_race_data[(driver_encoded, race_encoded)] = data.values   

                    # Add rows until lap is equal to 78 (Monaco's grand prix lap). 
                    while(driver_race_data[(driver_encoded, race_encoded)].shape[0] < 78):
                        lap = driver_race_data[(driver_encoded, race_encoded)].shape[0] + 1
                        new_row = np.array([[driver_encoded, race_encoded, lap, -1, -1, -1, -1, -1, -1, -1]])
                        driver_race_data[(driver_encoded, race_encoded)] = np.vstack(
                            (driver_race_data[(driver_encoded, race_encoded)], new_row))

        # Replace NaN values with -1
        for key, value in driver_race_data.items():
            driver_race_data[key] = np.nan_to_num(value, nan=-1)

        driver_race_data_list.append(driver_race_data)

    return driver_race_data_list

# Function returns the compound of the driver how had the best time at t time. 
def get_compound_for_time(session, t):   
    driver = get_best_driver_for_time(session, t)  
    
    session_driver = session.laps.pick_driver(driver)
    compound = session_driver['Compound']
    
    best_compound = None 
    
    # It gets the compound at t time. 
    for i, entry in enumerate(compound):
        if i == t:
            best_compound = entry
            
    #print = f"Driver: {driver} - Compound: {best_compound}"
    return best_compound


# Get information for a specific race and year. 
def get_information(session, race, year):
    # Get lap number for the race
    lap = dict_data.laps[race]
    
    # Weather conditions data
    air_temperature = session.laps.get_weather_data()['AirTemp']
    humidity = session.laps.get_weather_data()['Humidity']
    pressure = session.laps.get_weather_data()['Pressure']
    rainfall = session.laps.get_weather_data()['Rainfall']
    rainfall = np.where(rainfall == True, 1, 0)

    track_temperature = session.laps.get_weather_data()['TrackTemp']
    wind_direction = session.laps.get_weather_data()['WindDirection']
    wind_speed = session.laps.get_weather_data()['WindSpeed']
    
    year_list = [year] * lap
    race = [session.event['Location']] * lap   
    
    lap_list = []
    for i in range(lap):
        lap_list.append(i)
    
    list_of_tuples = list(zip(race, year_list, lap_list, air_temperature, humidity, pressure, rainfall, track_temperature, wind_direction, wind_speed))
    
    df = pd.DataFrame(list_of_tuples, columns = ['Race', 'Year', 'Lap', 'Air Temperature', 'Humidity', 
                                                 'Pressure', 'Rainfall', 'Track Temperature', 'Wind Direction', 
                                                 'Wind Speed'])
    
    return df 

In [3]:
def populate_dataset(year_list):
    dataset = pd.DataFrame()  # Empty DataFrame to hold the full dataset
    race_encoding = {}
    compound_encoding = dict_data.compound

    for year in year_list:
        # Get the race list for the current year
        race_list = get_race_list(year)

        for race in race_list:
            session = ff1.get_session(year, race, 'R')
            
            # Retry logic to handle loading errors
            try:
                session.load()
            except Exception as e:
                print(f"Failed to fully load session for {race} in {year}: {e}")
                continue  # Skip to the next race if loading fails

            try:
                # Get driver's information for the current session
                driver_information = get_information(session, race, year)
            except DataNotLoadedError as e:
                print(f"Data not loaded for {race} in {year}: {e}")
                continue  # Skip to the next race if data is not available

            # Handle missing 'DriverNumber' using try-except block
            try:
                driver_information['DriverNumber'] = session.laps['DriverNumber']
            except KeyError:
                print(f"'DriverNumber' not found for race {race} in year {year}, filling with NaN")
                driver_information['DriverNumber'] = np.nan

            # Ensure race encoding is applied properly
            race_encoding[race] = dict_data.races.get(race, -1)  # Ensure default encoding
            driver_information['Race'] = driver_information['Race'].map(lambda x: race_encoding.get(x, -1))

            for i, lap in enumerate(driver_information['Lap']):
                lap_data = list(driver_information.loc[driver_information['Lap'] == lap].values[0])
                target = get_compound_for_time(session, lap)  # Best compound at each lap

                if target is not None:
                    compound_encoded = compound_encoding.get(target, -1)  # Ensure default encoding for compound
                else:
                    compound_encoded = -1

                lap_data.append(compound_encoded)

                # Append the data as a row in the DataFrame
                driver_information.loc[i, 'Compound_Encoded'] = compound_encoded

            # Concatenate the data while resetting index
            dataset = pd.concat([dataset, driver_information], ignore_index=True)

    return dataset

def get_dataset(year_list):
    dataset_df = populate_dataset(year_list)

    # Save the DataFrame to a file if needed
    dataset_df.to_csv('exp2_final_data.csv', index=False)

    return dataset_df

In [11]:
# Enable fastf1 cache
ff1.Cache.enable_cache('cache')

In [5]:
dict_data.laps['Monte Carlo'] = dict_data.laps['Monte-Carlo']
dict_data.races['Monte Carlo'] = dict_data.races['Monte-Carlo']
dict_data.laps['Singapore'] = dict_data.laps['Marina Bay']
dict_data.races['Singapore'] = dict_data.races['Marina Bay']
dict_data.laps['Nürburgring'] = dict_data.laps['Nürburg']
dict_data.races['Nürburgring'] = dict_data.races['Nürburg']
dict_data.laps['Bahrain'] = dict_data.laps['Sakhir5']
dict_data.races['Bahrain'] = dict_data.races['Sakhir5']
dict_data.laps['Lusail'] = dict_data.laps['Al Daayen']
dict_data.races['Lusail'] = dict_data.races['Al Daayen']

In [6]:
year_list = [2019, 2020, 2021, 2022]
exp2_final_data = get_dataset(year_list)

core           INFO 	Loading data for Australian Grand Prix - Race [v3.3.9]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['77', '44', '33', '5', '16', '20', '27', '7', '18', '26', '10', '4', '11', '23', '99', '63', '88', '8', '3', '55']
core           INFO 	Loading data for Bahrain Grand Prix - R

Failed to fully load session for Sakhir in 2022: 'DriverNumber'


req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
core           INFO 	Loading data for Australian Grand Prix - Race [v3.3.9]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...


Failed to fully load session for Jeddah in 2022: 'DriverNumber'


req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
core           INFO 	Loading data for Emilia Romagna Grand Prix - Race [v3.3.9]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...


Failed to fully load session for Melbourne in 2022: 'DriverNumber'


req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
core           INFO 	Loading data for Miami Grand Prix - Race [v3.3.9]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...


Failed to fully load session for Imola in 2022: 'DriverNumber'


req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
core           INFO 	Loading data for Spanish Grand Prix - Race [v3.3.9]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...


Failed to fully load session for Miami in 2022: 'DriverNumber'


req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
core           INFO 	Loading data for Monaco Grand Prix - Race [v3.3.9]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...


Failed to fully load session for Barcelona in 2022: 'DriverNumber'


req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
core           INFO 	Loading data for Azerbaijan Grand Prix - Race [v3.3.9]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...


Failed to fully load session for Monaco in 2022: 'DriverNumber'


req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
core           INFO 	Loading data for Canadian Grand Prix - Race [v3.3.9]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...


Failed to fully load session for Baku in 2022: 'DriverNumber'


req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
core           INFO 	Loading data for British Grand Prix - Race [v3.3.9]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...


Failed to fully load session for Montréal in 2022: 'DriverNumber'


req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
core           INFO 	Loading data for Austrian Grand Prix - Race [v3.3.9]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...


Failed to fully load session for Silverstone in 2022: 'DriverNumber'


req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
core           INFO 	Loading data for French Grand Prix - Race [v3.3.9]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...


Failed to fully load session for Spielberg in 2022: 'DriverNumber'


req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
core           INFO 	Loading data for Hungarian Grand Prix - Race [v3.3.9]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...


Failed to fully load session for Le Castellet in 2022: 'DriverNumber'


req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
core           INFO 	Loading data for Belgian Grand Prix - Race [v3.3.9]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...


Failed to fully load session for Budapest in 2022: 'DriverNumber'


req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
core           INFO 	Loading data for Dutch Grand Prix - Race [v3.3.9]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...


Failed to fully load session for Spa-Francorchamps in 2022: 'DriverNumber'


req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
core           INFO 	Loading data for Italian Grand Prix - Race [v3.3.9]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...


Failed to fully load session for Zandvoort in 2022: 'DriverNumber'


req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
core           INFO 	Loading data for Singapore Grand Prix - Race [v3.3.9]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...


Failed to fully load session for Monza in 2022: 'DriverNumber'


req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
core           INFO 	Loading data for Japanese Grand Prix - Race [v3.3.9]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...


Failed to fully load session for Marina Bay in 2022: 'DriverNumber'


req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
core           INFO 	Loading data for United States Grand Prix - Race [v3.3.9]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...


Failed to fully load session for Suzuka in 2022: 'DriverNumber'


req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
core           INFO 	Loading data for Mexico City Grand Prix - Race [v3.3.9]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...


Failed to fully load session for Austin in 2022: 'DriverNumber'


req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
core           INFO 	Loading data for São Paulo Grand Prix - Race [v3.3.9]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...


Failed to fully load session for Mexico City in 2022: 'DriverNumber'


req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
core           INFO 	Loading data for Abu Dhabi Grand Prix - Race [v3.3.9]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...


Failed to fully load session for São Paulo in 2022: 'DriverNumber'


req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...


Failed to fully load session for Yas Island in 2022: 'DriverNumber'


In [7]:
df = exp2_final_data

In [8]:
df.head(50)

Unnamed: 0,Race,Year,Lap,Air Temperature,Humidity,Pressure,Rainfall,Track Temperature,Wind Direction,Wind Speed,DriverNumber,Compound_Encoded
0,2,2019,0,23.7,70.3,1015.4,0,43.6,176,1.4,10,0.0
1,2,2019,1,23.6,70.1,1015.3,0,43.3,191,1.3,10,0.0
2,2,2019,2,23.5,70.5,1015.4,0,43.4,68,1.3,10,0.0
3,2,2019,3,23.7,69.1,1015.3,0,43.4,328,1.8,10,0.0
4,2,2019,4,23.7,69.1,1015.4,0,43.0,149,1.2,10,0.0
5,2,2019,5,23.5,70.5,1015.3,0,42.9,178,2.2,10,0.0
6,2,2019,6,23.4,70.8,1015.3,0,42.9,0,1.4,10,0.0
7,2,2019,7,23.4,70.7,1015.3,0,42.5,333,1.1,10,0.0
8,2,2019,8,23.5,70.0,1015.3,0,42.8,152,1.7,10,0.0
9,2,2019,9,23.5,70.3,1015.4,0,42.3,98,1.2,10,0.0


In [9]:
import os
os.getcwd()

'c:\\Users\\User\\Desktop\\Capstone Project'

In [10]:
df.to_csv('dataset.csv', index=False)