In [1]:
NAME = "Alison Menezes"

### Building the dataset

The most comprehensive data source for my questions is FASTF1; however, it is a library, not a dataset. I will generate my own dataset using the FASTF1 library and use it to perform my analysis.

In [2]:
import fastf1
from fastf1.core import Laps
from fastf1.events import get_event_schedule
import pandas as pd

In [None]:
# drivers for the 2025 season
drivers_2025 = ['NOR', 'VER', 'RUS', 'PIA', 'ANT', 'ALB', 'OCO', 'STR', 'HAM', 'LEC', 
                'HUL', 'BEA', 'TSU', 'SAI', 'HAD', 'GAS', 'LAW', 'DOO', 'BOR', 'ALO']

# there is a lot of race data, so I will pull from 2021 to the most recent 
def get_all_races(start_year=2021, end_year=2024):
    races = []
    
    for year in range(start_year, end_year + 1):
        schedule = get_event_schedule(year)
        
        for i, event in schedule.iterrows():
            # only want conventional races (R) 
            if event['EventFormat'] == 'conventional':
                race_info = (year, event['EventName'], 'R')
                races.append(race_info)
    
    return races

races = get_all_races(2021, 2024)
for year, name, session in races:
    print(f"{year} - {name} ({session})")



2021 - Bahrain Grand Prix (R)
2021 - Emilia Romagna Grand Prix (R)
2021 - Portuguese Grand Prix (R)
2021 - Spanish Grand Prix (R)
2021 - Monaco Grand Prix (R)
2021 - Azerbaijan Grand Prix (R)
2021 - French Grand Prix (R)
2021 - Styrian Grand Prix (R)
2021 - Austrian Grand Prix (R)
2021 - Hungarian Grand Prix (R)
2021 - Belgian Grand Prix (R)
2021 - Dutch Grand Prix (R)
2021 - Russian Grand Prix (R)
2021 - Turkish Grand Prix (R)
2021 - United States Grand Prix (R)
2021 - Mexico City Grand Prix (R)
2021 - Qatar Grand Prix (R)
2021 - Saudi Arabian Grand Prix (R)
2021 - Abu Dhabi Grand Prix (R)
2022 - Bahrain Grand Prix (R)
2022 - Saudi Arabian Grand Prix (R)
2022 - Australian Grand Prix (R)
2022 - Miami Grand Prix (R)
2022 - Spanish Grand Prix (R)
2022 - Monaco Grand Prix (R)
2022 - Azerbaijan Grand Prix (R)
2022 - Canadian Grand Prix (R)
2022 - British Grand Prix (R)
2022 - French Grand Prix (R)
2022 - Hungarian Grand Prix (R)
2022 - Belgian Grand Prix (R)
2022 - Dutch Grand Prix (R)
202

In [4]:
def get_style_features(lap, driver, race_name, year):
    # this line gets telemetry for a lap (speed, throttle, brake, etc)
    telemetry = lap.get_car_data().add_distance()
    # these are all the features that could be useful for building driver profiles
    features = {
        'Driver': driver,
        'Track': race_name,
        'Year': year,
        'LapTime_s': lap['LapTime'].total_seconds(),
        'MeanSpeed': telemetry['Speed'].mean(),
        'MaxSpeed': telemetry['Speed'].max(),
        'ThrottleMean': telemetry['Throttle'].mean(),
        'BrakePct': (telemetry['Brake'] > 0).sum() / len(telemetry) * 100,
        'BrakingEvents': (telemetry['Brake'] > 0).astype(int).diff().fillna(0).gt(0).sum()
    }
    return features

In [None]:
import time
output = []

# popular tracks
tracks = ['Monaco', 'Monza', 'Silverstone', 'Spa', 'Suzuka', 'Bahrain', 'Australia', 'Azerbaijan', 'Austria', 'China']
years = [2021, 2022, 2023, 2024]
session_type = 'R' 

for year in years:
    for track in tracks:
        print(f"\nLoading session: {track} {year}")
        try:
            session = fastf1.get_session(year, track, session_type)
            session.load(laps=True)
        except Exception as e:
            print(f"Failed to load session {track} {year}: {e}")
            continue
        
        session_drivers = session.laps['Driver'].unique()
        valid_drivers = [d for d in session_drivers if d in drivers_2025]

        for driver in valid_drivers:
            try:
                driver_laps = session.laps.pick_driver(driver).pick_quicklaps()
                driver_laps = driver_laps.sort_values(by='LapTime').head(3)  # top 3 laps
                if driver_laps.empty:
                    print(f"No laps for {driver} at {track} {year}")
                    continue

                for _, lap in driver_laps.iterrows():
                    features = get_style_features(lap, driver, track, year)
                    if features:
                        output.append(features)
            except Exception as e:
                print(f"Skipping lap for {driver} at {track} {year}: {e}")

        time.sleep(3)  # slight delay because I was having data loading issues

df = pd.DataFrame(output)
df.to_csv("style_updated_0415.csv", index=False)

In [None]:
# Since the rookies have no previous race data in F1, 
# I'll use data from the 2025 races AND qualifying matches that have happened already 
# and just pull enough laps to match the rest of the drivers
rookies1 = ['ANT', 'BOR', 'BEA', 'LAW']
rookies2 = ['DOO', 'HAD']
races1 = [(2025, 'Australian Grand Prix', 'R'), (2025, 'Australian Grand Prix', 'Q'), 
          (2025, 'Chinese Grand Prix', 'R'), (2025, 'Chinese Grand Prix', 'Q'),
          (2025, 'Japanese Grand Prix', 'R'), (2025, 'Japanese Grand Prix', 'Q'),
          (2025, 'Bahrain Grand Prix', 'R'), (2025, 'Bahrain Grand Prix', 'Q')]
races2 = [(2025, 'Chinese Grand Prix', 'R'), (2025, 'Chinese Grand Prix', 'Q'), 
          (2025, 'Japanese Grand Prix', 'R'), (2025, 'Japanese Grand Prix', 'Q'),
          (2025, 'Bahrain Grand Prix', 'R'), (2025, 'Bahrain Grand Prix', 'Q')]

rookie1_data = []
rookie2_data = []

for year, track, session_type in races1:
    print(f"\nLoading session: {track} {year} ({session_type})")
    session = fastf1.get_session(year, track, session_type)
    session.load()

    for driver in rookies1:
        try:
            if session_type == 'Q':
                laps = session.laps.pick_driver(driver)
            else:
                laps = session.laps.pick_driver(driver).pick_quicklaps()

            laps = laps[laps['LapTime'].notna()]
            laps = laps.sort_values(by='LapTime').head(45 if session_type == 'R' else 10)

            if laps.empty:
                print(f"No laps for {driver} at {track} {year} ({session_type})")
                continue

            print(f"{driver}: {len(laps)} laps")

            for _, lap in laps.iterrows():
                try:
                    features = get_style_features(lap, driver, track, year)
                    rookie1_data.append(features)
                except Exception as e:
                    print(f"Skipping lap for {driver} at {track}: {e}")
        except Exception as e:
            print(f"Failed to process {driver} at {track} {year}: {e}")

for year, track, session_type in races2:
    print(f"\nLoading session: {track} {year} ({session_type})")
    session = fastf1.get_session(year, track, session_type)
    session.load()

    for driver in rookies2:
        try:
            if session_type == 'Q':
                laps = session.laps.pick_driver(driver)
            else:
                laps = session.laps.pick_driver(driver).pick_quicklaps()

            # Filter out invalid LapTime entries
            laps = laps[laps['LapTime'].notna()]
            # Cap the number of laps depending on session type
            laps = laps.sort_values(by='LapTime').head(45 if session_type == 'R' else 10)

            if laps.empty:
                print(f"No laps for {driver} at {track} {year} ({session_type})")
                continue

            print(f"{driver}: {len(laps)} laps")

            for _, lap in laps.iterrows():
                try:
                    features = get_style_features(lap, driver, track, year)
                    rookie2_data.append(features)
                except Exception as e:
                    print(f"Skipping lap for {driver} at {track}: {e}")
        except Exception as e:
            print(f"Failed to process {driver} at {track} {year}: {e}")

# Combine new and existing data
df_rookies1 = pd.DataFrame(rookie1_data)
df_rookies2 = pd.DataFrame(rookie2_data)
df_final = pd.concat([df, df_rookies1, df_rookies2], ignore_index=True)
df_final = df_final.sort_values(by=["Driver", "Year"])

# Save final dataset
df_final.to_csv("style_balanced_0415.csv", index=False)
print("\nFinal dataset saved as style_balanced_0415.csv")

### Data Preprocessing

In [None]:
# check for missing values - there shouldn't be any since I built the dataset
df = pd.read_csv('style_balanced_0415.csv')
missing_values = df.isnull().sum()
print(f"Missing values:\n {missing_values}")

Missing values:
 Driver           0
Track            0
Year             0
LapTime_s        0
MeanSpeed        0
MaxSpeed         0
ThrottleMean     0
BrakePct         0
BrakingEvents    0
dtype: int64


In [None]:
# checking for outliers or anomalies in data - there are none
print("Summary statistics:\n", df.describe())

Summary statistics:
               Year   LapTime_s   MeanSpeed    MaxSpeed  ThrottleMean  \
count   925.000000  925.000000  925.000000  925.000000    925.000000   
mean   2023.124324   87.856661  206.869021  309.754595     65.150611   
std       1.374120    8.309337   27.307816   16.466064      8.526918   
min    2021.000000   72.909000  148.710345  266.000000     43.627907   
25%    2022.000000   79.876000  197.002755  301.000000     61.079082   
50%    2023.000000   87.079000  209.549488  308.000000     66.073579   
75%    2024.000000   95.874000  228.165680  322.000000     71.608355   
max    2025.000000  108.837000  254.547855  357.000000     80.337349   

         BrakePct  BrakingEvents  
count  925.000000     925.000000  
mean    18.850414       9.235676  
std      5.241565       2.460208  
min      0.000000       0.000000  
25%     15.094340       7.000000  
50%     17.808219       9.000000  
75%     20.807453      11.000000  
max     44.360902      20.000000  
