In [1]:
import pandas as pd
import fastf1
from IPython.display import clear_output

## Pick all the anomalies from 2014-2019

In [72]:
all_data = []

year = range(2014, 2020)
for y in year:
    try:
        schedule = fastf1.get_event_schedule(y)
        for race in schedule.iterrows():
            race_info = race[1]
            event_name = race_info['EventName']  # Nome specifico della gara
            print(f"Loading data for {event_name} ({race_info['EventDate']})")

            try:
                race_session = fastf1.get_session(y, event_name, 'R')
                race_session.load(telemetry=False, laps=False, weather=False) #IMPORTANT
                clear_output()
                print(f"Loaded data for {event_name} ({race_info['EventDate']})")

                # Filtra per status rilevanti
                relevant_status = race_session.results[(race_session.results['Status'] != 'Finished') & 
                                                       (~race_session.results['Status'].str.contains('lap', case=False, na=False))&
                                                       (race_session.results['Status'] != 'Collision')&
                                                       (race_session.results['Status'] != 'Disqualified')&
                                                       (race_session.results['Status'] != 'Collision damage')&
                                                       (race_session.results['Status'] != 'Wheel nut')&
                                                       (race_session.results['Status'] != 'Accident')&
                                                       (race_session.results['Status'] != 'Retired')&
                                                       (race_session.results['Status'] != 'Withdrew')&
                                                       (race_session.results['Status'] != 'Spun off')&
                                                       (race_session.results['Status'] != 'Seat')&
                                                       (race_session.results['Status'] != 'Debris')&
                                                       (race_session.results['Status'] != 'Excluded')&
                                                       (race_session.results['Status'] != '')&
                                                       (race_session.results['Status'] != 'Illness')]
                
                # Crea il DataFrame con solo dati rilevanti per le anomalies
                partial_csv = relevant_status[['DriverNumber', 'Status']].reset_index(drop=True)
                event_name = race_session.event['EventName']
                event_date = race_session.event['EventDate']
                partial_csv = partial_csv.assign(EventName=event_name, EventDate=event_date)
                
                all_data.append(partial_csv)

            except Exception as e:
                print(f"Failed to load data for {event_name}: {e}")
    except Exception as e:
        print(f"Failed to process year {year}: {e}")


Loaded data for Abu Dhabi Grand Prix (2019-12-01 00:00:00)


In [73]:
final_data = pd.concat(all_data, ignore_index=True)
final_data = final_data.drop_duplicates()
# final_data['EventDate'] = final_data['EventDate'].astype(str)
# final_data[final_data['EventDate'].str.startswith("2017")]


In [89]:
final_data['EventDate'] = final_data['EventDate'].astype(str)
final_data[final_data['EventDate'].str.startswith("2019")].shape

(30, 4)

## Pick all the anomalies from 2020-2025

In [90]:


all_data = []

year = range(2020, 2025)
for y in year:
    try:
        schedule = fastf1.get_event_schedule(y)
        for race in schedule.iterrows():
            race_info = race[1]
            event_name = race_info['EventName']  # Nome specifico della gara
            print(f"Loading data for {event_name} ({race_info['EventDate']})")

            try:
                race_session = fastf1.get_session(y, event_name, 'R')
                race_session.load(telemetry=False, laps=False, weather=False) #IMPORTANT
                clear_output()
                print(f"Loaded data for {event_name} ({race_info['EventDate']})")

                # Filtra per status rilevanti
                relevant_status = race_session.results[(race_session.results['Status'] != 'Finished') & 
                                                       (~race_session.results['Status'].str.contains('lap', case=False, na=False))&
                                                       (race_session.results['Status'] != 'Collision')&
                                                       (race_session.results['Status'] != 'Disqualified')&
                                                       (race_session.results['Status'] != 'Collision damage')&
                                                       (race_session.results['Status'] != 'Wheel nut')&
                                                       (race_session.results['Status'] != 'Accident')&
                                                       (race_session.results['Status'] != 'Retired')&
                                                       (race_session.results['Status'] != 'Withdrew')&
                                                       (race_session.results['Status'] != 'Spun off')&
                                                       (race_session.results['Status'] != 'Seat')&
                                                       (race_session.results['Status'] != 'Debris')&
                                                       (race_session.results['Status'] != 'Excluded')&
                                                       (race_session.results['Status'] != '')&
                                                       (race_session.results['Status'] != 'Illness')]
                
                # Crea il DataFrame con solo dati rilevanti per le anomalies
                partial_csv = relevant_status[['DriverNumber', 'Status']].reset_index(drop=True)
                event_name = race_session.event['EventName']
                event_date = race_session.event['EventDate']
                partial_csv = partial_csv.assign(EventName=event_name, EventDate=event_date)
                
                all_data.append(partial_csv)

            except Exception as e:
                print(f"Failed to load data for {event_name}: {e}")
    except Exception as e:
        print(f"Failed to process year {year}: {e}")


Loaded data for Abu Dhabi Grand Prix (2024-12-08 00:00:00)


In [91]:
#concat the two dataframes
all_data = pd.concat(all_data, ignore_index=True)
all_data = all_data.drop_duplicates()
final_data = pd.concat([final_data,all_data], ignore_index=True)

In [96]:
final_data['EventDate'] = final_data['EventDate'].astype(str)
final_data[final_data['EventDate'].str.startswith("2024")].shape

(13, 4)

In [97]:
# Define the file path and save it
csv_file = 'Failures2014_2024.csv'
final_data.to_csv(csv_file, mode='w', header=True, index=False)

print("Data has been written successfully to the CSV file.")


Data has been written successfully to the CSV file.


## Pre-processing

In [98]:
df_anomalies = pd.read_csv('Failures2014_2024.csv')
df_anomalies.head()

Unnamed: 0,DriverNumber,Status,EventName,EventDate
0,8,ERS,Australian Grand Prix,2014-03-16 06:00:00
1,13,ERS,Australian Grand Prix,2014-03-16 06:00:00
2,9,Oil pressure,Australian Grand Prix,2014-03-16 06:00:00
3,1,Engine,Australian Grand Prix,2014-03-16 06:00:00
4,44,Engine,Australian Grand Prix,2014-03-16 06:00:00


In [99]:
# Function to extract the year from the 'EventDate' column
def extract_year(date):
    if '/' in date:  # Checks if the format contains '/'
        return '20'+ date.split('/')[2].split(' ')[0]  # Extracts the last part (year)
    elif '-' in date:  # Checks if the format contains '.'
        return date.split('-')[0]  # Extracts the first part (year)
    else:
        return None  # Returns None if the format is not recognized

In [100]:
df_anomalies['Year'] = df_anomalies['EventDate'].apply(extract_year)
df_anomalies.drop('EventDate', axis=1, inplace=True)
df_anomalies

Unnamed: 0,DriverNumber,Status,EventName,Year
0,8,ERS,Australian Grand Prix,2014
1,13,ERS,Australian Grand Prix,2014
2,9,Oil pressure,Australian Grand Prix,2014
3,1,Engine,Australian Grand Prix,2014
4,44,Engine,Australian Grand Prix,2014
...,...,...,...,...
405,10,Hydraulics,Hungarian Grand Prix,2024
406,22,Overheating,Italian Grand Prix,2024
407,14,Brakes,Mexico City Grand Prix,2024
408,23,Radiator,Las Vegas Grand Prix,2024


In [101]:
failure_counts = df_anomalies['Status'].value_counts()
failure_counts, df_anomalies.shape

(Status
 Engine            79
 Brakes            49
 Gearbox           42
 Power Unit        42
 Suspension        25
 Hydraulics        15
 Electrical        14
 Power loss        13
 Wheel             11
 Oil leak          10
 Overheating        8
 Water pressure     7
 Puncture           7
 Mechanical         7
 Turbo              6
 Battery            5
 ERS                5
 Exhaust            5
 Fuel pressure      5
 Water leak         4
 Electronics        4
 Tyre               4
 Undertray          4
 Radiator           3
 Transmission       3
 Rear wing          3
 Front wing         3
 Throttle           2
 Technical          2
 Fuel leak          2
 Clutch             2
 Damage             2
 Steering           2
 Driveshaft         2
 Vibrations         2
 Oil pressure       2
 Spark plugs        1
 Brake duct         1
 Out of fuel        1
 Drivetrain         1
 Fuel pump          1
 Cooling system     1
 Water pump         1
 Fuel system        1
 Differential       1
 N

In [102]:
df_anomalies.to_csv('Failures2014_2024_cleaned.csv', index=False)

In [103]:
unique_pairs_count = df_anomalies[['EventName', 'Year']].drop_duplicates().shape[0]
unique_pairs_count

175

In [104]:
def retrieve_data(year, event_name,driver):
    try:
        race_session = fastf1.get_session(year, event_name, 'R')
        race_session.load(telemetry=True, laps=True, weather=False)
        clear_output()
        print(f"Loaded data for {event_name} ({race_session.event['EventDate']})")
        return race_session.laps.pick_driver(str(driver))
    except Exception as e:
        print(f"Failed to load data for {event_name}: {e}")
        

In [105]:
test = retrieve_data(2019, 'Singapore', 11)

Loaded data for Singapore (2019-09-22 00:00:00)




In [9]:
test.columns
test[['Time','LapStartTime','LapStartDate','LapTime']]

Unnamed: 0,Time,LapStartTime,LapStartDate,LapTime
61,0 days 00:35:42.865000,0 days 00:33:40.429000,2019-09-22 12:13:41.397,0 days 00:02:02.205000
62,0 days 00:37:35.913000,0 days 00:35:42.865000,2019-09-22 12:15:43.833,0 days 00:01:53.048000
63,0 days 00:39:27.974000,0 days 00:37:35.913000,2019-09-22 12:17:36.881,0 days 00:01:52.061000
64,0 days 00:41:17.683000,0 days 00:39:27.974000,2019-09-22 12:19:28.942,0 days 00:01:49.709000
65,0 days 00:43:07.634000,0 days 00:41:17.683000,2019-09-22 12:21:18.651,0 days 00:01:49.951000
66,0 days 00:44:57.276000,0 days 00:43:07.634000,2019-09-22 12:23:08.602,0 days 00:01:49.642000
67,0 days 00:46:47.017000,0 days 00:44:57.276000,2019-09-22 12:24:58.244,0 days 00:01:49.741000
68,0 days 00:48:35.479000,0 days 00:46:47.017000,2019-09-22 12:26:47.985,0 days 00:01:48.462000
69,0 days 00:50:24.743000,0 days 00:48:35.479000,2019-09-22 12:28:36.447,0 days 00:01:49.264000
70,0 days 00:52:16.735000,0 days 00:50:24.743000,2019-09-22 12:30:25.711,0 days 00:01:51.992000
