In [42]:
import sys
!{sys.executable} -m pip install pyarrow
import fastf1
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Setup
cache_path = Path('cache')
cache_path.mkdir(exist_ok=True)
fastf1.Cache.enable_cache(str(cache_path))

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)



In [43]:
def collect_and_save_season(year, output_format="csv"):
    output_dir = Path(f"data/raw/{year}")
    output_dir.mkdir(parents=True, exist_ok=True)

    all_races = []
    schedule = fastf1.get_event_schedule(year)
    num_races = len(schedule)
    print(f"Collecting {year} season ({num_races} races)...")

    for round_num in range(1, num_races + 1):
        try:
            print(f"  Round {round_num}...", end="")
            session = fastf1.get_session(year, round_num, "R")
            session.load(laps=True, telemetry=False, weather=True, messages=True)



            keep = ["Abbreviation","FullName","TeamName","GridPosition","Position","Points","Status","Time","Laps"]
            race = session.results[keep].copy()
            race["Year"] = year
            race["Round"] = round_num
            race["TrackName"] = session.event["EventName"]
            race["Country"] = session.event["Country"]



            if session.laps is not None and not session.laps.empty:
                lap_agg = (
                    session.laps.groupby("Driver")
                    .agg(
                        AvgLapTime=("LapTime", lambda s: s.dt.total_seconds().mean()),
                        NumPitStops=("PitInTime", lambda s: s.notna().sum()),
                    )
                    .reset_index()
                    .rename(columns={"Driver": "Abbreviation"})
                )
                race = race.merge(lap_agg, on="Abbreviation", how="left")



            if session.weather_data is not None and not session.weather_data.empty:
                race["AvgTrackTemp"] = float(session.weather_data["TrackTemp"].mean())
                race["AvgAirTemp"] = float(session.weather_data["AirTemp"].mean())
                race["RainDuringRace"] = bool(session.weather_data["Rainfall"].any())


            sc = False
            rcm = getattr(session, "race_control_messages", None)
            if rcm is not None and not rcm.empty:
                sc = bool(rcm["Message"].str.contains("SAFETY CAR", na=False).any())
            race["SafetyCarDeployed"] = sc


            all_races.append(race)
            print(f" ✓ {session.event['EventName']}")
        except Exception as e:
            print(f" ✗ Error: {e}")
            continue



    if not all_races:
        return pd.DataFrame()



    combined = pd.concat(all_races, ignore_index=True)
    output_file = output_dir / (f"{year}_season_extended.csv" if output_format == "csv" else f"{year}_season_extended.parquet")
    if output_format == "csv":
        combined.to_csv(output_file, index=False)
    else:
        combined.to_parquet(output_file, index=False)
    print(f"Saved {len(all_races)} races to {output_file}")
    return combined


In [44]:
# Collect 2023 and 2022 data
for year in [2023, 2022]:
    data = collect_and_save_season(year, output_format='csv')
    if not data.empty:
        print(f"{year}: Collected {len(data)} driver entries from {data['Round'].nunique()} races\n")

core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.6.0]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...


Collecting 2023 season (23 races)...
  Round 1...

req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

 ✓ Bahrain Grand Prix
  Round 2...

req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

 ✓ Saudi Arabian Grand Prix
  Round 3...

req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

 ✓ Australian Grand Prix
  Round 4...

req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

 ✓ Azerbaijan Grand Prix
  Round 5...

req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

 ✓ Miami Grand Prix
  Round 6...

req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

 ✓ Monaco Grand Prix
  Round 7...

req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

 ✓ Spanish Grand Prix
  Round 8...

req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

 ✓ Canadian Grand Prix
  Round 9...

req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
This might be a bug and should be reported.
req     

 ✓ Austrian Grand Prix
  Round 10...

req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

 ✓ British Grand Prix
  Round 11...

req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

 ✓ Hungarian Grand Prix
  Round 12...

req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

 ✓ Belgian Grand Prix
  Round 13...

req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

 ✓ Dutch Grand Prix
  Round 14...

req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

 ✓ Italian Grand Prix
  Round 15...

req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

 ✓ Singapore Grand Prix
  Round 16...

req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

 ✓ Japanese Grand Prix
  Round 17...

req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

 ✓ Qatar Grand Prix
  Round 18...

req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

 ✓ United States Grand Prix
  Round 19...

req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

 ✓ Mexico City Grand Prix
  Round 20...

req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

 ✓ São Paulo Grand Prix
  Round 21...

req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

 ✓ Las Vegas Grand Prix
  Round 22...

req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

 ✓ Abu Dhabi Grand Prix
  Round 23... ✗ Error: Invalid round: 23
Saved 22 races to data/raw/2023/2023_season_extended.csv
2023: Collected 440 driver entries from 22 races



core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.6.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


Collecting 2022 season (24 races)...
  Round 1...

req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['16', '55', '44', '63', '20', '77', '31', '22', '14', '24', '47', '18', '23', '3', '4', '6', '27', '11', '1', '10']
core           INFO 	Loading data for Saudi Arabian Grand Prix - Race [v3.6.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


 ✓ Bahrain Grand Prix
  Round 2...

req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '16', '55', '11', '63', '31', '4', '10', '20', '44', '24', '27', '18', '23', '77', '14', '3', '6', '22', '47']
core           INFO 	Loading data for Australian Grand Prix - Race [v3.6.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


 ✓ Saudi Arabian Grand Prix
  Round 3...

req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['16', '11', '63', '44', '4', '3', '31', '77', '10', '23', '24', '18', '47', '20', '22', '6', '14', '1', '5', '55']
core           INFO 	Loading data for Emilia Romagna Grand Prix - Race [v3.6.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


 ✓ Australian Grand Prix
  Round 4...

req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '4', '63', '77', '16', '22', '5', '20', '18', '23', '10', '44', '31', '24', '6', '47', '3', '14', '55']
core           INFO 	Loading data for Miami Grand Prix - Race [v3.6.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


 ✓ Emilia Romagna Grand Prix
  Round 5...

req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
Request for URL https://api.jolpi.ca/ergast/f1/2022/5/laps/1.json failed; using cached response
Traceback (most recent call last):
  File "/Users/atul/miniconda3/envs/cv-env/lib/python3.12/site-packages/urllib3/connectionpool.py", line 536, in _make_request
    response = conn.getresponse()
               ^^^^^^^^^^^^^^^^^^
  File "/Users/atul/miniconda3/envs/cv-env/lib/python3.12/site-packages/urllib3/connection.py", line 507, in getresponse
    httplib_response = super().getresponse()
                       ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/atul/miniconda3/envs/cv-env/lib/python3.12/http/client.py", line 1428, in getresponse
    response.b

 ✓ Miami Grand Prix
  Round 6...

req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '63', '55', '44', '77', '31', '4', '14', '22', '5', '3', '10', '47', '18', '6', '20', '23', '24', '16']
core           INFO 	Loading data for Monaco Grand Prix - Race [v3.6.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


 ✓ Spanish Grand Prix
  Round 7...

req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['11', '55', '1', '16', '63', '4', '14', '44', '77', '5', '10', '31', '3', '18', '6', '24', '22', '23', '47', '20']
core           INFO 	Loading data for Azerbaijan Grand Prix - Race [v3.6.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


 ✓ Monaco Grand Prix
  Round 8...

req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '63', '44', '10', '5', '14', '3', '4', '31', '77', '23', '22', '47', '6', '18', '20', '24', '16', '55']
core           INFO 	Loading data for Canadian Grand Prix - Race [v3.6.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


 ✓ Azerbaijan Grand Prix
  Round 9...

req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '55', '44', '63', '16', '31', '77', '24', '14', '18', '3', '5', '23', '10', '4', '6', '20', '22', '47', '11']
core           INFO 	Loading data for British Grand Prix - Race [v3.6.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


 ✓ Canadian Grand Prix
  Round 10...

req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['55', '11', '44', '16', '14', '4', '1', '47', '5', '20', '18', '6', '3', '22', '31', '10', '77', '63', '24', '23']
core           INFO 	Loading data for Austrian Grand Prix - Race [v3.6.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


 ✓ British Grand Prix
  Round 11...

req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['16', '1', '44', '63', '31', '47', '4', '20', '3', '14', '77', '23', '18', '24', '10', '22', '5', '55', '6', '11']
core           INFO 	Loading data for French Grand Prix - Race [v3.6.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


 ✓ Austrian Grand Prix
  Round 12...

req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '44', '63', '11', '55', '14', '4', '31', '3', '18', '5', '10', '23', '77', '47', '24', '6', '20', '16', '22']
core           INFO 	Loading data for Hungarian Grand Prix - Race [v3.6.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


 ✓ French Grand Prix
  Round 13...

req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '44', '63', '55', '11', '16', '4', '14', '31', '5', '18', '10', '24', '47', '3', '20', '23', '6', '22', '77']
core           INFO 	Loading data for Belgian Grand Prix - Race [v3.6.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


 ✓ Hungarian Grand Prix
  Round 14...

req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '55', '63', '14', '16', '31', '5', '10', '23', '18', '4', '22', '24', '3', '20', '47', '6', '77', '44']
core           INFO 	Loading data for Dutch Grand Prix - Race [v3.6.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


 ✓ Belgian Grand Prix
  Round 15...

req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '63', '16', '44', '11', '14', '4', '55', '31', '18', '10', '23', '47', '5', '20', '24', '3', '6', '77', '22']
core           INFO 	Loading data for Italian Grand Prix - Race [v3.6.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


 ✓ Dutch Grand Prix
  Round 16...

req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '16', '63', '55', '44', '11', '4', '10', '45', '24', '31', '47', '77', '22', '6', '20', '3', '18', '14', '5']
core           INFO 	Loading data for Singapore Grand Prix - Race [v3.6.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


 ✓ Italian Grand Prix
  Round 17...

req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['11', '16', '55', '4', '3', '18', '1', '5', '44', '10', '77', '20', '47', '63', '22', '31', '23', '14', '6', '24']
core           INFO 	Loading data for Japanese Grand Prix - Race [v3.6.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


 ✓ Singapore Grand Prix
  Round 18...

req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '16', '31', '44', '5', '14', '63', '6', '4', '3', '18', '22', '20', '77', '24', '47', '10', '55', '23']
core           INFO 	Loading data for United States Grand Prix - Race [v3.6.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


 ✓ Japanese Grand Prix
  Round 19...

req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '44', '16', '11', '63', '4', '14', '5', '20', '22', '31', '24', '23', '10', '47', '3', '6', '18', '77', '55']
core           INFO 	Loading data for Mexico City Grand Prix - Race [v3.6.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


 ✓ United States Grand Prix
  Round 20...

req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '44', '11', '63', '55', '16', '3', '31', '4', '77', '10', '23', '24', '5', '18', '47', '20', '6', '14', '22']
core           INFO 	Loading data for São Paulo Grand Prix - Race [v3.6.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


 ✓ Mexico City Grand Prix
  Round 21...

req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['63', '44', '55', '16', '14', '1', '11', '31', '77', '18', '5', '24', '47', '10', '23', '6', '22', '4', '20', '3']
core           INFO 	Loading data for Abu Dhabi Grand Prix - Race [v3.6.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


 ✓ São Paulo Grand Prix
  Round 22...

req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '16', '11', '55', '63', '4', '31', '18', '3', '5', '22', '24', '23', '10', '77', '47', '20', '44', '6', '14']


 ✓ Abu Dhabi Grand Prix
  Round 23... ✗ Error: Invalid round: 23
  Round 24... ✗ Error: Invalid round: 24
Saved 22 races to data/raw/2022/2022_season_extended.csv
2022: Collected 440 driver entries from 22 races



In [45]:
def load_season_data(year):
    p = Path(f"data/raw/{year}")
    for fn in [f"{year}_season_extended.csv", f"{year}_season_extended.parquet",
               f"{year}_season.csv", f"{year}_season.parquet"]:
        f = p / fn
        if f.exists():
            return pd.read_csv(f) if f.suffix==".csv" else pd.read_parquet(f)
    print(f"No data files found for {year}")
    return pd.DataFrame()


def load_multiple_seasons(years):
    all_seasons = []
    for year in years:
        season_data = load_season_data(year)
        if not season_data.empty:
            all_seasons.append(season_data)
    
    if all_seasons:
        return pd.concat(all_seasons, ignore_index=True)
    return pd.DataFrame()

In [46]:
from pathlib import Path
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 0)

def load_season_data(year: int) -> pd.DataFrame:
    base = Path(f"data/raw/{year}")
    for f in [base / f"{year}_season_extended.parquet",
              base / f"{year}_season_extended.csv",
              base / f"{year}_season.parquet",
              base / f"{year}_season.csv"]:
        if f.exists():
            return pd.read_parquet(f) if f.suffix == ".parquet" else pd.read_csv(f)
    print(f"[warn] No data files found for {year}")
    return pd.DataFrame()

def load_multiple_seasons(years) -> pd.DataFrame:
    frames = [load_season_data(y) for y in years]
    frames = [f for f in frames if not f.empty]
    return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()

train_data = load_multiple_seasons([2023, 2022])
print(f"\nCombined training data: {train_data.shape}")

for c in ["GridPosition","Position","Points","Laps","Year","Round","AvgLapTime","NumPitStops","AvgTrackTemp","AvgAirTemp"]:
    if c in train_data.columns:
        train_data[c] = pd.to_numeric(train_data[c], errors="coerce")

train_data["Time"] = pd.to_timedelta(train_data.get("Time", pd.Series([np.nan]*len(train_data))), errors="coerce")

for bc in ["RainDuringRace","SafetyCarDeployed"]:
    if bc in train_data.columns:
        train_data[bc] = train_data[bc].astype("boolean")

is_winner = train_data["Position"] == 1.0
train_data["GapToWinner_s"] = np.where(is_winner, 0.0, train_data["Time"].dt.total_seconds())

train_data["WinnerRaceTime_s"] = (
    train_data.groupby(["Year","Round"])["Time"]
              .transform(lambda s: s.max().total_seconds() if s.notna().any() else np.nan)
)

train_data["Time_s"] = train_data["WinnerRaceTime_s"] + train_data["GapToWinner_s"]

def _fmt_hms_ms(sec):
    if pd.isna(sec): return np.nan
    sec = float(sec)
    ms = int(round((sec - int(sec)) * 1000))
    if ms == 1000:
        ms = 0
        sec = int(sec) + 1
    s = int(sec) % 60
    m = (int(sec) // 60) % 60
    h = int(sec) // 3600
    return f"{h:02d}:{m:02d}:{s:02d}.{ms:03d}"

train_data["Time_str"] = train_data["Time_s"].apply(_fmt_hms_ms)

train_data = train_data.rename(columns={"Position":"RacePosition","GridPosition":"QualifyingPosition"})

for cat in ["Abbreviation","FullName","TeamName","Status","TrackName","Country"]:
    if cat in train_data.columns:
        train_data[cat] = train_data[cat].astype("string").fillna(pd.NA).str.strip()

if "Status" in train_data.columns:
    train_data["FinishedFlag"] = train_data["Status"].str.lower().eq("finished").astype("Int8")

train_data = train_data.drop(columns=["Time"])

cols_order = [
    "Year","Round","TrackName","Country",
    "Abbreviation","FullName","TeamName","Status",
    "QualifyingPosition","RacePosition","Points","Laps",
    "AvgLapTime","NumPitStops","AvgTrackTemp","AvgAirTemp",
    "RainDuringRace","SafetyCarDeployed",
    "Time_s","Time_str","GapToWinner_s","WinnerRaceTime_s",
    "FinishedFlag"
]
train_data = train_data[[c for c in cols_order if c in train_data.columns]]

out_dir = Path("data/processed")
out_dir.mkdir(parents=True, exist_ok=True)
csv_path = out_dir / "f1_2022_2023_processed.csv"
parq_path = out_dir / "f1_2022_2023_processed.parquet"

train_data.to_csv(csv_path, index=False)
try:
    train_data.to_parquet(parq_path, index=False)
except Exception as e:
    print(f"[warn] parquet save failed: {e}")

print(f"\nSaved processed data to:\n- {csv_path}\n- {parq_path if parq_path.exists() else '(parquet not written)'}")
print("\nPreview:")
print(train_data.head(8).to_string(index=False))



Combined training data: (880, 19)

Saved processed data to:
- data/processed/f1_2022_2023_processed.csv
- data/processed/f1_2022_2023_processed.parquet

Preview:
 Year  Round          TrackName Country Abbreviation        FullName        TeamName   Status  QualifyingPosition  RacePosition  Points  Laps  AvgLapTime  NumPitStops  AvgTrackTemp  AvgAirTemp  RainDuringRace  SafetyCarDeployed   Time_s     Time_str  GapToWinner_s  WinnerRaceTime_s  FinishedFlag
 2023      1 Bahrain Grand Prix Bahrain          VER  Max Verstappen Red Bull Racing Finished                 1.0           1.0    25.0  57.0   98.890105          2.0     31.011801   27.431677           False               True 5636.736 01:33:56.736          0.000          5636.736             1
 2023      1 Bahrain Grand Prix Bahrain          PER    Sergio Perez Red Bull Racing Finished                 2.0           2.0    18.0  57.0   99.100404          2.0     31.011801   27.431677           False               True 5648.723 01:34:

In [47]:
train_data.dtypes
(
    train_data[train_data['RainDuringRace'] == False]  
    .groupby('TrackName')['AvgLapTime']    
    .mean()
    .sort_values()
    .apply(_fmt_hms_ms)
)


TrackName
Austrian Grand Prix          00:01:12.574
Dutch Grand Prix             00:01:20.707
Canadian Grand Prix          00:01:22.146
São Paulo Grand Prix         00:01:23.589
Spanish Grand Prix           00:01:25.616
Mexico City Grand Prix       00:01:25.996
Hungarian Grand Prix         00:01:28.345
Italian Grand Prix           00:01:29.440
Australian Grand Prix        00:01:29.944
Emilia Romagna Grand Prix    00:01:30.593
Abu Dhabi Grand Prix         00:01:31.620
Qatar Grand Prix             00:01:33.258
Miami Grand Prix             00:01:33.329
British Grand Prix           00:01:38.235
Saudi Arabian Grand Prix     00:01:38.365
Bahrain Grand Prix           00:01:41.729
French Grand Prix            00:01:42.506
Singapore Grand Prix         00:01:42.657
Las Vegas Grand Prix         00:01:43.163
Japanese Grand Prix          00:01:43.734
United States Grand Prix     00:01:45.137
Azerbaijan Grand Prix        00:01:50.426
Belgian Grand Prix           00:01:56.462
Name: AvgLapTime, dtype:

In [48]:
track_avg_speeds = {}
unique_tracks = train_data['TrackName'].unique()

for track in unique_tracks:
    mask = (train_data['TrackName'] == track) & (train_data['RainDuringRace'] == False)
    track_avg_speeds[track] = train_data[mask]['AvgLapTime'].mean()

print(track_avg_speeds)

{'Bahrain Grand Prix': 101.7290426951125, 'Saudi Arabian Grand Prix': 98.36462896678194, 'Australian Grand Prix': 89.94445004087463, 'Azerbaijan Grand Prix': 110.42628620108647, 'Miami Grand Prix': 93.32939793233083, 'Monaco Grand Prix': nan, 'Spanish Grand Prix': 85.61556414330751, 'Canadian Grand Prix': 82.14613000121706, 'Austrian Grand Prix': 72.57358236837692, 'British Grand Prix': 98.23494564893204, 'Hungarian Grand Prix': 88.34541538271375, 'Belgian Grand Prix': 116.46160744391466, 'Dutch Grand Prix': 80.70668699702406, 'Italian Grand Prix': 89.43952304923957, 'Singapore Grand Prix': 102.65677124231789, 'Japanese Grand Prix': 103.73429437063615, 'Qatar Grand Prix': 93.25838709273184, 'United States Grand Prix': 105.1367895087817, 'Mexico City Grand Prix': 85.99643963676294, 'São Paulo Grand Prix': 83.58870361949474, 'Las Vegas Grand Prix': 103.16262480341693, 'Abu Dhabi Grand Prix': 91.62016754324858, 'Emilia Romagna Grand Prix': 90.5926006491197, 'French Grand Prix': 102.506183

In [49]:
tracks_2023 = [
    'Bahrain Grand Prix',
    'Saudi Arabian Grand Prix', 
    'Australian Grand Prix',
    'Azerbaijan Grand Prix',
    'Miami Grand Prix',
    'Monaco Grand Prix',
    'Spanish Grand Prix',
    'Canadian Grand Prix',
    'Austrian Grand Prix',
    'British Grand Prix',
    'Hungarian Grand Prix',
    'Belgian Grand Prix',
    'Dutch Grand Prix',
    'Italian Grand Prix',
    'Singapore Grand Prix',  
    'Japanese Grand Prix',  
    'Qatar Grand Prix',
    'United States Grand Prix',
    'Mexico City Grand Prix',
    'São Paulo Grand Prix',
    'Las Vegas Grand Prix', 
    'Abu Dhabi Grand Prix'
]

train_data_2023 = train_data[train_data['TrackName'].isin(tracks_2023)]

In [50]:
unique_tracks = train_data_2023['TrackName'].unique()

sprint_races = [(2022, 'Austrian Grand Prix'), (2022, 'São Paulo Grand Prix')]

is_sprint = False
for year, track in sprint_races:
    is_sprint |= ((train_data['Year'] == year) & (train_data['TrackName'] == track))


train_data_no_sprints = train_data[~is_sprint]

overtake_difficulty_clean = {}
street_circuits = ['Monaco Grand Prix', 'Azerbaijan Grand Prix', 'Saudi Arabian Grand Prix', 'Miami Grand Prix']


for track in unique_tracks:
    mask = ((train_data_no_sprints['TrackName'] == track) & 
            (train_data_no_sprints['FinishedFlag'] == 1))
    
    data = train_data_no_sprints[mask][['QualifyingPosition', 'RacePosition']]
    data = data.dropna()
    
    
    if len(data) > 5:
        correlation = data['QualifyingPosition'].corr(data['RacePosition'])
        overtake_difficulty_clean[track] = correlation
        
        
    else:
        if track in street_circuits:
            overtake_difficulty_clean[track] = 0.8
        else:
            overtake_difficulty_clean[track] = 0.6
        
print(overtake_difficulty_clean)

{'Bahrain Grand Prix': 0.8679245366974948, 'Saudi Arabian Grand Prix': 0.7698650908833107, 'Australian Grand Prix': 0.6574632976481228, 'Azerbaijan Grand Prix': 0.9102403290471156, 'Miami Grand Prix': 0.6524571650562434, 'Monaco Grand Prix': 0.9024110108290754, 'Spanish Grand Prix': 0.7554003902973232, 'Canadian Grand Prix': 0.5695097367448471, 'Austrian Grand Prix': 0.3669623340094638, 'British Grand Prix': 0.6920630134086893, 'Hungarian Grand Prix': 0.6098598612507029, 'Belgian Grand Prix': 0.6172727537796998, 'Dutch Grand Prix': 0.5524884656892283, 'Italian Grand Prix': 0.6641261187528871, 'Singapore Grand Prix': 0.4080014072134992, 'Japanese Grand Prix': 0.8585329536393139, 'Qatar Grand Prix': 0.6538033027198474, 'United States Grand Prix': 0.6065647890978583, 'Mexico City Grand Prix': 0.6761609297974968, 'São Paulo Grand Prix': 0.679529340507525, 'Las Vegas Grand Prix': 0.25424174750829126, 'Abu Dhabi Grand Prix': 0.8676385860425169}


In [51]:
# Create track features with just 2 features
track_features = pd.DataFrame({'TrackName': list(overtake_difficulty_clean.keys()), 'overtaking_difficulty': list(overtake_difficulty_clean.values())})

street_circuits = ['Monaco Grand Prix', 'Azerbaijan Grand Prix', 'Saudi Arabian Grand Prix', 'Miami Grand Prix']
track_features['is_street'] = track_features['TrackName'].isin(street_circuits).astype(int)

print(track_features.head())

                  TrackName  overtaking_difficulty  is_street
0        Bahrain Grand Prix               0.867925          0
1  Saudi Arabian Grand Prix               0.769865          1
2     Australian Grand Prix               0.657463          0
3     Azerbaijan Grand Prix               0.910240          1
4          Miami Grand Prix               0.652457          1


In [52]:
train_data_with_features = train_data_no_sprints.merge(
    track_features, 
    on='TrackName', 
    how='left'
)

print(train_data_with_features[['TrackName', 'is_street', 'overtaking_difficulty']].head())

            TrackName  is_street  overtaking_difficulty
0  Bahrain Grand Prix        0.0               0.867925
1  Bahrain Grand Prix        0.0               0.867925
2  Bahrain Grand Prix        0.0               0.867925
3  Bahrain Grand Prix        0.0               0.867925
4  Bahrain Grand Prix        0.0               0.867925


In [53]:
print(train_data_with_features[train_data_with_features['is_street'] == 1]['TrackName'].unique())

['Saudi Arabian Grand Prix' 'Azerbaijan Grand Prix' 'Miami Grand Prix'
 'Monaco Grand Prix']


In [54]:
quali_points = {1:8, 2:7, 3:6, 4:5, 5:4, 6:3, 7:2, 8:1, 9:0, 10:0, 11:0, 12:0, 13:0, 14:0, 15:0, 16:0, 17:0, 18:0, 19:0, 20:0}
recent_form = {}

for year in [2022, 2023]:
    season_data = train_data[train_data['Year'] == year]
    
    max_round = season_data['Round'].max()
    for race in range(4, max_round + 1):
        current_round = season_data[season_data['Round'] == race]
            
        for driver in current_round["Abbreviation"].unique():
            prev_race1 = season_data[(season_data['Round'] == race-1) & (season_data['Abbreviation'] == driver)]
            points1 = prev_race1['Points'].sum()
            prev_race2 = season_data[(season_data['Round'] == race-2) & (season_data['Abbreviation'] == driver)]
            points2 = prev_race2['Points'].sum()
            prev_race3 = season_data[(season_data['Round'] == race-3) & (season_data['Abbreviation'] == driver)]
            points3 = prev_race3['Points'].sum()
            
            points = sum([points1, points2, points3])
            
            quali1 = prev_race1['QualifyingPosition'].sum()
            sprint_points1 = quali_points.get(quali1, 0)
            quali2 = prev_race2['QualifyingPosition'].sum()
            sprint_points2 = quali_points.get(quali2, 0)
            quali3 = prev_race3['QualifyingPosition'].sum()
            sprint_points3 = quali_points.get(quali3, 0)
            
            points += sum([sprint_points1, sprint_points2, sprint_points3])
            
            recent_form[(year, race, driver)] = points

train_data['recent_form'] = np.nan

for key, form_value in recent_form.items():
    year, round_num, driver_code = key  
    
    season = train_data['Year'] == year
    race = train_data['Round'] == round_num  
    racer = train_data['Abbreviation'] == driver_code
    
    row_to_update = season & race & racer
    
    train_data.loc[row_to_update, 'recent_form'] = form_value
    
print(recent_form[(2022, 4, 'HAM')])


36.0


In [55]:
rain_specialists = ['VER', 'HAM', 'STR']
train_data['rain_specialist'] = train_data['Abbreviation'].isin(rain_specialists).astype(int)
print(train_data[train_data['RainDuringRace'] == True][['Abbreviation', 'rain_specialist', 'RacePosition']].head(10))

    Abbreviation  rain_specialist  RacePosition
100          VER                1           1.0
101          ALO                0           2.0
102          OCO                0           3.0
103          HAM                1           4.0
104          RUS                0           5.0
105          LEC                0           6.0
106          GAS                0           7.0
107          SAI                0           8.0
108          NOR                0           9.0
109          PIA                0          10.0


In [56]:
# Check DNF positions
dnf = train_data[~train_data['Status'].isin(['Finished', '+1 Lap', '+2 Laps', '+3 Laps'])]
print(dnf[['Status', 'RacePosition', 'QualifyingPosition', 'Laps']].head(10))

     Status  RacePosition  QualifyingPosition  Laps
11   Lapped          12.0                16.0  56.0
12   Lapped          13.0                17.0  56.0
13   Lapped          14.0                19.0  56.0
14   Lapped          15.0                10.0  56.0
15   Lapped          16.0                13.0  56.0
16   Lapped          17.0                11.0  55.0
17  Retired          18.0                 9.0  41.0
18  Retired          19.0                 3.0  39.0
19  Retired          20.0                18.0  13.0
37   Lapped          18.0                14.0  49.0


In [57]:
driver_error = ['Collision', 'Collision damage', 'Accident', 'Spun off']
mechanical = ['Engine', 'Gearbox', 'Power Unit', 'Hydraulics', 'Brakes', 'Suspension', 'Fuel pressure', 'Power loss', 'Water pressure', 'Water leak', 'Mechanical', 'Undertray', 'Turbo', 'Oil leak', 'Cooling system', 'Vibrations', 'Differential', ...]
lapped = ['+1 Lap', '+2 Laps', '+3 Laps']



adjusted_positions = []

for index, row in train_data.iterrows():

    if row['Status'] == "Finished" or row['Status'] in lapped:
        adjusted_positions.append(row['RacePosition'])
        
    elif row['Status'] in mechanical:
        adjusted_positiondnf = (row['RacePosition'] + row['QualifyingPosition']) / 2
        adjusted_positions.append(adjusted_positiondnf)
    
    elif row['Status'] in driver_error:
        adjusted_positions.append(row['RacePosition'])
    
    else:
        adjusted_positions.append(row['RacePosition'])

train_data['adjusted_position'] = adjusted_positions



In [60]:
#train_data.shape
#train_data.info()
#train_data.describe()
train_data.head()
#train_data[train_data['Points'] > 6].head()
#train_data.groupby('Abbreviation')['Points'].sum().head()
train_data[train_data['Year'] == 2023].groupby('Abbreviation')['Points'].sum().sort_values(ascending=False).head(10)

Abbreviation
VER    530.0
PER    260.0
HAM    217.0
ALO    198.0
LEC    185.0
NOR    184.0
SAI    178.0
RUS    157.0
PIA     82.0
STR     68.0
Name: Points, dtype: float64

In [59]:
train_data.head()

Unnamed: 0,Year,Round,TrackName,Country,Abbreviation,FullName,TeamName,Status,QualifyingPosition,RacePosition,Points,Laps,AvgLapTime,NumPitStops,AvgTrackTemp,AvgAirTemp,RainDuringRace,SafetyCarDeployed,Time_s,Time_str,GapToWinner_s,WinnerRaceTime_s,FinishedFlag,recent_form,rain_specialist,adjusted_position
0,2023,1,Bahrain Grand Prix,Bahrain,VER,Max Verstappen,Red Bull Racing,Finished,1.0,1.0,25.0,57.0,98.890105,2.0,31.011801,27.431677,False,True,5636.736,01:33:56.736,0.0,5636.736,1,,1,1.0
1,2023,1,Bahrain Grand Prix,Bahrain,PER,Sergio Perez,Red Bull Racing,Finished,2.0,2.0,18.0,57.0,99.100404,2.0,31.011801,27.431677,False,True,5648.723,01:34:08.723,11.987,5636.736,1,,0,2.0
2,2023,1,Bahrain Grand Prix,Bahrain,ALO,Fernando Alonso,Aston Martin,Finished,5.0,3.0,15.0,57.0,99.567947,2.0,31.011801,27.431677,False,True,5675.373,01:34:35.373,38.637,5636.736,1,,0,3.0
3,2023,1,Bahrain Grand Prix,Bahrain,SAI,Carlos Sainz,Ferrari,Finished,4.0,4.0,12.0,57.0,99.733123,2.0,31.011801,27.431677,False,True,5684.788,01:34:44.788,48.052,5636.736,1,,0,4.0
4,2023,1,Bahrain Grand Prix,Bahrain,HAM,Lewis Hamilton,Mercedes,Finished,7.0,5.0,10.0,57.0,99.784439,2.0,31.011801,27.431677,False,True,5687.713,01:34:47.713,50.977,5636.736,1,,1,5.0


In [63]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'xgboost'