In [44]:
import sys
!{sys.executable} -m pip install pyarrow
import fastf1
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Setup
cache_path = Path('cache')
cache_path.mkdir(exist_ok=True)
fastf1.Cache.enable_cache(str(cache_path))

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)



In [45]:
def collect_and_save_season(year, output_format='csv'):
    output_dir = Path(f'data/raw/{year}')
    output_dir.mkdir(parents=True, exist_ok=True)
    
    all_races = []
    schedule = fastf1.get_event_schedule(year)
    
    # Determine number of races
    if year == 2021:
        num_races = 22
    elif year == 2022:
        num_races = 22
    elif year == 2023:
        num_races = 22
    else:
        num_races = min(len(schedule), 23)
    
    print(f"Collecting {year} season ({num_races} races)...")
    
    for round_num in range(1, num_races + 1):
        try:
            print(f"  Round {round_num}...", end="")
            session = fastf1.get_session(year, round_num, 'R')
            session.load(telemetry=False, laps=False, weather=False)
            
            # Core columns
            columns_to_keep = ['Abbreviation', 'FullName', 'TeamName', 
                             'GridPosition', 'Position', 'Points', 'Status', 'Laps']
            
            # Add qualifying if available
            for q in ['Q1', 'Q2', 'Q3']:
                if q in session.results.columns:
                    columns_to_keep.append(q)
            
            race_data = session.results[columns_to_keep].copy()
            race_data['Year'] = year
            race_data['Round'] = round_num
            race_data['TrackName'] = session.event['EventName']
            race_data['Country'] = session.event['Country']
            
            all_races.append(race_data)
            print(f" ✓ {session.event['EventName']}")
            
        except Exception as e:
            print(f" ✗ Error: {e}")
            continue
    
    # Save all races in one file
    if all_races:
        combined = pd.concat(all_races, ignore_index=True)
        
        if output_format == 'csv':
            output_file = output_dir / f'{year}_season.csv'
            combined.to_csv(output_file, index=False)
        else:
            output_file = output_dir / f'{year}_season.parquet'
            combined.to_parquet(output_file, index=False)
            
        print(f"Saved {len(all_races)} races to {output_file}")
        return combined
    
    return pd.DataFrame()

In [46]:
# Collect 2021 and 2022 data
for year in [2021, 2022]:
    data = collect_and_save_season(year, output_format='csv')
    if not data.empty:
        print(f"{year}: Collected {len(data)} driver entries from {data['Round'].nunique()} races\n")

In [None]:
def load_season_data(year):
    data_dir = Path(f'data/raw/{year}')
    
    # Try loading the combined season file
    season_file_csv = data_dir / f'{year}_season.csv'
    season_file_parquet = data_dir / f'{year}_season.parquet'
    
    if season_file_csv.exists():
        return pd.read_csv(season_file_csv)
    elif season_file_parquet.exists():
        return pd.read_parquet(season_file_parquet)
    else:
        print(f"No data files found for {year}")
        return pd.DataFrame()

def load_multiple_seasons(years):
    all_seasons = []
    for year in years:
        season_data = load_season_data(year)
        if not season_data.empty:
            all_seasons.append(season_data)
    
    if all_seasons:
        return pd.concat(all_seasons, ignore_index=True)
    return pd.DataFrame()

In [None]:
# Load the data
data_2021 = load_season_data(2021)
data_2022 = load_season_data(2022)

print(f"2021: {data_2021.shape}")
print(f"2022: {data_2022.shape}")

if not data_2021.empty:
    print(f"2021 - Races: {data_2021['Round'].nunique()}, Drivers: {data_2021['Abbreviation'].nunique()}")
if not data_2022.empty:
    print(f"2022 - Races: {data_2022['Round'].nunique()}, Drivers: {data_2022['Abbreviation'].nunique()}")

# Combine both seasons
train_data = load_multiple_seasons([2021, 2022])
print(f"\nCombined training data: {train_data.shape}")