In [None]:
import sys
!{sys.executable} -m pip install pyarrow
import warnings
warnings.filterwarnings('ignore')


In [None]:

import sys
import fastf1
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

# Setup
cache_path = Path('cache')
cache_path.mkdir(exist_ok=True)
fastf1.Cache.enable_cache(str(cache_path))

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)

In [None]:
def collect_and_save_season(year, output_format="csv"):
    output_dir = Path(f"data/raw/{year}")
    output_dir.mkdir(parents=True, exist_ok=True)

    all_races = []
    schedule = fastf1.get_event_schedule(year)
    num_races = len(schedule)
    print(f"Collecting {year} season ({num_races} races)...")

    for round_num in range(1, num_races + 1):
        try:
            print(f"  Round {round_num}...", end="")
            session = fastf1.get_session(year, round_num, "R")
            session.load(laps=True, telemetry=False, weather=True, messages=True)

            # Check what columns are actually available
            available_columns = session.results.columns.tolist()
            
            # Only keep columns that exist
            keep_base = ["Abbreviation","FullName","TeamName","GridPosition","Position","Points","Status","Time"]
            keep = [col for col in keep_base if col in available_columns]
            
            race = session.results[keep].copy()
            race["Year"] = year
            race["Round"] = round_num
            race["TrackName"] = session.event["EventName"]
            race["Country"] = session.event["Country"]

            # Get Laps from session.laps if not in results
            if 'Laps' not in available_columns and session.laps is not None and not session.laps.empty:
                laps_completed = session.laps.groupby('Driver')['LapNumber'].max()
                race = race.merge(
                    laps_completed.rename('Laps').reset_index().rename(columns={'Driver': 'Abbreviation'}),
                    on='Abbreviation',
                    how='left'
                )
            elif 'Laps' in available_columns:
                # If Laps is in results, it's already in race dataframe
                pass

            if session.laps is not None and not session.laps.empty:
                lap_agg = (
                    session.laps.groupby("Driver")
                    .agg(
                        AvgLapTime=("LapTime", lambda s: s.dt.total_seconds().mean()),
                        NumPitStops=("PitInTime", lambda s: s.notna().sum()),
                    )
                    .reset_index()
                    .rename(columns={"Driver": "Abbreviation"})
                )
                race = race.merge(lap_agg, on="Abbreviation", how="left")

            if session.weather_data is not None and not session.weather_data.empty:
                race["AvgTrackTemp"] = float(session.weather_data["TrackTemp"].mean())
                race["AvgAirTemp"] = float(session.weather_data["AirTemp"].mean())
                race["RainDuringRace"] = bool(session.weather_data["Rainfall"].any())

            sc = False
            rcm = getattr(session, "race_control_messages", None)
            if rcm is not None and not rcm.empty:
                sc = bool(rcm["Message"].str.contains("SAFETY CAR", na=False).any())
            race["SafetyCarDeployed"] = sc

            all_races.append(race)
            print(f" ✓ {session.event['EventName']}")
        except Exception as e:
            print(f" ✗ Error: {e}")
            continue

    if not all_races:
        return pd.DataFrame()

    combined = pd.concat(all_races, ignore_index=True)
    output_file = output_dir / (f"{year}_season_extended.csv" if output_format == "csv" else f"{year}_season_extended.parquet")
    if output_format == "csv":
        combined.to_csv(output_file, index=False)
    else:
        combined.to_parquet(output_file, index=False)
    print(f"Saved {len(all_races)} races to {output_file}")
    return combined

In [None]:
# Collect 2023 and 2022 data
for year in [2022, 2023]:
    data = collect_and_save_season(year, output_format='csv')
    if not data.empty:
        print(f"{year}: Collected {len(data)} driver entries from {data['Round'].nunique()} races\n")

In [None]:
def load_season_data(year):
    p = Path(f"data/raw/{year}")
    for fn in [f"{year}_season_extended.csv", f"{year}_season_extended.parquet",
               f"{year}_season.csv", f"{year}_season.parquet"]:
        f = p / fn
        if f.exists():
            return pd.read_csv(f) if f.suffix==".csv" else pd.read_parquet(f)
    print(f"No data files found for {year}")
    return pd.DataFrame()


def load_multiple_seasons(years):
    all_seasons = []
    for year in years:
        season_data = load_season_data(year)
        if not season_data.empty:
            all_seasons.append(season_data)
    
    if all_seasons:
        return pd.concat(all_seasons, ignore_index=True)
    return pd.DataFrame()

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 0)

def load_season_data(year: int) -> pd.DataFrame:
    base = Path(f"data/raw/{year}")
    for f in [base / f"{year}_season_extended.parquet",
              base / f"{year}_season_extended.csv",
              base / f"{year}_season.parquet",
              base / f"{year}_season.csv"]:
        if f.exists():
            return pd.read_parquet(f) if f.suffix == ".parquet" else pd.read_csv(f)
    print(f"[warn] No data files found for {year}")
    return pd.DataFrame()

def load_multiple_seasons(years) -> pd.DataFrame:
    frames = [load_season_data(y) for y in years]
    frames = [f for f in frames if not f.empty]
    return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()

train_data = load_multiple_seasons([2023, 2022])
print(f"\nCombined training data: {train_data.shape}")

for c in ["GridPosition","Position","Points","Laps","Year","Round","AvgLapTime","NumPitStops","AvgTrackTemp","AvgAirTemp"]:
    if c in train_data.columns:
        train_data[c] = pd.to_numeric(train_data[c], errors="coerce")

train_data["Time"] = pd.to_timedelta(train_data.get("Time", pd.Series([np.nan]*len(train_data))), errors="coerce")

for bc in ["RainDuringRace","SafetyCarDeployed"]:
    if bc in train_data.columns:
        train_data[bc] = train_data[bc].astype("boolean")

is_winner = train_data["Position"] == 1.0
train_data["GapToWinner_s"] = np.where(is_winner, 0.0, train_data["Time"].dt.total_seconds())

train_data["WinnerRaceTime_s"] = (
    train_data.groupby(["Year","Round"])["Time"]
              .transform(lambda s: s.max().total_seconds() if s.notna().any() else np.nan)
)

train_data["Time_s"] = train_data["WinnerRaceTime_s"] + train_data["GapToWinner_s"]

def _fmt_hms_ms(sec):
    if pd.isna(sec): return np.nan
    sec = float(sec)
    ms = int(round((sec - int(sec)) * 1000))
    if ms == 1000:
        ms = 0
        sec = int(sec) + 1
    s = int(sec) % 60
    m = (int(sec) // 60) % 60
    h = int(sec) // 3600
    return f"{h:02d}:{m:02d}:{s:02d}.{ms:03d}"

train_data["Time_str"] = train_data["Time_s"].apply(_fmt_hms_ms)

train_data = train_data.rename(columns={"Position":"RacePosition","GridPosition":"QualifyingPosition"})

for cat in ["Abbreviation","FullName","TeamName","Status","TrackName","Country"]:
    if cat in train_data.columns:
        train_data[cat] = train_data[cat].astype("string").fillna(pd.NA).str.strip()

if "Status" in train_data.columns:
    train_data["FinishedFlag"] = train_data["Status"].str.lower().eq("finished").astype("Int8")

train_data = train_data.drop(columns=["Time"])

cols_order = [
    "Year","Round","TrackName","Country",
    "Abbreviation","FullName","TeamName","Status",
    "QualifyingPosition","RacePosition","Points","Laps",
    "AvgLapTime","NumPitStops","AvgTrackTemp","AvgAirTemp",
    "RainDuringRace","SafetyCarDeployed",
    "Time_s","Time_str","GapToWinner_s","WinnerRaceTime_s",
    "FinishedFlag"
]
train_data = train_data[[c for c in cols_order if c in train_data.columns]]

out_dir = Path("data/processed")
out_dir.mkdir(parents=True, exist_ok=True)
csv_path = out_dir / "f1_2022_2023_processed.csv"
parq_path = out_dir / "f1_2022_2023_processed.parquet"

train_data.to_csv(csv_path, index=False)
try:
    train_data.to_parquet(parq_path, index=False)
except Exception as e:
    print(f"[warn] parquet save failed: {e}")

print(f"\nSaved processed data to:\n- {csv_path}\n- {parq_path if parq_path.exists() else '(parquet not written)'}")
print("\nPreview:")
print(train_data.head(8).to_string(index=False))


In [None]:
train_data.dtypes
(
    train_data[train_data['RainDuringRace'] == False]  
    .groupby('TrackName')['AvgLapTime']    
    .mean()
    .sort_values()
    .apply(_fmt_hms_ms)
)


In [None]:
track_avg_speeds = {}
unique_tracks = train_data['TrackName'].unique()

for track in unique_tracks:
    mask = (train_data['TrackName'] == track) & (train_data['RainDuringRace'] == False)
    track_avg_speeds[track] = train_data[mask]['AvgLapTime'].mean()

print(track_avg_speeds)

In [None]:
tracks_2023 = [
    'Bahrain Grand Prix',
    'Saudi Arabian Grand Prix', 
    'Australian Grand Prix',
    'Azerbaijan Grand Prix',
    'Miami Grand Prix',
    'Monaco Grand Prix',
    'Spanish Grand Prix',
    'Canadian Grand Prix',
    'Austrian Grand Prix',
    'British Grand Prix',
    'Hungarian Grand Prix',
    'Belgian Grand Prix',
    'Dutch Grand Prix',
    'Italian Grand Prix',
    'Singapore Grand Prix',  
    'Japanese Grand Prix',  
    'Qatar Grand Prix',
    'United States Grand Prix',
    'Mexico City Grand Prix',
    'São Paulo Grand Prix',
    'Las Vegas Grand Prix', 
    'Abu Dhabi Grand Prix'
]

train_data_2023 = train_data[train_data['TrackName'].isin(tracks_2023)]

In [None]:
import pandas as pd


sprint_races = [(2022, 'Emilia Romagna Grand Prix'),(2022, 'Austrian Grand Prix'), (2022, 'São Paulo Grand Prix'),(2023, 'Azerbaijan Grand Prix'),(2023, 'Austrian Grand Prix'),(2023, 'Belgian Grand Prix'),(2023, 'Qatar Grand Prix'),(2023, 'United States Grand Prix'),(2023, 'São Paulo Grand Prix')]

is_sprint = pd.Series([False] * len(train_data), index=train_data.index)
for year, track in sprint_races:
    is_sprint |= ((train_data['Year'] == year) & (train_data['TrackName'] == track))


train_data_no_sprints = train_data[~is_sprint]



overtake_difficulty_clean = {}
street_circuits = ['Monaco Grand Prix', 'Azerbaijan Grand Prix', 'Saudi Arabian Grand Prix', 'Miami Grand Prix']


unique_tracks = train_data_no_sprints['TrackName'].unique()


for track in unique_tracks:
    mask = ((train_data_no_sprints['TrackName'] == track) & 
            (train_data_no_sprints['FinishedFlag'] == 1))
    
    data = train_data_no_sprints[mask][['QualifyingPosition', 'RacePosition']]
    data = data.dropna()
    
    
    if len(data) > 5:
        correlation = data['QualifyingPosition'].corr(data['RacePosition'])
        overtake_difficulty_clean[track] = correlation
    
    else:
        if track in street_circuits:
            overtake_difficulty_clean[track] = 0.8
        else:
            overtake_difficulty_clean[track] = 0.6


print(overtake_difficulty_clean)

In [None]:
# Create track features with just 2 features
track_features = pd.DataFrame({'TrackName': list(overtake_difficulty_clean.keys()), 'overtaking_difficulty': list(overtake_difficulty_clean.values())})

street_circuits = ['Monaco Grand Prix', 'Azerbaijan Grand Prix', 'Saudi Arabian Grand Prix', 'Miami Grand Prix']
track_features['is_street'] = track_features['TrackName'].isin(street_circuits).astype(int)

print(track_features.head())

In [None]:
train_data_with_features = train_data_no_sprints.merge(
    track_features, 
    on='TrackName', 
    how='left'
)

print(train_data_with_features[['TrackName', 'is_street', 'overtaking_difficulty']].head())

In [None]:
print(train_data_with_features[train_data_with_features['is_street'] == 1]['TrackName'].unique())

In [None]:
quali_points = {1:8, 2:7, 3:6, 4:5, 5:4, 6:3, 7:2, 8:1, 9:0, 10:0, 11:0, 12:0, 13:0, 14:0, 15:0, 16:0, 17:0, 18:0, 19:0, 20:0}
recent_form = {}

for year in [2022, 2023]:
    season_data = train_data[train_data['Year'] == year]
    
    max_round = season_data['Round'].max()
    for race in range(4, max_round + 1):
        current_round = season_data[season_data['Round'] == race]
            
        for driver in current_round["Abbreviation"].unique():
            prev_race1 = season_data[(season_data['Round'] == race-1) & (season_data['Abbreviation'] == driver)]
            points1 = prev_race1['Points'].sum()
            prev_race2 = season_data[(season_data['Round'] == race-2) & (season_data['Abbreviation'] == driver)]
            points2 = prev_race2['Points'].sum()
            prev_race3 = season_data[(season_data['Round'] == race-3) & (season_data['Abbreviation'] == driver)]
            points3 = prev_race3['Points'].sum()
            
            points = sum([points1, points2, points3])
            
            quali1 = prev_race1['QualifyingPosition'].sum()
            sprint_points1 = quali_points.get(quali1, 0)
            quali2 = prev_race2['QualifyingPosition'].sum()
            sprint_points2 = quali_points.get(quali2, 0)
            quali3 = prev_race3['QualifyingPosition'].sum()
            sprint_points3 = quali_points.get(quali3, 0)
            
            points += sum([sprint_points1, sprint_points2, sprint_points3])
            
            recent_form[(year, race, driver)] = points

train_data['recent_form'] = np.nan

for key, form_value in recent_form.items():
    year, round_num, driver_code = key  
    
    season = train_data['Year'] == year
    race = train_data['Round'] == round_num  
    racer = train_data['Abbreviation'] == driver_code
    
    row_to_update = season & race & racer
    
    train_data.loc[row_to_update, 'recent_form'] = form_value
    
print(recent_form[(2022, 4, 'HAM')])


In [None]:
rain_specialists = ['VER', 'HAM', 'STR']
train_data['rain_specialist'] = train_data['Abbreviation'].isin(rain_specialists).astype(int)
print(train_data[train_data['RainDuringRace'] == True][['Abbreviation', 'rain_specialist', 'RacePosition']].head(10))

In [None]:
# Check DNF positions
dnf = train_data[~train_data['Status'].isin(['Finished', '+1 Lap', '+2 Laps', '+3 Laps'])]
print(dnf[['Status', 'RacePosition', 'QualifyingPosition', 'Laps']].head(10))

In [None]:
driver_error = ['Collision', 'Collision damage', 'Accident', 'Spun off']
mechanical = ['Engine', 'Gearbox', 'Power Unit', 'Hydraulics', 'Brakes', 'Suspension', 'Fuel pressure', 'Power loss', 'Water pressure', 'Water leak', 'Mechanical', 'Undertray', 'Turbo', 'Oil leak', 'Cooling system', 'Vibrations', 'Differential', ...]
lapped = ['+1 Lap', '+2 Laps', '+3 Laps']



adjusted_positions = []

for index, row in train_data.iterrows():

    if row['Status'] == "Finished" or row['Status'] in lapped:
        adjusted_positions.append(row['RacePosition'])
        
    elif row['Status'] in mechanical:
        adjusted_positiondnf = (row['RacePosition'] + row['QualifyingPosition']) / 2
        adjusted_positions.append(adjusted_positiondnf)
    
    elif row['Status'] in driver_error:
        adjusted_positions.append(row['RacePosition'])
    
    else:
        adjusted_positions.append(row['RacePosition'])

train_data['adjusted_position'] = adjusted_positions



In [None]:
#train_data.shape
#train_data.info()
#train_data.describe()
train_data.head()
#train_data[train_data['Points'] > 6].head()
#train_data.groupby('Abbreviation')['Points'].sum().head()
train_data[train_data['Year'] == 2023].groupby('Abbreviation')['Points'].sum().sort_values(ascending=False).head(10)

In [None]:
train_data.head()

In [None]:
import xgboost as xgb
print(xgb.__version__)
