In [None]:
import sys
!{sys.executable} -m pip install pyarrow
import warnings
warnings.filterwarnings('ignore')


In [None]:

import sys
import fastf1
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

# Setup
cache_path = Path('cache')
cache_path.mkdir(exist_ok=True)
fastf1.Cache.enable_cache(str(cache_path))

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)

In [None]:
def collect_and_save_season_individual(year, rounds_to_collect=None):
    """
    Collect F1 season data and save each race as individual file.
    
    Args:
        year: Season year
        rounds_to_collect: List of specific rounds to collect (None = all)
    """
    output_dir = Path(f"data/raw/{year}")
    output_dir.mkdir(parents=True, exist_ok=True)
    
    schedule = fastf1.get_event_schedule(year)
    num_races = len(schedule)
    
    if rounds_to_collect is None:
        rounds_to_collect = range(1, num_races + 1)
    
    print(f"Collecting {year} season - {len(rounds_to_collect)} races")
    
    for round_num in rounds_to_collect:
        # Check if file already exists
        race_file = output_dir / f"{year}_round_{round_num:02d}.csv"
        if race_file.exists():
            print(f"  Round {round_num}: Already exists, skipping")
            continue
            
        try:
            print(f"  Round {round_num}...", end="")
            session = fastf1.get_session(year, round_num, "R")
            session.load(laps=True, telemetry=False, weather=True, messages=True)
            
            # Check available columns
            available_columns = session.results.columns.tolist()
            keep_base = ["Abbreviation","FullName","TeamName","GridPosition","Position","Points","Status","Time"]
            keep = [col for col in keep_base if col in available_columns]
            
            race = session.results[keep].copy()
            race["Year"] = year
            race["Round"] = round_num
            race["TrackName"] = session.event["EventName"]
            race["Country"] = session.event["Country"]
            
            # Get Laps from session.laps if needed
            if 'Laps' not in available_columns and session.laps is not None and not session.laps.empty:
                laps_completed = session.laps.groupby('Driver')['LapNumber'].max()
                race = race.merge(
                    laps_completed.rename('Laps').reset_index().rename(columns={'Driver': 'Abbreviation'}),
                    on='Abbreviation',
                    how='left'
                )
            
            # Add lap statistics
            if session.laps is not None and not session.laps.empty:
                lap_agg = (
                    session.laps.groupby("Driver")
                    .agg(
                        AvgLapTime=("LapTime", lambda s: s.dt.total_seconds().mean()),
                        NumPitStops=("PitInTime", lambda s: s.notna().sum()),
                    )
                    .reset_index()
                    .rename(columns={"Driver": "Abbreviation"})
                )
                race = race.merge(lap_agg, on="Abbreviation", how="left")
            
            # Add weather data
            if session.weather_data is not None and not session.weather_data.empty:
                race["AvgTrackTemp"] = float(session.weather_data["TrackTemp"].mean())
                race["AvgAirTemp"] = float(session.weather_data["AirTemp"].mean())
                race["RainDuringRace"] = bool(session.weather_data["Rainfall"].any())
            
            # Check for safety car
            sc = False
            rcm = getattr(session, "race_control_messages", None)
            if rcm is not None and not rcm.empty:
                sc = bool(rcm["Message"].str.contains("SAFETY CAR", na=False).any())
            race["SafetyCarDeployed"] = sc
            
            # Save individual race
            race.to_csv(race_file, index=False)
            print(f" ✓ {session.event['EventName']} saved")
            
        except Exception as e:
            print(f" ✗ Error: {e}")
            continue

def combine_season_files(year):
    """Combine individual race files into one season file."""
    data_dir = Path(f"data/raw/{year}")
    race_files = sorted(data_dir.glob(f"{year}_round_*.csv"))
    
    if not race_files:
        print(f"No race files found for {year}")
        return pd.DataFrame()
    
    all_races = []
    for file in race_files:
        df = pd.read_csv(file)
        all_races.append(df)
    
    combined = pd.concat(all_races, ignore_index=True)
    output_file = data_dir / f"{year}_season_combined.csv"
    combined.to_csv(output_file, index=False)
    print(f"Combined {len(race_files)} races into {output_file}")
    return combined

# Usage examples:

# Collect all races for a year (skips existing files)
collect_and_save_season_individual(2023)

# Collect only specific races (e.g., just Hungarian GP which is round 11)
collect_and_save_season_individual(2023, rounds_to_collect=[11])

# Combine all individual files into one
train_data_2023 = pd.read_csv('data/raw/2023/2023_season_complete.csv')
train_data_2022 = pd.read_csv('data/raw/2022/2022_season_extended.csv')
train_data = pd.concat([train_data_2023, train_data_2022], ignore_index=True)

In [None]:
# Collect data with individual race files
for year in [2022, 2023]:
    # Check if we need to collect
    data_dir = Path(f"data/raw/{year}")
    if data_dir.exists() and len(list(data_dir.glob(f"{year}_round_*.csv"))) > 0:
        print(f"{year}: Found existing race files, combining...")
        data = combine_season_files(year)
    else:
        print(f"{year}: Collecting races individually...")
        collect_and_save_season_individual(year)
        data = combine_season_files(year)
    
    if not data.empty:
        print(f"{year}: {len(data)} driver entries from {data['Round'].nunique()} races\n")

In [None]:
from pathlib import Path
import pandas as pd

def load_season_data(year):
    p = Path(f"data/raw/{year}")
    # Check for combined files first
    for fn in [f"{year}_season_combined.csv", f"{year}_season_extended.csv", 
               f"{year}_season.csv", f"{year}_season.parquet"]:
        f = p / fn
        if f.exists():
            return pd.read_csv(f) if f.suffix == ".csv" else pd.read_parquet(f)
    
    # If no combined file, try to combine individual race files
    race_files = sorted(p.glob(f"{year}_round_*.csv"))
    if race_files:
        print(f"Found {len(race_files)} individual race files for {year}, combining...")
        all_races = []
        for file in race_files:
            df = pd.read_csv(file)
            all_races.append(df)
        return pd.concat(all_races, ignore_index=True)
    
    print(f"No data files found for {year}")
    return pd.DataFrame()

def load_multiple_seasons(years):
    all_seasons = []
    for year in years:
        season_data = load_season_data(year)
        if not season_data.empty:
            all_seasons.append(season_data)
    
    if all_seasons:
        return pd.concat(all_seasons, ignore_index=True)
    return pd.DataFrame()

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 0)

# Load data using the functions from previous cell
train_data = load_multiple_seasons([2023, 2022])
print(f"\nCombined training data: {train_data.shape}")

# Convert numeric columns
for c in ["GridPosition","Position","Points","Laps","Year","Round","AvgLapTime","NumPitStops","AvgTrackTemp","AvgAirTemp"]:
    if c in train_data.columns:
        train_data[c] = pd.to_numeric(train_data[c], errors="coerce")

# Handle time data
train_data["Time"] = pd.to_timedelta(train_data.get("Time", pd.Series([np.nan]*len(train_data))), errors="coerce")

# Convert boolean columns
for bc in ["RainDuringRace","SafetyCarDeployed"]:
    if bc in train_data.columns:
        train_data[bc] = train_data[bc].astype("boolean")

# Calculate time gaps
is_winner = train_data["Position"] == 1.0
train_data["GapToWinner_s"] = np.where(is_winner, 0.0, train_data["Time"].dt.total_seconds())

train_data["WinnerRaceTime_s"] = (
    train_data.groupby(["Year","Round"])["Time"]
              .transform(lambda s: s.max().total_seconds() if s.notna().any() else np.nan)
)

train_data["Time_s"] = train_data["WinnerRaceTime_s"] + train_data["GapToWinner_s"]

def _fmt_hms_ms(sec):
    if pd.isna(sec): return np.nan
    sec = float(sec)
    ms = int(round((sec - int(sec)) * 1000))
    if ms == 1000:
        ms = 0
        sec = int(sec) + 1
    s = int(sec) % 60
    m = (int(sec) // 60) % 60
    h = int(sec) // 3600
    return f"{h:02d}:{m:02d}:{s:02d}.{ms:03d}"

train_data["Time_str"] = train_data["Time_s"].apply(_fmt_hms_ms)

# Rename columns
train_data = train_data.rename(columns={"Position":"RacePosition","GridPosition":"QualifyingPosition"})

# Clean string columns
for cat in ["Abbreviation","FullName","TeamName","Status","TrackName","Country"]:
    if cat in train_data.columns:
        train_data[cat] = train_data[cat].astype("string").fillna(pd.NA).str.strip()

# Add finished flag
if "Status" in train_data.columns:
    train_data["FinishedFlag"] = train_data["Status"].str.lower().eq("finished").astype("Int8")

# Drop original Time column
if "Time" in train_data.columns:
    train_data = train_data.drop(columns=["Time"])

# Reorder columns
cols_order = [
    "Year","Round","TrackName","Country",
    "Abbreviation","FullName","TeamName","Status",
    "QualifyingPosition","RacePosition","Points","Laps",
    "AvgLapTime","NumPitStops","AvgTrackTemp","AvgAirTemp",
    "RainDuringRace","SafetyCarDeployed",
    "Time_s","Time_str","GapToWinner_s","WinnerRaceTime_s",
    "FinishedFlag"
]
train_data = train_data[[c for c in cols_order if c in train_data.columns]]

# Save processed data
out_dir = Path("data/processed")
out_dir.mkdir(parents=True, exist_ok=True)
csv_path = out_dir / "f1_2022_2023_processed.csv"
parq_path = out_dir / "f1_2022_2023_processed.parquet"

train_data.to_csv(csv_path, index=False)
try:
    train_data.to_parquet(parq_path, index=False)
except Exception as e:
    print(f"[warn] parquet save failed: {e}")

print(f"\nSaved processed data to:\n- {csv_path}\n- {parq_path if parq_path.exists() else '(parquet not written)'}")
print("\nPreview:")
print(train_data.head(8).to_string(index=False))

In [None]:
train_data.dtypes
(
    train_data[train_data['RainDuringRace'] == False]  
    .groupby('TrackName')['AvgLapTime']    
    .mean()
    .sort_values()
    .apply(_fmt_hms_ms)
)


In [None]:
track_avg_speeds = {}
unique_tracks = train_data['TrackName'].unique()

for track in unique_tracks:
    mask = (train_data['TrackName'] == track) & (train_data['RainDuringRace'] == False)
    track_avg_speeds[track] = train_data[mask]['AvgLapTime'].mean()

print(track_avg_speeds)

In [None]:
import pandas as pd

sprint_races = [(2022, 'Emilia Romagna Grand Prix'),(2022, 'Austrian Grand Prix'), (2022, 'São Paulo Grand Prix'),(2023, 'Azerbaijan Grand Prix'),(2023, 'Austrian Grand Prix'),(2023, 'Belgian Grand Prix'),(2023, 'Qatar Grand Prix'),(2023, 'United States Grand Prix'),(2023, 'São Paulo Grand Prix')]

is_sprint = pd.Series([False] * len(train_data), index=train_data.index)
for year, track in sprint_races:
    is_sprint |= ((train_data['Year'] == year) & (train_data['TrackName'] == track))

train_data_no_sprints = train_data[~is_sprint]

overtake_difficulty_clean = {}
street_circuits = ['Monaco Grand Prix', 'Azerbaijan Grand Prix', 'Saudi Arabian Grand Prix', 'Miami Grand Prix']

unique_tracks = train_data_no_sprints['TrackName'].unique()

for track in unique_tracks:
    mask = ((train_data_no_sprints['TrackName'] == track) & 
            (train_data_no_sprints['FinishedFlag'] == 1))
    
    data = train_data_no_sprints[mask][['QualifyingPosition', 'RacePosition']].dropna()
    
    if len(data) > 5:
        correlation = data['QualifyingPosition'].corr(data['RacePosition'])
        overtake_difficulty_clean[track] = correlation
    else:
        if track in street_circuits:
            overtake_difficulty_clean[track] = 0.8
        else:
            overtake_difficulty_clean[track] = 0.6

print(overtake_difficulty_clean)

In [None]:
# Create track features with just 2 features
track_features = pd.DataFrame({'TrackName': list(overtake_difficulty_clean.keys()), 'overtaking_difficulty': list(overtake_difficulty_clean.values())})

street_circuits = ['Monaco Grand Prix', 'Azerbaijan Grand Prix', 'Saudi Arabian Grand Prix', 'Miami Grand Prix']
track_features['is_street'] = track_features['TrackName'].isin(street_circuits).astype(int)

print(track_features.head(10))

In [None]:
# Merge track features with main data
train_data_with_features = train_data_no_sprints.merge(
    track_features, 
    on='TrackName', 
    how='left'
)

# Select features for modeling
feature_cols = ['QualifyingPosition', 'overtaking_difficulty', 'is_street', 
                 'AvgLapTime', 'AvgTrackTemp', 'AvgAirTemp', 
                 'RainDuringRace', 'SafetyCarDeployed']

target_col = 'RacePosition'

# Remove rows with missing target
model_data = train_data_with_features.dropna(subset=[target_col])

print(f"Dataset ready: {len(model_data)} samples")
print(f"Features: {feature_cols}")
print(f"Target: {target_col}")

In [None]:
# Check Hungarian GP data
hungarian_data = train_data_no_sprints[train_data_no_sprints['TrackName'] == 'Hungarian Grand Prix']
print(f"Hungarian GP races: {hungarian_data['Year'].unique()}")
print(f"Sample positions:\n{hungarian_data[['Year', 'QualifyingPosition', 'RacePosition', 'FinishedFlag']].head(30)}")

In [None]:
train_data_with_features = train_data_no_sprints.merge(
    track_features, 
    on='TrackName', 
    how='left'
)

print(train_data_with_features[['TrackName', 'is_street', 'overtaking_difficulty']].head())

In [None]:
print(train_data_with_features[train_data_with_features['is_street'] == 1]['TrackName'].unique())

In [None]:
quali_points = {1:8, 2:7, 3:6, 4:5, 5:4, 6:3, 7:2, 8:1, 9:0, 10:0, 11:0, 12:0, 13:0, 14:0, 15:0, 16:0, 17:0, 18:0, 19:0, 20:0}
recent_form = {}

for year in [2022, 2023]:
    season_data = train_data[train_data['Year'] == year]
    
    max_round = season_data['Round'].max()
    for race in range(4, max_round + 1):
        current_round = season_data[season_data['Round'] == race]
            
        for driver in current_round["Abbreviation"].unique():
            prev_race1 = season_data[(season_data['Round'] == race-1) & (season_data['Abbreviation'] == driver)]
            points1 = prev_race1['Points'].sum()
            prev_race2 = season_data[(season_data['Round'] == race-2) & (season_data['Abbreviation'] == driver)]
            points2 = prev_race2['Points'].sum()
            prev_race3 = season_data[(season_data['Round'] == race-3) & (season_data['Abbreviation'] == driver)]
            points3 = prev_race3['Points'].sum()
            
            points = sum([points1, points2, points3])
            
            quali1 = prev_race1['QualifyingPosition'].sum()
            sprint_points1 = quali_points.get(quali1, 0)
            quali2 = prev_race2['QualifyingPosition'].sum()
            sprint_points2 = quali_points.get(quali2, 0)
            quali3 = prev_race3['QualifyingPosition'].sum()
            sprint_points3 = quali_points.get(quali3, 0)
            
            points += sum([sprint_points1, sprint_points2, sprint_points3])
            
            recent_form[(year, race, driver)] = points

train_data['recent_form'] = np.nan

for key, form_value in recent_form.items():
    year, round_num, driver_code = key  
    
    season = train_data['Year'] == year
    race = train_data['Round'] == round_num  
    racer = train_data['Abbreviation'] == driver_code
    
    row_to_update = season & race & racer
    
    train_data.loc[row_to_update, 'recent_form'] = form_value
    
print(recent_form[(2022, 4, 'HAM')])


In [None]:
rain_specialists = ['VER', 'HAM', 'STR']
train_data['rain_specialist'] = train_data['Abbreviation'].isin(rain_specialists).astype(int)
print(train_data[train_data['RainDuringRace'] == True][['Abbreviation', 'rain_specialist', 'RacePosition']].head(10))

In [None]:
# Check DNF positions
dnf = train_data[~train_data['Status'].isin(['Finished', '+1 Lap', '+2 Laps', '+3 Laps'])]
print(dnf[['Status', 'RacePosition', 'QualifyingPosition', 'Laps']].head(10))

In [None]:
driver_error = ['Collision', 'Collision damage', 'Accident', 'Spun off']
mechanical = ['Engine', 'Gearbox', 'Power Unit', 'Hydraulics', 'Brakes', 'Suspension', 'Fuel pressure', 'Power loss', 'Water pressure', 'Water leak', 'Mechanical', 'Undertray', 'Turbo', 'Oil leak', 'Cooling system', 'Vibrations', 'Differential', ...]
lapped = ['+1 Lap', '+2 Laps', '+3 Laps']



adjusted_positions = []

for index, row in train_data.iterrows():

    if row['Status'] == "Finished" or row['Status'] in lapped:
        adjusted_positions.append(row['RacePosition'])
        
    elif row['Status'] in mechanical:
        adjusted_positiondnf = (row['RacePosition'] + row['QualifyingPosition']) / 2
        adjusted_positions.append(adjusted_positiondnf)
    
    elif row['Status'] in driver_error:
        adjusted_positions.append(row['RacePosition'])
    
    else:
        adjusted_positions.append(row['RacePosition'])

train_data['adjusted_position'] = adjusted_positions



In [None]:
#train_data.shape
#train_data.info()
#train_data.describe()
train_data.head()
#train_data[train_data['Points'] > 6].head()
#train_data.groupby('Abbreviation')['Points'].sum().head()
train_data[train_data['Year'] == 2023].groupby('Abbreviation')['Points'].sum().sort_values(ascending=False).head(10)

In [None]:
train_data.head()

In [None]:
import xgboost as xgb
print(xgb.__version__)
