In [41]:
import sys
!{sys.executable} -m pip install pyarrow
import warnings
warnings.filterwarnings('ignore')




In [42]:

import sys
import fastf1
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

# Setup
cache_path = Path('cache')
cache_path.mkdir(exist_ok=True)
fastf1.Cache.enable_cache(str(cache_path))

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)

In [50]:
import requests
import pandas as pd
from pathlib import Path
import time
import numpy as np

def collect_comprehensive_race_data(year, skip_existing=True):
    """
    Collect comprehensive F1 data with robust error handling.
    """
    output_dir = Path(f"data/raw/{year}_comprehensive")
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Get all sessions for the year
    race_sessions = requests.get(f"https://api.openf1.org/v1/sessions?year={year}&session_type=Race").json()
    qual_sessions = requests.get(f"https://api.openf1.org/v1/sessions?year={year}&session_type=Qualifying").json()
    
    # Create qualifying position mapping
    quali_map = {}
    for qual in qual_sessions:
        try:
            positions = requests.get(f"https://api.openf1.org/v1/position?session_key={qual['session_key']}").json()
            if positions and isinstance(positions, list):
                pos_df = pd.DataFrame(positions)
                final_quali = pos_df.groupby('driver_number')['position'].last().to_dict()
                quali_map[qual['meeting_key']] = final_quali
        except:
            continue
    
    print(f"Collecting {year} - {len(race_sessions)} races with lap times and gaps")
    successful = 0
    
    for idx, session in enumerate(race_sessions, 1):
        race_file = output_dir / f"{year}_round_{idx:02d}.csv"
        
        if skip_existing and race_file.exists():
            print(f"  Round {idx}: Already exists, skipping")
            successful += 1
            continue
            
        try:
            print(f"  Round {idx} - {session['location']}...", end="")
            session_key = session['session_key']
            meeting_key = session['meeting_key']
            
            # Get drivers - check if valid response
            drivers_resp = requests.get(f"https://api.openf1.org/v1/drivers?session_key={session_key}").json()
            if not drivers_resp or not isinstance(drivers_resp, list):
                print(f" ✗ No driver data")
                continue
            
            drivers = drivers_resp
            driver_map = {d['driver_number']: d for d in drivers if isinstance(d, dict)}
            
            # Get lap data - check for valid response
            avg_lap_times = {}
            try:
                laps_resp = requests.get(f"https://api.openf1.org/v1/laps?session_key={session_key}").json()
                if laps_resp and isinstance(laps_resp, list):
                    lap_df = pd.DataFrame(laps_resp)
                    if 'lap_duration' in lap_df.columns and 'is_pit_out_lap' in lap_df.columns:
                        for driver_num in lap_df['driver_number'].unique():
                            driver_laps = lap_df[(lap_df['driver_number'] == driver_num) & 
                                                (lap_df['is_pit_out_lap'] == False)]['lap_duration'].dropna()
                            if len(driver_laps) > 3:
                                q1 = driver_laps.quantile(0.25)
                                q3 = driver_laps.quantile(0.75)
                                iqr = q3 - q1
                                filtered = driver_laps[(driver_laps >= q1 - 1.5*iqr) & (driver_laps <= q3 + 1.5*iqr)]
                                if len(filtered) > 0:
                                    avg_lap_times[driver_num] = filtered.mean()
            except:
                pass  # Continue without lap times
            
            # Get intervals - check for valid response
            final_gaps = {}
            try:
                intervals_resp = requests.get(f"https://api.openf1.org/v1/intervals?session_key={session_key}").json()
                if intervals_resp and isinstance(intervals_resp, list):
                    int_df = pd.DataFrame(intervals_resp)
                    if 'gap_to_leader' in int_df.columns:
                        for driver_num in int_df['driver_number'].unique():
                            driver_gaps = int_df[int_df['driver_number'] == driver_num]['gap_to_leader']
                            if not driver_gaps.empty:
                                last_gap = driver_gaps.dropna().iloc[-1] if not driver_gaps.dropna().empty else None
                                final_gaps[driver_num] = last_gap
            except:
                pass  # Continue without gaps
            
            # Get final positions
            final_positions = {}
            try:
                positions_resp = requests.get(f"https://api.openf1.org/v1/position?session_key={session_key}").json()
                if positions_resp and isinstance(positions_resp, list):
                    pos_df = pd.DataFrame(positions_resp)
                    final_positions = pos_df.groupby('driver_number')['position'].last().to_dict()
            except:
                pass
            
            # Get stints
            stint_data = {}
            try:
                stints_resp = requests.get(f"https://api.openf1.org/v1/stints?session_key={session_key}").json()
                if stints_resp and isinstance(stints_resp, list):
                    for driver_num in driver_map.keys():
                        driver_stints = [s for s in stints_resp if isinstance(s, dict) and s.get('driver_number') == driver_num]
                        if driver_stints:
                            total_laps = max([s.get('lap_end', 0) for s in driver_stints], default=0)
                            stint_data[driver_num] = {
                                'laps': total_laps,
                                'pit_stops': max(0, len(driver_stints) - 1)
                            }
                        else:
                            stint_data[driver_num] = {'laps': 0, 'pit_stops': 0}
                else:
                    for driver_num in driver_map.keys():
                        stint_data[driver_num] = {'laps': 0, 'pit_stops': 0}
            except:
                for driver_num in driver_map.keys():
                    stint_data[driver_num] = {'laps': 0, 'pit_stops': 0}
            
            # Build results
            race_results = []
            for driver_num, driver in driver_map.items():
                quali_pos = None
                if meeting_key in quali_map:
                    quali_pos = quali_map[meeting_key].get(driver_num)
                
                result = {
                    'Abbreviation': driver.get('name_acronym'),
                    'FullName': driver.get('full_name'),
                    'TeamName': driver.get('team_name'),
                    'QualifyingPosition': quali_pos,
                    'RacePosition': final_positions.get(driver_num),
                    'GapToLeader': final_gaps.get(driver_num),
                    'AvgLapTime': avg_lap_times.get(driver_num),
                    'Laps': stint_data.get(driver_num, {}).get('laps', 0),
                    'NumPitStops': stint_data.get(driver_num, {}).get('pit_stops', 0),
                    'Year': year,
                    'Round': idx,
                    'TrackName': session['location'],
                    'Country': session['country_name']
                }
                
                points_map = {1: 25, 2: 18, 3: 15, 4: 12, 5: 10, 6: 8, 7: 6, 8: 4, 9: 2, 10: 1}
                race_pos = result['RacePosition']
                result['Points'] = points_map.get(race_pos, 0) if race_pos and race_pos <= 10 else 0
                result['Status'] = 'Finished' if race_pos else 'DNF'
                
                race_results.append(result)
            
            if not race_results:
                print(f" ✗ No valid results")
                continue
                
            race = pd.DataFrame(race_results)
            
            # Add weather (simplified)
            try:
                weather_resp = requests.get(f"https://api.openf1.org/v1/weather?session_key={session_key}").json()
                if weather_resp and isinstance(weather_resp, list):
                    weather_df = pd.DataFrame(weather_resp)
                    race['AvgTrackTemp'] = weather_df['track_temperature'].mean() if 'track_temperature' in weather_df else None
                    race['AvgAirTemp'] = weather_df['air_temperature'].mean() if 'air_temperature' in weather_df else None
                    race['RainDuringRace'] = weather_df['rainfall'].any() if 'rainfall' in weather_df else False
                else:
                    race['AvgTrackTemp'] = None
                    race['AvgAirTemp'] = None
                    race['RainDuringRace'] = False
            except:
                race['AvgTrackTemp'] = None
                race['AvgAirTemp'] = None
                race['RainDuringRace'] = False
            
            race['SafetyCarDeployed'] = False
            
            # Save
            race.to_csv(race_file, index=False)
            laps_with_time = sum(1 for x in avg_lap_times.values() if x is not None)
            gaps_filled = sum(1 for x in final_gaps.values() if x is not None)
            print(f" ✓ (Lap times: {laps_with_time}/{len(driver_map)}, Gaps: {gaps_filled}/{len(driver_map)})")
            successful += 1
            
            time.sleep(0.5)
            
        except Exception as e:
            print(f" ✗ Error: {e}")
            continue
    
    print(f"\nSuccessfully collected {successful}/{len(race_sessions)} races")
    
    # Combine all races
    race_files = sorted(output_dir.glob(f"{year}_round_*.csv"))
    if race_files:
        all_races = [pd.read_csv(f) for f in race_files]
        combined = pd.concat(all_races, ignore_index=True)
        output_file = output_dir / f"{year}_comprehensive.csv"
        combined.to_csv(output_file, index=False)
        print(f"Combined into {output_file}")
        print(f"  Total: {len(combined)} entries")
        print(f"  AvgLapTime filled: {combined['AvgLapTime'].notna().sum()}/{len(combined)} ({combined['AvgLapTime'].notna().sum()*100/len(combined):.1f}%)")
        print(f"  GapToLeader filled: {combined['GapToLeader'].notna().sum()}/{len(combined)} ({combined['GapToLeader'].notna().sum()*100/len(combined):.1f}%)")
        return combined
    
    return pd.DataFrame()

# Delete failed races and recollect
import shutil
shutil.rmtree('data/raw/2023_comprehensive', ignore_errors=True)

# Collect again with better error handling
data_2023 = collect_comprehensive_race_data(2023)

Collecting 2023 - 28 races with lap times and gaps
  Round 1 - Sakhir... ✓ (Lap times: 20/20, Gaps: 20/20)
  Round 2 - Jeddah... ✓ (Lap times: 20/20, Gaps: 20/20)
  Round 3 - Melbourne... ✗ No driver data
  Round 4 - Baku... ✗ No driver data
  Round 5 - Baku... ✗ No driver data
  Round 6 - Miami... ✗ No driver data
  Round 7 - Monaco... ✓ (Lap times: 20/20, Gaps: 20/20)
  Round 8 - Barcelona... ✗ No driver data
  Round 9 - Montréal... ✓ (Lap times: 0/20, Gaps: 20/20)
  Round 10 - Spielberg... ✗ No driver data
  Round 11 - Spielberg... ✓ (Lap times: 20/20, Gaps: 20/20)
  Round 12 - Silverstone... ✓ (Lap times: 20/20, Gaps: 20/20)
  Round 13 - Budapest... ✓ (Lap times: 18/20, Gaps: 20/20)
  Round 14 - Spa-Francorchamps... ✓ (Lap times: 19/20, Gaps: 20/20)
  Round 15 - Spa-Francorchamps... ✓ (Lap times: 19/20, Gaps: 20/20)
  Round 16 - Zandvoort... ✓ (Lap times: 20/20, Gaps: 20/20)
  Round 17 - Monza... ✓ (Lap times: 19/20, Gaps: 19/20)
  Round 18 - Marina Bay... ✓ (Lap times: 18/19, Gaps

In [59]:
from pathlib import Path
import pandas as pd

# Check what races were successfully saved
data_dir = Path("data/raw/2023_comprehensive")
saved_files = sorted(data_dir.glob("*.csv"))
print(f"Files saved: {len(saved_files)}")

# Load and check the comprehensive file if it exists
if (data_dir / "2023_comprehensive.csv").exists():
    data_2023 = pd.read_csv(data_dir / "2023_comprehensive.csv")
    print(f"\n2023 comprehensive data:")
    print(f"  Total entries: {len(data_2023)}")
    print(f"  Races: {data_2023['Round'].nunique()}")
    print(f"  AvgLapTime filled: {data_2023['AvgLapTime'].notna().sum()}/{len(data_2023)} ({data_2023['AvgLapTime'].notna().sum()*100/len(data_2023):.1f}%)")
    print(f"  GapToLeader filled: {data_2023['GapToLeader'].notna().sum()}/{len(data_2023)} ({data_2023['GapToLeader'].notna().sum()*100/len(data_2023):.1f}%)")
    print(f"\nRounds collected: {sorted(data_2023['Round'].unique())}")

Files saved: 23

2023 comprehensive data:
  Total entries: 439
  Races: 22
  AvgLapTime filled: 384/439 (87.5%)
  GapToLeader filled: 416/439 (94.8%)

Rounds collected: [1, 2, 7, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]


In [61]:
import requests
import pandas as pd
from pathlib import Path

def update_comprehensive_with_qualifying():
    """Update comprehensive data with qualifying positions."""
    
    # Get all qualifying and race sessions
    qual_sessions = requests.get("https://api.openf1.org/v1/sessions?year=2023&session_type=Qualifying").json()
    race_sessions = requests.get("https://api.openf1.org/v1/sessions?year=2023&session_type=Race").json()
    
    print(f"Processing {len(qual_sessions)} qualifying sessions...")
    
    # Build qualifying position mapping
    quali_results = {}
    for qual_session in qual_sessions:
        location = qual_session['location']
        qual_key = qual_session['session_key']
        
        # Find matching race
        matching_race = None
        for race in race_sessions:
            if race['location'] == location and race['meeting_key'] == qual_session['meeting_key']:
                matching_race = race
                break
        
        if not matching_race:
            continue
            
        # Get qualifying positions
        positions = requests.get(f"https://api.openf1.org/v1/position?session_key={qual_key}").json()
        
        if positions and isinstance(positions, list):
            pos_df = pd.DataFrame(positions)
            final_positions = pos_df.groupby('driver_number')['position'].last().sort_values()
            
            quali_results[matching_race['session_key']] = {
                'location': location,
                'quali_positions': dict(zip(final_positions.index, final_positions.values))
            }
            
            print(f"  {location}: {len(final_positions)} drivers")
    
    # Update comprehensive data files
    data_dir = Path("data/raw/2023_comprehensive")  # Changed from 2023_fixed
    updated = 0
    
    for idx, race_session in enumerate(race_sessions, 1):
        race_file = data_dir / f"2023_round_{idx:02d}.csv"
        
        if race_file.exists():
            race_df = pd.read_csv(race_file)
            
            if race_session['session_key'] in quali_results:
                quali_positions = quali_results[race_session['session_key']]['quali_positions']
                
                # Get driver mapping
                drivers_resp = requests.get(f"https://api.openf1.org/v1/drivers?session_key={race_session['session_key']}").json()
                if drivers_resp and isinstance(drivers_resp, list):
                    driver_mapping = {d['name_acronym']: d['driver_number'] for d in drivers_resp}
                    
                    # Update QualifyingPosition column
                    for i, row in race_df.iterrows():
                        driver_abbr = row['Abbreviation']
                        if driver_abbr in driver_mapping:
                            driver_num = driver_mapping[driver_abbr]
                            if driver_num in quali_positions:
                                race_df.at[i, 'QualifyingPosition'] = quali_positions[driver_num]
                    
                    race_df.to_csv(race_file, index=False)
                    updated += 1
                    print(f"  Round {idx}: Updated {race_df['QualifyingPosition'].notna().sum()}/{len(race_df)} positions")
    
    print(f"\nUpdated {updated} race files")
    
    # Recombine
    race_files = sorted(data_dir.glob("2023_round_*.csv"))
    if race_files:
        all_races = [pd.read_csv(f) for f in race_files]
        combined = pd.concat(all_races, ignore_index=True)
        combined.to_csv(data_dir / "2023_comprehensive.csv", index=False)
        print(f"Recombined: {len(combined)} entries, Qualifying filled: {combined['QualifyingPosition'].notna().sum()}/{len(combined)}")
        return combined

# Run the update
update_comprehensive_with_qualifying()

Processing 28 qualifying sessions...
  Sakhir: 20 drivers
  Jeddah: 20 drivers
  Melbourne: 20 drivers
  Baku: 20 drivers
  Baku: 20 drivers
  Miami: 20 drivers
  Monaco: 20 drivers
  Barcelona: 20 drivers
  Montréal: 20 drivers
  Spielberg: 20 drivers
  Spielberg: 20 drivers
  Silverstone: 20 drivers
  Budapest: 20 drivers
  Spa-Francorchamps: 20 drivers
  Spa-Francorchamps: 20 drivers
  Zandvoort: 20 drivers
  Monza: 20 drivers
  Marina Bay: 20 drivers
  Suzuka: 20 drivers
  Lusail: 20 drivers
  Lusail: 20 drivers
  Austin: 20 drivers
  Austin: 20 drivers
  Mexico City: 20 drivers
  São Paulo: 20 drivers
  São Paulo: 20 drivers
  Las Vegas: 20 drivers
  Yas Island: 20 drivers
  Round 1: Updated 20/20 positions
  Round 2: Updated 20/20 positions
  Round 7: Updated 20/20 positions
  Round 9: Updated 20/20 positions
  Round 12: Updated 20/20 positions
  Round 13: Updated 20/20 positions
  Round 14: Updated 20/20 positions
  Round 16: Updated 20/20 positions
  Round 17: Updated 20/20 pos

Unnamed: 0,Abbreviation,FullName,TeamName,QualifyingPosition,RacePosition,GapToLeader,AvgLapTime,Laps,NumPitStops,Year,Round,TrackName,Country,Points,Status,AvgTrackTemp,AvgAirTemp,RainDuringRace,SafetyCarDeployed
0,VER,Max VERSTAPPEN,Red Bull Racing,1,1,0.0,97.465420,57.0,2,2023,1,Sakhir,Bahrain,25,Finished,31.011801,27.431677,False,False
1,SAR,Logan SARGEANT,Williams,16,12,+1 LAP,98.964240,56.0,3,2023,1,Sakhir,Bahrain,0,Finished,31.011801,27.431677,False,False
2,NOR,Lando NORRIS,McLaren,11,17,+2 LAPS,98.796766,55.0,6,2023,1,Sakhir,Bahrain,0,Finished,31.011801,27.431677,False,False
3,GAS,Pierre GASLY,Alpine,20,9,73.753,98.308680,57.0,3,2023,1,Sakhir,Bahrain,2,Finished,31.011801,27.431677,False,False
4,PER,Sergio PEREZ,Red Bull Racing,2,2,11.987,97.792231,57.0,2,2023,1,Sakhir,Bahrain,18,Finished,31.011801,27.431677,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
434,HAM,Lewis HAMILTON,Mercedes,11,9,44.424,89.936185,58.0,2,2023,28,Yas Island,United Arab Emirates,2,Finished,33.460897,26.962821,False,False
435,SAI,Carlos SAINZ,Ferrari,16,18,76.844,90.519962,57.0,2,2023,28,Yas Island,United Arab Emirates,0,Finished,33.460897,26.962821,False,False
436,RUS,George RUSSELL,Mercedes,4,3,20.328,89.624182,58.0,2,2023,28,Yas Island,United Arab Emirates,15,Finished,33.460897,26.962821,False,False
437,BOT,Valtteri BOTTAS,Alfa Romeo,18,19,+1 LAP,91.251019,57.0,1,2023,28,Yas Island,United Arab Emirates,0,Finished,33.460897,26.962821,False,False


In [62]:
import pandas as pd
from pathlib import Path

# Load 2022 data from your processed file
existing = pd.read_csv("data/processed/f1_2022_2023_processed.csv")
data_2022 = existing[existing['Year'] == 2022].copy()
print(f"2022 FastF1 data: {len(data_2022)} entries, {data_2022['Round'].nunique()} races")
print(f"2022 Grid positions: {data_2022['QualifyingPosition'].notna().sum()}/{len(data_2022)}")

# Load updated 2023 data
data_2023 = pd.read_csv("data/raw/2023_fixed/2023_season_combined.csv")
print(f"\n2023 OpenF1 data: {len(data_2023)} entries, {data_2023['Round'].nunique()} races")
print(f"2023 Grid positions: {data_2023['GridPosition'].notna().sum()}/{len(data_2023)}")

# Rename columns to match
data_2023 = data_2023.rename(columns={
    'Position': 'RacePosition', 
    'GridPosition': 'QualifyingPosition'
})

# Get common columns
common_cols = list(set(data_2022.columns) & set(data_2023.columns))
print(f"\nCommon columns: {len(common_cols)}")

# Combine the datasets
train_data = pd.concat([data_2022[common_cols], data_2023[common_cols]], ignore_index=True)

print(f"\n=== FINAL COMBINED DATASET ===")
print(f"Total entries: {len(train_data)}")
print(f"Years: {train_data['Year'].unique()}")
print(f"Total races: {len(train_data.groupby(['Year', 'Round']))}")
print(f"Qualifying positions filled: {train_data['QualifyingPosition'].notna().sum()}/{len(train_data)} ({train_data['QualifyingPosition'].notna().sum()*100/len(train_data):.1f}%)")
print(f"Race positions filled: {train_data['RacePosition'].notna().sum()}/{len(train_data)} ({train_data['RacePosition'].notna().sum()*100/len(train_data):.1f}%)")

# Save final dataset
train_data.to_csv("data/processed/f1_2022_2023_final_complete.csv", index=False)
print(f"\n✓ Saved to: data/processed/f1_2022_2023_final_complete.csv")

# Quick preview
print("\nSample data:")
print(train_data[['Year', 'Round', 'FullName', 'QualifyingPosition', 'RacePosition', 'Points']].head(10))

2022 FastF1 data: 440 entries, 22 races
2022 Grid positions: 439/440

2023 OpenF1 data: 558 entries, 28 races
2023 Grid positions: 438/558

Common columns: 17

=== FINAL COMBINED DATASET ===
Total entries: 998
Years: [2022 2023]
Total races: 50
Qualifying positions filled: 877/998 (87.9%)
Race positions filled: 997/998 (99.9%)

✓ Saved to: data/processed/f1_2022_2023_final_complete.csv

Sample data:
   Year  Round         FullName  QualifyingPosition  RacePosition  Points
0  2022      1  Charles Leclerc                 1.0           1.0    26.0
1  2022      1     Carlos Sainz                 3.0           2.0    18.0
2  2022      1   Lewis Hamilton                 5.0           3.0    15.0
3  2022      1   George Russell                 9.0           4.0    12.0
4  2022      1  Kevin Magnussen                 7.0           5.0    10.0
5  2022      1  Valtteri Bottas                 6.0           6.0     8.0
6  2022      1     Esteban Ocon                11.0           7.0     6.0
7  20

In [63]:
import pandas as pd

# Load final dataset
train_data = pd.read_csv("data/processed/f1_2022_2023_final_complete.csv")
print(f"Loaded: {train_data.shape}")

Loaded: (998, 17)


In [73]:
from pathlib import Path
import pandas as pd

# Check what comprehensive data exists
comprehensive_path = Path("data/raw/2023_comprehensive/2023_comprehensive.csv")
if comprehensive_path.exists():
    data_2023_new = pd.read_csv(comprehensive_path)
    print("2023 COMPREHENSIVE data:")
    print(f"Shape: {data_2023_new.shape}")
    print(f"Columns: {data_2023_new.columns.tolist()}")
    print(f"\nData completeness:")
    print(f"  GapToLeader: {data_2023_new['GapToLeader'].notna().sum()}/{len(data_2023_new)}")
    print(f"  AvgLapTime: {data_2023_new['AvgLapTime'].notna().sum()}/{len(data_2023_new)}")
    print(f"\nSample:")
    print(data_2023_new[['FullName', 'RacePosition', 'GapToLeader', 'AvgLapTime']].head())
else:
    print("No comprehensive file found!")
    
    # Check for individual race files
    race_files = list(Path("data/raw/2023_comprehensive").glob("*.csv"))
    print(f"\nFound {len(race_files)} files in 2023_comprehensive/")
    if race_files:
        # Try combining them manually
        all_races = [pd.read_csv(f) for f in sorted(race_files) if 'round' in f.name]
        if all_races:
            combined = pd.concat(all_races, ignore_index=True)
            print(f"Combined shape: {combined.shape}")
            print(f"Has GapToLeader: {'GapToLeader' in combined.columns}")
            print(f"Has AvgLapTime: {'AvgLapTime' in combined.columns}")

2023 COMPREHENSIVE data:
Shape: (439, 19)
Columns: ['Abbreviation', 'FullName', 'TeamName', 'QualifyingPosition', 'RacePosition', 'GapToLeader', 'AvgLapTime', 'Laps', 'NumPitStops', 'Year', 'Round', 'TrackName', 'Country', 'Points', 'Status', 'AvgTrackTemp', 'AvgAirTemp', 'RainDuringRace', 'SafetyCarDeployed']

Data completeness:
  GapToLeader: 416/439
  AvgLapTime: 384/439

Sample:
         FullName  RacePosition GapToLeader  AvgLapTime
0  Max VERSTAPPEN             1         0.0   97.465420
1  Logan SARGEANT            12      +1 LAP   98.964240
2    Lando NORRIS            17     +2 LAPS   98.796766
3    Pierre GASLY             9      73.753   98.308680
4    Sergio PEREZ             2      11.987   97.792231


In [75]:
import pandas as pd

# Load comprehensive 2023 data
data_2023_comprehensive = pd.read_csv("data/raw/2023_comprehensive/2023_comprehensive.csv")

# Load your existing 2022 data
existing = pd.read_csv("data/processed/f1_2022_2023_processed.csv")
data_2022 = existing[existing['Year'] == 2022].copy()

print(f"2022: {len(data_2022)} entries")
print(f"2023: {len(data_2023_comprehensive)} entries")

# Combine them
train_data_comprehensive = pd.concat([data_2022, data_2023_comprehensive], ignore_index=True)

print(f"\nCombined comprehensive dataset:")
print(f"Shape: {train_data_comprehensive.shape}")
print(f"Years: {train_data_comprehensive['Year'].unique()}")
print(f"GapToLeader filled: {train_data_comprehensive['GapToLeader'].notna().sum()}/{len(train_data_comprehensive)}")
print(f"AvgLapTime filled: {train_data_comprehensive['AvgLapTime'].notna().sum()}/{len(train_data_comprehensive)}")

# Save this as your new training data
train_data_comprehensive.to_csv("data/processed/train_data_comprehensive.csv", index=False)
print("\nSaved as: train_data_comprehensive.csv")

2022: 440 entries
2023: 439 entries

Combined comprehensive dataset:
Shape: (879, 24)
Years: [2022 2023]
GapToLeader filled: 416/879
AvgLapTime filled: 813/879

Saved as: train_data_comprehensive.csv


In [76]:
import pandas as pd
import numpy as np

# Load the comprehensive data
train_data_comprehensive = pd.read_csv("data/processed/train_data_comprehensive.csv")

# Calculate track average lap times
track_avg_speeds = {}
unique_tracks = train_data_comprehensive['TrackName'].unique()

for track in unique_tracks:
    mask = (train_data_comprehensive['TrackName'] == track) & (train_data_comprehensive['RainDuringRace'] == False)
    track_avg_speeds[track] = train_data_comprehensive[mask]['AvgLapTime'].mean()

print("Track average lap times (seconds):")
for track, time in sorted(track_avg_speeds.items(), key=lambda x: x[1] if pd.notna(x[1]) else float('inf')):
    if pd.notna(time):
        print(f"  {track}: {time:.2f}s")

# Filter out sprint races
sprint_races = [
    (2022, 'Emilia Romagna Grand Prix'),
    (2022, 'Austrian Grand Prix'), 
    (2022, 'São Paulo Grand Prix'),
    (2023, 'Azerbaijan Grand Prix'),
    (2023, 'Austrian Grand Prix'),
    (2023, 'Belgian Grand Prix'),
    (2023, 'Qatar Grand Prix'),
    (2023, 'United States Grand Prix'),
    (2023, 'São Paulo Grand Prix')
]

is_sprint = pd.Series([False] * len(train_data_comprehensive), index=train_data_comprehensive.index)
for year, track in sprint_races:
    is_sprint |= ((train_data_comprehensive['Year'] == year) & (train_data_comprehensive['TrackName'] == track))

train_data_no_sprints = train_data_comprehensive[~is_sprint]

# Calculate overtaking difficulty
overtake_difficulty_clean = {}
street_circuits = ['Monaco Grand Prix', 'Azerbaijan Grand Prix', 'Saudi Arabian Grand Prix', 'Miami Grand Prix', 'Singapore Grand Prix']

unique_tracks = train_data_no_sprints['TrackName'].unique()

for track in unique_tracks:
    # Use Status == 'Finished' instead of FinishedFlag
    mask = ((train_data_no_sprints['TrackName'] == track) & 
            (train_data_no_sprints['Status'] == 'Finished'))
    
    data = train_data_no_sprints[mask][['QualifyingPosition', 'RacePosition']].dropna()
    
    if len(data) > 5:
        correlation = data['QualifyingPosition'].corr(data['RacePosition'])
        overtake_difficulty_clean[track] = correlation
    else:
        if track in street_circuits:
            overtake_difficulty_clean[track] = 0.8
        else:
            overtake_difficulty_clean[track] = 0.6

print("\nOvertaking difficulty (higher = harder):")
for track, diff in sorted(overtake_difficulty_clean.items(), key=lambda x: x[1], reverse=True):
    print(f"  {track}: {diff:.3f}")

Track average lap times (seconds):
  Austrian Grand Prix: 72.57s
  Dutch Grand Prix: 80.71s
  Canadian Grand Prix: 82.89s
  São Paulo Grand Prix: 83.59s
  Monaco: 83.80s
  Mexico City: 84.04s
  Budapest: 84.69s
  Mexico City Grand Prix: 84.90s
  Monza: 86.94s
  Spanish Grand Prix: 90.12s
  Yas Island: 90.20s
  Emilia Romagna Grand Prix: 90.59s
  Italian Grand Prix: 91.01s
  Australian Grand Prix: 91.67s
  Abu Dhabi Grand Prix: 92.24s
  Silverstone: 93.11s
  Jeddah: 95.19s
  Sakhir: 98.78s
  Las Vegas: 98.94s
  Saudi Arabian Grand Prix: 99.18s
  Lusail: 99.97s
  Marina Bay: 100.37s
  Austin: 101.99s
  French Grand Prix: 102.51s
  Bahrain Grand Prix: 102.87s
  United States Grand Prix: 106.69s
  Azerbaijan Grand Prix: 111.89s
  Belgian Grand Prix: 116.46s

Overtaking difficulty (higher = harder):
  Azerbaijan Grand Prix: 0.967
  Australian Grand Prix: 0.925
  Saudi Arabian Grand Prix: 0.912
  Monaco Grand Prix: 0.905
  Spanish Grand Prix: 0.887
  Abu Dhabi Grand Prix: 0.886
  Yas Island:

In [77]:
# Check Hungarian GP data specifically
hungarian_data = train_data_no_sprints[
    (train_data_no_sprints['TrackName'] == 'Hungarian Grand Prix') & 
    (train_data_no_sprints['Status'] == 'Finished')
][['Year', 'Round', 'FullName', 'QualifyingPosition', 'RacePosition']]

print(f"Hungarian GP samples: {len(hungarian_data)}")
print("\nSample of position changes:")
print(hungarian_data[['FullName', 'QualifyingPosition', 'RacePosition']].head(10))

# Check correlation by year
for year in hungarian_data['Year'].unique():
    year_data = hungarian_data[hungarian_data['Year'] == year][['QualifyingPosition', 'RacePosition']].dropna()
    if len(year_data) > 0:
        corr = year_data['QualifyingPosition'].corr(year_data['RacePosition'])
        print(f"\n{year} Hungarian GP: {len(year_data)} finishers, correlation: {corr:.3f}")

Hungarian GP samples: 7

Sample of position changes:
            FullName  QualifyingPosition  RacePosition
240   Max Verstappen                10.0           1.0
241   Lewis Hamilton                 7.0           2.0
242   George Russell                 1.0           3.0
243     Carlos Sainz                 2.0           4.0
244     Sergio Perez                11.0           5.0
245  Charles Leclerc                 3.0           6.0
246     Lando Norris                 4.0           7.0

2022 Hungarian GP: 7 finishers, correlation: -0.312


In [78]:
# Check if all these results are from the same race
hungarian_grouped = train_data_no_sprints[
    (train_data_no_sprints['TrackName'] == 'Hungarian Grand Prix') & 
    (train_data_no_sprints['Status'] == 'Finished')
].groupby(['Year', 'Round']).size()

print("Hungarian GP races in dataset:")
print(hungarian_grouped)

# For tracks with limited data, consider using a default value
def calculate_overtaking_difficulty(track_data, track_name, min_samples=10):
    """Calculate overtaking difficulty with safeguards."""
    
    clean_data = track_data[['QualifyingPosition', 'RacePosition']].dropna()
    
    if len(clean_data) < min_samples:
        # Use predefined difficulty based on track characteristics
        default_difficulties = {
            'Monaco Grand Prix': 0.85,
            'Hungarian Grand Prix': 0.75,  # Known to be hard to overtake
            'Singapore Grand Prix': 0.80,
            'Abu Dhabi Grand Prix': 0.65,
            # Add more tracks
        }
        return default_difficulties.get(track_name, 0.60)
    
    correlation = clean_data['QualifyingPosition'].corr(clean_data['RacePosition'])
    
    # Bound correlation between 0.1 and 0.9 to avoid extremes
    return max(0.1, min(0.9, correlation))

# Recalculate with safeguards
overtake_difficulty_improved = {}
for track in unique_tracks:
    mask = ((train_data_no_sprints['TrackName'] == track) & 
            (train_data_no_sprints['Status'] == 'Finished'))
    track_data = train_data_no_sprints[mask]
    
    overtake_difficulty_improved[track] = calculate_overtaking_difficulty(track_data, track)

print("\nImproved overtaking difficulty:")
for track, diff in sorted(overtake_difficulty_improved.items(), key=lambda x: x[1], reverse=True):
    print(f"  {track}: {diff:.3f}")

Hungarian GP races in dataset:
Year  Round
2022  13       7
dtype: int64

Improved overtaking difficulty:
  Saudi Arabian Grand Prix: 0.900
  Australian Grand Prix: 0.900
  Monaco Grand Prix: 0.900
  Azerbaijan Grand Prix: 0.900
  Abu Dhabi Grand Prix: 0.886
  Yas Island: 0.880
  Bahrain Grand Prix: 0.872
  Monaco: 0.862
  Monza: 0.856
  Dutch Grand Prix: 0.833
  Japanese Grand Prix: 0.817
  Silverstone: 0.755
  Suzuka: 0.750
  Hungarian Grand Prix: 0.750
  Austin: 0.750
  British Grand Prix: 0.722
  French Grand Prix: 0.686
  Miami Grand Prix: 0.631
  Budapest: 0.630
  São Paulo: 0.630
  Lusail: 0.620
  Jeddah: 0.602
  Spanish Grand Prix: 0.600
  Mexico City Grand Prix: 0.600
  United States Grand Prix: 0.571
  Spa-Francorchamps: 0.551
  Montréal: 0.541
  Mexico City: 0.526
  Spielberg: 0.519
  Sakhir: 0.510
  Canadian Grand Prix: 0.496
  Singapore Grand Prix: 0.493
  Zandvoort: 0.448
  Las Vegas: 0.412
  Marina Bay: 0.395
  Italian Grand Prix: 0.343
  Belgian Grand Prix: 0.183


In [79]:
# Create track features dataframe
track_features = pd.DataFrame({
    'TrackName': list(overtake_difficulty_improved.keys()),
    'overtaking_difficulty': list(overtake_difficulty_improved.values())
})

# Add street circuit flag
street_circuits = ['Monaco Grand Prix', 'Monaco', 'Azerbaijan Grand Prix', 'Baku', 
                   'Saudi Arabian Grand Prix', 'Jeddah', 'Miami Grand Prix', 
                   'Singapore Grand Prix', 'Marina Bay', 'Las Vegas']
track_features['is_street'] = track_features['TrackName'].isin(street_circuits).astype(int)

# Add average lap time from your earlier calculation
track_features['avg_lap_time'] = track_features['TrackName'].map(track_avg_speeds)

print(f"Track features shape: {track_features.shape}")
print("\nSample:")
print(track_features.head(10))

# Save track features
track_features.to_csv("data/processed/track_features.csv", index=False)
print("\nSaved track features")

# Merge with main dataset
train_data_with_features = train_data_no_sprints.merge(
    track_features,
    on='TrackName',
    how='left'
)

print(f"\nMerged dataset shape: {train_data_with_features.shape}")
print(f"New columns added: overtaking_difficulty, is_street, avg_lap_time")

Track features shape: (37, 4)

Sample:
                  TrackName  overtaking_difficulty  is_street  avg_lap_time
0        Bahrain Grand Prix               0.872369          0    102.874575
1  Saudi Arabian Grand Prix               0.900000          1     99.182703
2     Australian Grand Prix               0.900000          0     91.668394
3          Miami Grand Prix               0.630502          1           NaN
4        Spanish Grand Prix               0.600000          0     90.123956
5         Monaco Grand Prix               0.900000          1           NaN
6     Azerbaijan Grand Prix               0.900000          1    111.894604
7       Canadian Grand Prix               0.496262          0     82.890262
8        British Grand Prix               0.721970          0           NaN
9         French Grand Prix               0.685823          0    102.506183

Saved track features

Merged dataset shape: (819, 27)
New columns added: overtaking_difficulty, is_street, avg_lap_time


In [80]:
# Create complete track features dataframe
track_features = pd.DataFrame({
    'TrackName': list(overtake_difficulty_improved.keys()),
    'overtaking_difficulty': list(overtake_difficulty_improved.values())
})

# Add street circuit flag (expanded list)
street_circuits = ['Monaco Grand Prix', 'Monaco', 'Azerbaijan Grand Prix', 'Baku', 
                   'Saudi Arabian Grand Prix', 'Jeddah', 'Miami Grand Prix', 
                   'Singapore Grand Prix', 'Marina Bay', 'Las Vegas']
track_features['is_street'] = track_features['TrackName'].isin(street_circuits).astype(int)

# Add average lap time
track_features['avg_lap_time'] = track_features['TrackName'].map(track_avg_speeds)

print(f"Track features shape: {track_features.shape}")
print(track_features.head(10))

# Save for future use
track_features.to_csv("data/processed/track_features.csv", index=False)

Track features shape: (37, 4)
                  TrackName  overtaking_difficulty  is_street  avg_lap_time
0        Bahrain Grand Prix               0.872369          0    102.874575
1  Saudi Arabian Grand Prix               0.900000          1     99.182703
2     Australian Grand Prix               0.900000          0     91.668394
3          Miami Grand Prix               0.630502          1           NaN
4        Spanish Grand Prix               0.600000          0     90.123956
5         Monaco Grand Prix               0.900000          1           NaN
6     Azerbaijan Grand Prix               0.900000          1    111.894604
7       Canadian Grand Prix               0.496262          0     82.890262
8        British Grand Prix               0.721970          0           NaN
9         French Grand Prix               0.685823          0    102.506183


In [81]:
import numpy as np

# Calculate recent form (last 3 races performance)
quali_points = {1:8, 2:7, 3:6, 4:5, 5:4, 6:3, 7:2, 8:1}  # Only top 8 get points
recent_form = {}

for year in [2022, 2023]:
    season_data = train_data_comprehensive[train_data_comprehensive['Year'] == year]
    
    max_round = season_data['Round'].max()
    for race in range(4, max_round + 1):  # Start from round 4 (need 3 previous races)
        current_round = season_data[season_data['Round'] == race]
            
        for driver in current_round["Abbreviation"].unique():
            # Get previous 3 races
            prev_races_points = []
            prev_quali_points = []
            
            for prev in [1, 2, 3]:
                prev_race = season_data[(season_data['Round'] == race-prev) & 
                                       (season_data['Abbreviation'] == driver)]
                if not prev_race.empty:
                    # Race points
                    prev_races_points.append(prev_race['Points'].values[0] if len(prev_race) > 0 else 0)
                    
                    # Qualifying "points" (bonus for good qualifying)
                    quali_pos = prev_race['QualifyingPosition'].values[0] if len(prev_race) > 0 else 20
                    if pd.notna(quali_pos):
                        prev_quali_points.append(quali_points.get(int(quali_pos), 0))
            
            # Sum all points
            total_points = sum(prev_races_points) + sum(prev_quali_points)
            recent_form[(year, race, driver)] = total_points

# Add to dataset
train_data_comprehensive['recent_form'] = np.nan

for key, form_value in recent_form.items():
    year, round_num, driver_code = key
    
    mask = ((train_data_comprehensive['Year'] == year) & 
            (train_data_comprehensive['Round'] == round_num) & 
            (train_data_comprehensive['Abbreviation'] == driver_code))
    
    train_data_comprehensive.loc[mask, 'recent_form'] = form_value

# Check results
print(f"Recent form calculated for {len(recent_form)} driver-race combinations")
print(f"Recent form filled: {train_data_comprehensive['recent_form'].notna().sum()}/{len(train_data_comprehensive)}")

# Example check
if (2022, 4, 'HAM') in recent_form:
    print(f"\nHamilton Round 4 2022 recent form: {recent_form[(2022, 4, 'HAM')]}")

Recent form calculated for 779 driver-race combinations
Recent form filled: 779/879

Hamilton Round 4 2022 recent form: 36.0


In [82]:
rain_specialists = ['VER', 'HAM', 'STR', 'ALO', 'RUS']  # Known rain performers
train_data_comprehensive['rain_specialist'] = train_data_comprehensive['Abbreviation'].isin(rain_specialists).astype(int)

# Check rain races
rain_races = train_data_comprehensive[train_data_comprehensive['RainDuringRace'] == True]
print(f"Rain races in dataset: {rain_races['Round'].nunique()}")
print(f"Rain specialists in rain races: {rain_races[rain_races['rain_specialist']==1]['Abbreviation'].value_counts()}")


Rain races in dataset: 12
Rain specialists in rain races: Abbreviation
VER    12
RUS    12
HAM    12
STR    12
ALO    12
Name: count, dtype: int64


In [83]:
# Check DNF positions
dnf = train_data[~train_data['Status'].isin(['Finished', '+1 Lap', '+2 Laps', '+3 Laps'])]
print(dnf[['Status', 'RacePosition', 'QualifyingPosition', 'Laps']].head(10))

              Status  RacePosition  QualifyingPosition  Laps
17     Fuel pressure          18.0                 4.0  56.0
18     Fuel pressure          19.0                 2.0  54.0
19        Power Unit          20.0                10.0  44.0
33  Collision damage          14.0                16.0  47.0
34    Cooling system          15.0                 8.0  36.0
35        Water pump          16.0                 7.0  35.0
36           Gearbox          17.0                14.0  35.0
37          Accident          18.0                18.0  14.0
38        Power Unit          19.0                19.0   0.0
39          Withdrew           NaN                 NaN   0.0


In [84]:
driver_error_statuses = ['Collision', 'Collision damage', 'Accident', 'Spun off', 'Crashed']
mechanical_statuses = ['Engine', 'Gearbox', 'Power Unit', 'Hydraulics', 'Brakes', 'Suspension', 
                       'Fuel pressure', 'Power loss', 'Mechanical', 'Electrical', 'Transmission']

def categorize_dnf(status):
    if pd.isna(status) or status == 'Finished' or '+' in str(status):
        return 'Finished'
    elif any(error in str(status) for error in driver_error_statuses):
        return 'Driver_Error'
    elif any(mech in str(status) for mech in mechanical_statuses):
        return 'Mechanical'
    else:
        return 'Other_DNF'

train_data_comprehensive['dnf_category'] = train_data_comprehensive['Status'].apply(categorize_dnf)

print(f"\nDNF categories:")
print(train_data_comprehensive['dnf_category'].value_counts())


DNF categories:
dnf_category
Finished        805
Driver_Error     32
Mechanical       26
Other_DNF        16
Name: count, dtype: int64


In [89]:
train_data_comprehensive.head(80)

Unnamed: 0,Year,Round,TrackName,Country,Abbreviation,FullName,TeamName,Status,QualifyingPosition,RacePosition,Points,Laps,AvgLapTime,NumPitStops,AvgTrackTemp,AvgAirTemp,RainDuringRace,SafetyCarDeployed,Time_s,Time_str,GapToWinner_s,WinnerRaceTime_s,FinishedFlag,GapToLeader,recent_form,rain_specialist,dnf_category
0,2022,1,Bahrain Grand Prix,Bahrain,LEC,Charles Leclerc,Ferrari,Finished,1.0,1.0,26.0,57.0,100.697709,3.0,28.610429,23.617791,False,True,5853.584,01:37:33.584,0.0,5853.584,1.0,,,0,Finished
1,2022,1,Bahrain Grand Prix,Bahrain,SAI,Carlos Sainz,Ferrari,Finished,3.0,2.0,18.0,57.0,102.792667,3.0,28.610429,23.617791,False,True,5859.182,01:37:39.182,5.598,5853.584,1.0,,,0,Finished
2,2022,1,Bahrain Grand Prix,Bahrain,HAM,Lewis Hamilton,Mercedes,Finished,5.0,3.0,15.0,57.0,102.864193,3.0,28.610429,23.617791,False,True,5863.259,01:37:43.259,9.675,5853.584,1.0,,,1,Finished
3,2022,1,Bahrain Grand Prix,Bahrain,RUS,George Russell,Mercedes,Finished,9.0,4.0,12.0,57.0,102.040107,3.0,28.610429,23.617791,False,True,5864.795,01:37:44.795,11.211,5853.584,1.0,,,1,Finished
4,2022,1,Bahrain Grand Prix,Bahrain,MAG,Kevin Magnussen,Haas F1 Team,Finished,7.0,5.0,10.0,57.0,102.953298,3.0,28.610429,23.617791,False,True,5868.338,01:37:48.338,14.754,5853.584,1.0,,,0,Finished
5,2022,1,Bahrain Grand Prix,Bahrain,BOT,Valtteri Bottas,Alfa Romeo,Finished,6.0,6.0,8.0,57.0,102.977246,3.0,28.610429,23.617791,False,True,5869.703,01:37:49.703,16.119,5853.584,1.0,,,0,Finished
6,2022,1,Bahrain Grand Prix,Bahrain,OCO,Esteban Ocon,Alpine,Finished,11.0,7.0,6.0,57.0,103.035211,3.0,28.610429,23.617791,False,True,5873.007,01:37:53.007,19.423,5853.584,1.0,,,0,Finished
7,2022,1,Bahrain Grand Prix,Bahrain,TSU,Yuki Tsunoda,AlphaTauri,Finished,16.0,8.0,4.0,57.0,103.052105,3.0,28.610429,23.617791,False,True,5873.97,01:37:53.970,20.386,5853.584,1.0,,,0,Finished
8,2022,1,Bahrain Grand Prix,Bahrain,ALO,Fernando Alonso,Alpine,Finished,8.0,9.0,2.0,57.0,103.087263,3.0,28.610429,23.617791,False,True,5875.974,01:37:55.974,22.39,5853.584,1.0,,,1,Finished
9,2022,1,Bahrain Grand Prix,Bahrain,ZHO,Guanyu Zhou,Alfa Romeo,Finished,15.0,10.0,1.0,57.0,102.161071,3.0,28.610429,23.617791,False,True,5876.648,01:37:56.648,23.064,5853.584,1.0,,,0,Finished
