In [7]:
import fastf1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from tqdm import tqdm

# For modeling
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,  OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline

# Enable FastF1 caching to avoid repeated API calls
fastf1.Cache.enable_cache('f1_cache')


In [8]:

# MILESTONE 2: Feature Engineering, Feature Selection, and Data Modeling

## 1. Data Collection Using FastF1
# Let's collect data from the 2023 season for training
def collect_race_data(year=2023, race_types=None):
    """
    Collect race data for a specific year
    
    Parameters:
    year - The F1 season year
    race_types - Types of sessions to collect (e.g., 'R' for Race, 'Q' for Qualifying)
    
    Returns:
    DataFrame with combined race data
    """
    if race_types is None:
        race_types = ['R']  # Default to Race sessions
    
    all_data = []
    # Get all events for the year
    events = fastf1.get_event_schedule(year)
    
    for _, event in tqdm(events.iterrows(), desc=f"Processing {year} events", total=len(events)):
        for session_type in race_types:
            try:
                # Load session
                session = fastf1.get_session(year, event['EventName'], session_type)
                session.load(laps=True, telemetry=True, weather=True)
                
                # Get lap data
                laps_data = session.laps.copy()
                
                # Add event info
                laps_data['EventName'] = event['EventName']
                laps_data['CircuitName'] = event['OfficialEventName']
                laps_data['Year'] = year
                
                all_data.append(laps_data)
                print(f"Added data for {event['EventName']} - {session_type}")
            except Exception as e:
                print(f"Error loading {event['EventName']} - {session_type}: {str(e)}")
                continue
    
    if not all_data:
        print("No data collected")
        return None
    
    # Combine all data
    combined_data = pd.concat(all_data, ignore_index=True)
    return combined_data

# Collect Race and Qualifying data
print("Collecting 2023 race data...")
race_data_2023 = collect_race_data(year=2023, race_types=['R'])
qualifying_data_2023 = collect_race_data(year=2023, race_types=['Q'])



Collecting 2023 race data...


Processing 2023 events:   0%|          | 0/23 [00:00<?, ?it/s]core           INFO 	Loading data for British Grand Prix - Race [v3.0.0]
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Par

Added data for Pre-Season Testing - R


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to cache!
c

Added data for Bahrain Grand Prix - R


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to cache!
c

Added data for Saudi Arabian Grand Prix - R


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to cache!
c

Added data for Australian Grand Prix - R


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to cache!
c

Added data for Azerbaijan Grand Prix - R


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to cache!
c

Added data for Miami Grand Prix - R


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to cache!
c

Added data for Monaco Grand Prix - R


req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data.

Added data for Spanish Grand Prix - R


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to cache!
c

Added data for Canadian Grand Prix - R


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
This might be a bug and should be reported.
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req       

Added data for Austrian Grand Prix - R


req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '4', '44', '81', '63', '11', '14', '23', '16', '55', '2', '77', '27', '18', '24', '22', '21', '10', '20', '31']
Processing 2023 events:  48%|████▊     | 11/23 [08:11<07:18, 36.52s/it]core           INFO 	Loading data for Hungarian Grand Prix - Race [v3.0.0]
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driv

Added data for British Grand Prix - R


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to cache!
c

Added data for Hungarian Grand Prix - R


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to cache!
c

Added data for Belgian Grand Prix - R


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to cache!
c

Added data for Dutch Grand Prix - R


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to cache!
c

Added data for Italian Grand Prix - R


core           INFO 	Loading data for Singapore Grand Prix - Race [v3.0.0]
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been writt

Added data for Singapore Grand Prix - R


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to cache!
c

Added data for Japanese Grand Prix - R


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to cache!
c

Added data for Qatar Grand Prix - R


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to cache!
c

Added data for United States Grand Prix - R


core           INFO 	Loading data for Mexico City Grand Prix - Race [v3.0.0]
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been wri

Added data for Mexico City Grand Prix - R


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to cache!
c

Added data for São Paulo Grand Prix - R


core           INFO 	Loading data for Las Vegas Grand Prix - Race [v3.0.0]
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
req            INFO 	No cached data found for car_data. Loading data...
_api           INFO 	Fetching car data...
req            INFO 	No cached data found for weather_data. Loading data...
_api           INFO 	Fetching weather data...
req            INFO 	No cached data found for race_control_m

Error loading Las Vegas Grand Prix - R: The data you are trying to access has not been loaded yet. See `Session.load`


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to cache!
c

Added data for Abu Dhabi Grand Prix - R


Processing 2023 events:   0%|          | 0/23 [00:00<?, ?it/s]core           INFO 	Loading data for British Grand Prix - Qualifying [v3.0.0]
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           I

Added data for Pre-Season Testing - Q


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to cache!
core           INFO 	Processing timing data...
req            INFO 	No cached data found for car_data. Loading data...
_api           INFO 	Fetching car data...
_api          

Added data for Bahrain Grand Prix - Q


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to cache!
core           INFO 	Processing timing data...
req            INFO 	No cached data found for car_data. Loading data...
_api           INFO 	Fetching car data...
_api          

Added data for Saudi Arabian Grand Prix - Q


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to cache!
core           INFO 	Processing timing data...
req            INFO 	No cached data found for car_data. Loading data...
_api           INFO 	Fetching car data...
_api          

Added data for Australian Grand Prix - Q


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to cache!
core           INFO 	Processing timing data...
req            INFO 	No cached data found for car_data. Loading data...
_api           INFO 	Fetching car data...
_api          

Added data for Azerbaijan Grand Prix - Q


core           INFO 	Loading data for Miami Grand Prix - Qualifying [v3.0.0]
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has b

Added data for Miami Grand Prix - Q


_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to cache!
core           INFO 	Processing timing data...
req            INFO 	No cached data found for car_data. Loading data...
_api       

Added data for Monaco Grand Prix - Q


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to cache!
core           INFO 	Processing timing data...
req            INFO 	No cached data found for car_data. Loading data...
_api           INFO 	Fetching car data...
_api          

Added data for Spanish Grand Prix - Q


core           INFO 	Loading data for Canadian Grand Prix - Qualifying [v3.0.0]
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data ha

Added data for Canadian Grand Prix - Q


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to cache!
core           INFO 	Processing timing data...
req            INFO 	No cached data found for car_data. Loading data...
_api           INFO 	Fetching car data...
_api          

Added data for Austrian Grand Prix - Q


req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '4', '81', '16', '55', '63', '44', '23', '14', '10', '27', '18', '31', '2', '77', '11', '22', '24', '21', '20']
Processing 2023 events:  48%|████▊     | 11/23 [05:27<04:26, 22.20s/it]core           INFO 	Loading data for Hungarian Grand Prix - Qualifying [v3.0.0]
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...


Added data for British Grand Prix - Q


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to cache!
core           INFO 	Processing timing data...
req            INFO 	No cached data found for car_data. Loading data...
_api           INFO 	Fetching car data...
_api          

Added data for Hungarian Grand Prix - Q


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to cache!
core           INFO 	Processing timing data...
req            INFO 	No cached data found for car_data. Loading data...
_api           INFO 	Fetching car data...
_api          

Added data for Belgian Grand Prix - Q


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to cache!
core           INFO 	Processing timing data...
req            INFO 	No cached data found for car_data. Loading data...
_api           INFO 	Fetching car data...
_api          

Added data for Dutch Grand Prix - Q


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to cache!
core           INFO 	Processing timing data...
req            INFO 	No cached data found for car_data. Loading data...
_api           INFO 	Fetching car data...
_api          

Added data for Italian Grand Prix - Q


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to cache!
core           INFO 	Processing timing data...
req            INFO 	No cached data found for car_data. Loading data...
_api           INFO 	Fetching car data...
_api          

Added data for Singapore Grand Prix - Q


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to cache!
core           INFO 	Processing timing data...
req            INFO 	No cached data found for car_data. Loading data...
_api           INFO 	Fetching car data...
_api          

Added data for Japanese Grand Prix - Q


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to cache!
core           INFO 	Processing timing data...
req            INFO 	No cached data found for car_data. Loading data...
_api           INFO 	Fetching car data...
_api          

Added data for Qatar Grand Prix - Q


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to cache!
core           INFO 	Processing timing data...
req            INFO 	No cached data found for car_data. Loading data...
_api           INFO 	Fetching car data...
_api          

Added data for United States Grand Prix - Q


core           INFO 	Loading data for Mexico City Grand Prix - Qualifying [v3.0.0]
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data

Added data for Mexico City Grand Prix - Q


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to cache!
core           INFO 	Processing timing data...
req            INFO 	No cached data found for car_data. Loading data...
_api           INFO 	Fetching car data...
_api          

Added data for São Paulo Grand Prix - Q


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to cache!
core           INFO 	Processing timing data...
req            INFO 	No cached data found for car_data. Loading data...
_api           INFO 	Fetching car data...
_api          

Added data for Las Vegas Grand Prix - Q


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to cache!
core           INFO 	Processing timing data...
req            INFO 	No cached data found for car_data. Loading data...
_api           INFO 	Fetching car data...
_api          

Added data for Abu Dhabi Grand Prix - Q





In [9]:
drivers_df = pd.read_csv('../Data/drivers.csv')
races_df = pd.read_csv('../Data/races.csv')
lap_times_df = pd.read_csv('../Data/lap_times.csv')
results_df = pd.read_csv('../Data/results.csv')
constructors_df = pd.read_csv('../Data/constructors.csv')

# Preprocess date columns
races_df['date'] = pd.to_datetime(races_df['date'])
drivers_df['dob'] = pd.to_datetime(drivers_df['dob'])
   

In [None]:
# Optional: Collect 2022 data for additional training
# race_data_2022 = collect_race_data(year=2022, race_types=['R'])
# qualifying_data_2022 = collect_race_data(year=2022, race_types=['Q'])

## 2. Feature Engineering

def engineer_features(race_data, qualifying_data=None):
    """
    Engineer features from race and qualifying data
    
    Parameters:
    race_data - Race session lap data
    qualifying_data - Qualifying session lap data (optional)
    
    Returns:
    DataFrame with engineered features
    """
    # 2.1 Create driver performance metrics
    # Calculate average lap time per driver per race
    race_features = race_data.copy()
    
    # Filter out outlier laps
    race_features = race_features[
        (race_features['IsPersonalBest'] == True) |
        (~race_features['LapTime'].isna())
    ]
    
    # Convert time deltas to seconds for easier manipulation
    for col in ['LapTime', 'Sector1Time', 'Sector2Time', 'Sector3Time']:
        if col in race_features.columns:
            race_features[f'{col}_seconds'] = race_features[col].dt.total_seconds()
    
    # 2.2 Driver Consistency Features
    # Calculate standard deviation of lap times per driver per race
    driver_consistency = race_features.groupby(['Year', 'EventName', 'Driver'])[
        'LapTime_seconds'
    ].agg(['mean', 'std', 'count']).reset_index()
    
    driver_consistency.columns = [
        'Year', 'EventName', 'Driver', 'AvgLapTime', 'LapTimeStd', 'LapCount'
    ]
    
    # 2.3 Sector Performance Features - FIX: Use a list instead of tuple for column selection
    # Check if all sector columns exist
    sector_columns = []
    for col in ['Sector1Time_seconds', 'Sector2Time_seconds', 'Sector3Time_seconds']:
        if col in race_features.columns:
            sector_columns.append(col)
    
    # Only proceed if we have sector data
    if sector_columns:
        sector_performance = race_features.groupby(['Year', 'EventName', 'Driver'])[
            sector_columns  # Use list of columns that exist
        ].agg(['mean', 'std']).reset_index()
        
        # Flatten the multi-level columns
        sector_performance.columns = [
            '_'.join(col).strip('_') if isinstance(col, tuple) else col 
            for col in sector_performance.columns.values
        ]
    else:
        # Create empty DataFrame with correct columns if sector data not available
        sector_performance = pd.DataFrame(columns=['Year', 'EventName', 'Driver'])
    
    # 2.4 Qualifying Performance (if available)
    qualifying_features = None
    if qualifying_data is not None:
        qualifying_features = qualifying_data.copy()
        
        # Get fastest qualifying lap per driver per race
        qualifying_best = qualifying_features.groupby(['Year', 'EventName', 'Driver']).agg({
            'LapTime': 'min'
        }).reset_index()
        
        qualifying_best['QualifyingTime_seconds'] = qualifying_best['LapTime'].dt.total_seconds()
    
    # 2.5 Speed Features - check if speed columns exist
    speed_columns = []
    for col in ['SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST']:
        if col in race_features.columns:
            speed_columns.append(col)
    
    if speed_columns:
        speed_features = race_features.groupby(['Year', 'EventName', 'Driver'])[
            speed_columns
        ].agg(['mean', 'max']).reset_index()
        
        # Flatten the multi-level columns
        speed_features.columns = [
            '_'.join(col).strip('_') if isinstance(col, tuple) else col 
            for col in speed_features.columns.values
        ]
    else:
        # Create empty DataFrame with correct columns if speed data not available
        speed_features = pd.DataFrame(columns=['Year', 'EventName', 'Driver'])
    
    # 2.6 Tire Strategy Features - check if Compound column exists
    if 'Compound' in race_features.columns:
        tire_features = race_features.groupby(['Year', 'EventName', 'Driver', 'Compound'])[
            'LapTime_seconds'
        ].count().reset_index()
        
        tire_pivot = tire_features.pivot_table(
            index=['Year', 'EventName', 'Driver'],
            columns='Compound',
            values='LapTime_seconds',
            fill_value=0
        ).reset_index()
    else:
        # Create empty DataFrame with correct columns if compound data not available
        tire_pivot = pd.DataFrame(columns=['Year', 'EventName', 'Driver'])
    
    # 2.7 Grid Position and Position Change - check if position columns exist
    position_columns = {'GridPosition': 'first'}
    if 'Position' in race_features.columns:
        position_columns['Position'] = ['first', 'last']
    
    position_features = race_features.groupby(['Year', 'EventName', 'Driver']).agg(
        position_columns
    ).reset_index()
    
    # Flatten the multi-level columns if they exist
    if 'Position' in position_columns:
        position_features.columns = [
            '_'.join(col).strip('_') if isinstance(col, tuple) else col 
            for col in position_features.columns.values
        ]
        
        # Calculate position change if both grid and final position are available
        if 'GridPosition' in position_features.columns and 'Position_last' in position_features.columns:
            position_features['PositionChange'] = position_features['GridPosition'] - position_features['Position_last']
    
    # 2.9 Combine all features - start with driver consistency as base
    combined_features = driver_consistency.copy()
    
    # Function to safely merge DataFrames
    def safe_merge(left, right, on_cols, how='left'):
        # Only merge if right DataFrame is not empty
        if not right.empty and all(col in right.columns for col in on_cols):
            return pd.merge(left, right, on=on_cols, how=how)
        return left
    
    # Merge sector performance if available
    combined_features = safe_merge(
        combined_features,
        sector_performance,
        on=['Year', 'EventName', 'Driver']
    )
    
    # Merge speed features if available
    combined_features = safe_merge(
        combined_features,
        speed_features,
        on=['Year', 'EventName', 'Driver']
    )
    
    # Merge tire features if available
    combined_features = safe_merge(
        combined_features,
        tire_pivot,
        on=['Year', 'EventName', 'Driver']
    )
    
    # Merge position features if available
    combined_features = safe_merge(
        combined_features,
        position_features,
        on=['Year', 'EventName', 'Driver']
    )
    
    # Merge qualifying features if available
    if qualifying_features is not None:
        qualifying_best = qualifying_best[['Year', 'EventName', 'Driver', 'QualifyingTime_seconds']]
        combined_features = safe_merge(
            combined_features,
            qualifying_best,
            on=['Year', 'EventName', 'Driver']
        )
    
    # Fill missing values with median to avoid NaNs
    numeric_cols = combined_features.select_dtypes(include=['number']).columns
    combined_features[numeric_cols] = combined_features[numeric_cols].fillna(
        combined_features[numeric_cols].median()
    )
    
    return combined_features
# Engineer features from collected data
print("Engineering features...")
features_df = engineer_features(race_data_2023, qualifying_data_2023)


In [None]:

## 3. Feature Selection

def select_features(features_df, target_col='AvgLapTime', test_size=0.2):
    """
    Select relevant features using feature importance
    
    Parameters:
    features_df - DataFrame with all engineered features
    target_col - Target column for prediction
    test_size - Test set size for train-test split
    
    Returns:
    Dictionary with selected features and train-test split data
    """
    # Drop non-feature columns for modeling
    non_feature_cols = ['Year', 'EventName', 'Driver']
    
    # Separate features and target
    X = features_df.drop(non_feature_cols + [target_col], axis=1)
    y = features_df[target_col]
    
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42
    )
    
    print(f"Training set size: {X_train.shape[0]}")
    print(f"Test set size: {X_test.shape[0]}")
    
    # Use Random Forest to evaluate feature importance
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    
    # Get feature importance
    feature_importance = pd.DataFrame({
        'Feature': X.columns,
        'Importance': rf_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    print("Top 10 most important features:")
    print(feature_importance.head(10))
    
    # Visualize feature importance
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=feature_importance.head(15))
    plt.title('Feature Importance (Random Forest)')
    plt.tight_layout()
    plt.show()
    
    # Select top features (80% of cumulative importance)
    cum_importance = feature_importance['Importance'].cumsum()
    top_features = feature_importance[cum_importance <= 0.8]['Feature'].tolist()
    
    if not top_features:  # If no features have cumulative importance <= 0.8
        top_features = feature_importance['Feature'].head(10).tolist()
    
    print(f"Selected {len(top_features)} features with 80% cumulative importance")
    
    # Filter features
    X_train_selected = X_train[top_features]
    X_test_selected = X_test[top_features]
    
    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_selected)
    X_test_scaled = scaler.transform(X_test_selected)
    
    return {
        'X_train': X_train_scaled,
        'X_test': X_test_scaled,
        'y_train': y_train,
        'y_test': y_test,
        'feature_names': top_features,
        'feature_importance': feature_importance,
        'scaler': scaler
    }

# Select features
print("Selecting features...")
selected_features = select_features(features_df, target_col='AvgLapTime')


In [None]:

## 4. Data Modeling

def train_models(selected_features):
    """
    Train multiple regression models and evaluate performance
    
    Parameters:
    selected_features - Dictionary with selected features and train-test data
    
    Returns:
    Dictionary with trained models and performance metrics
    """
    X_train = selected_features['X_train']
    X_test = selected_features['X_test']
    y_train = selected_features['y_train']
    y_test = selected_features['y_test']
    
    # Initialize models
    models = {
        'Linear Regression': LinearRegression(),
        'Ridge Regression': Ridge(alpha=1.0),
        'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
        'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
    }
    
    # Train and evaluate models
    results = []
    trained_models = {}
    
    for name, model in models.items():
        print(f"Training {name}...")
        model.fit(X_train, y_train)
        
        # Predict on test set
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        
        print(f"{name} Performance:")
        print(f"MAE: {mae:.4f} seconds")
        print(f"RMSE: {rmse:.4f} seconds")
        print(f"R²: {r2:.4f}")
        print("-" * 50)
        
        results.append({
            'model': name,
            'mae': mae,
            'rmse': rmse,
            'r2': r2
        })
        
        trained_models[name] = model
    
    # Convert to DataFrame for comparison
    results_df = pd.DataFrame(results)
    
    # Visualize model comparison
    plt.figure(figsize=(14, 6))
    plt.subplot(1, 2, 1)
    sns.barplot(x='model', y='mae', data=results_df)
    plt.title('Model Comparison - MAE (lower is better)')
    plt.xticks(rotation=45)
    plt.ylabel('Mean Absolute Error (seconds)')
    
    plt.subplot(1, 2, 2)
    sns.barplot(x='model', y='r2', data=results_df)
    plt.title('Model Comparison - R² (higher is better)')
    plt.xticks(rotation=45)
    plt.ylabel('R² Score')
    
    plt.tight_layout()
    plt.show()
    
    # Find best model
    best_model_name = results_df.loc[results_df['mae'].idxmin(), 'model']
    print(f"Best model based on MAE: {best_model_name}")
    
    return {
        'trained_models': trained_models,
        'results': results_df,
        'best_model_name': best_model_name
    }

# Train models
print("Training models...")
model_results = train_models(selected_features)


In [None]:

## 5. Race Prediction Implementation

def predict_race_results(qualifying_data, model, feature_names, scaler):
    """
    Predict race results using qualifying data
    
    Parameters:
    qualifying_data - Qualifying data for the race
    model - Trained prediction model
    feature_names - Names of features used by the model
    scaler - Fitted scaler for the model
    
    Returns:
    DataFrame with predicted race results
    """
    # Prepare features from qualifying data
    # This would need to be adapted based on available data
    X_pred = qualifying_data[feature_names].copy()
    
    # Scale features
    X_pred_scaled = scaler.transform(X_pred)
    
    # Make predictions
    predicted_times = model.predict(X_pred_scaled)
    
    # Add predictions to qualifying data
    results = qualifying_data.copy()
    results['PredictedRaceTime'] = predicted_times
    
    # Sort by predicted race time (faster times first)
    results = results.sort_values('PredictedRaceTime').reset_index(drop=True)
    
    return results

# Example: Prepare data for the next race prediction
# In a real-world scenario, you would need to collect actual qualifying data
# for the upcoming race and engineer the same features

# For demonstration, let's create a dummy qualifying dataset
print("Creating sample prediction for a future race...")
next_race_qualifying = features_df.copy()

# Select the best model
best_model = model_results['trained_models'][model_results['best_model_name']]

# Predict race results
predicted_results = predict_race_results(
    next_race_qualifying,
    best_model,
    selected_features['feature_names'],
    selected_features['scaler']
)

# Show predicted race results
print("\nPredicted Race Results:")
print(predicted_results[['Driver', 'PredictedRaceTime']].head(10))



In [None]:
## 6. Deliverables Summary

# 6.1 Selected Features with Justification
print("\nSelected Features with Justification:")
top_features = selected_features['feature_importance']
for feature, importance in zip(top_features['Feature'].head(10), top_features['Importance'].head(10)):
    print(f"- {feature}: Importance score = {importance:.4f}")

# 6.2 Model Performance Summary
print("\nModel Performance Summary:")
print(model_results['results'].sort_values('mae'))

# 6.3 Best Model Analysis
print(f"\nBest Model: {model_results['best_model_name']}")
best_model_performance = model_results['results'][model_results['results']['model'] == model_results['best_model_name']]
print(f"MAE: {best_model_performance['mae'].values[0]:.4f} seconds")
print(f"RMSE: {best_model_performance['rmse'].values[0]:.4f} seconds")
print(f"R²: {best_model_performance['r2'].values[0]:.4f}")

# 6.4 Conclusions and Next Steps
print("\nConclusions:")
print("1. The most important features for predicting F1 lap times are:")
for i, (feature, importance) in enumerate(zip(top_features['Feature'].head(3), top_features['Importance'].head(3))):
    print(f"   {i+1}. {feature} (Importance: {importance:.4f})")

print(f"\n2. The {model_results['best_model_name']} model performed best with MAE of {best_model_performance['mae'].values[0]:.4f} seconds")

print("\n3. Next steps for further improvement:")
print("   - Collect more historical data (e.g., from 2021 and 2022)")
print("   - Incorporate additional telemetry data for more detailed driver performance metrics")
print("   - Add weather forecast data for future race predictions")
print("   - Implement hyperparameter tuning for model optimization")