# 03.6 - FastF1 Feature Engineering

This notebook extracts features from FastF1 data (2018+) and appends them to `master_races_clean.csv`.

**Features:**
- **Relative features** (standardized within race): DRS patterns, overtaking skill, position change rate, lap time std dev, pit stop timing, sector speed vs lap time, tyre efficiency index
- **Weather features**: AirTemp, TrackTemp, Humidity, Pressure, WindSpeed, Rainfall flags and transitions

**Key Requirements:**
- Features calculated from historical race data (rolling averages)
- Relative features standardized per race using z-score for cross-track comparability
- Features must be available before the race (Practice/Qualifying + historical data)
- All features joined to master_races_clean.csv (from 03.8)

**Input:** 
- `data/processed/master_races_clean.csv` (from 03.8)
- `data/raw/fastf1_2018plus/ALL_LAPS_*.csv`
- `data/raw/fastf1_2018plus/ALL_TELEMETRY_*.csv`
- `data/raw/fastf1_2018plus/ALL_WEATHER_*.csv`

**Output:** `data/processed/master_races_with_fastf1.csv` - Clean dataset with FastF1 features


In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import warnings
from typing import Dict, List, Optional, Tuple
from scipy import stats
import logging

warnings.filterwarnings('ignore')

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)8s | %(message)s",
    datefmt="%H:%M:%S",
)
logger = logging.getLogger(__name__)

# Set up paths robustly for both script and notebook usage.

import os

# Try typical environment variables from Jupyter/Colab, fall back to cwd.
def find_project_root():
    # 1. Check environment variable for absolute project root.
    prj_env = os.environ.get('PROJECT_ROOT', None)
    if prj_env and Path(prj_env).exists():
        return Path(prj_env).resolve()
    # 2. Check common notebook variable for repo root.
    if 'PWD' in os.environ:  # Jupyter sets $PWD to the notebook's start dir
        root = Path(os.environ['PWD']).resolve()
        # If this is inside the project, try project root
        # Navigate up if typical subdirectory present
        for test_up in [root, root.parent, root.parent.parent]:
            # Check if 'data/processed/master_races_clean.csv' exists there
            if (test_up / "data" / "processed" / "master_races_clean.csv").exists():
                return test_up
    # 3. Try cwd and parents upwards
    cwd = Path.cwd().resolve()
    for up in [cwd, *cwd.parents]:
        if (up / "data" / "processed" / "master_races_clean.csv").exists():
            return up
    # 4. Fallback to relative parent (legacy; may be wrong on some setups)
    fallback = Path("..").resolve()
    return fallback

PROJECT_ROOT = find_project_root()

PROCESSED_ROOT = PROJECT_ROOT / "data" / "processed"
RAW_ROOT = PROJECT_ROOT / "data" / "raw" / "fastf1_2018plus"

# FastF1 data years (2018+)
FASTF1_YEARS = range(2018, 2026)

logger.info(f"PROJECT_ROOT: {PROJECT_ROOT}")
logger.info(f"PROCESSED_ROOT: {PROCESSED_ROOT}")
logger.info(f"RAW_ROOT: {RAW_ROOT}")

# Load master dataset (from 03.8 - cleaned dataset)
master_path = PROCESSED_ROOT / "master_races_clean.csv"
if not master_path.exists():
    raise FileNotFoundError(f"master_races_clean.csv not found at {master_path}. Please run notebook 03.8 first.")

logger.info("Loading master_races_clean.csv...")
master = pd.read_csv(master_path, low_memory=False)
master['date'] = pd.to_datetime(master['date'], errors='coerce')
master = master.sort_values(['year', 'round', 'date']).reset_index(drop=True)

# Filter to 2018+ for FastF1 data
master_2018plus = master[master['year'] >= 2018].copy()

# Initialize variables for later cells
fastf1_data = {'LAPS': {}, 'TELEMETRY': {}, 'WEATHER': {}}
historical_metrics = pd.DataFrame()
metrics_with_rolling = pd.DataFrame()
weather_features = pd.DataFrame()
master_with_features = pd.DataFrame()

logger.info(f"Master dataset: {len(master):,} rows (total), {len(master_2018plus):,} rows (2018+)")
logger.info(f"Date range: {master_2018plus['date'].min()} to {master_2018plus['date'].max()}")


17:46:24 |     INFO | PROJECT_ROOT: C:\Users\erikv\Downloads\F1
17:46:24 |     INFO | PROCESSED_ROOT: C:\Users\erikv\Downloads\F1\data\processed
17:46:24 |     INFO | RAW_ROOT: C:\Users\erikv\Downloads\F1\data\raw\fastf1_2018plus
17:46:24 |     INFO | Loading master_races_clean.csv...
17:46:25 |     INFO | Master dataset: 12,358 rows (total), 2,979 rows (2018+)
17:46:25 |     INFO | Date range: 2018-03-25 00:00:00 to 2024-12-08 00:00:00


## Step 1: Load FastF1 Data and Enrich with raceId

Load all FastF1 CSV files (LAPS, TELEMETRY, WEATHER) and enrich them with `raceId` from master CSV by matching Year and Event. This makes temporal ordering and rolling averages much easier since raceId is in chronological order.


In [None]:
def normalize_event_name(name: str) -> str:
    """Normalize event name for matching between FastF1 and master CSV."""
    if pd.isna(name):
        return None
    # Convert to string and strip
    name = str(name).strip()
    return name

def enrich_fastf1_with_raceid(fastf1_data: Dict, master_df: pd.DataFrame) -> Dict:
    """Enrich FastF1 data with raceId from master CSV by matching Year and Event.
    
    This makes temporal ordering and historical calculations much easier.
    """
    logger.info("Enriching FastF1 data with raceId from master CSV...")
    
    # Create a mapping from (year, event) -> raceId
    # Use the first raceId for each year+event combination (since one race = one raceId)
    race_mapping = master_df[['year', 'name', 'raceId']].drop_duplicates(['year', 'name'])
    race_mapping_dict = {}
    for _, row in race_mapping.iterrows():
        key = (int(row['year']), normalize_event_name(row['name']))
        race_mapping_dict[key] = int(row['raceId'])
    
    logger.info(f"Created raceId mapping for {len(race_mapping_dict)} unique race events")
    
    # Enrich each dataset
    enriched_data = {
        'LAPS': {},
        'TELEMETRY': {},
        'WEATHER': {}
    }
    
    for dataset_type in ['LAPS', 'TELEMETRY', 'WEATHER']:
        for year in fastf1_data[dataset_type].keys():
            df = fastf1_data[dataset_type][year].copy()
            
            # Add raceId by matching Year and Event
            def get_raceid(row):
                event = normalize_event_name(row['Event']) if 'Event' in row.index else None
                year_val = int(row['Year']) if 'Year' in row.index else None
                if event and year_val:
                    key = (year_val, event)
                    return race_mapping_dict.get(key, None)
                return None
            
            # Apply raceId mapping
            df['raceId'] = df.apply(get_raceid, axis=1)
            
            # Log statistics
            total_rows = len(df)
            matched_rows = df['raceId'].notna().sum()
            match_rate = (matched_rows / total_rows * 100) if total_rows > 0 else 0
            
            logger.info(f"  {dataset_type} {year}: {matched_rows:,}/{total_rows:,} rows matched ({match_rate:.1f}%)")
            
            enriched_data[dataset_type][year] = df
    
    return enriched_data

def load_fastf1_data(years: range, raw_root: Path) -> Dict[str, Dict[int, pd.DataFrame]]:
    """Load all FastF1 CSV files for specified years.
    
    Returns:
        Dictionary with keys: 'LAPS', 'TELEMETRY', 'WEATHER'
        Each contains a dict mapping year -> DataFrame
    """
    data = {
        'LAPS': {},
        'TELEMETRY': {},
        'WEATHER': {}
    }
    
    for year in years:
        # Load LAPS
        laps_path = raw_root / f"ALL_LAPS_{year}.csv"
        if laps_path.exists():
            try:
                logger.info(f"Loading LAPS {year}...")
                df = pd.read_csv(laps_path, low_memory=False)
                if len(df) > 0:
                    # Convert LapTime to timedelta for easier calculations
                    if 'LapTime' in df.columns:
                        df['LapTime'] = pd.to_timedelta(df['LapTime'], errors='coerce')
                    data['LAPS'][year] = df
                    logger.info(f"  LAPS {year}: {len(df):,} rows")
            except Exception as e:
                logger.warning(f"  Failed to load LAPS {year}: {e}")
        
        # Load WEATHER
        weather_path = raw_root / f"ALL_WEATHER_{year}.csv"
        if weather_path.exists():
            try:
                logger.info(f"Loading WEATHER {year}...")
                df = pd.read_csv(weather_path, low_memory=False)
                if len(df) > 0:
                    data['WEATHER'][year] = df
                    logger.info(f"  WEATHER {year}: {len(df):,} rows")
            except Exception as e:
                logger.warning(f"  Failed to load WEATHER {year}: {e}")
        
        # Load TELEMETRY (in chunks if very large)
        telemetry_path = raw_root / f"ALL_TELEMETRY_{year}.csv"
        if telemetry_path.exists():
            try:
                logger.info(f"Loading TELEMETRY {year}...")
                # Check file size first
                file_size = telemetry_path.stat().st_size / (1024**3)  # GB
                if file_size > 1.0:  # > 1GB, load in chunks
                    logger.info(f"  TELEMETRY {year} is large ({file_size:.2f} GB), loading in chunks...")
                    chunks = []
                    chunk_size = 100000
                    for chunk in pd.read_csv(telemetry_path, chunksize=chunk_size, low_memory=False):
                        chunks.append(chunk)
                    df = pd.concat(chunks, ignore_index=True)
                else:
                    df = pd.read_csv(telemetry_path, low_memory=False)
                
                if len(df) > 0:
                    data['TELEMETRY'][year] = df
                    logger.info(f"  TELEMETRY {year}: {len(df):,} rows")
            except Exception as e:
                logger.warning(f"  Failed to load TELEMETRY {year}: {e}")
    
    return data

# Load FastF1 data
logger.info("Loading FastF1 data files...")
fastf1_data = load_fastf1_data(FASTF1_YEARS, RAW_ROOT)

# Summary
for dataset_type in ['LAPS', 'TELEMETRY', 'WEATHER']:
    years_loaded = list(fastf1_data[dataset_type].keys())
    logger.info(f"{dataset_type}: {len(years_loaded)} years loaded: {years_loaded}")

logger.info("FastF1 data loading complete!")

# Enrich FastF1 data with raceId from master CSV
logger.info("\n" + "="*60)
logger.info("Enriching FastF1 data with raceId...")
fastf1_data = enrich_fastf1_with_raceid(fastf1_data, master_2018plus)
logger.info("FastF1 data enrichment complete!")


17:46:25 |     INFO | Loading FastF1 data files...
17:46:25 |     INFO | Loading LAPS 2018...


17:46:26 |     INFO |   LAPS 2018: 58,002 rows
17:46:26 |     INFO | Loading WEATHER 2018...
17:46:26 |     INFO |   WEATHER 2018: 9,707 rows
17:46:26 |     INFO | Loading TELEMETRY 2018...
17:46:26 |     INFO |   TELEMETRY 2018 is large (2.34 GB), loading in chunks...
17:47:43 |     INFO |   TELEMETRY 2018: 19,140,560 rows
17:47:43 |     INFO | Loading LAPS 2019...
17:47:45 |     INFO |   LAPS 2019: 45,169 rows
17:47:45 |     INFO | Loading WEATHER 2019...
17:47:45 |     INFO |   WEATHER 2019: 8,077 rows
17:47:45 |     INFO | Loading TELEMETRY 2019...
17:47:45 |     INFO |   TELEMETRY 2019 is large (6.31 GB), loading in chunks...
17:53:13 |     INFO |   TELEMETRY 2019: 51,628,140 rows
17:53:13 |     INFO | Loading LAPS 2020...
17:53:16 |     INFO |   LAPS 2020: 39,040 rows
17:53:16 |     INFO | Loading WEATHER 2020...
17:53:16 |     INFO |   WEATHER 2020: 7,822 rows
17:53:16 |     INFO | Loading TELEMETRY 2020...
17:53:16 |     INFO |   TELEMETRY 2020 is large (2.01 GB), loading in ch

## Step 2: Calculate Historical Race Metrics

Calculate base metrics from historical race data (Session='R') for each driver-race combination:
- DRS patterns (from TELEMETRY)
- Overtaking events (position changes)
- Lap time statistics (mean, std dev per stint)
- Pit stop timings (first pit lap)
- Sector speed correlations
- Tyre efficiency metrics


In [4]:
def calculate_drs_patterns(telemetry_df: pd.DataFrame, driver_code: str, driver_number: int, 
                           year: int, event: str, race_id: Optional[int] = None) -> Dict[str, float]:
    """Calculate DRS usage patterns for a driver in a race."""
    # Filter telemetry for this driver/race
    # Telemetry Driver column can be driver number (int) or driver code (str)
    if 'Driver' not in telemetry_df.columns:
        return {'drs_activation_rate': np.nan, 'drs_time_fraction': np.nan}
    
    # Use raceId if available for more efficient matching
    if 'raceId' in telemetry_df.columns and race_id is not None:
        race_telemetry = telemetry_df[
            (telemetry_df['raceId'] == race_id) &
            (telemetry_df['Session'] == 'R')
        ].copy()
    else:
        # Fallback to Year + Event matching
        race_telemetry = telemetry_df[
            (telemetry_df['Year'] == year) &
            (telemetry_df['Event'] == event) &
            (telemetry_df['Session'] == 'R')
        ].copy()
    
    # Match on driver number (telemetry usually has numeric driver IDs)
    if len(race_telemetry) > 0:
        if race_telemetry['Driver'].dtype in [np.int64, np.float64]:
            race_telemetry = race_telemetry[race_telemetry['Driver'] == driver_number]
        else:
            # Try matching on driver code as string
            race_telemetry = race_telemetry[race_telemetry['Driver'].astype(str) == str(driver_code)]
    
    if len(race_telemetry) == 0 or 'DRS' not in race_telemetry.columns:
        return {'drs_activation_rate': np.nan, 'drs_time_fraction': np.nan}
    
    # DRS activation rate (percentage of samples with DRS=1)
    drs_activation_rate = (race_telemetry['DRS'] == 1).mean() if 'DRS' in race_telemetry.columns else np.nan
    
    # DRS time fraction (if we have time data)
    drs_time_fraction = drs_activation_rate  # Simplified - could be more sophisticated
    
    return {
        'drs_activation_rate': drs_activation_rate,
        'drs_time_fraction': drs_time_fraction
    }

def calculate_overtaking_skill(laps_df: pd.DataFrame, driver_code: str, driver_number: int,
                               year: int, event: str) -> Dict[str, float]:
    """Calculate overtaking skill metrics (position changes)."""
    # Filter laps for this driver/race
    race_laps = laps_df[
        (laps_df['Year'] == year) &
        (laps_df['Event'] == event) &
        (laps_df['Session'] == 'R') &
        (laps_df['Driver'] == driver_code) &
        (laps_df['DriverNumber'] == driver_number) &
        (laps_df['Position'].notna())
    ].copy()
    
    if len(race_laps) < 2:
        return {'position_changes': np.nan, 'position_change_rate': np.nan}
    
    race_laps = race_laps.sort_values('LapNumber')
    
    # Calculate position changes
    position_changes = (race_laps['Position'].diff() != 0).sum() - 1  # Subtract 1 for first lap
    position_changes = max(0, position_changes)  # Can't be negative
    
    # Position change rate (changes per lap)
    total_laps = len(race_laps)
    position_change_rate = position_changes / total_laps if total_laps > 0 else np.nan
    
    return {
        'position_changes': position_changes,
        'position_change_rate': position_change_rate
    }

def calculate_lap_time_stats(laps_df: pd.DataFrame, driver_code: str, driver_number: int,
                            year: int, event: str) -> Dict[str, float]:
    """Calculate lap time statistics (mean, std dev) per stint."""
    # Filter laps for this driver/race
    race_laps = laps_df[
        (laps_df['Year'] == year) &
        (laps_df['Event'] == event) &
        (laps_df['Session'] == 'R') &
        (laps_df['Driver'] == driver_code) &
        (laps_df['DriverNumber'] == driver_number) &
        (laps_df['LapTime'].notna()) &
        (laps_df['Deleted'] != True)  # Exclude deleted laps
    ].copy()
    
    if len(race_laps) == 0:
        return {'lap_time_mean': np.nan, 'lap_time_std': np.nan}
    
    # Convert LapTime to seconds if it's timedelta
    if pd.api.types.is_timedelta64_dtype(race_laps['LapTime']):
        lap_times_seconds = race_laps['LapTime'].dt.total_seconds()
    else:
        lap_times_seconds = pd.to_timedelta(race_laps['LapTime'], errors='coerce').dt.total_seconds()
    
    lap_time_mean = lap_times_seconds.mean()
    lap_time_std = lap_times_seconds.std()
    
    return {
        'lap_time_mean': lap_time_mean,
        'lap_time_std': lap_time_std
    }

def calculate_pit_stop_timing(laps_df: pd.DataFrame, driver_code: str, driver_number: int,
                             year: int, event: str) -> Dict[str, float]:
    """Calculate pit stop timing (lap number of first pit stop)."""
    # Filter laps for this driver/race
    race_laps = laps_df[
        (laps_df['Year'] == year) &
        (laps_df['Event'] == event) &
        (laps_df['Session'] == 'R') &
        (laps_df['Driver'] == driver_code) &
        (laps_df['DriverNumber'] == driver_number) &
        (laps_df['LapNumber'].notna())
    ].copy()
    
    if len(race_laps) == 0:
        return {'first_pit_lap': np.nan, 'pit_stops': 0}
    
    race_laps = race_laps.sort_values('LapNumber')
    
    # Check for pit stops (Stint changes or PitInTime/PitOutTime)
    if 'Stint' in race_laps.columns:
        stint_changes = race_laps[race_laps['Stint'].diff() > 0]
        if len(stint_changes) > 0:
            first_pit_lap = stint_changes.iloc[0]['LapNumber']
            pit_stops = len(stint_changes)
        else:
            first_pit_lap = np.nan
            pit_stops = 0
    else:
        # Alternative: check PitInTime
        pit_laps = race_laps[race_laps['PitInTime'].notna()]
        if len(pit_laps) > 0:
            first_pit_lap = pit_laps.iloc[0]['LapNumber']
            pit_stops = len(pit_laps)
        else:
            first_pit_lap = np.nan
            pit_stops = 0
    
    return {
        'first_pit_lap': first_pit_lap,
        'pit_stops': pit_stops
    }

def calculate_sector_speed_correlation(laps_df: pd.DataFrame, driver_code: str, driver_number: int,
                                      year: int, event: str) -> Dict[str, float]:
    """Calculate correlation between sector speeds and lap time."""
    # Filter laps for this driver/race
    race_laps = laps_df[
        (laps_df['Year'] == year) &
        (laps_df['Event'] == event) &
        (laps_df['Session'] == 'R') &
        (laps_df['Driver'] == driver_code) &
        (laps_df['DriverNumber'] == driver_number) &
        (laps_df['LapTime'].notna()) &
        (laps_df['Deleted'] != True)
    ].copy()
    
    if len(race_laps) < 3:
        return {'sector_speed_laptime_corr': np.nan}
    
    # Convert LapTime to seconds
    if pd.api.types.is_timedelta64_dtype(race_laps['LapTime']):
        lap_times_seconds = race_laps['LapTime'].dt.total_seconds()
    else:
        lap_times_seconds = pd.to_timedelta(race_laps['LapTime'], errors='coerce').dt.total_seconds()
    
    # Calculate average sector speed (using SpeedI1 and SpeedI2)
    if 'SpeedI1' in race_laps.columns and 'SpeedI2' in race_laps.columns:
        sector_speeds = (race_laps['SpeedI1'].fillna(0) + race_laps['SpeedI2'].fillna(0)) / 2
        sector_speeds = sector_speeds.replace(0, np.nan)  # Replace 0 with NaN
        
        # Calculate correlation
        valid_data = pd.DataFrame({
            'speed': sector_speeds,
            'lap_time': lap_times_seconds
        }).dropna()
        
        if len(valid_data) >= 3:
            corr = valid_data['speed'].corr(valid_data['lap_time'])
            # Invert correlation (higher speed should correlate with lower lap time)
            # So we use -corr or (1/corr) for efficiency metric
            sector_speed_laptime_corr = -corr if not pd.isna(corr) else np.nan
        else:
            sector_speed_laptime_corr = np.nan
    else:
        sector_speed_laptime_corr = np.nan
    
    return {
        'sector_speed_laptime_corr': sector_speed_laptime_corr
    }

def calculate_tyre_efficiency(laps_df: pd.DataFrame, driver_code: str, driver_number: int,
                             year: int, event: str) -> Dict[str, float]:
    """Calculate tyre efficiency index (average lap time per compound normalized by tyre life)."""
    # Filter laps for this driver/race
    race_laps = laps_df[
        (laps_df['Year'] == year) &
        (laps_df['Event'] == event) &
        (laps_df['Session'] == 'R') &
        (laps_df['Driver'] == driver_code) &
        (laps_df['DriverNumber'] == driver_number) &
        (laps_df['LapTime'].notna()) &
        (laps_df['Compound'].notna()) &
        (laps_df['Deleted'] != True)
    ].copy()
    
    if len(race_laps) == 0 or 'Compound' not in race_laps.columns:
        return {'tyre_efficiency_index': np.nan}
    
    # Convert LapTime to seconds
    if pd.api.types.is_timedelta64_dtype(race_laps['LapTime']):
        lap_times_seconds = race_laps['LapTime'].dt.total_seconds()
    else:
        lap_times_seconds = pd.to_timedelta(race_laps['LapTime'], errors='coerce').dt.total_seconds()
    
    race_laps['LapTimeSeconds'] = lap_times_seconds
    
    # Group by stint/compound and calculate efficiency
    if 'Stint' in race_laps.columns and 'TyreLife' in race_laps.columns:
        # Calculate per stint
        stints = race_laps.groupby('Stint')
        efficiencies = []
        
        for stint_num, stint_data in stints:
            if len(stint_data) < 2:
                continue
            
            avg_lap_time = stint_data['LapTimeSeconds'].mean()
            max_tyre_life = stint_data['TyreLife'].max()
            
            if max_tyre_life > 0 and not pd.isna(avg_lap_time):
                # Efficiency = average lap time / tyre life (lower is better)
                efficiency = avg_lap_time / max_tyre_life
                efficiencies.append(efficiency)
        
        if len(efficiencies) > 0:
            tyre_efficiency_index = np.mean(efficiencies)
        else:
            tyre_efficiency_index = np.nan
    else:
        # Simplified: overall average
        if len(race_laps) > 0:
            avg_lap_time = race_laps['LapTimeSeconds'].mean()
            tyre_efficiency_index = avg_lap_time  # Simplified metric
        else:
            tyre_efficiency_index = np.nan
    
    return {
        'tyre_efficiency_index': tyre_efficiency_index
    }

logger.info("Historical race metrics calculation functions defined.")


03:18:45 |     INFO | Historical race metrics calculation functions defined.


In [5]:
def calculate_all_historical_metrics(fastf1_data: Dict, master_df: pd.DataFrame) -> pd.DataFrame:
    """Calculate all historical metrics for each race-driver combination.
    
    Returns DataFrame with one row per (year, event, driver_code, driver_number) 
    with all calculated metrics.
    """
    results = []
    
    # Get unique race-driver combinations from master CSV (2018+)
    races = master_df[master_df['year'] >= 2018].copy()
    
    logger.info(f"Calculating historical metrics for {len(races):,} race-driver combinations...")
    
    # Combine all years of data into single DataFrames for efficiency
    all_laps = []
    all_telemetry = []
    all_weather = []
    
    for year in sorted(fastf1_data['LAPS'].keys()):
        if year in fastf1_data['LAPS']:
            df = fastf1_data['LAPS'][year].copy()
            all_laps.append(df)
    
    for year in sorted(fastf1_data['TELEMETRY'].keys()):
        if year in fastf1_data['TELEMETRY']:
            df = fastf1_data['TELEMETRY'][year].copy()
            all_telemetry.append(df)
    
    for year in sorted(fastf1_data['WEATHER'].keys()):
        if year in fastf1_data['WEATHER']:
            df = fastf1_data['WEATHER'][year].copy()
            all_weather.append(df)
    
    if all_laps:
        combined_laps = pd.concat(all_laps, ignore_index=True)
        logger.info(f"Combined LAPS data: {len(combined_laps):,} rows")
    else:
        combined_laps = pd.DataFrame()
        logger.warning("No LAPS data available!")
    
    if all_telemetry:
        combined_telemetry = pd.concat(all_telemetry, ignore_index=True)
        logger.info(f"Combined TELEMETRY data: {len(combined_telemetry):,} rows")
    else:
        combined_telemetry = pd.DataFrame()
        logger.warning("No TELEMETRY data available!")
    
    if all_weather:
        combined_weather = pd.concat(all_weather, ignore_index=True)
        logger.info(f"Combined WEATHER data: {len(combined_weather):,} rows")
    else:
        combined_weather = pd.DataFrame()
        logger.warning("No WEATHER data available!")
    
    # Process each race-driver combination
    processed = 0
    total = len(races)
    
    for idx, row in races.iterrows():
        year = int(row['year'])
        event = normalize_event_name(row['name'])
        driver_code = row['code']
        driver_number = row['number_driver']
        race_id = row.get('raceId', None)
        
        if pd.isna(event) or pd.isna(driver_code) or pd.isna(driver_number):
            continue
        
        # Initialize result dict
        result = {
            'year': year,
            'event': event,
            'driver_code': driver_code,
            'driver_number': int(driver_number) if not pd.isna(driver_number) else None,
            'race_date': row['date'],
            'race_id': race_id,
            'driver_id': row.get('driverId', None),
        }
        
        # Calculate metrics from LAPS data (race sessions only)
        # Use raceId if available for more efficient matching
        if len(combined_laps) > 0:
            if 'raceId' in combined_laps.columns and race_id is not None:
                # Use raceId for matching (faster and more reliable)
                race_laps = combined_laps[
                    (combined_laps['raceId'] == race_id) &
                    (combined_laps['Session'] == 'R')
                ]
            else:
                # Fallback to Year + Event matching
                race_laps = combined_laps[
                    (combined_laps['Year'] == year) &
                    (combined_laps['Event'] == event) &
                    (combined_laps['Session'] == 'R')
                ]
            
            if len(race_laps) > 0:
                # Overtaking skill
                overtaking = calculate_overtaking_skill(race_laps, driver_code, int(driver_number), year, event)
                result.update(overtaking)
                
                # Lap time stats
                lap_stats = calculate_lap_time_stats(race_laps, driver_code, int(driver_number), year, event)
                result.update(lap_stats)
                
                # Pit stop timing
                pit_timing = calculate_pit_stop_timing(race_laps, driver_code, int(driver_number), year, event)
                result.update(pit_timing)
                
                # Sector speed correlation
                sector_corr = calculate_sector_speed_correlation(race_laps, driver_code, int(driver_number), year, event)
                result.update(sector_corr)
                
                # Tyre efficiency
                tyre_eff = calculate_tyre_efficiency(race_laps, driver_code, int(driver_number), year, event)
                result.update(tyre_eff)
        
        # Calculate DRS patterns from TELEMETRY
        if len(combined_telemetry) > 0:
            drs_patterns = calculate_drs_patterns(combined_telemetry, driver_code, int(driver_number), year, event, race_id)
            result.update(drs_patterns)
        
        results.append(result)
        processed += 1
        
        if processed % 100 == 0:
            logger.info(f"Processed {processed}/{total} race-driver combinations...")
    
    logger.info(f"Completed calculation for {len(results):,} race-driver combinations")
    
    # Convert to DataFrame
    metrics_df = pd.DataFrame(results)
    
    return metrics_df

# Calculate all historical metrics
logger.info("Starting historical metrics calculation...")
if len(fastf1_data['LAPS']) > 0 or len(fastf1_data['TELEMETRY']) > 0:
    historical_metrics = calculate_all_historical_metrics(fastf1_data, master_2018plus)
    
    logger.info(f"Historical metrics calculated: {historical_metrics.shape}")
    logger.info(f"Columns: {list(historical_metrics.columns)}")
    logger.info(f"Sample metrics:")
    print(historical_metrics.head())
else:
    logger.warning("No FastF1 data available to calculate metrics!")
    historical_metrics = pd.DataFrame()


03:18:45 |     INFO | Starting historical metrics calculation...
03:18:45 |     INFO | Calculating historical metrics for 2,979 race-driver combinations...
03:18:57 |     INFO | Combined LAPS data: 403,492 rows
03:18:57 |     INFO | Combined TELEMETRY data: 29,934,157 rows
03:18:57 |     INFO | Combined WEATHER data: 72,836 rows
03:21:30 |     INFO | Processed 100/2979 race-driver combinations...
03:23:50 |     INFO | Processed 200/2979 race-driver combinations...
03:26:24 |     INFO | Processed 300/2979 race-driver combinations...
03:29:10 |     INFO | Processed 400/2979 race-driver combinations...
03:32:07 |     INFO | Processed 500/2979 race-driver combinations...
03:34:57 |     INFO | Processed 600/2979 race-driver combinations...
03:37:53 |     INFO | Processed 700/2979 race-driver combinations...
03:40:33 |     INFO | Processed 800/2979 race-driver combinations...
03:42:59 |     INFO | Processed 900/2979 race-driver combinations...
03:45:14 |     INFO | Processed 1000/2979 race-d

   year                  event driver_code  driver_number  race_date  race_id  \
0  2018  Australian Grand Prix         VET              5 2018-03-25      989   
1  2018  Australian Grand Prix         HAM             44 2018-03-25      989   
2  2018  Australian Grand Prix         RAI              7 2018-03-25      989   
3  2018  Australian Grand Prix         RIC              3 2018-03-25      989   
4  2018  Australian Grand Prix         ALO             14 2018-03-25      989   

   driver_id  position_changes  position_change_rate  lap_time_mean  \
0         20               2.0              0.034483      92.642810   
1          1               1.0              0.017241      92.729638   
2          8               1.0              0.017241      92.751586   
3        817               4.0              0.068966      92.764690   
4          4               5.0              0.086207      93.123603   

   lap_time_std  first_pit_lap  pit_stops  sector_speed_laptime_corr  \
0     13.36604

In [7]:
def calculate_rolling_averages_and_standardize(metrics_df: pd.DataFrame, master_df: pd.DataFrame) -> pd.DataFrame:
    """Calculate rolling averages for each driver and standardize relative features per race.
    
    IMPORTANT: Rolling averages use ONLY previous races (shifted by 1) for predictive features.
    Uses raceId for proper temporal ordering (raceId is in chronological order).
    
    Relative features to standardize:
    - drs_activation_rate
    - position_change_rate  
    - lap_time_std
    - first_pit_lap
    - sector_speed_laptime_corr
    - tyre_efficiency_index
    
    Returns DataFrame with rolling averages and standardized features.
    """
    logger.info("Calculating rolling averages and standardizing features...")
    
    # Prepare master_df - ensure number_driver is numeric for merging
    master_df_prep = master_df[['year', 'name', 'code', 'number_driver', 'date', 'raceId', 'driverId']].copy()
    
    # Convert number_driver to numeric (handles both int and string representations)
    master_df_prep['number_driver'] = pd.to_numeric(master_df_prep['number_driver'], errors='coerce')
    
    # Also ensure metrics_df driver_number is numeric
    metrics_df_prep = metrics_df.copy()
    if 'driver_number' in metrics_df_prep.columns:
        metrics_df_prep['driver_number'] = pd.to_numeric(metrics_df_prep['driver_number'], errors='coerce')
    
    # Merge with master to get date and ensure we have raceId and driverId
    metrics_with_master = metrics_df_prep.merge(
        master_df_prep,
        left_on=['year', 'event', 'driver_code', 'driver_number'],
        right_on=['year', 'name', 'code', 'number_driver'],
        how='left'
    )
    
    # Sort by driver and raceId (raceId is in chronological order, more reliable than date)
    # Use raceId for temporal ordering since it's guaranteed to be sequential
    metrics_sorted = metrics_with_master.sort_values(['driver_code', 'driver_number', 'raceId']).reset_index(drop=True)
    
    # Define relative features to standardize
    relative_features = [
        'drs_activation_rate',
        'position_change_rate',
        'lap_time_std',
        'first_pit_lap',
        'sector_speed_laptime_corr',
        'tyre_efficiency_index'
    ]
    
    # Standardize relative features per race (z-score within each race)
    logger.info("Standardizing relative features per race...")
    
    standardized_metrics = []
    
    # Group by raceId (more reliable than year+event)
    for race_id, race_group in metrics_sorted.groupby('raceId'):
        if pd.isna(race_id):
            continue
            
        race_metrics = race_group.copy()
        
        # Standardize each relative feature within this race
        for feature in relative_features:
            if feature not in race_metrics.columns:
                continue
            
            feature_values = race_metrics[feature].values
            feature_mean = np.nanmean(feature_values)
            feature_std = np.nanstd(feature_values)
            
            if feature_std > 0:
                # Z-score standardization
                race_metrics[f'{feature}_relative'] = (feature_values - feature_mean) / feature_std
            else:
                race_metrics[f'{feature}_relative'] = 0.0
        
        standardized_metrics.append(race_metrics)
    
    result_df = pd.concat(standardized_metrics, ignore_index=True)
    
    # Calculate rolling averages per driver (ONLY previous races, shifted by 1)
    # Use raceId for proper chronological ordering
    result_df = result_df.sort_values(['driver_code', 'driver_number', 'raceId']).reset_index(drop=True)
    
    logger.info("Calculating rolling averages from previous races only (using raceId for ordering)...")
    
    for feature in relative_features:
        if feature not in result_df.columns:
            continue
        
        for driver_code in result_df['driver_code'].unique():
            driver_mask = result_df['driver_code'] == driver_code
            driver_data = result_df[driver_mask].copy()
            
            # IMPORTANT: Use shift(1) to exclude current race from rolling average
            # This ensures we only use PREVIOUS races for prediction
            # raceId ordering ensures chronological correctness
            rolling_3 = driver_data[feature].shift(1).rolling(window=3, min_periods=1).mean()
            rolling_5 = driver_data[feature].shift(1).rolling(window=5, min_periods=1).mean()
            rolling_10 = driver_data[feature].shift(1).rolling(window=10, min_periods=1).mean()
            
            result_df.loc[driver_mask, f'{feature}_avg_last_3'] = rolling_3.values
            result_df.loc[driver_mask, f'{feature}_avg_last_5'] = rolling_5.values
            result_df.loc[driver_mask, f'{feature}_avg_last_10'] = rolling_10.values
    
    logger.info(f"Rolling averages and standardization complete. Shape: {result_df.shape}")
    
    return result_df

# Calculate rolling averages and standardize
if 'historical_metrics' in globals() and len(historical_metrics) > 0:
    metrics_with_rolling = calculate_rolling_averages_and_standardize(historical_metrics, master_2018plus)
    logger.info(f"Metrics with rolling averages: {metrics_with_rolling.shape}")
    logger.info(f"New columns: {[c for c in metrics_with_rolling.columns if 'relative' in c or 'avg_last' in c][:10]}")
else:
    logger.warning("No historical metrics to process!")
    metrics_with_rolling = pd.DataFrame()


10:18:56 |     INFO | Calculating rolling averages and standardizing features...
10:18:56 |     INFO | Standardizing relative features per race...
10:18:57 |     INFO | Calculating rolling averages from previous races only (using raceId for ordering)...
10:18:57 |     INFO | Rolling averages and standardization complete. Shape: (2979, 47)
10:18:57 |     INFO | Metrics with rolling averages: (2979, 47)
10:18:57 |     INFO | New columns: ['drs_activation_rate_relative', 'position_change_rate_relative', 'lap_time_std_relative', 'first_pit_lap_relative', 'sector_speed_laptime_corr_relative', 'tyre_efficiency_index_relative', 'drs_activation_rate_avg_last_3', 'drs_activation_rate_avg_last_5', 'drs_activation_rate_avg_last_10', 'position_change_rate_avg_last_3']


## Step 5: Calculate Weather Features

Calculate weather features from WEATHER data (Session='R'): averages, rainfall flags, transitions.


In [8]:
def calculate_weather_features(weather_data: Dict, master_df: pd.DataFrame) -> pd.DataFrame:
    """Calculate weather features for each race.
    
    Features:
    - AirTemp (average)
    - TrackTemp (average)
    - Humidity (average)
    - Pressure (average)
    - WindSpeed (average)
    - Rainfall (bool - any rain during race)
    - Rainfall transitions (count of dry→wet or wet→dry)
    - Dry vs wet start flag (rainfall at race start)
    """
    logger.info("Calculating weather features...")
    
    # Combine all weather data
    all_weather = []
    for year in sorted(weather_data.keys()):
        if year in weather_data:
            df = weather_data[year].copy()
            all_weather.append(df)
    
    if not all_weather:
        logger.warning("No weather data available!")
        return pd.DataFrame()
    
    combined_weather = pd.concat(all_weather, ignore_index=True)
    logger.info(f"Combined weather data: {len(combined_weather):,} rows")
    
    # Filter for race sessions only
    race_weather = combined_weather[combined_weather['Session'] == 'R'].copy()
    
    if len(race_weather) == 0:
        logger.warning("No race weather data available!")
        return pd.DataFrame()
    
    # Calculate weather features per race
    # Use raceId if available for more efficient grouping
    weather_features = []
    
    if 'raceId' in race_weather.columns:
        # Group by raceId (more reliable and efficient)
        for race_id, race_weather_group in race_weather.groupby('raceId'):
            if pd.isna(race_id):
                continue
            
            # Get year and event from first row
            first_row = race_weather_group.iloc[0]
            year = int(first_row['Year']) if 'Year' in first_row.index else None
            event = normalize_event_name(first_row['Event']) if 'Event' in first_row.index else None
            
            if year is None or event is None:
                continue
            
            race_metrics = {
                'year': year,
                'event': event,
                'raceId': int(race_id),
            }
            
            # Average values over race
            if 'AirTemp' in race_weather_group.columns:
                race_metrics['weather_airtemp_avg'] = race_weather_group['AirTemp'].mean()
            else:
                race_metrics['weather_airtemp_avg'] = np.nan
            
            if 'TrackTemp' in race_weather_group.columns:
                race_metrics['weather_tracktemp_avg'] = race_weather_group['TrackTemp'].mean()
            else:
                race_metrics['weather_tracktemp_avg'] = np.nan
            
            if 'Humidity' in race_weather_group.columns:
                race_metrics['weather_humidity_avg'] = race_weather_group['Humidity'].mean()
            else:
                race_metrics['weather_humidity_avg'] = np.nan
            
            if 'Pressure' in race_weather_group.columns:
                race_metrics['weather_pressure_avg'] = race_weather_group['Pressure'].mean()
            else:
                race_metrics['weather_pressure_avg'] = np.nan
            
            if 'WindSpeed' in race_weather_group.columns:
                race_metrics['weather_windspeed_avg'] = race_weather_group['WindSpeed'].mean()
            else:
                race_metrics['weather_windspeed_avg'] = np.nan
            
            # Rainfall flag (any rain during race)
            if 'Rainfall' in race_weather_group.columns:
                race_metrics['weather_rainfall'] = race_weather_group['Rainfall'].any() if race_weather_group['Rainfall'].dtype == bool else (race_weather_group['Rainfall'] == True).any()
            else:
                race_metrics['weather_rainfall'] = False
            
            # Rainfall transitions (dry→wet or wet→dry)
            if 'Rainfall' in race_weather_group.columns:
                # Sort by time
                race_weather_sorted = race_weather_group.sort_values('Time' if 'Time' in race_weather_group.columns else race_weather_group.index)
                rainfall_values = race_weather_sorted['Rainfall']
                
                # Count transitions
                transitions = 0
                prev_value = None
                for val in rainfall_values:
                    if prev_value is not None and val != prev_value:
                        transitions += 1
                    prev_value = val
                
                race_metrics['weather_rainfall_transitions'] = transitions
            else:
                race_metrics['weather_rainfall_transitions'] = 0
            
            # Dry vs wet start flag (rainfall at race start)
            if 'Rainfall' in race_weather_group.columns and 'Time' in race_weather_group.columns:
                # Get first time entry
                race_weather_sorted = race_weather_group.sort_values('Time')
                first_row = race_weather_sorted.iloc[0]
                race_metrics['weather_dry_wet_start'] = bool(first_row['Rainfall']) if pd.notna(first_row['Rainfall']) else False
            elif 'Rainfall' in race_weather_group.columns:
                # Use first entry
                first_row = race_weather_group.iloc[0]
                race_metrics['weather_dry_wet_start'] = bool(first_row['Rainfall']) if pd.notna(first_row['Rainfall']) else False
            else:
                race_metrics['weather_dry_wet_start'] = False
            
            weather_features.append(race_metrics)
    else:
        # Fallback to Year + Event grouping
        for (year, event), race_weather_group in race_weather.groupby(['Year', 'Event']):
            race_metrics = {
                'year': year,
                'event': normalize_event_name(event),
            }
            
            # Average values over race
            if 'AirTemp' in race_weather_group.columns:
                race_metrics['weather_airtemp_avg'] = race_weather_group['AirTemp'].mean()
            else:
                race_metrics['weather_airtemp_avg'] = np.nan
            
            if 'TrackTemp' in race_weather_group.columns:
                race_metrics['weather_tracktemp_avg'] = race_weather_group['TrackTemp'].mean()
            else:
                race_metrics['weather_tracktemp_avg'] = np.nan
            
            if 'Humidity' in race_weather_group.columns:
                race_metrics['weather_humidity_avg'] = race_weather_group['Humidity'].mean()
            else:
                race_metrics['weather_humidity_avg'] = np.nan
            
            if 'Pressure' in race_weather_group.columns:
                race_metrics['weather_pressure_avg'] = race_weather_group['Pressure'].mean()
            else:
                race_metrics['weather_pressure_avg'] = np.nan
            
            if 'WindSpeed' in race_weather_group.columns:
                race_metrics['weather_windspeed_avg'] = race_weather_group['WindSpeed'].mean()
            else:
                race_metrics['weather_windspeed_avg'] = np.nan
            
            # Rainfall flag (any rain during race)
            if 'Rainfall' in race_weather_group.columns:
                race_metrics['weather_rainfall'] = race_weather_group['Rainfall'].any() if race_weather_group['Rainfall'].dtype == bool else (race_weather_group['Rainfall'] == True).any()
            else:
                race_metrics['weather_rainfall'] = False
            
            # Rainfall transitions (dry→wet or wet→dry)
            if 'Rainfall' in race_weather_group.columns:
                # Sort by time
                race_weather_sorted = race_weather_group.sort_values('Time' if 'Time' in race_weather_group.columns else race_weather_group.index)
                rainfall_values = race_weather_sorted['Rainfall']
                
                # Count transitions
                transitions = 0
                prev_value = None
                for val in rainfall_values:
                    if prev_value is not None and val != prev_value:
                        transitions += 1
                    prev_value = val
                
                race_metrics['weather_rainfall_transitions'] = transitions
            else:
                race_metrics['weather_rainfall_transitions'] = 0
            
            # Dry vs wet start flag (rainfall at race start)
            if 'Rainfall' in race_weather_group.columns and 'Time' in race_weather_group.columns:
                # Get first time entry
                race_weather_sorted = race_weather_group.sort_values('Time')
                first_row = race_weather_sorted.iloc[0]
                race_metrics['weather_dry_wet_start'] = bool(first_row['Rainfall']) if pd.notna(first_row['Rainfall']) else False
            elif 'Rainfall' in race_weather_group.columns:
                # Use first entry
                first_row = race_weather_group.iloc[0]
                race_metrics['weather_dry_wet_start'] = bool(first_row['Rainfall']) if pd.notna(first_row['Rainfall']) else False
            else:
                race_metrics['weather_dry_wet_start'] = False
            
            weather_features.append(race_metrics)
    
    weather_df = pd.DataFrame(weather_features)
    
    logger.info(f"Weather features calculated for {len(weather_df):,} races")
    
    return weather_df

# Calculate weather features
if len(fastf1_data['WEATHER']) > 0:
    weather_features = calculate_weather_features(fastf1_data['WEATHER'], master_2018plus)
    logger.info(f"Weather features: {weather_features.shape}")
    logger.info(f"Weather feature columns: {list(weather_features.columns)}")
else:
    logger.warning("No weather data available!")
    weather_features = pd.DataFrame()


10:18:57 |     INFO | Calculating weather features...
10:18:57 |     INFO | Combined weather data: 72,836 rows
10:18:58 |     INFO | Weather features calculated for 149 races
10:18:58 |     INFO | Weather features: (149, 11)
10:18:58 |     INFO | Weather feature columns: ['year', 'event', 'raceId', 'weather_airtemp_avg', 'weather_tracktemp_avg', 'weather_humidity_avg', 'weather_pressure_avg', 'weather_windspeed_avg', 'weather_rainfall', 'weather_rainfall_transitions', 'weather_dry_wet_start']


## Step 6: Merge Features to Master CSV

Merge all calculated features back to master_races_augmented.csv.


In [9]:
def merge_features_to_master(master_df: pd.DataFrame, metrics_df: pd.DataFrame, 
                            weather_df: pd.DataFrame) -> pd.DataFrame:
    """Merge all FastF1 features to master CSV.
    
    Join on: year + name (normalized) + code + number_driver
    """
    logger.info("Merging features to master CSV...")
    
    master_result = master_df.copy()
    
    # Prepare metrics_df for merge (need to match on master CSV columns)
    if len(metrics_df) > 0:
        # Prepare DataFrames with consistent types for merging
        metrics_df_prep = metrics_df.copy()
        if 'driver_number' in metrics_df_prep.columns:
            metrics_df_prep['driver_number'] = pd.to_numeric(metrics_df_prep['driver_number'], errors='coerce')
        
        master_df_prep = master_df[['year', 'name', 'code', 'number_driver', 'raceId', 'driverId', 'date']].copy()
        master_df_prep['number_driver'] = pd.to_numeric(master_df_prep['number_driver'], errors='coerce')
        
        # Merge metrics on year + event + driver code + driver number
        metrics_for_merge = metrics_df_prep.merge(
            master_df_prep,
            left_on=['year', 'event', 'driver_code', 'driver_number'],
            right_on=['year', 'name', 'code', 'number_driver'],
            how='right',
            suffixes=('', '_master')
        )
        
        # Select feature columns to merge (exclude join keys already in master)
        feature_cols = [c for c in metrics_for_merge.columns 
                       if c not in ['year', 'event', 'driver_code', 'driver_number', 'name', 'code', 'number_driver'] 
                       and c not in master_df.columns
                       and not c.endswith('_master')]
        
        if feature_cols:
            # Merge on raceId and driverId for exact matching
            metrics_merge = metrics_for_merge[['raceId', 'driverId'] + feature_cols].copy()
            master_result = master_result.merge(
                metrics_merge,
                on=['raceId', 'driverId'],
                how='left'
            )
            logger.info(f"Merged {len(feature_cols)} metric features")
    else:
        logger.warning("No metrics to merge!")
    
    # Prepare weather_df for merge (one row per race, not per driver)
    if len(weather_df) > 0:
        # Use raceId if available for more efficient merging
        if 'raceId' in weather_df.columns:
            # Merge directly on raceId
            weather_cols = [c for c in weather_df.columns 
                           if c not in ['year', 'event', 'raceId'] 
                           and c not in master_df.columns]
            
            if weather_cols:
                weather_merge = weather_df[['raceId'] + weather_cols].copy()
                master_result = master_result.merge(
                    weather_merge,
                    on='raceId',
                    how='left'
                )
                logger.info(f"Merged {len(weather_cols)} weather features (using raceId)")
        else:
            # Fallback to year + name matching
            weather_for_merge = weather_df.merge(
                master_df[['year', 'name', 'raceId']].drop_duplicates(),
                left_on=['year', 'event'],
                right_on=['year', 'name'],
                how='right'
            )
            
            weather_cols = [c for c in weather_for_merge.columns 
                           if c not in ['year', 'event', 'name'] 
                           and c not in master_df.columns]
            
            if weather_cols:
                weather_merge = weather_for_merge[['raceId'] + weather_cols].copy()
                master_result = master_result.merge(
                    weather_merge,
                    on='raceId',
                    how='left'
                )
                logger.info(f"Merged {len(weather_cols)} weather features (using year+name)")
    else:
        logger.warning("No weather features to merge!")
    
    logger.info(f"Merge complete. Master shape: {master_result.shape}")
    
    # Count missing data
    if len(metrics_df) > 0 or len(weather_df) > 0:
        new_features = [c for c in master_result.columns if c not in master_df.columns]
        logger.info(f"New features added: {len(new_features)}")
        
        for feature in new_features[:10]:  # Show first 10
            missing = master_result[feature].isna().sum()
            missing_pct = (missing / len(master_result)) * 100
            logger.info(f"  {feature}: {missing:,} missing ({missing_pct:.1f}%)")
    
    return master_result

# Merge all features
if ('metrics_with_rolling' in globals() and len(metrics_with_rolling) > 0) or ('weather_features' in globals() and len(weather_features) > 0):
    metrics_df_to_merge = metrics_with_rolling if 'metrics_with_rolling' in globals() else pd.DataFrame()
    weather_df_to_merge = weather_features if 'weather_features' in globals() else pd.DataFrame()
    
    master_with_features = merge_features_to_master(master, metrics_df_to_merge, weather_df_to_merge)
    logger.info(f"Master with features: {master_with_features.shape}")
    
    # Compare to original
    original_cols = set(master.columns)
    new_cols = set(master_with_features.columns) - original_cols
    logger.info(f"New features added: {len(new_cols)}")
    logger.info(f"New feature names: {sorted(list(new_cols))[:20]}...")  # Show first 20
else:
    logger.warning("No features to merge!")
    master_with_features = master.copy()


10:18:58 |     INFO | Merging features to master CSV...
10:18:58 |     INFO | Merged 37 metric features
10:18:58 |     INFO | Merged 8 weather features (using raceId)
10:18:58 |     INFO | Merge complete. Master shape: (12358, 165)
10:18:58 |     INFO | New features added: 45
10:18:58 |     INFO |   race_date: 9,379 missing (75.9%)
10:18:58 |     INFO |   race_id: 9,379 missing (75.9%)
10:18:58 |     INFO |   driver_id: 9,379 missing (75.9%)
10:18:58 |     INFO |   position_changes: 9,546 missing (77.2%)
10:18:58 |     INFO |   position_change_rate: 9,546 missing (77.2%)
10:18:58 |     INFO |   lap_time_mean: 9,530 missing (77.1%)
10:18:58 |     INFO |   lap_time_std: 9,569 missing (77.4%)
10:18:58 |     INFO |   first_pit_lap: 9,659 missing (78.2%)
10:18:58 |     INFO |   pit_stops: 9,379 missing (75.9%)
10:18:58 |     INFO |   sector_speed_laptime_corr: 9,573 missing (77.5%)
10:18:58 |     INFO | Master with features: (12358, 165)
10:18:58 |     INFO | New features added: 45
10:18:58

## Step 7: Analyze Missing Data (2018+)

Analyze missingness of FastF1 features for 2018+ data to understand data coverage and quality.


In [11]:
# Analyze missing data for 2018+ only
logger.info("="*60)
logger.info("MISSING DATA ANALYSIS (2018+)")
logger.info("="*60)

if 'master_with_features' in globals() and len(master_with_features) > 0:
    # Filter to 2018+ data
    master_2018plus_analysis = master_with_features[master_with_features['year'] >= 2018].copy()
    
    logger.info(f"\nTotal rows (2018+): {len(master_2018plus_analysis):,}")
    logger.info(f"Date range: {master_2018plus_analysis['date'].min()} to {master_2018plus_analysis['date'].max()}")
    
    # Identify FastF1 features (new columns added)
    original_cols = set(master.columns)
    fastf1_features = [col for col in master_with_features.columns if col not in original_cols]
    
    if len(fastf1_features) > 0:
        logger.info(f"\nFastF1 features added: {len(fastf1_features)}")
        
        # Analyze missingness for each feature
        missing_analysis = []
        
        for feature in sorted(fastf1_features):
            missing_count = master_2018plus_analysis[feature].isna().sum()
            total_count = len(master_2018plus_analysis)
            missing_pct = (missing_count / total_count * 100) if total_count > 0 else 0
            
            missing_analysis.append({
                'feature': feature,
                'missing_count': missing_count,
                'total_count': total_count,
                'missing_pct': missing_pct,
                'available_count': total_count - missing_count,
                'available_pct': 100 - missing_pct
            })
        
        missing_df = pd.DataFrame(missing_analysis)
        
        # Display summary
        logger.info("\n" + "="*60)
        logger.info("MISSING DATA SUMMARY (2018+)")
        logger.info("="*60)
        
        # Sort by missing percentage (highest first)
        missing_df_sorted = missing_df.sort_values('missing_pct', ascending=False)
        
        logger.info("\nTop 10 features with highest missingness:")
        for idx, row in missing_df_sorted.head(10).iterrows():
            logger.info(f"  {row['feature']:40s} | Missing: {row['missing_count']:5,}/{row['total_count']:5,} ({row['missing_pct']:6.2f}%)")
        
        logger.info("\nTop 10 features with lowest missingness:")
        for idx, row in missing_df_sorted.tail(10).iterrows():
            logger.info(f"  {row['feature']:40s} | Missing: {row['missing_count']:5,}/{row['total_count']:5,} ({row['missing_pct']:6.2f}%)")
        
        # Group by feature type
        logger.info("\n" + "="*60)
        logger.info("MISSING DATA BY FEATURE TYPE")
        logger.info("="*60)
        
        feature_categories = {
            'DRS Patterns': [f for f in fastf1_features if 'drs' in f.lower()],
            'Overtaking/Position': [f for f in fastf1_features if any(x in f.lower() for x in ['position', 'overtak'])],
            'Lap Time': [f for f in fastf1_features if 'lap_time' in f.lower()],
            'Pit Stop': [f for f in fastf1_features if 'pit' in f.lower()],
            'Sector Speed': [f for f in fastf1_features if 'sector' in f.lower()],
            'Tyre Efficiency': [f for f in fastf1_features if 'tyre' in f.lower()],
            'Weather': [f for f in fastf1_features if 'weather' in f.lower()],
            'Relative Features': [f for f in fastf1_features if '_relative' in f],
            'Rolling Averages': [f for f in fastf1_features if '_avg_last_' in f],
        }
        
        for category, features in feature_categories.items():
            if features:
                cat_missing = missing_df[missing_df['feature'].isin(features)]
                if len(cat_missing) > 0:
                    avg_missing_pct = cat_missing['missing_pct'].mean()
                    logger.info(f"\n{category} ({len(features)} features):")
                    logger.info(f"  Average missing: {avg_missing_pct:.2f}%")
                    logger.info(f"  Features: {', '.join(features[:5])}{'...' if len(features) > 5 else ''}")
        
        # Year-by-year analysis
        logger.info("\n" + "="*60)
        logger.info("MISSING DATA BY YEAR (2018+)")
        logger.info("="*60)
        
        for year in sorted(master_2018plus_analysis['year'].unique()):
            year_data = master_2018plus_analysis[master_2018plus_analysis['year'] == year]
            year_features_missing = {}
            
            for feature in fastf1_features:
                missing_pct = (year_data[feature].isna().sum() / len(year_data) * 100) if len(year_data) > 0 else 100
                year_features_missing[feature] = missing_pct
            
            avg_missing = np.mean(list(year_features_missing.values()))
            logger.info(f"  {year}: {len(year_data):,} rows | Avg missing: {avg_missing:.2f}%")
        
        # Save missing data analysis to CSV
        missing_analysis_path = PROCESSED_ROOT / "fastf1_missing_data_analysis_2018plus.csv"
        missing_df_sorted.to_csv(missing_analysis_path, index=False)
        logger.info(f"\nMissing data analysis saved to: {missing_analysis_path}")
        
    else:
        logger.warning("No FastF1 features found to analyze!")
else:
    logger.warning("master_with_features not available for missing data analysis!")


12:26:02 |     INFO | MISSING DATA ANALYSIS (2018+)
12:26:02 |     INFO | 
Total rows (2018+): 2,979
12:26:02 |     INFO | Date range: 2018-03-25 00:00:00 to 2024-12-08 00:00:00
12:26:02 |     INFO | 
FastF1 features added: 45
12:26:02 |     INFO | 
12:26:02 |     INFO | MISSING DATA SUMMARY (2018+)
12:26:02 |     INFO | 
Top 10 features with highest missingness:
12:26:02 |     INFO |   drs_activation_rate                      | Missing: 2,713/2,979 ( 91.07%)
12:26:02 |     INFO |   drs_time_fraction                        | Missing: 2,713/2,979 ( 91.07%)
12:26:02 |     INFO |   drs_activation_rate_avg_last_3           | Missing: 2,673/2,979 ( 89.73%)
12:26:02 |     INFO |   drs_activation_rate_avg_last_5           | Missing: 2,633/2,979 ( 88.39%)
12:26:02 |     INFO |   drs_activation_rate_avg_last_10          | Missing: 2,539/2,979 ( 85.23%)
12:26:02 |     INFO |   first_pit_lap                            | Missing:   280/2,979 (  9.40%)
12:26:02 |     INFO |   first_pit_lap_relative

## Step 8: Export Augmented Dataset

Save the augmented dataset with FastF1 features.


In [None]:
# Export augmented dataset
output_path = PROCESSED_ROOT / "master_races_combined.csv"

logger.info(f"Saving augmented dataset to {output_path}...")
master_with_features.to_csv(output_path, index=False)

logger.info("="*60)
logger.info("FEATURE ENGINEERING COMPLETE!")
logger.info("="*60)
logger.info(f"Original shape: {master.shape}")
logger.info(f"New shape: {master_with_features.shape}")
logger.info(f"Features added: {len(set(master_with_features.columns) - set(master.columns))}")

# Display summary statistics
new_features = [c for c in master_with_features.columns if c not in master.columns]
if new_features:
    logger.info("\nNew features summary:")
    for feature in sorted(new_features)[:20]:  # Show first 20
        if master_with_features[feature].dtype in [np.float64, np.int64]:
            mean_val = master_with_features[feature].mean()
            std_val = master_with_features[feature].std()
            missing = master_with_features[feature].isna().sum()
            logger.info(f"  {feature}: mean={mean_val:.3f}, std={std_val:.3f}, missing={missing}")

logger.info(f"\nDataset saved to: {output_path}")


10:18:58 |     INFO | Saving augmented dataset to C:\Users\Erik Viljamaa\Downloads\projects\f1-podium-predictor\data\processed\master_races_augmented.csv...
10:18:59 |     INFO | FEATURE ENGINEERING COMPLETE!
10:18:59 |     INFO | Original shape: (12358, 120)
10:18:59 |     INFO | New shape: (12358, 165)
10:18:59 |     INFO | Features added: 45
10:18:59 |     INFO | 
New features summary:
10:18:59 |     INFO |   driver_id: mean=694.881, std=309.250, missing=9379
10:18:59 |     INFO |   drs_activation_rate: mean=0.434, std=0.191, missing=12092
10:18:59 |     INFO |   drs_activation_rate_avg_last_10: mean=0.425, std=0.088, missing=11918
10:18:59 |     INFO |   drs_activation_rate_avg_last_3: mean=0.431, std=0.111, missing=12052
10:18:59 |     INFO |   drs_activation_rate_avg_last_5: mean=0.427, std=0.099, missing=12012
10:18:59 |     INFO |   drs_activation_rate_relative: mean=0.000, std=0.300, missing=9393
10:18:59 |     INFO |   drs_time_fraction: mean=0.434, std=0.191, missing=12092
1