# 04 - Feature Engineering

This notebook creates all features for modeling from the master_races dataset.

**Feature Categories:**
1. **Static Features**: Grid position, circuit characteristics, driver age, constructor
2. **Rolling Historical Features**: Driver/constructor performance over last N races (N=3, 5, 10)
3. **Driver Style Features**: Placeholder for FastF1 telemetry features (2018+ only)
4. **Weather Features**: Placeholder for FastF1 weather features (2018+ only)

**Input:** `data/processed/master_races.csv`  
**Output:** `data/processed/features.csv`


In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Set up paths
PROJECT_ROOT = Path("..").resolve()
PROCESSED_ROOT = PROJECT_ROOT / "data" / "processed"

# Load master table
master = pd.read_csv(PROCESSED_ROOT / "master_races.csv")
master['date'] = pd.to_datetime(master['date'], errors='coerce')
master = master.sort_values(['driverId', 'date']).reset_index(drop=True)

print(f"Loaded master table: {master.shape}")
print(f"Date range: {master['date'].min()} to {master['date'].max()}")

# Create features DataFrame
features = master[['raceId', 'driverId', 'year', 'date', 'podium']].copy()
print(f"Starting with {len(features)} rows")


## 1. Static Features

Grid position, circuit characteristics, driver age, constructor information.


In [None]:
# Grid position features
if 'grid' in master.columns:
    features['grid'] = master['grid']
    features['grid_top3'] = (master['grid'] <= 3).astype(int)
    features['grid_top10'] = (master['grid'] <= 10).astype(int)
    features['grid_pole'] = (master['grid'] == 1).astype(int)

# Circuit features
if 'circuit_name' in master.columns:
    features['circuit_name'] = master['circuit_name']
if 'country' in master.columns:
    features['circuit_country'] = master['country']

# Driver age
if 'dob' in master.columns and 'date' in master.columns:
    master['dob'] = pd.to_datetime(master['dob'], errors='coerce')
    features['driver_age'] = (master['date'] - master['dob']).dt.days / 365.25

# Constructor features
if 'constructorId' in master.columns:
    features['constructorId'] = master['constructorId']
if 'name' in master.columns:
    features['constructor_name'] = master['name']

# Qualifying features
if 'qualifying_position' in master.columns:
    features['qualifying_position'] = master['qualifying_position']
elif 'position' in master.columns:
    features['qualifying_position'] = master['position']

print(f"Static features added. Features shape: {features.shape}")


## 2. Rolling Historical Features

Create rolling window features for driver and constructor performance.


In [None]:
def rolling_rate(series, window):
    """Calculate rolling mean with shift to avoid data leakage."""
    return series.shift(1).rolling(window=window, min_periods=1).mean()

def rolling_sum(series, window):
    """Calculate rolling sum with shift to avoid data leakage."""
    return series.shift(1).rolling(window=window, min_periods=1).sum()

# Driver historical features
if 'podium' in master.columns:
    for window in [3, 5, 10]:
        features[f'driver_podium_rate_last_{window}'] = master.groupby('driverId')['podium'].apply(
            lambda s: rolling_rate(s, window)
        ).values

if 'points' in master.columns:
    for window in [3, 5, 10]:
        features[f'driver_points_avg_last_{window}'] = master.groupby('driverId')['points'].apply(
            lambda s: rolling_rate(s, window)
        ).values

if 'positionOrder' in master.columns:
    for window in [3, 5, 10]:
        features[f'driver_avg_position_last_{window}'] = master.groupby('driverId')['positionOrder'].apply(
            lambda s: rolling_rate(s, window)
        ).values

# Constructor historical features
if 'constructorId' in master.columns:
    if 'podium' in master.columns:
        for window in [3, 5, 10]:
            features[f'constructor_podium_rate_last_{window}'] = master.groupby('constructorId')['podium'].apply(
                lambda s: rolling_rate(s, window)
            ).values
    
    if 'points' in master.columns:
        for window in [3, 5, 10]:
            features[f'constructor_points_avg_last_{window}'] = master.groupby('constructorId')['points'].apply(
                lambda s: rolling_rate(s, window)
            ).values

# Career totals
if 'podium' in master.columns:
    features['driver_total_podiums'] = master.groupby('driverId')['podium'].apply(
        lambda s: s.shift(1).cumsum().fillna(0)
    ).values

features['driver_races_completed'] = master.groupby('driverId').cumcount()

print(f"Rolling historical features added. Features shape: {features.shape}")


In [None]:
# Placeholder features for FastF1 telemetry and weather
if 'lap_time_variance' in master.columns:
    features['lap_time_variance'] = master['lap_time_variance']
else:
    features['lap_time_variance'] = np.nan

if 'throttle_variance' in master.columns:
    features['throttle_variance'] = master['throttle_variance']
else:
    features['throttle_variance'] = np.nan

if 'overtake_attempts' in master.columns:
    features['overtake_attempts'] = master['overtake_attempts']
else:
    features['overtake_attempts'] = np.nan

# Weather placeholders
weather_features = ['air_temp', 'track_temp', 'rainfall_mm', 'wind_speed', 'wind_direction']
for feat in weather_features:
    features[feat] = np.nan

print("Placeholder features added for FastF1 data.")


## 4. Feature Selection & Export

Remove highly correlated features and save.


In [None]:
# Remove highly correlated features
numeric_features = features.select_dtypes(include=[np.number]).columns.tolist()
if 'podium' in numeric_features:
    numeric_features.remove('podium')
if 'raceId' in numeric_features:
    numeric_features.remove('raceId')
if 'driverId' in numeric_features:
    numeric_features.remove('driverId')
if 'year' in numeric_features:
    numeric_features.remove('year')

if len(numeric_features) > 1:
    corr_matrix = features[numeric_features].corr().abs()
    high_corr_pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            if corr_matrix.iloc[i, j] > 0.95:
                high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j]))
    
    if high_corr_pairs:
        cols_to_remove = [pair[1] for pair in high_corr_pairs]
        features = features.drop(columns=[c for c in cols_to_remove if c in features.columns])
        print(f"Removed {len(set(cols_to_remove))} highly correlated features")

# Save features
output_path = PROCESSED_ROOT / "features.csv"
features.to_csv(output_path, index=False)
print(f"\nFeatures saved to: {output_path}")
print(f"  Rows: {features.shape[0]:,}")
print(f"  Columns: {features.shape[1]}")
