# 03.7 - FastF1 Features EDA

Exploratory Data Analysis of FastF1 features:
- Correlation analysis with target (podium)
- Feature distributions
- Feature importance analysis
- Missing data patterns

**Input:**
- `data/processed/master_races_with_fastf1.csv`

**Output:** Correlation analysis and visualizations


In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set up paths
# Get project root (works whether running from notebooks/ or F1/ folder)
PROJECT_ROOT = Path().resolve()
if PROJECT_ROOT.name == 'notebooks':
    PROJECT_ROOT = PROJECT_ROOT.parent

PROCESSED_ROOT = PROJECT_ROOT / "data" / "processed"

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print(f"PROJECT_ROOT: {PROJECT_ROOT}")


## 1. Load Data


In [None]:
# Load dataset with FastF1 features
fastf1_path = PROCESSED_ROOT / "master_races_with_fastf1.csv"
if not fastf1_path.exists():
    raise FileNotFoundError(f"master_races_with_fastf1.csv not found. Please run notebook 04_fastf1_feature_engineering.ipynb first.")

df = pd.read_csv(fastf1_path, low_memory=False)
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Load original to identify FastF1 features
original_path = PROCESSED_ROOT / "master_races_clean.csv"
original = pd.read_csv(original_path, low_memory=False)

# Identify FastF1 features
original_cols = set(original.columns)
fastf1_features = [col for col in df.columns if col not in original_cols]

print(f"Dataset shape: {df.shape}")
print(f"FastF1 features: {len(fastf1_features)}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")

# Filter to 2018+ for FastF1 analysis
df_2018plus = df[df['year'] >= 2018].copy()
print(f"\n2018+ rows: {len(df_2018plus):,}")


## 2. Feature Overview


In [None]:
print("="*70)
print("FASTF1 FEATURES OVERVIEW")
print("="*70)

# Categorize features
feature_categories = {
    'DRS Patterns': [f for f in fastf1_features if 'drs' in f.lower()],
    'Overtaking/Position': [f for f in fastf1_features if any(x in f.lower() for x in ['position', 'overtak'])],
    'Lap Time': [f for f in fastf1_features if 'lap_time' in f.lower()],
    'Pit Stop': [f for f in fastf1_features if 'pit' in f.lower()],
    'Sector Speed': [f for f in fastf1_features if 'sector' in f.lower()],
    'Tyre Efficiency': [f for f in fastf1_features if 'tyre' in f.lower()],
    'Weather': [f for f in fastf1_features if 'weather' in f.lower()],
    'Relative Features': [f for f in fastf1_features if '_relative' in f],
    'Rolling Averages': [f for f in fastf1_features if '_avg_last_' in f],
}

for category, features in feature_categories.items():
    if features:
        print(f"\n{category} ({len(features)} features):")
        for feat in sorted(features):
            print(f"  - {feat}")

# Missing data summary for 2018+
print("\n" + "="*70)
print("MISSING DATA SUMMARY (2018+)")
print("="*70)

missing_summary = []
for feature in sorted(fastf1_features):
    missing = df_2018plus[feature].isna().sum()
    total = len(df_2018plus)
    missing_pct = (missing / total * 100) if total > 0 else 0
    missing_summary.append({
        'feature': feature,
        'missing': missing,
        'missing_pct': missing_pct,
        'available': total - missing
    })

missing_df = pd.DataFrame(missing_summary).sort_values('missing_pct')

print("\nTop 10 features with lowest missingness:")
for _, row in missing_df.head(10).iterrows():
    print(f"  {row['feature']:40s} | Missing: {row['missing']:5,}/{len(df_2018plus):5,} ({row['missing_pct']:6.2f}%)")

print("\nTop 10 features with highest missingness:")
for _, row in missing_df.tail(10).iterrows():
    print(f"  {row['feature']:40s} | Missing: {row['missing']:5,}/{len(df_2018plus):5,} ({row['missing_pct']:6.2f}%)")


In [None]:
print("="*70)
print("CORRELATION WITH TARGET (PODIUM)")
print("="*70)

# Select numeric FastF1 features for correlation
numeric_fastf1 = []
for feat in fastf1_features:
    if df_2018plus[feat].dtype in [np.float64, np.int64]:
        numeric_fastf1.append(feat)

print(f"\nNumeric FastF1 features: {len(numeric_fastf1)}")

# Calculate correlations with podium (2018+ only)
correlations = []
for feat in numeric_fastf1:
    # Use only non-null values
    valid_data = df_2018plus[[feat, 'podium']].dropna()
    if len(valid_data) > 100:  # Need sufficient data
        corr = valid_data[feat].corr(valid_data['podium'])
        if not pd.isna(corr):
            correlations.append({
                'feature': feat,
                'correlation': corr,
                'abs_correlation': abs(corr),
                'n_samples': len(valid_data)
            })

corr_df = pd.DataFrame(correlations).sort_values('abs_correlation', ascending=False)

print(f"\nFeatures with correlation calculated: {len(corr_df)}")
print("\nTop 20 features by absolute correlation with podium:")
print("="*70)
for idx, row in corr_df.head(20).iterrows():
    direction = "+" if row['correlation'] > 0 else "-"
    print(f"{row['feature']:45s} | {direction} {abs(row['correlation']):.4f} | n={row['n_samples']:5,}")

# Visualize top correlations
top_n = 15
top_features = corr_df.head(top_n)

plt.figure(figsize=(10, 8))
colors = ['red' if x < 0 else 'blue' for x in top_features['correlation']]
plt.barh(range(len(top_features)), top_features['correlation'], color=colors, alpha=0.7)
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Correlation with Podium')
plt.title(f'Top {top_n} FastF1 Features Correlated with Podium (2018+)')
plt.axvline(x=0, color='black', linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.savefig(PROCESSED_ROOT / 'fastf1_podium_correlation.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\n✓ Correlation plot saved to: {PROCESSED_ROOT / 'fastf1_podium_correlation.png'}")


In [None]:
# Select top features for correlation matrix
top_features_list = corr_df.head(20)['feature'].tolist()

# Also include key pre-existing features for comparison
key_existing_features = ['grid', 'driver_standings_points_PRE_RACE', 'driver_standings_position_PRE_RACE']
key_existing_features = [f for f in key_existing_features if f in df_2018plus.columns]

# Combine for correlation matrix
features_for_matrix = top_features_list + key_existing_features + ['podium']
features_for_matrix = [f for f in features_for_matrix if f in df_2018plus.columns]

# Calculate correlation matrix (using available data)
corr_matrix = df_2018plus[features_for_matrix].corr()

# Plot correlation matrix
plt.figure(figsize=(14, 12))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))  # Mask upper triangle
sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8}, vmin=-1, vmax=1)
plt.title('Correlation Matrix: Top FastF1 Features + Key Existing Features (2018+)', fontsize=14)
plt.tight_layout()
plt.savefig(PROCESSED_ROOT / 'fastf1_correlation_matrix.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"✓ Correlation matrix saved to: {PROCESSED_ROOT / 'fastf1_correlation_matrix.png'}")


## 5. Feature Distributions


In [None]:
# Plot distributions of top features
top_5_features = corr_df.head(5)['feature'].tolist()

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for idx, feat in enumerate(top_5_features):
    if idx < len(axes):
        ax = axes[idx]
        data = df_2018plus[feat].dropna()
        
        if len(data) > 0:
            ax.hist(data, bins=30, alpha=0.7, edgecolor='black')
            ax.set_title(f'{feat}\n(n={len(data):,}, corr={corr_df[corr_df["feature"]==feat]["correlation"].values[0]:.3f})')
            ax.set_xlabel('Value')
            ax.set_ylabel('Frequency')
            ax.grid(True, alpha=0.3)

# Remove empty subplot
if len(top_5_features) < len(axes):
    axes[-1].remove()

plt.suptitle('Distributions of Top 5 FastF1 Features (2018+)', fontsize=14, y=1.02)
plt.tight_layout()
plt.savefig(PROCESSED_ROOT / 'fastf1_feature_distributions.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"✓ Feature distributions saved to: {PROCESSED_ROOT / 'fastf1_feature_distributions.png'}")


## 6. Feature Comparison: Podium vs Non-Podium


In [None]:
# Compare feature values between podium and non-podium finishers
podium_data = df_2018plus[df_2018plus['podium'] == 1]
non_podium_data = df_2018plus[df_2018plus['podium'] == 0]

print("="*70)
print("FEATURE COMPARISON: PODIUM vs NON-PODIUM (2018+)")
print("="*70)

comparisons = []
for feat in numeric_fastf1[:20]:  # Top 20 for brevity
    podium_vals = podium_data[feat].dropna()
    non_podium_vals = non_podium_data[feat].dropna()
    
    if len(podium_vals) > 10 and len(non_podium_vals) > 10:
        podium_mean = podium_vals.mean()
        non_podium_mean = non_podium_vals.mean()
        
        comparisons.append({
            'feature': feat,
            'podium_mean': podium_mean,
            'non_podium_mean': non_podium_mean,
            'difference': podium_mean - non_podium_mean,
            'pct_difference': ((podium_mean - non_podium_mean) / abs(non_podium_mean) * 100) if non_podium_mean != 0 else 0
        })

comp_df = pd.DataFrame(comparisons)
comp_df['abs_difference'] = comp_df['difference'].abs()
comp_df = comp_df.sort_values('abs_difference', ascending=False)

print("\nTop 10 features with largest differences:")
print("="*70)
for idx, row in comp_df.head(10).iterrows():
    print(f"{row['feature']:40s} | Podium: {row['podium_mean']:8.3f} | Non-Podium: {row['non_podium_mean']:8.3f} | Diff: {row['difference']:8.3f}")

# Visualize top differences
top_comp = comp_df.head(10)

fig, ax = plt.subplots(figsize=(10, 6))
x = np.arange(len(top_comp))
width = 0.35

ax.bar(x - width/2, top_comp['podium_mean'], width, label='Podium', alpha=0.7)
ax.bar(x + width/2, top_comp['non_podium_mean'], width, label='Non-Podium', alpha=0.7)

ax.set_xlabel('Feature')
ax.set_ylabel('Mean Value')
ax.set_title('Top 10 FastF1 Features: Podium vs Non-Podium Comparison (2018+)')
ax.set_xticks(x)
ax.set_xticklabels(top_comp['feature'], rotation=45, ha='right')
ax.legend()
ax.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.savefig(PROCESSED_ROOT / 'fastf1_podium_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\n✓ Comparison plot saved to: {PROCESSED_ROOT / 'fastf1_podium_comparison.png'}")


In [None]:
# Save correlation results to CSV
corr_output = PROCESSED_ROOT / 'fastf1_correlations_with_podium.csv'
corr_df.to_csv(corr_output, index=False)
print(f"✓ Correlation results saved to: {corr_output}")

# Save comparison results
comp_output = PROCESSED_ROOT / 'fastf1_podium_comparison.csv'
comp_df.to_csv(comp_output, index=False)
print(f"✓ Comparison results saved to: {comp_output}")

print("\n" + "="*70)
print("EDA COMPLETE!")
print("="*70)
print(f"\nSummary:")
print(f"  - FastF1 features analyzed: {len(fastf1_features)}")
print(f"  - Numeric features: {len(numeric_fastf1)}")
print(f"  - Features with correlation calculated: {len(corr_df)}")
if len(corr_df) > 0:
    print(f"  - Top correlated feature: {corr_df.iloc[0]['feature']} (corr={corr_df.iloc[0]['correlation']:.4f})")
