# 03 - Exploratory Data Analysis

This notebook performs comprehensive exploratory data analysis (EDA) on the master_races dataset to guide feature engineering and model selection.

**Goals:**
- Understand data distributions and missing patterns
- Analyze target variable (podium) distribution
- Explore relationships between features and target
- Identify temporal trends and patterns
- Generate insights for feature engineering

**Input:** `data/processed/master_races.csv`


In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set up paths
PROJECT_ROOT = Path("..").resolve()
PROCESSED_ROOT = PROJECT_ROOT / "data" / "processed"

# Load master table
master = pd.read_csv(PROCESSED_ROOT / "master_races.csv")
master['date'] = pd.to_datetime(master['date'], errors='coerce')

print(f"Dataset shape: {master.shape}")
print(f"Memory usage: {master.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"Date range: {master['date'].min()} to {master['date'].max()}")
print(f"Years: {master['year'].min()} - {master['year'].max()}")

# Set plotting style
sns.set_theme(style="darkgrid")
plt.rcParams['figure.figsize'] = (12, 6)


## 1. Dataset Overview

Basic statistics, data types, and missing data patterns.


In [None]:
# Data types
print("Data Types:")
print(master.dtypes.value_counts())

# Missing data heatmap
missing_pct = master.isnull().sum() / len(master) * 100
missing_pct = missing_pct[missing_pct > 0].sort_values(ascending=False)

print(f"\nColumns with missing data ({len(missing_pct)}):")
print(missing_pct.head(20).to_string())

# Visualize missing data
if len(missing_pct) > 0:
    plt.figure(figsize=(10, max(6, len(missing_pct) * 0.3)))
    missing_pct.plot(kind='barh')
    plt.title('Missing Data Percentage by Column')
    plt.xlabel('Missing Percentage (%)')
    plt.tight_layout()
    plt.show()


## 2. Target Variable Analysis

Analyze the distribution of the podium target variable.


In [None]:
# Target distribution
print("Target Variable (Podium) Distribution:")
print(master['podium'].value_counts())
print(f"\nPodium rate: {master['podium'].mean():.2%}")
print(f"Class imbalance ratio: {(1 - master['podium'].mean()) / master['podium'].mean():.2f}:1")

# Podium distribution by year
podium_by_year = master.groupby('year')['podium'].agg(['sum', 'count', 'mean'])
podium_by_year.columns = ['podiums', 'total', 'podium_rate']

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Podium rate over time
axes[0].plot(podium_by_year.index, podium_by_year['podium_rate'], marker='o')
axes[0].set_title('Podium Rate Over Time')
axes[0].set_xlabel('Year')
axes[0].set_ylabel('Podium Rate')
axes[0].grid(True, alpha=0.3)

# Podium distribution by circuit
if 'circuit_name' in master.columns:
    podium_by_circuit = master.groupby('circuit_name')['podium'].mean().sort_values(ascending=False)
    top_circuits = podium_by_circuit.head(10)
    axes[1].barh(range(len(top_circuits)), top_circuits.values)
    axes[1].set_yticks(range(len(top_circuits)))
    axes[1].set_yticklabels(top_circuits.index)
    axes[1].set_title('Top 10 Circuits by Podium Rate')
    axes[1].set_xlabel('Podium Rate')
    axes[1].invert_yaxis()

plt.tight_layout()
plt.show()


## 3. Feature Distributions

Explore distributions of key features.


In [None]:
# Key numeric features
numeric_features = ['grid', 'positionOrder', 'points', 'laps', 'driver_age']
available_numeric = [f for f in numeric_features if f in master.columns]

if available_numeric:
    fig, axes = plt.subplots(2, len(available_numeric), figsize=(5*len(available_numeric), 10))
    if len(available_numeric) == 1:
        axes = axes.reshape(-1, 1)
    
    for idx, feature in enumerate(available_numeric):
        # Distribution
        axes[0, idx].hist(master[feature].dropna(), bins=30, edgecolor='black')
        axes[0, idx].set_title(f'{feature} Distribution')
        axes[0, idx].set_xlabel(feature)
        axes[0, idx].set_ylabel('Frequency')
        
        # Box plot by podium
        if 'podium' in master.columns:
            master.boxplot(column=feature, by='podium', ax=axes[1, idx])
            axes[1, idx].set_title(f'{feature} by Podium')
            axes[1, idx].set_xlabel('Podium')
            axes[1, idx].set_ylabel(feature)
    
    plt.tight_layout()
    plt.show()


## 4. Relationships & Correlations

Analyze relationships between features and the target variable.


In [None]:
# Correlation matrix
numeric_cols = master.select_dtypes(include=[np.number]).columns.tolist()
if 'podium' in numeric_cols:
    corr_cols = ['podium'] + [c for c in numeric_cols if c != 'podium' and c in master.columns][:15]
    corr_matrix = master[corr_cols].corr()
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
                square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
    plt.title('Correlation Matrix (Top Features vs Podium)')
    plt.tight_layout()
    plt.show()
    
    # Top correlations with podium
    podium_corr = corr_matrix['podium'].drop('podium').abs().sort_values(ascending=False)
    print("Top correlations with podium:")
    print(podium_corr.head(10).to_string())


In [None]:
# Grid position vs podium probability
if 'grid' in master.columns and 'podium' in master.columns:
    grid_podium = master.groupby('grid')['podium'].agg(['mean', 'count'])
    grid_podium = grid_podium[grid_podium['count'] >= 10]  # Only show positions with enough data
    
    fig, ax = plt.subplots(figsize=(12, 6))
    ax.plot(grid_podium.index, grid_podium['mean'], marker='o', linewidth=2, markersize=8)
    ax.set_title('Podium Probability by Grid Position')
    ax.set_xlabel('Grid Position')
    ax.set_ylabel('Podium Probability')
    ax.set_xticks(range(1, min(21, int(grid_podium.index.max()) + 1)))
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    print(f"Podium rate from pole (grid=1): {master[master['grid']==1]['podium'].mean():.2%}")
    print(f"Podium rate from top 3: {master[master['grid']<=3]['podium'].mean():.2%}")
    print(f"Podium rate from top 10: {master[master['grid']<=10]['podium'].mean():.2%}")


## 5. Temporal Analysis

Analyze trends over time and performance by era.


In [None]:
# Performance trends by era
master['era'] = pd.cut(master['year'], 
                       bins=[1993, 2017, 2025], 
                       labels=['1994-2017', '2018-2024'])

if 'era' in master.columns:
    era_stats = master.groupby('era').agg({
        'podium': 'mean',
        'grid': 'mean',
        'points': 'mean'
    })
    
    print("Performance by Era:")
    print(era_stats.to_string())
    
    # Driver performance trends
    if 'driverId' in master.columns and 'surname' in master.columns:
        # Top drivers by podium rate
        driver_podium = master.groupby(['surname', 'era'])['podium'].mean().reset_index()
        top_drivers = master.groupby('surname')['podium'].mean().nlargest(10).index
        driver_podium_top = driver_podium[driver_podium['surname'].isin(top_drivers)]
        
        if len(driver_podium_top) > 0:
            pivot = driver_podium_top.pivot(index='surname', columns='era', values='podium')
            pivot = pivot.reindex(top_drivers)
            
            fig, ax = plt.subplots(figsize=(10, 6))
            x = np.arange(len(pivot))
            width = 0.35
            eras = pivot.columns.tolist()
            for i, era in enumerate(eras):
                ax.bar(x + i*width, pivot[era], width, label=era)
            ax.set_xlabel('Driver')
            ax.set_ylabel('Podium Rate')
            ax.set_title('Top 10 Drivers: Podium Rate by Era')
            ax.set_xticks(x + width/2)
            ax.set_xticklabels(pivot.index, rotation=45, ha='right')
            ax.legend()
            ax.grid(True, alpha=0.3, axis='y')
            plt.tight_layout()
            plt.show()


## 6. EDA Insights Summary

Summarize key findings for feature engineering.


In [None]:
# Generate insights summary
insights = {
    'target_distribution': {
        'podium_rate': float(master['podium'].mean()),
        'class_imbalance': float((1 - master['podium'].mean()) / master['podium'].mean())
    },
    'key_features': {
        'grid_position': 'Strong negative correlation with podium (lower grid = higher podium probability)',
        'qualifying': 'Qualifying performance likely important predictor',
        'driver_history': 'Driver historical performance should be included',
        'constructor_history': 'Constructor/team performance important'
    },
    'missing_data': {
        'sprint_results': 'Sprint results only available for recent years',
        'qualifying': 'Some historical races may lack qualifying data',
        'placeholders': 'FastF1 features (lap_time_variance, throttle_variance, etc.) are placeholders'
    },
    'recommendations': [
        'Create rolling historical features (last N races) for drivers and constructors',
        'Engineer grid position features (top 3, top 10, etc.)',
        'Add circuit-specific performance features',
        'Include driver age and experience features',
        'Create era-based features (pre/post 2018)',
        'Handle missing data appropriately (sprint results, qualifying)'
    ]
}

print("EDA Insights Summary:")
print("=" * 60)
print(f"\nTarget Distribution:")
print(f"  Podium rate: {insights['target_distribution']['podium_rate']:.2%}")
print(f"  Class imbalance: {insights['target_distribution']['class_imbalance']:.2f}:1")

print(f"\nKey Features for Modeling:")
for feature, description in insights['key_features'].items():
    print(f"  - {feature}: {description}")

print(f"\nRecommendations for Feature Engineering:")
for i, rec in enumerate(insights['recommendations'], 1):
    print(f"  {i}. {rec}")

# Save insights
import json
insights_path = PROCESSED_ROOT / "eda_insights.json"
with open(insights_path, 'w') as f:
    json.dump(insights, f, indent=2, default=str)

print(f"\nInsights saved to: {insights_path}")
