# Lap Time Predictor Training

This notebook trains the Lap Time Predictor model using telemetry data from multiple tracks.
We'll use Barber and COTA for training, and test on Indianapolis to validate generalization.

In [None]:
import sys
sys.path.append('../app')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
import xgboost as xgb

from data.loader import load_race_telemetry_wide, load_lap_times, segment_laps_by_time
from data.features import calculate_lap_features, calculate_tire_degradation_features, create_ml_features
from ml.models import LapTimePredictor

# Set up paths
dataset_root = Path("/Users/arsh/Developer/Projects/gr2025/dataset")
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 1. Load Training Data (Barber + COTA)

In [None]:
# Load data from multiple tracks for training
training_tracks = [
    ("barber", "R1"),
    ("barber", "R2"),
    ("cota", "R1"),
    ("cota", "R2"),
]

all_lap_features = []

for track, race in training_tracks:
    print(f"Processing {track} {race}...")
    
    try:
        # Load telemetry and lap data
        df_telemetry = load_race_telemetry_wide(dataset_root, track, race)
        start, end, lapt = load_lap_times(dataset_root, track, race)
        
        # Segment telemetry by laps
        df_segmented = segment_laps_by_time(df_telemetry, start, end)
        
        # Calculate lap features
        lap_features = calculate_lap_features(df_segmented)
        lap_features['track'] = track
        lap_features['race'] = race
        
        all_lap_features.append(lap_features)
        print(f"  - {len(lap_features)} laps processed")
        
    except Exception as e:
        print(f"  - Error: {e}")

# Combine all training data
df_training = pd.concat(all_lap_features, ignore_index=True)
print(f"\nTotal training laps: {len(df_training)}")
print(f"Vehicles: {df_training['vehicle_id'].nunique()}")
print(f"Tracks: {df_training['track'].unique()}")

## 2. Feature Engineering & Tire Degradation

In [None]:
# Add tire degradation features
df_training = calculate_tire_degradation_features(df_training)

# Create ML-ready features
df_features = create_ml_features(df_training, target_col="lap_duration_s")

print(f"Feature matrix shape: {df_features.shape}")
print(f"Features: {list(df_features.columns)}")

# Check for missing values
missing_counts = df_features.isnull().sum()
if missing_counts.sum() > 0:
    print("\nMissing values:")
    print(missing_counts[missing_counts > 0])
else:
    print("\nNo missing values!")

## 3. Train Lap Time Predictor Model

In [None]:
# Prepare training data
feature_cols = [col for col in df_features.columns 
                if col not in ["vehicle_id", "lap_id", "lap_duration_s"]]

X = df_features[feature_cols]
y = df_features["lap_duration_s"]

print(f"Training features: {len(feature_cols)}")
print(f"Training samples: {len(X)}")

# Train model
predictor = LapTimePredictor()
metrics = predictor.fit(df_features, target_col="lap_duration_s")

print(f"\nModel Performance:")
print(f"MAE: {metrics['mae']:.3f} seconds")
print(f"R²: {metrics['r2']:.3f}")

# Feature importance
if predictor.model is not None:
    importance_df = pd.DataFrame({
        'feature': predictor.feature_names,
        'importance': predictor.model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nTop 10 Most Important Features:")
    print(importance_df.head(10))

## 4. Test on Unseen Track (Indianapolis)

In [None]:
# Load Indianapolis data for testing
print("Testing on Indianapolis Race 1...")

try:
    df_test_telemetry = load_race_telemetry_wide(dataset_root, "indianapolis", "R1")
    start_test, end_test, lapt_test = load_lap_times(dataset_root, "indianapolis", "R1")
    
    df_test_segmented = segment_laps_by_time(df_test_telemetry, start_test, end_test)
    lap_features_test = calculate_lap_features(df_test_segmented)
    lap_features_test['track'] = 'indianapolis'
    lap_features_test['race'] = 'R1'
    
    df_test_features = calculate_tire_degradation_features(lap_features_test)
    df_test_ml = create_ml_features(df_test_features, target_col="lap_duration_s")
    
    # Make predictions
    X_test = df_test_ml[feature_cols]
    y_test = df_test_ml["lap_duration_s"]
    
    predictions = predictor.predict(X_test)
    
    # Calculate test metrics
    test_mae = mean_absolute_error(y_test, predictions)
    test_r2 = r2_score(y_test, predictions)
    test_rmse = np.sqrt(mean_squared_error(y_test, predictions))
    
    print(f"\nTest Performance (Indianapolis):")
    print(f"MAE: {test_mae:.3f} seconds")
    print(f"R²: {test_r2:.3f}")
    print(f"RMSE: {test_rmse:.3f} seconds")
    
    # Visualize predictions vs actual
    plt.figure(figsize=(12, 8))
    
    plt.subplot(2, 2, 1)
    plt.scatter(y_test, predictions, alpha=0.6)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    plt.xlabel('Actual Lap Time (s)')
    plt.ylabel('Predicted Lap Time (s)')
    plt.title(f'Predictions vs Actual\nR² = {test_r2:.3f}')
    
    plt.subplot(2, 2, 2)
    residuals = predictions - y_test
    plt.scatter(predictions, residuals, alpha=0.6)
    plt.axhline(y=0, color='r', linestyle='--')
    plt.xlabel('Predicted Lap Time (s)')
    plt.ylabel('Residuals (s)')
    plt.title('Residual Plot')
    
    plt.subplot(2, 2, 3)
    plt.hist(residuals, bins=20, alpha=0.7)
    plt.xlabel('Residuals (s)')
    plt.ylabel('Frequency')
    plt.title('Residual Distribution')
    
    plt.subplot(2, 2, 4)
    # Show lap progression for one vehicle
    vehicle_id = df_test_ml['vehicle_id'].iloc[0]
    vehicle_data = df_test_ml[df_test_ml['vehicle_id'] == vehicle_id].sort_values('lap_id')
    plt.plot(vehicle_data['lap_id'], vehicle_data['lap_duration_s'], 'o-', label='Actual')
    vehicle_preds = predictions[df_test_ml['vehicle_id'] == vehicle_id]
    plt.plot(vehicle_data['lap_id'], vehicle_preds, 's-', label='Predicted')
    plt.xlabel('Lap Number')
    plt.ylabel('Lap Time (s)')
    plt.title(f'Lap Progression - {vehicle_id}')
    plt.legend()
    
    plt.tight_layout()
    plt.show()
    
except Exception as e:
    print(f"Error testing on Indianapolis: {e}")

## 5. Model Analysis & Insights

In [None]:
# Analyze feature importance
if predictor.model is not None:
    plt.figure(figsize=(12, 8))
    
    # Feature importance plot
    plt.subplot(2, 2, 1)
    top_features = importance_df.head(15)
    plt.barh(range(len(top_features)), top_features['importance'])
    plt.yticks(range(len(top_features)), top_features['feature'])
    plt.xlabel('Feature Importance')
    plt.title('Top 15 Feature Importances')
    plt.gca().invert_yaxis()
    
    # Performance by track
    plt.subplot(2, 2, 2)
    track_performance = df_training.groupby('track')['lap_duration_s'].agg(['mean', 'std'])
    plt.bar(track_performance.index, track_performance['mean'], 
            yerr=track_performance['std'], capsize=5)
    plt.xlabel('Track')
    plt.ylabel('Average Lap Time (s)')
    plt.title('Average Lap Times by Track')
    
    # Tire degradation analysis
    plt.subplot(2, 2, 3)
    if 'lap_progression' in df_training.columns:
        degradation_sample = df_training[df_training['vehicle_id'] == df_training['vehicle_id'].iloc[0]]
        plt.plot(degradation_sample['lap_progression'], degradation_sample['lap_duration_s'], 'o-')
        plt.xlabel('Lap Number')
        plt.ylabel('Lap Time (s)')
        plt.title('Tire Degradation Example')
    
    # Speed vs lap time correlation
    plt.subplot(2, 2, 4)
    if 'avg_speed_kmh' in df_training.columns:
        plt.scatter(df_training['avg_speed_kmh'], df_training['lap_duration_s'], alpha=0.5)
        plt.xlabel('Average Speed (km/h)')
        plt.ylabel('Lap Time (s)')
        plt.title('Speed vs Lap Time')
    
    plt.tight_layout()
    plt.show()

print("\nModel Training Complete!")
print(f"The model can predict lap times with {metrics['mae']:.2f}s average error")
print(f"and explains {metrics['r2']*100:.1f}% of lap time variance.")