In [1]:
import pandas as pd
import numpy as np
import pickle
from datetime import datetime

In [2]:
# Load the trained Random Forest model
with open('f1_random_forest_model.pkl', 'rb') as f:
    rf_model = pickle.load(f)

# Load the encoders
with open('label_encoders.pkl', 'rb') as f:
    encoders = pickle.load(f)
    le_track = encoders['track']
    le_driver = encoders['driver']
    le_track_type = encoders['track_type']
    le_wet_dry = encoders['wet_dry']

# Load feature columns
with open('feature_columns.pkl', 'rb') as f:
    feature_columns = pickle.load(f)

# Load the engineered dataset (for historical stats)
df_sorted = pd.read_csv('f1_data.csv')

print("‚úÖ Model and data loaded successfully!")



‚úÖ Model and data loaded successfully!


In [3]:

def predict_race_results(race_name, track_name, year=2025, 
                         grid_positions=None, weather='dry', 
                         track_type='circuit', drivers_list=None):
    """
    Predict race finishing positions and identify top 3 podium finishers
    
    Parameters:
    -----------
    race_name : str
        Name of the Grand Prix (e.g., "Las Vegas Grand Prix")
    track_name : str
        Track name (e.g., "Las Vegas Strip Circuit", "Monza", "Spa-Francorchamps")
    year : int
        Race year (default: 2025)
    grid_positions : dict, optional
        Dictionary of {driver_name: grid_position}. If None, uses historical averages.
        Example: {'Max VERSTAPPEN': 1, 'Lando NORRIS': 2, ...}
    weather : str
        'dry' or 'wet' (default: 'dry')
    track_type : str
        'circuit' or 'street' (default: 'circuit')
    drivers_list : list, optional
        List of drivers to predict for. If None, uses all active drivers from latest season.
    
    Returns:
    --------
    DataFrame with predictions sorted by predicted position
    """
    
    print("\n" + "="*80)
    print(f"üèÅ PREDICTING: {race_name} {year}")
    print(f"üìç Track: {track_name}")
    print(f"üå¶Ô∏è  Weather: {weather.upper()}")
    print("="*80)
    
    # Get active drivers (from most recent data)
    if drivers_list is None:
        latest_year = df_sorted['year'].max()
        active_drivers = df_sorted[df_sorted['year'] == latest_year]['driver_name'].unique()
    else:
        active_drivers = drivers_list
    
    print(f"\nüèéÔ∏è  Analyzing {len(active_drivers)} drivers...")
    
    # Get historical statistics
    driver_stats = df_sorted.groupby('driver_name').agg({
        'avg_finish': 'first',
        'finish_std': 'first',
        'race_count': 'first',
        'avg_grid': 'first',
        'grid_std': 'first',
        'avg_pitstops': 'first',
        'podium_rate': 'first',
        'win_rate': 'first',
        'dnf_rate': 'first'
    }).reset_index()
    
    # Get track-specific stats
    track_stats = df_sorted.groupby(['driver_name', 'track']).agg({
        'track_avg_finish': 'first',
        'track_avg_grid': 'first',
        'track_podium_rate': 'first',
        'track_race_count': 'first'
    }).reset_index()
    
    # Prepare predictions
    predictions = []
    
    for driver in active_drivers:
        try:
            # Get driver's historical stats
            driver_data = driver_stats[driver_stats['driver_name'] == driver]
            
            if len(driver_data) == 0:
                print(f"‚ö†Ô∏è  Skipping {driver} - no historical data")
                continue
            
            # Get grid position (from input or historical average)
            if grid_positions and driver in grid_positions:
                grid_pos = grid_positions[driver]
            else:
                grid_pos = driver_data['avg_grid'].values[0]
            
            # Get track-specific stats if available
            track_data = track_stats[
                (track_stats['driver_name'] == driver) & 
                (track_stats['track'] == track_name)
            ]
            
            if len(track_data) > 0:
                track_avg_finish = track_data['track_avg_finish'].values[0]
                track_avg_grid = track_data['track_avg_grid'].values[0]
                track_podium_rate = track_data['track_podium_rate'].values[0]
                track_race_count = track_data['track_race_count'].values[0]
            else:
                # Use overall stats if no track-specific data
                track_avg_finish = driver_data['avg_finish'].values[0]
                track_avg_grid = driver_data['avg_grid'].values[0]
                track_podium_rate = driver_data['podium_rate'].values[0]
                track_race_count = 0
            
            # Get recent form (last 5 races of this driver)
            recent_races = df_sorted[df_sorted['driver_name'] == driver].tail(5)
            if len(recent_races) > 0:
                recent_form_5 = recent_races['finished_position'].mean()
                recent_form_3 = recent_races.tail(3)['finished_position'].mean()
                recent_podiums_5 = (recent_races['finished_position'] <= 3).sum()
                recent_wins_5 = (recent_races['finished_position'] == 1).sum()
            else:
                recent_form_5 = driver_data['avg_finish'].values[0]
                recent_form_3 = driver_data['avg_finish'].values[0]
                recent_podiums_5 = 0
                recent_wins_5 = 0
            
            # Encode categorical variables
            try:
                track_enc = le_track.transform([track_name])[0]
            except:
                # If track not in training data, use most common track encoding
                print(f"‚ö†Ô∏è  Track '{track_name}' not in training data, using default")
                track_enc = 0
            
            driver_enc = le_driver.transform([driver])[0]
            track_type_enc = le_track_type.transform([track_type])[0]
            wet_dry_enc = le_wet_dry.transform([weather])[0]
            
            # Create interaction features
            grid_advantage = driver_data['avg_grid'].values[0] / (grid_pos + 0.1)
            consistency_score = 1 / (1 + driver_data['finish_std'].values[0])
            performance_momentum = (driver_data['avg_finish'].values[0] - recent_form_5) / (driver_data['avg_finish'].values[0] + 1)
            
            # Build feature vector matching training features
            features = {
                'grid_position': grid_pos,
                'avg_finish': driver_data['avg_finish'].values[0],
                'finish_std': driver_data['finish_std'].values[0],
                'race_count': driver_data['race_count'].values[0],
                'avg_grid': driver_data['avg_grid'].values[0],
                'grid_std': driver_data['grid_std'].values[0],
                'avg_pitstops': driver_data['avg_pitstops'].values[0],
                'podium_rate': driver_data['podium_rate'].values[0],
                'win_rate': driver_data['win_rate'].values[0],
                'dnf_rate': driver_data['dnf_rate'].values[0],
                'track_avg_finish': track_avg_finish,
                'track_avg_grid': track_avg_grid,
                'track_podium_rate': track_podium_rate,
                'track_race_count': track_race_count,
                'recent_form_5': recent_form_5,
                'recent_form_3': recent_form_3,
                'recent_podiums_5': recent_podiums_5,
                'recent_wins_5': recent_wins_5,
                'track_encoded': track_enc,
                'track_type_encoded': track_type_enc,
                'wet_dry_encoded': wet_dry_enc,
                'driver_encoded': driver_enc,
                'grid_advantage': grid_advantage,
                'consistency_score': consistency_score,
                'performance_momentum': performance_momentum,
                'year': year
            }
            
            # Make prediction
            X_pred = pd.DataFrame([features])[feature_columns]
            predicted_position = rf_model.predict(X_pred)[0]
            
            predictions.append({
                'driver': driver,
                'predicted_position': predicted_position,
                'grid_position': grid_pos,
                'podium_rate': driver_data['podium_rate'].values[0],
                'win_rate': driver_data['win_rate'].values[0],
                'recent_form': recent_form_5,
                'track_experience': 'Yes' if track_race_count > 0 else 'No'
            })
            
        except Exception as e:
            print(f"‚ö†Ô∏è  Error predicting for {driver}: {str(e)}")
            continue
    
    # Create results DataFrame and sort by predicted position
    results_df = pd.DataFrame(predictions).sort_values('predicted_position')
    results_df['predicted_rank'] = range(1, len(results_df) + 1)
    
    # Display top 10 predictions
    print("\n" + "="*80)
    print("üèÜ PREDICTED RACE RESULTS - TOP 10")
    print("="*80)
    print(f"{'Rank':<6} {'Driver':<22} {'Pred Pos':<10} {'Grid':<8} {'Podium%':<10} {'Recent Form':<12}")
    print("-"*80)
    
    for idx, row in results_df.head(10).iterrows():
        rank = row['predicted_rank']
        medal = "ü•á" if rank == 1 else "ü•à" if rank == 2 else "ü•â" if rank == 3 else f"P{rank}"
        print(f"{medal:<6} {row['driver']:<22} {row['predicted_position']:<10.2f} "
              f"P{row['grid_position']:<7.0f} {row['podium_rate']*100:<9.1f}% "
              f"P{row['recent_form']:<11.2f}")
    
    # Highlight podium predictions
    print("\n" + "="*80)
    print("üèÅ PREDICTED PODIUM (TOP 3)")
    print("="*80)
    podium = results_df.head(3)
    for idx, row in podium.iterrows():
        rank = row['predicted_rank']
        medal = "ü•á 1st" if rank == 1 else "ü•à 2nd" if rank == 2 else "ü•â 3rd"
        print(f"{medal:8} | {row['driver']:<20} | Predicted: P{row['predicted_position']:.2f} | Grid: P{row['grid_position']:.0f}")
    
    print("\n" + "="*80)
    
    return results_df



In [None]:
if __name__ == "__main__":

    grid = {
        'Max VERSTAPPEN': 2,
        'Charles LECLERC': 9,
        'Lando NORRIS': 1,
        'Carlos SAINZ': 3,
        'George RUSSELL': 4,
        'Oscar PIASTRI': 5,
        'Liam LAWSON': 6,
        'Fernando ALONSO': 7,
        'Isack HADJAR': 8,
        'Charles LECLERC': 9,
        'Pierre GASLY': 10,
        'Nico HULKENBERG': 11,
        'Lance STROLL': 12,
        'Esteban OCON': 13,
        'Oliver BEARMAN': 14,
        'Franco COLAPINTO': 15,
        'Alexander ALBON': 16,
        'Kimi ANTONELLI': 17,
        'Gabriel BORTOLETO': 18,
        'Yuki TSUNODA': 19,
        'Lewis HAMILTON': 20
    }
    results = predict_race_results(
    race_name="Las Vegas Grand Prix",
    track_name="Las Vegas",
    year=2025,
    grid_positions=grid,
    weather='dry'
)


üèÅ PREDICTING: Las Vegas Grand Prix 2025
üìç Track: Las Vegas
üå¶Ô∏è  Weather: DRY

üèéÔ∏è  Analyzing 22 drivers...


[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 200 out of 200 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 200 out of 200 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 200 out of 200 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Do


üèÜ PREDICTED RACE RESULTS - TOP 10
Rank   Driver                 Pred Pos   Grid     Podium%    Recent Form 
--------------------------------------------------------------------------------
ü•á      Charles LECLERC        3.92       P5       44.8     % P3.75       
ü•à      Max VERSTAPPEN         4.16       P3       72.3     % P3.60       
ü•â      George RUSSELL         4.92       P5       22.6     % P4.00       
P4     Lando NORRIS           5.10       P5       56.1     % P2.40       
P5     Lewis HAMILTON         5.12       P8       18.3     % P6.25       
P6     Oscar PIASTRI          6.08       P6       38.1     % P3.20       
P7     Andrea Kimi ANTONELLI  7.60       P16      0.0      % P4.00       
P8     Kimi ANTONELLI         7.94       P8       12.5     % P6.50       
P9     Carlos SAINZ           8.83       P7       22.4     % P11.25      
P10    Fernando ALONSO        9.91       P9       13.6     % P10.20      

üèÅ PREDICTED PODIUM (TOP 3)
ü•á 1st    | Charles LECLE

[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 200 out of 200 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 200 out of 200 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 200 out of 200 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 200 out of 200 | 