In [21]:
# Cell 1: Import necessary libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from IPython.display import display, Markdown # For nice table output
import warnings # To suppress potential warnings
import joblib # For saving/loading model
import os # For path handling

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [22]:
# Cell 2: Load Data
try:
    # Ensure these paths are correct for your environment
    qualifying_df = pd.read_csv("../data/processed/updated_qualifying.csv", parse_dates=["date"])
    races_df = pd.read_csv("../data/processed/updated_races.csv", parse_dates=["date"])
    print("CSV files loaded successfully.")
    print(f"Races data includes {races_df['season'].max()} season up to round {races_df[races_df['season'] == races_df['season'].max()]['round'].max()}")

except FileNotFoundError as e:
    print(f"Error: File not found. {e}")
    print("Please ensure 'qualifying.csv' and your races CSV are in the correct directory.")
    raise
except Exception as e:
    print(f"An error occurred during file loading: {e}")
    raise

# --- Basic Data Cleaning (Applied to loaded DataFrames) ---
print("Applying basic data cleaning...")
races_df['points'] = pd.to_numeric(races_df['points'], errors='coerce').fillna(0)
races_df['position'] = pd.to_numeric(races_df['position'], errors='coerce')
races_df['grid'] = pd.to_numeric(races_df['grid'], errors='coerce')

def clean_driver_name(name):
    if isinstance(name, str):
        name = name.replace(' Jr.', '').replace('Hülkenberg', 'Hulkenberg').replace('Perez', 'Pérez').replace('Raikkonen', 'Räikkönen')
        if "Antonelli" in name: # This should normalize "antonelli" and "Kimi Antonelli"
            return "Kimi Antonelli"
        # Add other known variations if necessary
    return name

races_df['driverFullName'] = races_df['driverFullName'].apply(clean_driver_name)
qualifying_df['driverFullName'] = qualifying_df['driverFullName'].apply(clean_driver_name)

# Ensure IDs are strings for consistency
id_cols = ['driverId', 'constructorId']
for col in id_cols:
    if col in races_df.columns:
        races_df[col] = races_df[col].astype(str)
    if col in qualifying_df.columns:
        qualifying_df[col] = qualifying_df[col].astype(str)
print("Basic data cleaning complete.")

CSV files loaded successfully.
Races data includes 2025 season up to round 6
Applying basic data cleaning...
Basic data cleaning complete.


In [23]:
# Cell 3: Feature Engineering Function & Initial Calculation

def calculate_features(df_races, df_qualifying):
    """
    Calculates rolling metrics and standings features on race data.
    Args:
        df_races (pd.DataFrame): DataFrame with race results.
        df_qualifying (pd.DataFrame): DataFrame with qualifying info for names.
    Returns:
        pd.DataFrame: DataFrame with added features.
    """
    print("Calculating features...")
    df_races_processed = df_races.sort_values(by=["season", "round", "date"]).copy()

    # --- Ensure Full Names are Present (Merge if necessary) ---
    # This is primarily for the historical data being processed.
    # For future predictions, names will come from the raw_grid_list.
    if 'driverFullName' not in df_races_processed.columns or 'constructorName' not in df_races_processed.columns:
        print("Full names not in races_df, attempting to merge from qualifying_df...")
        latest_qual_names = df_qualifying.sort_values(
            by="date", ascending=False
        ).drop_duplicates(subset=["driverId", "constructorId"])
        name_map_df = latest_qual_names[[
            "driverId", "constructorId", "driverFullName", "constructorName"
        ]].copy()

        driver_id_to_name = name_map_df.drop_duplicates(subset="driverId").set_index("driverId")["driverFullName"]
        constructor_id_to_name = name_map_df.drop_duplicates(subset="constructorId").set_index("constructorId")["constructorName"]

        df_races_processed["driverFullName"] = df_races_processed["driverId"].map(driver_id_to_name)
        df_races_processed["constructorName"] = df_races_processed["constructorId"].map(constructor_id_to_name)

        df_races_processed["driverFullName"].fillna(df_races_processed["driverId"], inplace=True)
        df_races_processed["constructorName"].fillna(df_races_processed["constructorId"], inplace=True)
        print("Names merged/filled in races_df.")
    else:
        print("Full names already present in races_df for feature calculation.")

    # --- Target Variable ---
    df_races_processed["is_winner"] = (df_races_processed["position"] == 1).astype(int)

    # --- Grid Handling ---
    df_races_processed["grid"] = df_races_processed["grid"].replace(0, 21).fillna(21)
    df_races_processed["grid"] = df_races_processed["grid"].astype(int)

    # --- Rolling Performance Metrics (per driver) ---
    df_races_processed = df_races_processed.sort_values(by=["driverId", "season", "round", "date"])
    rolling_features_cols = ["points", "position", "grid"]
    for feature_col in rolling_features_cols:
        # Calculate rolling mean
        roll_mean = df_races_processed.groupby("driverId")[feature_col].rolling(window=5, min_periods=1).mean()
        # Drop the driverId level created by groupby, shift, and assign back
        df_races_processed[f"avg_{feature_col}_last_5"] = roll_mean.reset_index(level=0, drop=True).groupby(level=0).shift(1)


    # --- Championship Standings (Points Before Race) ---
    df_races_processed["season_points"] = df_races_processed.groupby(["season", "driverId"])["points"].cumsum()
    df_races_processed["points_standings_prev_race"] = df_races_processed.groupby(["season", "driverId"])["season_points"].shift(1)

    # --- Handle NaNs created by shift/rolling ---
    df_races_processed["avg_points_last_5"].fillna(0, inplace=True)
    df_races_processed["avg_position_last_5"].fillna(21, inplace=True)
    df_races_processed["avg_grid_last_5"].fillna(21, inplace=True)
    df_races_processed["points_standings_prev_race"].fillna(0, inplace=True)

    df_races_processed = df_races_processed.drop(columns=["season_points"], errors='ignore')
    print("Features calculated.")
    return df_races_processed.sort_values(by=["season", "round", "date"])

# --- Execute Feature Engineering on Initial Data ---
# Pass copies to avoid modifying original DataFrames in memory if re-running cells
data_df_featured = calculate_features(races_df.copy(), qualifying_df.copy())

# --- Create Name -> ID maps (for prepare_grid_for_prediction) ---
# These are built from the *final* data_df_featured which should have your manually cleaned names
# and all driver/constructor IDs present in your historical data.
latest_driver_entries = data_df_featured.drop_duplicates(subset="driverFullName", keep="last")
latest_driver_name_to_id_map = latest_driver_entries.set_index("driverFullName")["driverId"].to_dict()

latest_constructor_entries = data_df_featured.drop_duplicates(subset="constructorName", keep="last")
latest_constructor_name_to_id_map = latest_constructor_entries.set_index("constructorName")["constructorId"].to_dict()

print("\nName -> ID maps created from processed data for prediction input handling.")
print("Sample of featured data:")
print(data_df_featured[[
    "season", "round", "driverFullName", "constructorName", "grid", "position",
    "avg_points_last_5", "points_standings_prev_race", "is_winner"
]].tail())

Calculating features...
Full names already present in races_df for feature calculation.
Features calculated.

Name -> ID maps created from processed data for prediction input handling.
Sample of featured data:
      season  round  driverFullName               constructorName  grid  \
2239    2025      6   Oscar Piastri              McLaren-Mercedes     4   
2241    2025      6  George Russell                      Mercedes     5   
2247    2025      6    Carlos Sainz             Williams-Mercedes     6   
2254    2025      6    Lance Stroll  Aston Martin Aramco-Mercedes    18   
2248    2025      6    Yuki Tsunoda    Red Bull Racing-Honda RBPT    10   

      position  avg_points_last_5  points_standings_prev_race  is_winner  
2239         1                0.0                        92.0          1  
2241         3                0.0                        68.0          0  
2247         9                0.0                         5.0          0  
2254        16                0.0      

In [24]:
# Cell 4: Model Definition and Preprocessing Setup

# Define features including the new ones
features = [
    "grid",
    "circuitId",
    "driverId",
    "constructorId",
    "avg_points_last_5",
    "avg_position_last_5",
    "avg_grid_last_5",
    "points_standings_prev_race",
]
target = "is_winner"

# Define numerical and categorical features FOR THE PREPROCESSOR
numerical_features = [
    "grid",
    "avg_points_last_5",
    "avg_position_last_5",
    "avg_grid_last_5",
    "points_standings_prev_race",
]
categorical_features = ["circuitId", "driverId", "constructorId"]

# Create preprocessing pipelines
numerical_transformer = SimpleImputer(strategy="median")
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    (
        "onehot",
        OneHotEncoder(
            handle_unknown="ignore", sparse_output=False
        ),
    ),
])

# Create the preprocessor object
preprocessor = ColumnTransformer(transformers=[
    ("num", numerical_transformer, numerical_features),
    ("cat", categorical_transformer, categorical_features),
])

print("Preprocessor configured with new features.")
print("Features for model:", features)

Preprocessor configured with new features.
Features for model: ['grid', 'circuitId', 'driverId', 'constructorId', 'avg_points_last_5', 'avg_position_last_5', 'avg_grid_last_5', 'points_standings_prev_race']


In [25]:
# Cell 5: Model Training (with Model Saving)

# Select features and target from the data with calculated features
X = data_df_featured[features]
y = data_df_featured[target]

# Create the full model pipeline
model_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    (
        "classifier",
        RandomForestClassifier(
            n_estimators=200,
            random_state=42,
            class_weight="balanced",
            max_depth=15,
            min_samples_split=5,
            min_samples_leaf=3,
            n_jobs=-1,
        ),
    ),
])

print("Training the RandomForestClassifier model with new features...")
if "X" in locals() and "y" in locals() and not X.empty and not y.empty:
    model_pipeline.fit(X, y)
    print("Model training complete.")

    # --- SAVE THE TRAINED MODEL ---
    model_filename = "f1_winner_predictor_model.joblib"
    try:
        joblib.dump(model_pipeline, model_filename)
        print(f"Trained model saved to {model_filename}")
    except Exception as e:
        print(f"Error saving model: {e}")
    # -----------------------------
else:
    print(
        "Error: Feature data (X) or target data (y) not found or empty."
        " Please run the feature engineering cell (Cell 3) successfully."
    )

Training the RandomForestClassifier model with new features...
Model training complete.
Trained model saved to f1_winner_predictor_model.joblib


In [26]:
# Cell 6: Reusable Grid Preparation Function

# Define known team name changes/mappings for future seasons if needed
# Map NEW team name (key) to the constructorId used in TRAINING data (value)
TEAM_REBRAND_MAP = {
    "Red Bull Racing Honda RBPT": "red_bull",
    
    "McLaren Mercedes": "mclaren",
    "McLaren-Mercedes": "mclaren", # Added variation
    
    "Ferrari": "ferrari",
    
    "Mercedes": "mercedes",
    
    "Racing Bulls Honda RBPT": "rb",
    "Racing Bulls-Honda RBPT": "rb", # Added variation
    
    "Williams Mercedes": "williams",
    "Williams-Mercedes": "williams", # Added variation
    
    "Haas Ferrari": "haas",
    "Haas-Ferrari": "haas", # Added variation
    
    "Alpine Renault": "alpine",
    "Alpine-Renault": "alpine", # Added variation
    
    "Aston Martin Aramco Mercedes": "aston_martin",
    "Aston Martin Aramco-Mercedes": "aston_martin", # Added variation
    
    "Kick Sauber Ferrari": "sauber",
    "Kick Sauber-Ferrari": "sauber", # Added variation
    
    
    # Historical for completeness
    "AlphaTauri": "alphatauri",
    "Racing Point": "racing_point",
    "Alfa Romeo": "alfa",
    "Renault": "renault",
    "RB F1 Team": "rb",
    "Sauber": "sauber",
}

def prepare_grid_for_prediction(
    raw_grid_list, # List of dictionaries with driver and team names
    driver_name_to_id_map_hist, # Historical Name -> ID map
    constructor_name_to_id_map_hist, # Historical Name -> ID map
    team_rebrand_map_current # Current Season Team Name -> Historical ID map
):
    """
    Processes a raw grid list.
    It expects 'driverFullName' and 'constructorName' in raw_grid_list.
    It will try to find existing 'driverId' and 'constructorId' using the maps.
    If not found, it creates placeholder IDs.
    """
    prepared_grid = []
    print("Preparing grid for prediction...")

    for entry in raw_grid_list:
        driver_name = entry["driver"] # Expects 'driver' key for full name
        team_name = entry["team"]     # Expects 'team' key for current full team name
        grid_pos = entry["grid"]      # Expects 'grid' key for grid position

        # --- Determine driverId ---
        driver_id = driver_name_to_id_map_hist.get(driver_name)
        if driver_id is None:
            driver_id = f"new_driver_{driver_name.lower().replace(' ', '_')}"
            print(f"Note: Using placeholder ID for new/unmapped driver: {driver_name} -> {driver_id}")

        # --- Determine constructorId for the model ---
        
        # 1. Try current season rebrand map
        constructor_id_for_model = team_rebrand_map_current.get(team_name)
        
        # 2. If not in rebrand, try historical name map (in case an old name is used)
        if constructor_id_for_model is None:
            constructor_id_for_model = constructor_name_to_id_map_hist.get(team_name)
            
        # 3. If still not found, it's a truly new/unmapped team for the model
        if constructor_id_for_model is None:
            constructor_id_for_model = f"new_team_{team_name.lower().replace(' ', '_')}"
            print(f"Note: Using placeholder ID for new/unmapped team: {team_name} -> {constructor_id_for_model}")

        prepared_grid.append({
            "driverId": str(driver_id),
            "constructorId": str(constructor_id_for_model),
            "grid": grid_pos,
            "driverFullName": driver_name, # This is the name for display
            "constructorName": team_name, # This is the team name for display
        })
    print("Grid preparation complete.")
    return prepared_grid

print("Grid preparation function defined.")

Grid preparation function defined.


In [27]:
# Cell 7: Prediction Function Definition

def predict_race_winner_probabilities(
    circuit_id, # Informational
    predict_rows_featured,
    model,
    model_features_list,
    driver_detail_map # Map of {model_driver_id: {'FullName': ..., 'ConstructorName': ..., 'Grid': ...}}
    ):
    """
    Predicts the win probability for each driver in a given grid DataFrame.
    Accepts pre-calculated features and a driver detail map for display names.
    """
    if predict_rows_featured.empty:
        print("Error: predict_rows_featured DataFrame is empty.")
        return {}
    if not model:
        print("Error: Model is not provided or not trained.")
        return {}
    if not driver_detail_map:
        print("Error: driver_detail_map not provided.")
        return {}

    required_cols = model_features_list + ["driverId"] # 'grid' is in model_features_list
    if not all(col in predict_rows_featured.columns for col in required_cols):
        missing = [col for col in required_cols if col not in predict_rows_featured.columns]
        print(f"Error: predict_rows_featured DataFrame is missing required columns: {missing}")
        return {}

    try:
        predict_X = predict_rows_featured[model_features_list]
        probabilities = model.predict_proba(predict_X)
        win_probabilities = probabilities[:, 1]

        total_prob = np.sum(win_probabilities)
        if total_prob > 0:
            normalized_probs = win_probabilities / total_prob
        else:
            print("Warning: Model predicted zero probability for all drivers. Assigning equal probability.")
            normalized_probs = np.ones(len(predict_X)) / len(predict_X)

        results = {}
        for i, index in enumerate(predict_rows_featured.index):
            model_driver_id = predict_rows_featured.loc[index, "driverId"]
            details = driver_detail_map.get(model_driver_id)

            if details:
                 results[model_driver_id] = {
                    "Probability": normalized_probs[i],
                    "DriverFullName": details["FullName"],
                    "ConstructorName": details["ConstructorName"],
                    "Grid": details["Grid"],
                }
            else:
                print(f"Warning: Could not find display details for driverId '{model_driver_id}'. Using ID as name.")
                results[model_driver_id] = {
                    "Probability": normalized_probs[i],
                    "DriverFullName": model_driver_id,
                    "ConstructorName": predict_rows_featured.loc[index, "constructorId"] if "constructorId" in predict_rows_featured.columns else "Unknown",
                    "Grid": predict_rows_featured.loc[index, "grid"],
                }

        return dict(sorted(results.items(), key=lambda item: item[1]["Probability"], reverse=True))

    except Exception as e:
        print(f"An error occurred during prediction: {e}")
        import traceback
        traceback.print_exc()
        return {}

print("Prediction function updated.")

Prediction function updated.


In [28]:
# Cell 8: Wrapper Function for Prediction and Display (Corrected Arguments)

import os
import joblib # For loading model if you choose to load it inside this wrapper
from IPython.display import display, Markdown

def predict_and_display_results(
    circuit_id,
    future_season,
    future_round,
    raw_grid_list,
    model, # Expecting the trained model pipeline to be passed
    base_races_df,
    base_qualifying_df,
    # Corrected argument names to match what prepare_grid_for_prediction expects
    driver_name_to_id_hist_map,
    constructor_name_to_id_hist_map,
    team_rebrand_map_current,
    model_features_list, # The list of features the model was trained on
    race_description="Future Race",
    save_path="predictions" # Folder to save CSV predictions
):
    """
    Orchestrates prediction: prepares grid, adds it to history, recalculates features,
    runs prediction using the provided model, displays results, and saves results to CSV.
    """
    # --- Create save directory if it doesn't exist ---
    if save_path and not os.path.exists(save_path):
        try:
            os.makedirs(save_path)
            print(f"Created directory: {save_path}")
        except OSError as e:
            print(f"Error creating directory {save_path}: {e}")
            save_path = None # Disable saving if directory creation fails
    # -------------------------------------------------

    print(f"--- Predicting for: {race_description} ({circuit_id}) ---")

    print("Preparing future grid data (mapping names to IDs)...")
    # Pass the correctly named map arguments here
    prepared_grid_list_with_names = prepare_grid_for_prediction(
        raw_grid_list,
        driver_name_to_id_hist_map,
        constructor_name_to_id_hist_map,
        team_rebrand_map_current
    )
    if not prepared_grid_list_with_names:
        print("Grid preparation failed. Cannot predict.")
        return

    # Create the definitive map for display names
    driver_detail_map_for_display = {
        item["driverId"]: {
            "FullName": item["driverFullName"],
            "ConstructorName": item["constructorName"],
            "Grid": item["grid"]
        } for item in prepared_grid_list_with_names
    }

    print("Creating temporary DataFrame for feature calculation...")
    future_race_df_for_features = pd.DataFrame(prepared_grid_list_with_names)[
        ['driverId', 'constructorId', 'grid'] # Use model IDs here
    ].copy()
    future_race_df_for_features["season"] = future_season
    future_race_df_for_features["round"] = future_round
    future_race_df_for_features["circuitId"] = circuit_id
    future_race_df_for_features["date"] = pd.Timestamp.now() # Placeholder
    future_race_df_for_features["position"] = np.nan
    future_race_df_for_features["points"] = 0.0

    print("Combining with historical data...")
    required_base_cols = ['season', 'round', 'date', 'driverId', 'constructorId', 'circuitId', 'grid', 'position', 'points']
    if not all(col in base_races_df.columns for col in required_base_cols):
        print(f"Error: base_races_df is missing required columns. Needed: {required_base_cols}")
        return
    combined_df = pd.concat([base_races_df[required_base_cols], future_race_df_for_features], ignore_index=True)

    print("Recalculating features on combined data...")
    combined_featured_df = calculate_features(combined_df, base_qualifying_df) # base_qualifying_df is used for name mapping within calculate_features if needed

    print("Isolating prediction rows...")
    predict_rows_featured = combined_featured_df[
        (combined_featured_df["season"] == future_season) &
        (combined_featured_df["round"] == future_round)
    ].copy()

    if predict_rows_featured.empty:
        print("Error: Could not find rows for the future race after feature calculation.")
        return

    model_ready = model is not None and hasattr(model, 'predict_proba') and hasattr(model, 'steps')

    if model_ready:
        print(f"\nRunning prediction for {race_description}...")
        winner_probs_dict = predict_race_winner_probabilities(
            circuit_id,
            predict_rows_featured,
            model,
            model_features_list,
            driver_detail_map_for_display
        )

        print(f"\nPredicted Win Probabilities ({race_description}):")
        if winner_probs_dict:
            results_list = [
                {
                    "Driver": details["DriverFullName"],
                    "Grid": details["Grid"],
                    "Team": details["ConstructorName"],
                    "Probability_Num": details["Probability"],
                }
                for _, details in winner_probs_dict.items()
            ]
            results_df = pd.DataFrame(results_list)

            if save_path:
                safe_filename = "".join(c if c.isalnum() else "_" for c in race_description)
                csv_filename = os.path.join(save_path, f"{future_season}_R{future_round:02d}_{safe_filename}_{circuit_id}_predictions.csv")
                try:
                    results_df.to_csv(csv_filename, index=False, float_format='%.6f')
                    print(f"Predictions saved to: {csv_filename}")
                except Exception as e:
                    print(f"Error saving predictions to CSV: {e}")

            display_df = results_df.copy()
            display_df["Probability"] = display_df["Probability_Num"].map("{:.2%}".format)
            display_df = display_df.drop(columns=["Probability_Num"])

            markdown_table = "| Driver             | Grid | Team                           | Probability |\n"
            markdown_table += "|--------------------|------|--------------------------------|-------------|\n"
            for _, row in display_df.iterrows():
                 markdown_table += (
                    f"| {row['Driver']:<18} | {row['Grid']:<4} |"
                    f" {row['Team']:<30} | {row['Probability']:>11} |\n"
                )
            display(Markdown(markdown_table))
        else:
            print("Prediction failed or returned no results.")
    else:
         print("Model was not provided or does not appear to be a valid trained model.")

print("Prediction and display wrapper function arguments corrected and defined.")

Prediction and display wrapper function arguments corrected and defined.


In [29]:
# Cell 9: Define Grid for 2025 Emilia Romagna Grand Prix

imola_2025_raw_grid = [
    # McLaren Mercedes
    {'driver': 'Lando Norris',       'team': 'McLaren Mercedes',              'grid': 4},
    {'driver': 'Oscar Piastri',      'team': 'McLaren Mercedes',              'grid': 1},
    
    # Red Bull Racing Honda RBPT
    {'driver': 'Max Verstappen',     'team': 'Red Bull Racing Honda RBPT',    'grid': 2},
    {'driver': 'Yuki Tsunoda',        'team': 'Red Bull Racing Honda RBPT',    'grid': 20},
    
    # Ferrari
    {'driver': 'Charles Leclerc',    'team': 'Ferrari',                       'grid': 11},
    {'driver': 'Lewis Hamilton',     'team': 'Ferrari',                       'grid': 12},
    
    # Mercedes
    {'driver': 'George Russell',     'team': 'Mercedes',                      'grid': 3},
    {'driver': 'Kimi Antonelli',     'team': 'Mercedes',                      'grid': 13}, # New Driver
    
    # Alpine Renault
    {'driver': 'Pierre Gasly',       'team': 'Alpine Renault',                'grid': 10},
    {'driver': 'Franco Colapinto',   'team': 'Alpine Renault',                'grid': 16}, # New Driver
    
    # Williams Mercedes
    {'driver': 'Alexander Albon',    'team': 'Williams Mercedes',             'grid': 7},
    {'driver': 'Carlos Sainz',       'team': 'Williams Mercedes',             'grid': 6},
    
    # Racing Bulls Honda RBPT
    {'driver': 'Liam Lawson',       'team': 'Racing Bulls Honda RBPT',       'grid': 15},
    {'driver': 'Isack Hadjar',       'team': 'Racing Bulls Honda RBPT',       'grid': 9}, # New Driver
    
    # Aston Martin Aramco Mercedes
    {'driver': 'Fernando Alonso',    'team': 'Aston Martin Aramco Mercedes', 'grid': 5},
    {'driver': 'Lance Stroll',       'team': 'Aston Martin Aramco Mercedes', 'grid': 8}, 
    
    # Kick Sauber Ferrari
    {'driver': 'Gabriel Bortoleto',  'team': 'Kick Sauber Ferrari',           'grid': 14}, # New Driver
    {'driver': 'Nico Hulkenberg',    'team': 'Kick Sauber Ferrari',           'grid': 17},
    
    # Haas Ferrari
    {'driver': 'Esteban Ocon',       'team': 'Haas Ferrari',                  'grid': 18},
    {'driver': 'Oliver Bearman',     'team': 'Haas Ferrari',                  'grid': 19},
]

future_race_circuit_imola = "imola"
future_season_imola = 2025
future_round_imola = 7
race_description_imola = "2025 Emilia Romagna Grand Prix"

print(f"Grid defined for {race_description_imola}")

Grid defined for 2025 Emilia Romagna Grand Prix


In [30]:
# Cell 10: Execute Prediction for 2025 Emilia Romagna Grand Prix

# --- Load the Saved Model ---
model_filename = "f1_winner_predictor_model.joblib" # Ensure this path is correct
loaded_model = None
try:
    loaded_model = joblib.load(model_filename)
    print(f"Loaded trained model from {model_filename}")
except FileNotFoundError:
    print(f"Error: Saved model file '{model_filename}' not found. Please train the model first (run Cell 5).")
    # Optionally, you could trigger training here if the model isn't found,
    # but for now, we'll assume it should exist.
except Exception as e:
    print(f"Error loading model: {e}")
# --------------------------

# --- Execute Prediction ---
# Check if model loaded successfully before proceeding
if loaded_model:
    predict_and_display_results(
        circuit_id=future_race_circuit_imola,
        future_season=future_season_imola,
        future_round=future_round_imola,
        raw_grid_list=imola_2025_raw_grid, # Defined in Cell 9
        model=loaded_model, # Use the loaded model
        base_races_df=races_df, # Original races_df from Cell 2 (with 2025 Bahrain data)
        base_qualifying_df=qualifying_df, # Original qualifying_df from Cell 2
        driver_name_to_id_hist_map=latest_driver_name_to_id_map, # From Cell 3
        constructor_name_to_id_hist_map=latest_constructor_name_to_id_map, # From Cell 3
        team_rebrand_map_current=TEAM_REBRAND_MAP, # From Cell 6
        model_features_list=features, # Defined in Cell 4
        race_description=race_description_imola, # Defined in Cell 9
        save_path="predictions" # Specify folder to save CSV
    )
else:
    print("Prediction skipped because the model could not be loaded.")

Loaded trained model from f1_winner_predictor_model.joblib
--- Predicting for: 2025 Emilia Romagna Grand Prix (imola) ---
Preparing future grid data (mapping names to IDs)...
Preparing grid for prediction...
Grid preparation complete.
Creating temporary DataFrame for feature calculation...
Combining with historical data...
Recalculating features on combined data...
Calculating features...
Full names not in races_df, attempting to merge from qualifying_df...
Names merged/filled in races_df.
Features calculated.
Isolating prediction rows...

Running prediction for 2025 Emilia Romagna Grand Prix...

Predicted Win Probabilities (2025 Emilia Romagna Grand Prix):
Predictions saved to: predictions/2025_R07_2025_Emilia_Romagna_Grand_Prix_imola_predictions.csv


| Driver             | Grid | Team                           | Probability |
|--------------------|------|--------------------------------|-------------|
| Max Verstappen     | 2    | Red Bull Racing Honda RBPT     |      28.31% |
| Oscar Piastri      | 1    | McLaren Mercedes               |      24.38% |
| George Russell     | 3    | Mercedes                       |      15.27% |
| Lando Norris       | 4    | McLaren Mercedes               |       9.47% |
| Yuki Tsunoda       | 20   | Red Bull Racing Honda RBPT     |       5.79% |
| Lewis Hamilton     | 12   | Ferrari                        |       4.73% |
| Kimi Antonelli     | 13   | Mercedes                       |       3.55% |
| Charles Leclerc    | 11   | Ferrari                        |       2.57% |
| Esteban Ocon       | 18   | Haas Ferrari                   |       1.06% |
| Isack Hadjar       | 9    | Racing Bulls Honda RBPT        |       0.70% |
| Pierre Gasly       | 10   | Alpine Renault                 |       0.60% |
| Liam Lawson        | 15   | Racing Bulls Honda RBPT        |       0.60% |
| Franco Colapinto   | 16   | Alpine Renault                 |       0.53% |
| Gabriel Bortoleto  | 14   | Kick Sauber Ferrari            |       0.49% |
| Carlos Sainz       | 6    | Williams Mercedes              |       0.47% |
| Fernando Alonso    | 5    | Aston Martin Aramco Mercedes   |       0.41% |
| Nico Hulkenberg    | 17   | Kick Sauber Ferrari            |       0.37% |
| Oliver Bearman     | 19   | Haas Ferrari                   |       0.32% |
| Alexander Albon    | 7    | Williams Mercedes              |       0.19% |
| Lance Stroll       | 8    | Aston Martin Aramco Mercedes   |       0.18% |
