In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# For nice table output
from IPython.display import display, Markdown

# To suppress potential warnings during prediction if needed
import warnings 

# Optional: suppress some sklearn warnings
warnings.filterwarnings("ignore", category=UserWarning) 

# Optional: suppress some pandas warnings
warnings.filterwarnings("ignore", category=FutureWarning) 

In [2]:
# --- Data Loading ---
try:
    # Parse dates for correct chronological sorting
    qualifying_df = pd.read_csv("../data/processed/updated_qualifying.csv", parse_dates=["date"])
    
    # Use the updated races CSV you provided
    races_df = pd.read_csv("../data/processed/updated_races.csv", parse_dates=["date"])
    
    # Check if the CSV files are loaded correctly
    print("CSV files loaded successfully.")
    print(f"Races data includes {races_df['season'].max()} season up to round {races_df[races_df['season'] == races_df['season'].max()]['round'].max()}")

except FileNotFoundError as e:
    print(f"Error: File not found. {e}")
    print("Please ensure 'qualifying.csv' and your updated races CSV are in the correct directory.")
    raise
except Exception as e:
    print(f"An error occurred during file loading: {e}")
    raise

# --- Basic Data Cleaning ---

# Ensure correct data types where needed
races_df['points'] = pd.to_numeric(races_df['points'], errors='coerce').fillna(0)
races_df['position'] = pd.to_numeric(races_df['position'], errors='coerce')
races_df['grid'] = pd.to_numeric(races_df['grid'], errors='coerce')

# Clean driver names (remove Jr., handle Hulkenberg) - apply consistently
def clean_driver_name(name):
    if isinstance(name, str):
        name = name.replace(' Jr.', '').replace('Hülkenberg', 'Hulkenberg')
        # Add other known variations if necessary
    return name

races_df['driverFullName'] = races_df['driverFullName'].apply(clean_driver_name)
qualifying_df['driverFullName'] = qualifying_df['driverFullName'].apply(clean_driver_name)

# Handle Perez name variation
races_df['driverFullName'] = races_df['driverFullName'].replace('Sergio Perez', 'Sergio Pérez')
qualifying_df['driverFullName'] = qualifying_df['driverFullName'].replace('Sergio Perez', 'Sergio Pérez')

# Ensure IDs are strings
races_df['driverId'] = races_df['driverId'].astype(str)
races_df['constructorId'] = races_df['constructorId'].astype(str)
qualifying_df['driverId'] = qualifying_df['driverId'].astype(str)
qualifying_df['constructorId'] = qualifying_df['constructorId'].astype(str)

CSV files loaded successfully.
Races data includes 2025 season up to round 4


In [3]:
# --- Feature Engineering Function ---

def calculate_features(df_races, df_qualifying):
    """
    Calculates rolling metrics and standings features on race data.

    Args:
        df_races (pd.DataFrame): DataFrame with race results.
        df_qualifying (pd.DataFrame): DataFrame with qualifying info for names.

    Returns:
        pd.DataFrame: DataFrame with added features.
    """
    print("Calculating features...")
    
    # Ensure chronological order
    df_races = df_races.sort_values(by=["season", "round", "date"]).copy()

    # --- Merge Full Names (using latest known name for each ID) ---
    latest_qual_names = df_qualifying.sort_values(by="date", ascending=False).drop_duplicates(subset=["driverId", "constructorId"])
    name_map = latest_qual_names[["driverId", "constructorId", "driverFullName", "constructorName"]].copy()

    # Create unique driver and constructor name maps based on latest data
    driver_id_to_name = name_map.drop_duplicates(subset="driverId").set_index("driverId")["driverFullName"]
    constructor_id_to_name = name_map.drop_duplicates(subset="constructorId").set_index("constructorId")["constructorName"]

    df_races["driverFullName"] = df_races["driverId"].map(driver_id_to_name).fillna(df_races["driverId"])
    df_races["constructorName"] = df_races["constructorId"].map(constructor_id_to_name).fillna(df_races["constructorId"])

    # --- Target Variable ---
    df_races["is_winner"] = (df_races["position"] == 1).astype(int)

    # --- Grid Handling ---
    df_races["grid"] = df_races["grid"].replace(0, 21).fillna(21) # Pit lane/missing -> last
    df_races["grid"] = df_races["grid"].astype(int)

    # --- Rolling Performance Metrics (per driver) ---
    # Sort is crucial before grouping and shifting
    df_races = df_races.sort_values(by=["driverId", "season", "round", "date"])

    # Calculate rolling averages over last 5 races *before* the current one
    rolling_features = ["points", "position", "grid"]
    for feature in rolling_features:
        # Calculate rolling mean
        roll_mean = df_races.groupby("driverId")[feature].rolling(window=5, min_periods=1).mean()
        # Drop the driverId level created by groupby, shift, and assign back
        df_races[f"avg_{feature}_last_5"] = roll_mean.reset_index(level=0, drop=True).groupby(level=0).shift(1)

    # --- Championship Standings (Points Before Race) ---
    # Calculate cumulative points within each season for each driver
    df_races["season_points"] = df_races.groupby(["season", "driverId"])["points"].cumsum()
    # Get the points total *before* the current race
    df_races["points_standings_prev_race"] = df_races.groupby(["season", "driverId"])["season_points"].shift(1)

    # --- Handle NaNs created by shift/rolling ---
    # First race for a driver will have NaN rolling/standings features
    df_races["avg_points_last_5"].fillna(0, inplace=True)
    df_races["avg_position_last_5"].fillna(21, inplace=True) # Fill with "last place" avg
    df_races["avg_grid_last_5"].fillna(21, inplace=True) # Fill with "last place" avg
    df_races["points_standings_prev_race"].fillna(0, inplace=True)

    # Drop intermediate columns if desired
    df_races = df_races.drop(columns=["season_points"])

    print("Features calculated.")
    return df_races.sort_values(by=["season", "round", "date"]) # Resort chronologically

# --- Execute Feature Engineering on Initial Data ---
data_df_featured = calculate_features(races_df, qualifying_df)

# --- Create Name Mappings (needed for preparing prediction input later) ---
latest_driver_entries = data_df_featured.sort_values(
    by=["season", "round"], ascending=False
).drop_duplicates(subset="driverFullName", keep="first")
latest_driver_name_to_id_map = latest_driver_entries.set_index(
    "driverFullName"
)["driverId"].to_dict()

latest_constructor_entries = data_df_featured.sort_values(
    by=["season", "round"], ascending=False
).drop_duplicates(subset="constructorName", keep="first")
latest_constructor_name_to_id_map = latest_constructor_entries.set_index(
    "constructorName"
)["constructorId"].to_dict()

print("\nName -> ID maps created.")
print("Sample of featured data:")
print(data_df_featured[[
    "season", "round", "driverFullName", "grid", "position", "points",
    "avg_points_last_5", "avg_position_last_5", "points_standings_prev_race", "is_winner"
]].tail())

Calculating features...
Features calculated.

Name -> ID maps created.
Sample of featured data:
      season  round  driverFullName  grid  position  points  \
2199    2025      4   Oscar Piastri     1         1    25.0   
2200    2025      4  George Russell     3         2    18.0   
2217    2025      4    Carlos Sainz     8        19     0.0   
2215    2025      4    Lance Stroll    19        17     0.0   
2207    2025      4    Yuki Tsunoda    10         9     2.0   

      avg_points_last_5  avg_position_last_5  points_standings_prev_race  \
2199                0.0                 21.0                        42.0   
2200                0.0                 21.0                        40.0   
2217                0.0                 21.0                         1.0   
2215                0.0                 21.0                        10.0   
2207                0.0                 21.0                         0.0   

      is_winner  
2199          1  
2200          0  
2217          

In [4]:
# --- Model Definition & Preprocessing Setup ---

# Define features including the new ones
features = [
    "grid",
    "circuitId",
    "driverId",
    "constructorId",
    "avg_points_last_5",
    "avg_position_last_5",
    "avg_grid_last_5",
    "points_standings_prev_race",
]
target = "is_winner"

# Define numerical and categorical features FOR THE PREPROCESSOR
numerical_features = [
    "grid",
    "avg_points_last_5",
    "avg_position_last_5",
    "avg_grid_last_5",
    "points_standings_prev_race",
]
categorical_features = ["circuitId", "driverId", "constructorId"]

# Create preprocessing pipelines
numerical_transformer = SimpleImputer(strategy="median") # Median is robust to outliers
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    (
        "onehot",
        OneHotEncoder(
            handle_unknown="ignore", sparse_output=False
        ),
    ),
])

# Create the preprocessor object
preprocessor = ColumnTransformer(transformers=[
    ("num", numerical_transformer, numerical_features),
    ("cat", categorical_transformer, categorical_features),
])

print("Preprocessor configured with new features.")
print("Features for model:", features)

Preprocessor configured with new features.
Features for model: ['grid', 'circuitId', 'driverId', 'constructorId', 'avg_points_last_5', 'avg_position_last_5', 'avg_grid_last_5', 'points_standings_prev_race']


In [5]:
# Cell 5: Model Training (with Model Saving)

import joblib # Make sure joblib is imported

# Select features and target from the data with calculated features
X = data_df_featured[features]
y = data_df_featured[target]

# Create the full model pipeline
model_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    (
        "classifier",
        RandomForestClassifier(
            n_estimators=200,
            random_state=42,
            class_weight="balanced",
            max_depth=15,
            min_samples_split=5,
            min_samples_leaf=3,
            n_jobs=-1,
        ),
    ),
])

print("Training the RandomForestClassifier model with new features...")
if "X" in locals() and "y" in locals():
    model_pipeline.fit(X, y)
    print("Model training complete.")

    # --- SAVE THE TRAINED MODEL ---
    model_filename = "./joblogs/f1_winner_predictor_model.joblib"
    try:
        joblib.dump(model_pipeline, model_filename)
        print(f"Trained model saved to {model_filename}")
    except Exception as e:
        print(f"Error saving model: {e}")
    # -----------------------------

else:
    print(
        "Error: Feature data (X) or target data (y) not found."
        " Please run the feature engineering cell (Cell 3)."
    )



Training the RandomForestClassifier model with new features...
Model training complete.
Trained model saved to ./joblogs/f1_winner_predictor_model.joblib


In [6]:
# --- Reusable Grid Preparation Function ---

# Define known team name changes/mappings for future seasons if needed
# Map NEW team name (key) to the constructorId used in TRAINING data (value)
TEAM_REBRAND_MAP = {
    # 2024 -> 2025 examples (adjust based on actual final names/IDs)
    "Red Bull Racing Honda RBPT": "red_bull",
    "McLaren Mercedes": "mclaren",
    "Ferrari": "ferrari",
    "Mercedes": "mercedes",
    "Racing Bulls Honda RBPT": "rb", # Visa Cash App RB -> rb
    "Williams Mercedes": "williams",
    "Haas Ferrari": "haas",
    "Alpine Renault": "alpine",
    "Aston Martin Aramco Mercedes": "aston_martin",
    "Kick Sauber Ferrari": "sauber", # Stake F1 Team Kick Sauber -> sauber
    # Add mappings for other historical rebrands if necessary for consistency
    "AlphaTauri": "alphatauri", # Ensure historical names map too
    "Racing Point": "racing_point",
    "Alfa Romeo": "alfa",
    "Renault": "renault",
    "RB F1 Team": "rb", # 2024 name
    "Sauber": "sauber", # 2024 name
}


def prepare_grid_for_prediction(
    raw_grid_list, driver_map, constructor_map, team_rebrand_map
):
    """
    Processes a raw grid list, mapping names to historical IDs and handling new entries.

    Args:
        raw_grid_list (list): List of dicts with 'driver', 'team', 'grid'.
        driver_map (dict): Map of Driver Full Name -> Historical driverId.
        constructor_map (dict): Map of Historical Constructor Name -> Historical constructorId.
        team_rebrand_map (dict): Map of New Team Name -> Historical constructorId.

    Returns:
        list: List of dicts formatted for the prediction function,
              or None if input is invalid.
    """
    prepared_grid = []
    unknown_drivers = set()
    unknown_teams = set()

    if not isinstance(raw_grid_list, list):
        print("Error: raw_grid_list must be a list.")
        return None

    for entry in raw_grid_list:
        if not isinstance(entry, dict) or not all(
            k in entry for k in ["driver", "team", "grid"]
        ):
            print(f"Error: Invalid entry format in raw_grid_list: {entry}")
            return None

        driver_name = entry["driver"]
        team_name = entry["team"]
        grid_pos = entry["grid"]

        # --- Find driverId ---
        driver_id = driver_map.get(driver_name)
        if driver_id is None:
            # Handle common variations like Hulkenberg
            if driver_name == "Nico Hulkenberg":
                 driver_id = driver_map.get("Nico Hülkenberg") # Check common variation
            if driver_id is None:
                driver_id = f"new_driver_{driver_name.lower().replace(' ', '_')}"
                unknown_drivers.add(driver_name)

        # --- Find constructorId ---
        # 1. Check direct rebrand map first
        constructor_id = team_rebrand_map.get(team_name)
        # 2. If not in rebrand map, check historical name map
        if constructor_id is None:
            constructor_id = constructor_map.get(team_name)
        # 3. If still not found, it's a truly new/unmapped team
        if constructor_id is None:
            constructor_id = f"new_team_{team_name.lower().replace(' ', '_')}"
            unknown_teams.add(team_name)

        prepared_grid.append({
            "driverId": driver_id, # ID for the model
            "constructorId": constructor_id, # ID for the model
            "grid": grid_pos,
            "driverFullName": driver_name, # Name for display
            "constructorName": team_name, # Name for display
        })

    if unknown_drivers:
        print(
            f"Note: Using placeholder IDs for new/unmapped drivers:"
            f" {', '.join(sorted(list(unknown_drivers)))}."
            " Predictions less reliable."
        )
    if unknown_teams:
        print(
            f"Note: Using placeholder IDs for new/unmapped teams:"
            f" {', '.join(sorted(list(unknown_teams)))}."
            " Predictions less reliable."
        )

    return prepared_grid


print("Grid preparation function defined.")


Grid preparation function defined.


In [7]:
# --- Prediction Function ---
# (Use the function definition from the previous answer that returns
# the dictionary containing 'DriverFullName', 'ConstructorName', etc.)

def predict_race_winner_probabilities(circuit_id, grid_data, model, features_list):
    """
    Predicts the win probability for each driver in a given grid DataFrame.
    MODIFIED to accept pre-calculated features.

    Args:
        circuit_id (str): The ID of the circuit for the race (used for filtering).
        grid_data (pd.DataFrame): DataFrame containing the grid and PRE-CALCULATED features
                                   for the race to predict. Must include columns in features_list
                                   plus 'driverId', 'driverFullName', 'constructorName'.
        model (Pipeline): The trained scikit-learn pipeline model.
        features_list (list): The list of feature names the model expects.


    Returns:
        dict: A dictionary mapping driverId to their predicted win probability
              and full names, sorted by probability in descending order.
              Returns empty dict on error.
    """
    if grid_data.empty:
        print("Error: grid_data DataFrame is empty.")
        return {}
    if not model:
        print("Error: Model is not provided or not trained.")
        return {}

    # Check if required columns are present
    required_cols = features_list + ["driverId", "driverFullName", "constructorName", "grid"]
    if not all(col in grid_data.columns for col in required_cols):
        print(
            "Error: grid_data DataFrame must contain required columns:"
            f" {', '.join(required_cols)}"
        )
        return {}

    try:
        # Select only the features the model was trained on, in the correct order
        predict_X = grid_data[features_list]

        # Predict probabilities
        probabilities = model.predict_proba(predict_X)
        win_probabilities = probabilities[:, 1] # Probability of class 1 (winner)

        # Normalize probabilities
        total_prob = np.sum(win_probabilities)
        if total_prob > 0:
            normalized_probs = win_probabilities / total_prob
        else:
            print(
                "Warning: Model predicted zero probability for all drivers."
                " Assigning equal probability."
            )
            normalized_probs = np.ones(len(predict_X)) / len(predict_X)

        # Create result dictionary mapping driverId to details including full names
        results = {}
        for i, index in enumerate(grid_data.index):
            driver_id = grid_data.loc[index, "driverId"]
            results[driver_id] = {
                "Probability": normalized_probs[i],
                "DriverFullName": grid_data.loc[index, "driverFullName"],
                "ConstructorName": grid_data.loc[index, "constructorName"],
                "Grid": grid_data.loc[index, "grid"],
            }

        # Sort by probability descending
        sorted_results = dict(
            sorted(
                results.items(),
                key=lambda item: item[1]["Probability"],
                reverse=True,
            )
        )

        return sorted_results

    except Exception as e:
        print(f"An error occurred during prediction: {e}")
        return {}


print("Prediction function defined (accepts pre-calculated features).")


Prediction function defined (accepts pre-calculated features).


In [8]:
# Cell 8: Wrapper Function for Prediction and Display (with CSV Saving)

import os # Import os to handle file paths
import joblib # Needed if loading model inside, though we load outside now
from IPython.display import display, Markdown # Ensure this is imported

def predict_and_display_results(
    circuit_id,
    future_season,
    future_round,
    raw_grid_list,
    model, # Expecting the loaded model to be passed in
    base_races_df,
    base_qualifying_df,
    driver_map,
    constructor_map,
    team_rebrand_map,
    model_features_list,
    race_description="Future Race",
    save_path="predictions" # Folder to save CSV predictions
):
    """
    Prepares grid data, ADDS IT TO HISTORY, recalculates features,
    runs prediction using the provided model, displays results,
    and saves results to CSV.
    """
    # --- Create save directory if it doesn't exist ---
    if save_path and not os.path.exists(save_path):
        try:
            os.makedirs(save_path)
            print(f"Created directory: {save_path}")
        except OSError as e:
            print(f"Error creating directory {save_path}: {e}")
            save_path = None # Disable saving if directory creation fails
    # -------------------------------------------------

    print(f"--- Predicting for: {race_description} ({circuit_id}) ---")

    print("Preparing future grid data...")
    prepared_grid_list = prepare_grid_for_prediction(
        raw_grid_list, driver_map, constructor_map, team_rebrand_map
    )
    if prepared_grid_list is None:
        print("Grid preparation failed. Cannot predict.")
        return

    print("Creating temporary DataFrame for feature calculation...")
    future_race_df = pd.DataFrame(prepared_grid_list)
    future_race_df["season"] = future_season
    future_race_df["round"] = future_round
    future_race_df["circuitId"] = circuit_id
    future_race_df["date"] = pd.Timestamp.now() # Placeholder
    future_race_df["position"] = np.nan
    future_race_df["points"] = 0

    print("Combining with historical data...")
    # Ensure base_races_df has the necessary columns before concat
    required_base_cols = ['season', 'round', 'date', 'driverId', 'constructorId', 'circuitId', 'grid', 'position', 'points']
    if not all(col in base_races_df.columns for col in required_base_cols):
        print(f"Error: base_races_df is missing required columns. Needed: {required_base_cols}")
        return
    combined_df = pd.concat([base_races_df[required_base_cols], future_race_df], ignore_index=True)


    print("Recalculating features...")
    # Pass the original qualifying df for consistent name mapping if needed inside
    combined_featured_df = calculate_features(combined_df, base_qualifying_df)

    print("Isolating prediction rows...")
    predict_rows = combined_featured_df[
        (combined_featured_df["season"] == future_season) &
        (combined_featured_df["round"] == future_round)
    ].copy()

    if predict_rows.empty:
        print("Error: Could not find rows for the specified future race after feature calculation.")
        return

    # --- Model Check (ensure model object was passed) ---
    model_ready = False
    if model is not None:
         # Basic check if it looks like a fitted sklearn pipeline/model
         if hasattr(model, 'predict_proba') and hasattr(model, 'steps'):
             model_ready = True

    if model_ready:
        print(f"\nRunning prediction for {race_description}...")
        winner_probs_dict = predict_race_winner_probabilities(
            circuit_id, predict_rows, model, model_features_list
        )

        print(f"\nPredicted Win Probabilities ({race_description}):")
        if winner_probs_dict:
            # --- Prepare DataFrame for display and saving ---
            results_list = [
                {
                    "Driver": details["DriverFullName"],
                    "Grid": details["Grid"],
                    "Team": details["ConstructorName"],
                    "Probability_Num": details["Probability"], # Keep numeric
                }
                for driver_id, details in winner_probs_dict.items()
            ]
            results_df = pd.DataFrame(results_list)
            # -----------------------------------------------

            # --- Save to CSV ---
            if save_path:
                safe_filename = "".join(c if c.isalnum() else "_" for c in race_description)
                # Include season and round in filename for better organization
                csv_filename = os.path.join(save_path, f"{future_season}_R{future_round:02d}_{safe_filename}_{circuit_id}_predictions.csv")
                try:
                    results_df.to_csv(csv_filename, index=False, float_format='%.6f')
                    print(f"Predictions saved to: {csv_filename}")
                except Exception as e:
                    print(f"Error saving predictions to CSV: {e}")
            # --------------------

            # --- Prepare for Markdown Display ---
            display_df = results_df.copy()
            display_df["Probability"] = display_df["Probability_Num"].map("{:.2%}".format)
            display_df = display_df.drop(columns=["Probability_Num"])

            markdown_table = "| Driver             | Grid | Team                           | Probability |\n"
            markdown_table += "|--------------------|------|--------------------------------|-------------|\n"
            for index, row in display_df.iterrows():
                 markdown_table += (
                    f"| {row['Driver']:<18} | {row['Grid']:<4} |"
                    f" {row['Team']:<30} | {row['Probability']:>11} |\n"
                )
            display(Markdown(markdown_table))
            # ----------------------------------

        else:
            print("Prediction failed or returned no results.")
    else:
         print("Model was not provided or does not appear to be a valid trained model.")

print("Prediction and display wrapper function updated to save CSV.")



Prediction and display wrapper function updated to save CSV.


In [9]:
# Cell 8: Wrapper Function for Prediction and Display (with CSV Saving)

import os # Import os to handle file paths
import joblib # Needed if loading model inside, though we load outside now
from IPython.display import display, Markdown # Ensure this is imported

def predict_and_display_results(
    circuit_id,
    future_season,
    future_round,
    raw_grid_list,
    model, # Expecting the loaded model to be passed in
    base_races_df,
    base_qualifying_df,
    driver_map,
    constructor_map,
    team_rebrand_map,
    model_features_list,
    race_description="Future Race",
    save_path="predictions" # Folder to save CSV predictions
):
    """
    Prepares grid data, ADDS IT TO HISTORY, recalculates features,
    runs prediction using the provided model, displays results,
    and saves results to CSV.
    """
    # --- Create save directory if it doesn't exist ---
    if save_path and not os.path.exists(save_path):
        try:
            os.makedirs(save_path)
            print(f"Created directory: {save_path}")
        except OSError as e:
            print(f"Error creating directory {save_path}: {e}")
            save_path = None # Disable saving if directory creation fails
    # -------------------------------------------------

    print(f"--- Predicting for: {race_description} ({circuit_id}) ---")

    print("Preparing future grid data...")
    prepared_grid_list = prepare_grid_for_prediction(
        raw_grid_list, driver_map, constructor_map, team_rebrand_map
    )
    if prepared_grid_list is None:
        print("Grid preparation failed. Cannot predict.")
        return

    print("Creating temporary DataFrame for feature calculation...")
    future_race_df = pd.DataFrame(prepared_grid_list)
    future_race_df["season"] = future_season
    future_race_df["round"] = future_round
    future_race_df["circuitId"] = circuit_id
    future_race_df["date"] = pd.Timestamp.now() # Placeholder
    future_race_df["position"] = np.nan
    future_race_df["points"] = 0

    print("Combining with historical data...")
    # Ensure base_races_df has the necessary columns before concat
    required_base_cols = ['season', 'round', 'date', 'driverId', 'constructorId', 'circuitId', 'grid', 'position', 'points']
    if not all(col in base_races_df.columns for col in required_base_cols):
        print(f"Error: base_races_df is missing required columns. Needed: {required_base_cols}")
        return
    combined_df = pd.concat([base_races_df[required_base_cols], future_race_df], ignore_index=True)


    print("Recalculating features...")
    # Pass the original qualifying df for consistent name mapping if needed inside
    combined_featured_df = calculate_features(combined_df, base_qualifying_df)

    print("Isolating prediction rows...")
    predict_rows = combined_featured_df[
        (combined_featured_df["season"] == future_season) &
        (combined_featured_df["round"] == future_round)
    ].copy()

    if predict_rows.empty:
        print("Error: Could not find rows for the specified future race after feature calculation.")
        return

    # --- Model Check (ensure model object was passed) ---
    model_ready = False
    if model is not None:
         # Basic check if it looks like a fitted sklearn pipeline/model
         if hasattr(model, 'predict_proba') and hasattr(model, 'steps'):
             model_ready = True

    if model_ready:
        print(f"\nRunning prediction for {race_description}...")
        winner_probs_dict = predict_race_winner_probabilities(
            circuit_id, predict_rows, model, model_features_list
        )

        print(f"\nPredicted Win Probabilities ({race_description}):")
        if winner_probs_dict:
            # --- Prepare DataFrame for display and saving ---
            results_list = [
                {
                    "Driver": details["DriverFullName"],
                    "Grid": details["Grid"],
                    "Team": details["ConstructorName"],
                    "Probability_Num": details["Probability"], # Keep numeric
                }
                for driver_id, details in winner_probs_dict.items()
            ]
            results_df = pd.DataFrame(results_list)
            # -----------------------------------------------

            # --- Save to CSV ---
            if save_path:
                safe_filename = "".join(c if c.isalnum() else "_" for c in race_description)
                # Include season and round in filename for better organization
                csv_filename = os.path.join(save_path, f"{future_season}_R{future_round:02d}_{safe_filename}_{circuit_id}_predictions.csv")
                try:
                    results_df.to_csv(csv_filename, index=False, float_format='%.6f')
                    print(f"Predictions saved to: {csv_filename}")
                except Exception as e:
                    print(f"Error saving predictions to CSV: {e}")
            # --------------------

            # --- Prepare for Markdown Display ---
            display_df = results_df.copy()
            display_df["Probability"] = display_df["Probability_Num"].map("{:.2%}".format)
            display_df = display_df.drop(columns=["Probability_Num"])

            markdown_table = "| Driver             | Grid | Team                           | Probability |\n"
            markdown_table += "|--------------------|------|--------------------------------|-------------|\n"
            for index, row in display_df.iterrows():
                 markdown_table += (
                    f"| {row['Driver']:<18} | {row['Grid']:<4} |"
                    f" {row['Team']:<30} | {row['Probability']:>11} |\n"
                )
            display(Markdown(markdown_table))
            # ----------------------------------

        else:
            print("Prediction failed or returned no results.")
    else:
         print("Model was not provided or does not appear to be a valid trained model.")

print("Prediction and display wrapper function updated to save CSV.")



Prediction and display wrapper function updated to save CSV.


In [10]:
# --- Define Grid for 2025 Saudi Arabian GP ---

saudi_2025_raw_grid = [
    # McLaren Mercedes
    {'driver': 'Lando Norris',       'team': 'McLaren Mercedes',              'grid': 10},
    {'driver': 'Oscar Piastri',      'team': 'McLaren Mercedes',              'grid': 2},
    # Red Bull Racing Honda RBPT
    {'driver': 'Max Verstappen',     'team': 'Red Bull Racing Honda RBPT',    'grid': 1},
    {'driver': 'Yuki Tsunoda',        'team': 'Red Bull Racing Honda RBPT',    'grid': 8},
    # Ferrari
    {'driver': 'Charles Leclerc',    'team': 'Ferrari',                       'grid': 4},
    {'driver': 'Lewis Hamilton',     'team': 'Ferrari',                       'grid': 7},
    # Mercedes
    {'driver': 'George Russell',     'team': 'Mercedes',                      'grid': 3},
    {'driver': 'Kimi Antonelli',     'team': 'Mercedes',                      'grid': 5},
    # Alpine Renault
    {'driver': 'Pierre Gasly',       'team': 'Alpine Renault',                'grid': 9},
    {'driver': 'Jack Doohan',        'team': 'Alpine Renault',                'grid': 17},
    # Williams Mercedes
    {'driver': 'Alexander Albon',    'team': 'Williams Mercedes',             'grid': 11},
    {'driver': 'Carlos Sainz',       'team': 'Williams Mercedes',             'grid': 6},
    # Racing Bulls Honda RBPT
    {'driver': 'Liam Lawson',       'team': 'Racing Bulls Honda RBPT',       'grid': 12},
    {'driver': 'Isack Hadjar',       'team': 'Racing Bulls Honda RBPT',       'grid': 14},
    # Aston Martin Aramco Mercedes
    {'driver': 'Fernando Alonso',    'team': 'Aston Martin Aramco Mercedes', 'grid': 13},
    {'driver': 'Lance Stroll',       'team': 'Aston Martin Aramco Mercedes', 'grid': 16},
    # Kick Sauber Ferrari
    {'driver': 'Gabriel Bortoleto',  'team': 'Kick Sauber Ferrari',           'grid': 20},
    {'driver': 'Nico Hulkenberg',    'team': 'Kick Sauber Ferrari',           'grid': 18},
    # Haas Ferrari
    {'driver': 'Esteban Ocon',       'team': 'Haas Ferrari',                  'grid': 19},
    {'driver': 'Oliver Bearman',     'team': 'Haas Ferrari',                  'grid': 15},
]

future_race_circuit_saudi = "jeddah"
future_season_saudi = 2025
future_round_saudi = 5 # Assuming Saudi is round 5 in 2025 for this example
race_description_saudi = "2025 Saudi Arabian GP"

print(f"Grid defined for {race_description_saudi}")

Grid defined for 2025 Saudi Arabian GP


In [11]:
# Cell 10: Execute Prediction for 2025 Saudi Arabian GP

import joblib # Make sure joblib is imported

# --- Load the Saved Model ---
model_filename = "./joblogs/f1_winner_predictor_model.joblib"
loaded_model = None
try:
    loaded_model = joblib.load(model_filename)
    print(f"Loaded trained model from {model_filename}")
except FileNotFoundError:
    print(f"Error: Saved model file '{model_filename}' not found. Please train the model first (run Cell 5).")
except Exception as e:
    print(f"Error loading model: {e}")
# --------------------------

# --- Execute Prediction ---
# Check if model loaded successfully before proceeding
if loaded_model:
    predict_and_display_results(
        circuit_id=future_race_circuit_saudi,
        future_season=future_season_saudi,
        future_round=future_round_saudi,
        raw_grid_list=saudi_2025_raw_grid, # Defined in Cell 9
        model=loaded_model, # Use the loaded model
        base_races_df=races_df, # Original races df from Cell 2
        base_qualifying_df=qualifying_df, # Original qualifying df from Cell 2
        driver_map=latest_driver_name_to_id_map, # From Cell 3
        constructor_map=latest_constructor_name_to_id_map, # From Cell 3
        team_rebrand_map=TEAM_REBRAND_MAP, # From Cell 6
        model_features_list=features, # Defined in Cell 4
        race_description=race_description_saudi, # Defined in Cell 9
        save_path="predictions" # Specify folder to save CSV
    )
else:
    print("Prediction skipped because the model could not be loaded.")



Loaded trained model from ./joblogs/f1_winner_predictor_model.joblib
--- Predicting for: 2025 Saudi Arabian GP (jeddah) ---
Preparing future grid data...
Creating temporary DataFrame for feature calculation...
Combining with historical data...
Recalculating features...
Calculating features...
Features calculated.
Isolating prediction rows...

Running prediction for 2025 Saudi Arabian GP...

Predicted Win Probabilities (2025 Saudi Arabian GP):
Predictions saved to: predictions/2025_R05_2025_Saudi_Arabian_GP_jeddah_predictions.csv


| Driver             | Grid | Team                           | Probability |
|--------------------|------|--------------------------------|-------------|
| Max Verstappen     | 1    | Red Bull Racing-Honda RBPT     |      30.05% |
| Oscar Piastri      | 2    | McLaren-Mercedes               |      20.56% |
| George Russell     | 3    | Mercedes                       |      14.40% |
| Yuki Tsunoda       | 8    | Red Bull Racing-Honda RBPT     |       7.17% |
| Lewis Hamilton     | 7    | Ferrari                        |       5.77% |
| Charles Leclerc    | 4    | Ferrari                        |       5.59% |
| Kimi Antonelli     | 5    | Mercedes                       |       4.90% |
| Lando Norris       | 10   | McLaren-Mercedes               |       3.58% |
| Esteban Ocon       | 19   | Haas-Ferrari                   |       1.13% |
| Carlos Sainz       | 6    | Williams-Mercedes              |       1.12% |
| Pierre Gasly       | 9    | Alpine-Renault                 |       1.00% |
| Liam Lawson        | 12   | RB F1 Team                     |       0.77% |
| Gabriel Bortoleto  | 20   | Kick Sauber-Ferrari            |       0.74% |
| Jack Doohan        | 17   | Alpine-Renault                 |       0.74% |
| Isack Hadjar       | 14   | RB F1 Team                     |       0.71% |
| Nico Hulkenberg    | 18   | Kick Sauber-Ferrari            |       0.59% |
| Alexander Albon    | 11   | Williams-Mercedes              |       0.48% |
| Fernando Alonso    | 13   | Aston Martin Aramco-Mercedes   |       0.30% |
| Lance Stroll       | 16   | Aston Martin Aramco-Mercedes   |       0.25% |
| Oliver Bearman     | 15   | Haas-Ferrari                   |       0.18% |
