In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# -----------------------------------------------------------
# 1. Load processed data
# -----------------------------------------------------------

df = pd.read_csv("../data/processed_quali_2024.csv")
print("Loaded:", df.shape)

# -----------------------------------------------------------
# 2. Identify all qualifying sessions (races)
# -----------------------------------------------------------

sessions = sorted(df["session_key"].unique())
print("Available session keys (races):")
print(sessions)

# Choose race index to predict
# Example: predict 5th race using races 1‚Äì4
target_race_index = 4  # 0-based index ‚Üí 5th race
target_session = sessions[target_race_index]

print(f"\nüîÆ Predicting session_key = {target_session}")
print("Training on: ", sessions[:target_race_index])

# -----------------------------------------------------------
# 3. Split data into train (earlier races) and test (target race)
# -----------------------------------------------------------

train_df = df[df["session_key"].isin(sessions[:target_race_index])]
test_df = df[df["session_key"] == target_session]

print("\nTrain size:", train_df.shape)
print("Test size:", test_df.shape)

# -----------------------------------------------------------
# 4. Select features and target
# -----------------------------------------------------------

features = [
    "lap_number",
    "duration_sector_1",
    "duration_sector_2",
    "duration_sector_3",
    "i1_speed",
    "i2_speed",
    "st_speed",
    "sector_sum",
    "avg_speed_est"
]

X_train = train_df[features]
y_train = train_df["lap_duration"]

X_test = test_df[features]
y_test = test_df["lap_duration"]

# -----------------------------------------------------------
# 5. Train a simple baseline model
# -----------------------------------------------------------

model = RandomForestRegressor(
    n_estimators=200,
    random_state=42
)

model.fit(X_train, y_train)

# -----------------------------------------------------------
# 6. Predict the target race laps
# -----------------------------------------------------------

preds = model.predict(X_test)

# -----------------------------------------------------------
# 7. Compute metrics
# -----------------------------------------------------------

mae = mean_absolute_error(y_test, preds)
rmse = np.sqrt(mean_squared_error(y_test, preds))

print("\nüìâ Model Performance on Target Race")
print("------------------------------------")
print("MAE :", round(mae, 4))
print("RMSE:", round(rmse, 4))

# -----------------------------------------------------------
# 8. Show example predictions
# -----------------------------------------------------------

comparison = test_df[["lap_number", "lap_duration"]].copy()
comparison["predicted_lap"] = preds
comparison["error"] = comparison["predicted_lap"] - comparison["lap_duration"]

print("\nüîç Sample predictions:")
comparison.head(10)
