In [None]:
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_absolute_error

# -----------------------------
# Load data
# -----------------------------
df = pd.read_csv("data.csv")

# -----------------------------
# Use push laps only
# -----------------------------
df = df[df["IsPushLap"] == 1].copy()

# -----------------------------
# Convert sector times to seconds
# -----------------------------
for col in ["Sector1Time", "Sector2Time", "Sector3Time"]:
    if col in df.columns:
        df[col] = pd.to_timedelta(df[col], errors="coerce").dt.total_seconds()

# -----------------------------
# Q3 ONLY (most stable & accurate)
# -----------------------------
df = df[df["QualiSegment"] == "Q3"].copy()

# -----------------------------
# Create SESSION MEDIAN (for delta target)
# -----------------------------
df["SessionMedianLap"] = (
    df.groupby(["Event", "Session", "QualiSegment"])["LapTime_sec"]
      .transform("median")
)

# -----------------------------
# NEW TARGET: DELTA LAP TIME
# -----------------------------
df["LapDelta_sec"] = df["LapTime_sec"] - df["SessionMedianLap"]
TARGET = "LapDelta_sec"

# -----------------------------
# Driverâ€“Track interactions (HUGE gain)
# -----------------------------
df["Driver_Track"] = df["Driver"] + "_" + df["CircuitName"]
df["Team_Track"]   = df["Team"] + "_" + df["CircuitName"]

# -----------------------------
# Feature selection
# -----------------------------
categorical_features = [
    "Driver", "Team", "Compound", "Event", "Session",
    "QualiSegment", "CircuitName", "Country",
    "TrackType", "LapSpeedClass",
    "Driver_Track", "Team_Track"
]

numeric_features = [
    "TyreLife", "SpeedI1", "SpeedI2", "SpeedFL", "SpeedST",
    "TrackLength_m", "NumCorners", "CornerDensity",
    "AvgCornerSpacing_m", "AirTemp", "TrackTemp",
    "WindSpeed", "Altitude_m", "DRSZones"
]

features = categorical_features + numeric_features

X = df[features]
y = df[TARGET]

# -----------------------------
# Time-based split (no leakage)
# -----------------------------
train_idx = df["Year"] <= 2022
test_idx  = df["Year"] >= 2023

X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

train_pool = Pool(
    X_train, y_train,
    cat_features=categorical_features
)

test_pool = Pool(
    X_test, y_test,
    cat_features=categorical_features
)

# -----------------------------
# Model (tuned for quali)
# -----------------------------
model = CatBoostRegressor(
    iterations=2500,
    learning_rate=0.03,
    depth=6,
    loss_function="MAE",
    eval_metric="MAE",
    random_seed=42,
    early_stopping_rounds=200,
    verbose=200
)

# -----------------------------
# Train
# -----------------------------
model.fit(
    train_pool,
    eval_set=test_pool,
    use_best_model=True
)

# -----------------------------
# Evaluate
# -----------------------------
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)

print(f"\nðŸ”¥ Q3 DELTA MAE: {mae:.3f} seconds")

# -----------------------------
# Save model
# -----------------------------
model.save_model("quali_q3_delta_model.cbm")
print("âœ… Model saved as quali_q3_delta_model.cbm")


In [2]:
import pandas as pd

# -----------------------------
# Load original data
# -----------------------------
df = pd.read_csv("data.csv")

# -----------------------------
# Use push laps only
# -----------------------------
df = df[df["IsPushLap"] == 1].copy()

# -----------------------------
# Convert lap time to seconds
# -----------------------------
df["LapTime_sec"] = pd.to_timedelta(
    df["LapTime"], errors="coerce"
).dt.total_seconds()

# -----------------------------
# Keep only qualifying laps
# -----------------------------
df = df[df["QualiSegment"].isin(["Q1", "Q2", "Q3"])].copy()

# -----------------------------
# Session median lap time
# -----------------------------
session_medians = (
    df.groupby(["Event", "QualiSegment"])["LapTime_sec"]
      .median()
      .reset_index()
      .rename(columns={"LapTime_sec": "SessionMedianLap"})
)

# -----------------------------
# Circuit categorical features
# (ONE row per Event)
# -----------------------------
circuit_cat = (
    df[
        ["Event", "CircuitName", "Country", "TrackType", "LapSpeedClass"]
    ]
    .drop_duplicates("Event")
)

# -----------------------------
# Circuit numeric features
# -----------------------------
circuit_num = (
    df[
        [
            "Event",
            "SpeedI1", "SpeedI2", "SpeedFL", "SpeedST",
            "TrackLength_m", "NumCorners", "CornerDensity",
            "AvgCornerSpacing_m", "AirTemp", "TrackTemp",
            "WindSpeed", "Altitude_m", "DRSZones"
        ]
    ]
    .groupby("Event")
    .median()
    .reset_index()
)

# -----------------------------
# Merge everything
# -----------------------------
final_medians = (
    session_medians
    .merge(circuit_cat, on="Event", how="left")
    .merge(circuit_num, on="Event", how="left")
)

# -----------------------------
# Save
# -----------------------------
final_medians.to_csv("circuit_medians.csv", index=False)

print("âœ… circuit_medians.csv generated correctly")
print("Columns:", final_medians.columns.tolist())
print("Rows:", len(final_medians))


âœ… circuit_medians.csv generated correctly
Columns: ['Event', 'QualiSegment', 'SessionMedianLap', 'CircuitName', 'Country', 'TrackType', 'LapSpeedClass', 'SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST', 'TrackLength_m', 'NumCorners', 'CornerDensity', 'AvgCornerSpacing_m', 'AirTemp', 'TrackTemp', 'WindSpeed', 'Altitude_m', 'DRSZones']
Rows: 108
