In [None]:
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_absolute_error

# -----------------------------
# Load data
# -----------------------------
df = pd.read_csv("data.csv")

# Use only push laps
df = df[df["IsPushLap"] == 1].copy()

# -----------------------------
# Create POLE TIME per session
# -----------------------------
group_cols = ["Year", "Event", "Session", "QualiSegment"]

pole_times = (
    df.groupby(group_cols)["LapTime_sec"]
    .min()
    .reset_index()
    .rename(columns={"LapTime_sec": "PoleTime_sec"})
)

df = df.merge(pole_times, on=group_cols, how="left")

# -----------------------------
# Target: delta to pole
# -----------------------------
df["delta_to_pole"] = df["LapTime_sec"] - df["PoleTime_sec"]

# -----------------------------
# Sort for rolling features
# -----------------------------
df = df.sort_values(["Driver", "Year", "Event"])

# -----------------------------
# Driver recent form (last 5)
# -----------------------------
df["driver_avg_delta_last5"] = (
    df.groupby("Driver")["delta_to_pole"]
    .shift(1)
    .rolling(5)
    .mean()
)

# -----------------------------
# Team recent form (last 5)
# -----------------------------
df["team_avg_delta_last5"] = (
    df.groupby("Team")["delta_to_pole"]
    .shift(1)
    .rolling(5)
    .mean()
)

# -----------------------------
# Driver vs teammate gap
# -----------------------------
df["team_session_avg"] = (
    df.groupby(["Year", "Event", "Session", "Team"])["delta_to_pole"]
    .transform("mean")
)

df["driver_vs_teammate_gap"] = (
    df["delta_to_pole"] - df["team_session_avg"]
)

# -----------------------------
# Drop rows with missing rolling data
# -----------------------------
df = df.dropna().copy()

# -----------------------------
# Feature sets
# -----------------------------
categorical_features = [
    "Driver", "Team", "Compound", "Event", "Session",
    "QualiSegment", "CircuitName", "Country",
    "TrackType", "LapSpeedClass"
]

numeric_features = [
    "TyreLife", "SpeedI1", "SpeedI2", "SpeedFL", "SpeedST",
    "TrackLength_m", "NumCorners", "CornerDensity",
    "AvgCornerSpacing_m", "AirTemp", "TrackTemp",
    "WindSpeed", "Altitude_m", "DRSZones",
    "driver_avg_delta_last5",
    "team_avg_delta_last5",
    "driver_vs_teammate_gap"
]

features = categorical_features + numeric_features
target = "delta_to_pole"

X = df[features]
y = df[target]

# -----------------------------
# Time-based split
# -----------------------------
train_idx = df["Year"] <= 2022
test_idx  = df["Year"] >= 2023

X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

train_pool = Pool(
    X_train, y_train,
    cat_features=categorical_features
)

test_pool = Pool(
    X_test, y_test,
    cat_features=categorical_features
)

# -----------------------------
# Train model
# -----------------------------
model = CatBoostRegressor(
    iterations=2000,
    learning_rate=0.03,
    depth=8,
    loss_function="MAE",
    eval_metric="MAE",
    random_seed=42,
    early_stopping_rounds=150,
    verbose=200
)

model.fit(
    train_pool,
    eval_set=test_pool,
    use_best_model=True
)

# -----------------------------
# Evaluate
# -----------------------------
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)

print(f"\n Q-DELTA MAE: {mae:.3f} seconds")

# -----------------------------
# Save model
# -----------------------------
model.save_model("quali_model.cbm")
print("âœ… Model saved as quali_model.cbm")


0:	learn: 7.1574622	test: 7.5729522	best: 7.5729522 (0)	total: 268ms	remaining: 8m 56s
200:	learn: 0.9615712	test: 1.5465259	best: 1.5465259 (200)	total: 22.5s	remaining: 3m 21s
400:	learn: 0.7550828	test: 1.5085999	best: 1.5022362 (358)	total: 47.1s	remaining: 3m 7s
Stopped by overfitting detector  (150 iterations wait)

bestTest = 1.502236163
bestIteration = 358

Shrink model to first 359 iterations.

ðŸ”¥ Q-DELTA MAE: 1.502 seconds
âœ… Model saved as quali_model.cbm
