In [1]:
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_absolute_error

# -----------------------------
# Load data
# -----------------------------
df = pd.read_csv("data.csv")

# -----------------------------
# Use push laps only
# -----------------------------
df = df[df["IsPushLap"] == 1].copy()

# -----------------------------
# Convert lap time
# -----------------------------
df["LapTime_sec"] = pd.to_timedelta(
    df["LapTime"], errors="coerce"
).dt.total_seconds()

# -----------------------------
# Q3 only
# -----------------------------
df = df[df["QualiSegment"] == "Q3"].copy()

# -----------------------------
# Session median
# -----------------------------
df["SessionMedianLap"] = (
    df.groupby(["Event", "Session", "QualiSegment"])["LapTime_sec"]
      .transform("median")
)

# -----------------------------
# Target: delta
# -----------------------------
df["LapDelta_sec"] = df["LapTime_sec"] - df["SessionMedianLap"]
TARGET = "LapDelta_sec"

# -----------------------------
# Driver / Team track keys
# -----------------------------
df["Driver_Track"] = df["Driver"] + "_" + df["CircuitName"]
df["Team_Track"]   = df["Team"] + "_" + df["CircuitName"]

# -----------------------------
# ðŸ”¥ NEW: Historical performance features
# -----------------------------
driver_track_stats = (
    df.groupby("Driver_Track")["LapDelta_sec"]
      .agg(["mean", "std"])
      .reset_index()
      .rename(columns={
          "mean": "DriverTrackAvgDelta",
          "std": "DriverTrackStdDelta"
      })
)

team_track_stats = (
    df.groupby("Team_Track")["LapDelta_sec"]
      .agg(["mean", "std"])
      .reset_index()
      .rename(columns={
          "mean": "TeamTrackAvgDelta",
          "std": "TeamTrackStdDelta"
      })
)

df = df.merge(driver_track_stats, on="Driver_Track", how="left")
df = df.merge(team_track_stats, on="Team_Track", how="left")

# Fill cold-start cases
df["DriverTrackAvgDelta"].fillna(0, inplace=True)
df["DriverTrackStdDelta"].fillna(df["LapDelta_sec"].std(), inplace=True)
df["TeamTrackAvgDelta"].fillna(0, inplace=True)
df["TeamTrackStdDelta"].fillna(df["LapDelta_sec"].std(), inplace=True)

# -----------------------------
# Features
# -----------------------------
categorical_features = [
    "Driver", "Team", "Compound", "Event", "Session",
    "QualiSegment", "CircuitName", "Country",
    "TrackType", "LapSpeedClass",
    "Driver_Track", "Team_Track"
]

numeric_features = [
    "TyreLife",
    "SpeedI1", "SpeedI2", "SpeedFL", "SpeedST",
    "TrackLength_m", "NumCorners", "CornerDensity",
    "AvgCornerSpacing_m", "AirTemp", "TrackTemp",
    "WindSpeed", "Altitude_m", "DRSZones",
    "DriverTrackAvgDelta", "DriverTrackStdDelta",
    "TeamTrackAvgDelta", "TeamTrackStdDelta"
]

features = categorical_features + numeric_features

X = df[features]
y = df[TARGET]

# -----------------------------
# Time split
# -----------------------------
train_idx = df["Year"] <= 2022
test_idx  = df["Year"] >= 2023

train_pool = Pool(
    X[train_idx], y[train_idx],
    cat_features=categorical_features
)

test_pool = Pool(
    X[test_idx], y[test_idx],
    cat_features=categorical_features
)

# -----------------------------
# Model
# -----------------------------
model = CatBoostRegressor(
    iterations=3000,
    learning_rate=0.03,
    depth=7,
    loss_function="MAE",
    eval_metric="MAE",
    random_seed=42,
    early_stopping_rounds=250,
    verbose=200
)

model.fit(train_pool, eval_set=test_pool, use_best_model=True)

# -----------------------------
# Evaluate
# -----------------------------
pred = model.predict(X[test_idx])
mae = mean_absolute_error(y[test_idx], pred)
print(f"\nðŸ”¥ Q3 DELTA MAE: {mae:.3f} sec")

# -----------------------------
# Save
# -----------------------------
model.save_model("quali_q3_delta_model.cbm")
print("âœ… Model saved")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["DriverTrackAvgDelta"].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["DriverTrackStdDelta"].fillna(df["LapDelta_sec"].std(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object o

0:	learn: 8.7856490	test: 8.0556811	best: 8.0556811 (0)	total: 243ms	remaining: 12m 7s
200:	learn: 3.0736000	test: 4.0439596	best: 4.0350754 (191)	total: 18.6s	remaining: 4m 18s
400:	learn: 2.5395063	test: 3.9397843	best: 3.9379964 (398)	total: 36.5s	remaining: 3m 56s
600:	learn: 2.2902455	test: 3.9224147	best: 3.9078406 (577)	total: 54.7s	remaining: 3m 38s
800:	learn: 2.1404440	test: 3.8865012	best: 3.8851645 (797)	total: 1m 12s	remaining: 3m 20s
1000:	learn: 2.0281709	test: 3.8861949	best: 3.8773100 (873)	total: 1m 30s	remaining: 3m 1s
Stopped by overfitting detector  (250 iterations wait)

bestTest = 3.877310046
bestIteration = 873

Shrink model to first 874 iterations.

ðŸ”¥ Q3 DELTA MAE: 3.877 sec
âœ… Model saved
