In [2]:
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_absolute_error

# -----------------------------
# Load data
# -----------------------------
df = pd.read_csv("data.csv")

# -----------------------------
# Push laps only
# -----------------------------
df = df[df["IsPushLap"] == 1].copy()

# -----------------------------
# Convert lap time
# -----------------------------
df["LapTime_sec"] = pd.to_timedelta(
    df["LapTime"], errors="coerce"
).dt.total_seconds()

# -----------------------------
# Q3 only
# -----------------------------
df = df[df["QualiSegment"] == "Q3"].copy()

# -----------------------------
# Session median
# -----------------------------
df["SessionMedianLap"] = (
    df.groupby(["Event", "Session", "QualiSegment"])["LapTime_sec"]
      .transform("median")
)

# -----------------------------
# Target
# -----------------------------
df["LapDelta_sec"] = df["LapTime_sec"] - df["SessionMedianLap"]
TARGET = "LapDelta_sec"

# -----------------------------
# Train / Test split FIRST
# -----------------------------
train_df = df[df["Year"] <= 2022].copy()
test_df  = df[df["Year"] >= 2023].copy()

# -----------------------------
# Driver / Team track keys
# -----------------------------
for d in (train_df, test_df):
    d["Driver_Track"] = d["Driver"] + "_" + d["CircuitName"]
    d["Team_Track"]   = d["Team"] + "_" + d["CircuitName"]

# -----------------------------
# GLOBAL driver & team pace (train only)
# -----------------------------
driver_pace = (
    train_df.groupby("Driver")["LapDelta_sec"]
      .median()
      .rename("DriverMedianDelta")
)

team_pace = (
    train_df.groupby("Team")["LapDelta_sec"]
      .median()
      .rename("TeamMedianDelta")
)

# -----------------------------
# Track-specific performance (train only)
# -----------------------------
driver_track_stats = (
    train_df.groupby("Driver_Track")["LapDelta_sec"]
      .agg(["mean", "std"])
      .reset_index()
      .rename(columns={
          "mean": "DriverTrackAvgDelta",
          "std": "DriverTrackStdDelta"
      })
)

team_track_stats = (
    train_df.groupby("Team_Track")["LapDelta_sec"]
      .agg(["mean", "std"])
      .reset_index()
      .rename(columns={
          "mean": "TeamTrackAvgDelta",
          "std": "TeamTrackStdDelta"
      })
)

# -----------------------------
# Merge features (FIXED)
# -----------------------------
train_df = train_df.merge(driver_track_stats, on="Driver_Track", how="left")
train_df = train_df.merge(team_track_stats, on="Team_Track", how="left")
train_df = train_df.merge(driver_pace, on="Driver", how="left")
train_df = train_df.merge(team_pace, on="Team", how="left")

test_df = test_df.merge(driver_track_stats, on="Driver_Track", how="left")
test_df = test_df.merge(team_track_stats, on="Team_Track", how="left")
test_df = test_df.merge(driver_pace, on="Driver", how="left")
test_df = test_df.merge(team_pace, on="Team", how="left")

# -----------------------------
# Cold-start handling
# -----------------------------
global_std = train_df["LapDelta_sec"].std()

for d in (train_df, test_df):
    d["DriverTrackAvgDelta"].fillna(0, inplace=True)
    d["TeamTrackAvgDelta"].fillna(0, inplace=True)
    d["DriverTrackStdDelta"].fillna(global_std, inplace=True)
    d["TeamTrackStdDelta"].fillna(global_std, inplace=True)
    d["DriverMedianDelta"].fillna(0, inplace=True)
    d["TeamMedianDelta"].fillna(0, inplace=True)

# -----------------------------
# Features
# -----------------------------
categorical_features = [
    "Driver", "Team", "Compound", "Event", "Session",
    "QualiSegment", "CircuitName", "Country",
    "TrackType", "LapSpeedClass",
    "Driver_Track", "Team_Track"
]

numeric_features = [
    "TyreLife",
    "SpeedI1", "SpeedI2", "SpeedFL", "SpeedST",
    "TrackLength_m", "NumCorners", "CornerDensity",
    "AvgCornerSpacing_m", "AirTemp", "TrackTemp",
    "WindSpeed", "Altitude_m", "DRSZones",
    "DriverTrackAvgDelta", "DriverTrackStdDelta",
    "TeamTrackAvgDelta", "TeamTrackStdDelta",
    "DriverMedianDelta", "TeamMedianDelta"
]

features = categorical_features + numeric_features

# -----------------------------
# Pools
# -----------------------------
train_pool = Pool(
    train_df[features],
    train_df[TARGET],
    cat_features=categorical_features
)

test_pool = Pool(
    test_df[features],
    test_df[TARGET],
    cat_features=categorical_features
)

# -----------------------------
# Model
# -----------------------------
model = CatBoostRegressor(
    iterations=3000,
    learning_rate=0.03,
    depth=7,
    loss_function="MAE",
    eval_metric="MAE",
    random_seed=42,
    early_stopping_rounds=250,
    verbose=200
)

model.fit(train_pool, eval_set=test_pool, use_best_model=True)

# -----------------------------
# Evaluate
# -----------------------------
pred = model.predict(test_df[features])
mae = mean_absolute_error(test_df[TARGET], pred)
print(f"\nðŸ”¥ Q3 DELTA MAE: {mae:.3f} sec")

# -----------------------------
# Save
# -----------------------------
model.save_model("quali_q3_delta_model.cbm")
print("âœ… Model saved")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  d["DriverTrackAvgDelta"].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  d["TeamTrackAvgDelta"].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting valu

0:	learn: 8.7775976	test: 8.0729111	best: 8.0729111 (0)	total: 229ms	remaining: 11m 25s
200:	learn: 3.0865333	test: 4.0215642	best: 4.0090290 (198)	total: 17.5s	remaining: 4m 3s
400:	learn: 2.5673744	test: 3.9175679	best: 3.9114071 (381)	total: 35.2s	remaining: 3m 48s
600:	learn: 2.2890896	test: 3.8961989	best: 3.8866450 (588)	total: 52.4s	remaining: 3m 29s
800:	learn: 2.1406328	test: 3.8829707	best: 3.8820723 (654)	total: 1m 9s	remaining: 3m 10s
1000:	learn: 2.0346970	test: 3.9043701	best: 3.8811794 (812)	total: 1m 26s	remaining: 2m 52s
Stopped by overfitting detector  (250 iterations wait)

bestTest = 3.881179425
bestIteration = 812

Shrink model to first 813 iterations.

ðŸ”¥ Q3 DELTA MAE: 3.881 sec
âœ… Model saved
