In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder

# -----------------------------
# Load final cleaned dataset
# -----------------------------
df = pd.read_csv("data.csv")

# -----------------------------
# Target
# -----------------------------
TARGET = "LapTime_sec"

# -----------------------------
# Drop leakage / ID columns
# -----------------------------
drop_cols = [
    "LapTime", "Time", "LapStartTime",
    "IsPersonalBest", "LapNumber"
]

X = df.drop(columns=[TARGET] + drop_cols)
y = df[TARGET]

# -----------------------------
# Feature groups
# -----------------------------
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()
boolean_features = X.select_dtypes(include=["bool"]).columns.tolist()

# Ordinal feature
ordinal_features = ["QualiSegment"]
ordinal_order = [["Q1", "Q2", "Q3"]]

# Remove ordinal from categorical list
categorical_features = [c for c in categorical_features if c not in ordinal_features]

# -----------------------------
# Preprocessing pipeline
# -----------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_features),
        ("ord", OrdinalEncoder(categories=ordinal_order), ordinal_features),
        ("bool", "passthrough", boolean_features),
    ],
    remainder="drop"
)

# -----------------------------
# Fit & transform
# -----------------------------
X_processed = preprocessor.fit_transform(X)

print("‚úÖ Feature engineering complete")
print("Final feature matrix shape:", X_processed.shape)


In [4]:
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error

df = pd.read_csv("data.csv")

# 1Ô∏è‚É£ Only push laps
df = df[df["IsPushLap"] == 1].copy()

# 2Ô∏è‚É£ Convert sector times to seconds
for col in ["Sector1Time", "Sector2Time", "Sector3Time"]:
    df[col] = pd.to_timedelta(df[col]).dt.total_seconds()

TARGET = "LapTime_sec"

drop_cols = ["LapTime", "Time", "LapStartTime"]

X = df.drop(columns=[TARGET] + drop_cols)
y = df[TARGET]

cat_features = X.select_dtypes(include=["object"]).columns.tolist()

# Time split
train_idx = df["Year"] <= 2022
test_idx  = df["Year"] >= 2023

X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

model = CatBoostRegressor(
    iterations=2000,
    learning_rate=0.03,
    depth=8,
    loss_function="MAE",
    eval_metric="MAE",
    random_seed=42,
    early_stopping_rounds=150,
    verbose=200
)

model.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_test, y_test),
    use_best_model=True
)

y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)

print(f"\nüî• Push-lap MAE: {mae:.3f} seconds")


0:	learn: 13.5350071	test: 12.5994196	best: 12.5994196 (0)	total: 91ms	remaining: 3m 1s
200:	learn: 0.8683989	test: 1.6657126	best: 1.6657126 (200)	total: 31.9s	remaining: 4m 45s
400:	learn: 0.5586540	test: 1.3560426	best: 1.3560426 (400)	total: 57s	remaining: 3m 47s
600:	learn: 0.4125120	test: 1.2149498	best: 1.2149498 (600)	total: 1m 32s	remaining: 3m 34s
800:	learn: 0.3438640	test: 1.1610677	best: 1.1610677 (800)	total: 1m 56s	remaining: 2m 54s
1000:	learn: 0.3043748	test: 1.1303059	best: 1.1301333 (997)	total: 2m 27s	remaining: 2m 27s
1200:	learn: 0.2766603	test: 1.1133675	best: 1.1133228 (1199)	total: 2m 48s	remaining: 1m 52s
1400:	learn: 0.2534750	test: 1.1007054	best: 1.1007054 (1400)	total: 3m 9s	remaining: 1m 21s
1600:	learn: 0.2388874	test: 1.0903845	best: 1.0903472 (1587)	total: 3m 30s	remaining: 52.4s
1800:	learn: 0.2272505	test: 1.0831734	best: 1.0831734 (1800)	total: 3m 50s	remaining: 25.4s
1999:	learn: 0.2176057	test: 1.0812281	best: 1.0812281 (1999)	total: 4m 10s	remain

In [1]:
import pandas as pd
from catboost import CatBoostRegressor, Pool

# Load cleaned data
df = pd.read_csv("data.csv")

# Use push laps only
df = df[df["IsPushLap"] == 1].copy()

# Features
categorical_features = [
    "Driver", "Team", "Compound", "Event", "Session",
    "QualiSegment", "CircuitName", "Country",
    "TrackType", "LapSpeedClass"
]

numeric_features = [
    "TyreLife", "SpeedI1", "SpeedI2", "SpeedFL", "SpeedST",
    "TrackLength_m", "NumCorners", "CornerDensity",
    "AvgCornerSpacing_m", "AirTemp", "TrackTemp",
    "WindSpeed", "Altitude_m", "DRSZones"
]

features = categorical_features + numeric_features
target = "LapTime_sec"

X = df[features]
y = df[target]

train_pool = Pool(
    X, y,
    cat_features=categorical_features
)

model = CatBoostRegressor(
    iterations=2000,
    learning_rate=0.05,
    depth=8,
    loss_function="MAE",
    random_seed=42,
    verbose=200
)

model.fit(train_pool)

# SAVE MODEL
model.save_model("quali_model.cbm")

print("‚úÖ Model saved as quali_model.cbm")


0:	learn: 13.0304507	total: 265ms	remaining: 8m 49s
200:	learn: 2.5671297	total: 31.8s	remaining: 4m 44s
400:	learn: 2.1581875	total: 1m 1s	remaining: 4m 5s
600:	learn: 1.9803416	total: 1m 36s	remaining: 3m 45s
800:	learn: 1.8714931	total: 2m 3s	remaining: 3m 5s
1000:	learn: 1.7844094	total: 2m 35s	remaining: 2m 35s
1200:	learn: 1.7202574	total: 3m 1s	remaining: 2m
1400:	learn: 1.6766964	total: 3m 42s	remaining: 1m 35s
1600:	learn: 1.6437282	total: 4m 14s	remaining: 1m 3s
1800:	learn: 1.6172660	total: 4m 48s	remaining: 31.8s
1999:	learn: 1.5944487	total: 5m 20s	remaining: 0us
‚úÖ Model saved as quali_model.cbm


In [2]:
from catboost import CatBoostRegressor
import pandas as pd

model = CatBoostRegressor()
model.load_model("quali_model.cbm")

print("‚úÖ Model loaded successfully")


‚úÖ Model loaded successfully
