In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

# ML libs
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor

In [20]:
TRAIN_PATH = "/home/awail/PycharmProjects/kaggle/RoadAccident/train.csv"
TEST_PATH  = "/home/awail/PycharmProjects/kaggle/RoadAccident/test.csv"
OUTPUT_PATH = "/home/awail/PycharmProjects/kaggle/RoadAccident/submission_ensemble.csv"
RANDOM_STATE = 42

# ============================================================
# LOAD TRAINING DATA
# ============================================================
df = pd.read_csv(TRAIN_PATH)


In [6]:
# ============================================================
# PATHS + CONFIG
# ============================================================
TRAIN_PATH = "/home/awail/PycharmProjects/kaggle/RoadAccident/train.csv"
TEST_PATH  = "/home/awail/PycharmProjects/kaggle/RoadAccident/test.csv"
OUTPUT_PATH = "/home/awail/PycharmProjects/kaggle/RoadAccident/submission_ensemble.csv"
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)


In [8]:
def apply_feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    # assume columns used below exist in df (speed_limit, curvature, num_lanes, weather, lighting, holiday, school_season, time_of_day, num_reported_accidents, road_type, public_road)
    df = df.copy()
    df["speed_limit_curvature"] = df["speed_limit"] * df["curvature"]
    df["speed_limit_num_lanes"] = df["speed_limit"] * df["num_lanes"]
    df["curvature_num_lanes"] = df["curvature"] * df["num_lanes"]

    df["lighting_weather"] = df["lighting"].astype(str) + "_" + df["weather"].astype(str)
    df["lighting_numeric"] = df["lighting"].astype("category").cat.codes
    df["weather_numeric"] = df["weather"].astype("category").cat.codes

    df["curvature_x_speed_limit"] = df["curvature"] * df["speed_limit"]
    df["curvature_x_speed_limit_x_lighting"] = df["curvature"] * df["speed_limit"] * df["lighting_numeric"]
    df["curvature_x_lighting"] = df["curvature"] * df["lighting_numeric"]
    df["num_lanes_x_curvature_x_speed_limit"] = df["num_lanes"] * df["curvature"] * df["speed_limit"]
    df["curvature_x_speed_limit_x_num_reported_accidents"] = df["curvature"] * df["speed_limit"] * df["num_reported_accidents"]
    df["speed_limit_x_lighting"] = df["speed_limit"] * df["lighting_numeric"]
    df["curvature_x_speed_limit_x_weather"] = df["curvature"] * df["speed_limit"] * df["weather_numeric"]
    df["curvature_x_lighting_x_holiday"] = df["curvature"] * df["lighting_numeric"] * df["holiday"].astype(int)
    df["speed_limit_x_weather_foggy"] = np.where(df["weather"].astype(str).str.lower() == "foggy", df["speed_limit"], 0)

    df["school_time_holiday"] = (
        df["school_season"].astype(int)
        * df["time_of_day"].astype("category").cat.codes
        * df["holiday"].astype(int)
    )
    return df

In [9]:
# ============================================================
# LOAD TRAIN DATA + ENGINEER FEATURES
# ============================================================
df = pd.read_csv(TRAIN_PATH)
if "id" in df.columns:
    df = df.drop("id", axis=1)

df = apply_feature_engineering(df)

# ============================================================
# FINAL FEATURES (engineered + requested original features)
# ============================================================
engineered_cols = [
    "speed_limit_curvature", "speed_limit_num_lanes", "curvature_num_lanes",
    "lighting_weather", "lighting_numeric", "weather_numeric",
    "curvature_x_speed_limit", "curvature_x_speed_limit_x_lighting",
    "curvature_x_lighting", "num_lanes_x_curvature_x_speed_limit",
    "curvature_x_speed_limit_x_num_reported_accidents",
    "speed_limit_x_lighting", "curvature_x_speed_limit_x_weather",
    "curvature_x_lighting_x_holiday", "speed_limit_x_weather_foggy",
    "school_time_holiday"
]

# important original features the user wanted to keep
important_originals = [
    "curvature", "lighting", "speed_limit", "num_reported_accidents",
    "weather", "holiday", "road_type", "time_of_day", "public_road"
]

final_features = important_originals + engineered_cols

# ensure all features exist
missing = [c for c in final_features if c not in df.columns]
if missing:
    raise ValueError(f"Missing required columns in training data after FE: {missing}")

X_df = df[final_features].copy()
y = df["accident_risk"].values


In [10]:
# ============================================================
# PREPROCESSING (numerical scaling + one-hot for categorical)
# ============================================================
# choose categorical columns (we'll OHE them)
cat_cols = ["lighting", "weather", "road_type", "time_of_day", "holiday", "lighting_weather", "public_road"]
num_cols = [c for c in final_features if c not in cat_cols]

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols)
], remainder="drop")

X = preprocessor.fit_transform(X_df)

# Build readable feature names for importances (best-effort)
try:
    feature_names = preprocessor.get_feature_names_out()
except Exception:
    # fallback: construct numeric names + expand categories with OHE feature names if possible
    num_out = num_cols
    # try to get categories from fitted OneHotEncoder
    ohe = None
    for name, trans, cols in preprocessor.transformers_:
        if name == "cat":
            ohe = trans
            cat_features = cols
            break
    if ohe is not None:
        cat_out = []
        categories = list(ohe.categories_)
        for col_name, cats in zip(cat_features, categories):
            cat_out += [f"{col_name}_{str(cat)}" for cat in cats]
    else:
        cat_out = [f"cat_{i}" for i in range(1)]
    feature_names = np.array(num_out + cat_out)

# ============================================================
# TRAIN/VAL SPLIT
# ============================================================
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)


In [None]:
# ============================================================
# MODEL: XGBoost
# ============================================================
xgb_model = xgb.XGBRegressor(
    n_estimators=3000,
    learning_rate=0.004,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    tree_method="hist",
    n_jobs=-1,
    eval_metric="rmse"   # ✅ moved here
)
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=200,
   # early_stopping_rounds=100
)
xgb_val_pred = xgb_model.predict(X_val)
xgb_rmse = np.sqrt(mean_squared_error(y_val, xgb_val_pred))
print(f"XGBoost  RMSE: {xgb_rmse:.6f}")


[0]	validation_0-rmse:0.16561
[200]	validation_0-rmse:0.09066
[400]	validation_0-rmse:0.06513
[600]	validation_0-rmse:0.05835
[800]	validation_0-rmse:0.05678
[1000]	validation_0-rmse:0.05641
[1200]	validation_0-rmse:0.05630
[1400]	validation_0-rmse:0.05627
[1600]	validation_0-rmse:0.05625


In [26]:
# ============================================================
# MODEL: LightGBM
# ============================================================
lgb_model = lgb.LGBMRegressor(
    n_estimators=2000,
    learning_rate=0.004,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=RANDOM_STATE,
    n_jobs=-1,
    #
    #early_stopping_rounds=,
    verbose=100
)
lgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric="rmse",


)
lgb_val_pred = lgb_model.predict(X_val)
lgb_rmse = np.sqrt(mean_squared_error(y_val, lgb_val_pred))
print(f"LightGBM  RMSE: {lgb_rmse:.6f}")


[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.033274
[LightGBM] [Debug] init for col-wise cost 0.000004 seconds, init for row-wise cost 0.009033 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004982 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 2367
[LightGBM] [Info] Number of data points in the train set: 414203, number of used features: 43
[LightGBM] [Info] Start training from score 0.352605
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
Training until validation scores don't improve for 100 rounds
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 3



LightGBM  RMSE: 0.062872


In [18]:
# ============================================================
# MODEL: CatBoost
# ============================================================
# CatBoost works with numpy arrays just fine; we'll pass the preprocessed numeric matrix.
cat_model = CatBoostRegressor(
    iterations=2000,
    learning_rate=0.004,
    depth=8,
    subsample=0.8,
    random_seed=RANDOM_STATE,
    verbose=200,
    early_stopping_rounds=100,
    loss_function="RMSE"
)

cat_model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    use_best_model=True,
)
cat_val_pred = cat_model.predict(X_val)
cat_rmse = np.sqrt(mean_squared_error(y_val, cat_val_pred))
print(f"CatBoost   RMSE: {cat_rmse:.6f}")


0:	learn: 0.1659063	test: 0.1656032	best: 0.1656032 (0)	total: 90.9ms	remaining: 3m 1s
200:	learn: 0.0916047	test: 0.0916098	best: 0.0916098 (200)	total: 4.5s	remaining: 40.3s
400:	learn: 0.0660182	test: 0.0661658	best: 0.0661658 (400)	total: 8.54s	remaining: 34s
600:	learn: 0.0589911	test: 0.0591837	best: 0.0591837 (600)	total: 12.8s	remaining: 29.8s
800:	learn: 0.0572205	test: 0.0574355	best: 0.0574355 (800)	total: 19s	remaining: 28.4s
1000:	learn: 0.0567169	test: 0.0569535	best: 0.0569535 (1000)	total: 23.4s	remaining: 23.4s
1200:	learn: 0.0565227	test: 0.0567776	best: 0.0567776 (1200)	total: 28s	remaining: 18.6s
1400:	learn: 0.0564172	test: 0.0566875	best: 0.0566875 (1400)	total: 32.2s	remaining: 13.8s
1600:	learn: 0.0563455	test: 0.0566322	best: 0.0566322 (1600)	total: 36.4s	remaining: 9.08s
1800:	learn: 0.0562893	test: 0.0565924	best: 0.0565924 (1800)	total: 40.1s	remaining: 4.43s
1999:	learn: 0.0562426	test: 0.0565629	best: 0.0565629 (1999)	total: 43.7s	remaining: 0us

bestTest 

In [24]:
# ============================================================
# ENSEMBLE: weighted average by inverse-RMSE
# ============================================================
rmses = np.array([xgb_rmse, lgb_rmse, cat_rmse])
weights = 1.0 / (rmses + 1e-12)           # inverse RMSE
weights = weights / weights.sum()        # normalized
print(f"\nModel weights (inverse-RMSE normalized): XGB={weights[0]:.3f}, LGB={weights[1]:.3f}, CAT={weights[2]:.3f}")

val_ensemble_pred = (weights[0] * xgb_val_pred +
                     weights[1] * lgb_val_pred +
                     weights[2] * cat_val_pred)
ensemble_rmse = np.sqrt(mean_squared_error(y_val, val_ensemble_pred))
print(f"Ensemble  RMSE on validation: {ensemble_rmse:.6f}")



Model weights (inverse-RMSE normalized): XGB=0.335, LGB=0.333, CAT=0.333
Ensemble  RMSE on validation: 0.056350


In [25]:
# ============================================================
# TRAINING DONE -> PREPARE TEST SET, PREDICT, SAVE
# ============================================================
test_df = pd.read_csv(TEST_PATH)
ids = test_df["id"]

test_df = apply_feature_engineering(test_df)

# ensure test has the same final features (if some categories missing in test, OneHotEncoder handle_unknown='ignore' will handle them)
missing_test = [c for c in final_features if c not in test_df.columns]
if missing_test:
    raise ValueError(f"Missing required columns in test data after FE: {missing_test}")

X_test_df = test_df[final_features].copy()
X_test = preprocessor.transform(X_test_df)

# single-model preds
xgb_test_pred = xgb_model.predict(X_test)
lgb_test_pred = lgb_model.predict(X_test)
cat_test_pred = cat_model.predict(X_test)

test_ensemble_pred = (weights[0] * xgb_test_pred +
                      weights[1] * lgb_test_pred +
                      weights[2] * cat_test_pred)

submission = pd.DataFrame({
    "id": ids,
    "accident_risk": test_ensemble_pred
})
submission.to_csv(OUTPUT_PATH, index=False)
print(f"\n✅ Ensemble submission saved to: {OUTPUT_PATH}")





✅ Ensemble submission saved to: /home/awail/PycharmProjects/kaggle/RoadAccident/submission_ensemble.csv
