In [42]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline      import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model  import LinearRegression, ElasticNetCV
from sklearn.neighbors     import KNeighborsRegressor
from sklearn.tree          import DecisionTreeRegressor
from sklearn.ensemble      import (
    GradientBoostingRegressor, AdaBoostRegressor,
    RandomForestRegressor, ExtraTreesRegressor
)
from xgboost               import XGBRegressor
import lightgbm as lgb
from sklearn.metrics       import mean_absolute_error, root_mean_squared_error
import pandas as pd, numpy as np
import json, joblib
from pathlib import Path
from sklearn.model_selection import KFold

In [43]:
PROJECT_ROOT = Path.cwd().resolve().parents[1]     
COMBINED_DIR   = PROJECT_ROOT / "combined_path"
OG_DIR = COMBINED_DIR / "new_test" / "original"
CELL_FILE_ADDITION = "original_0075_v2"


PARQUET_PATH    = OG_DIR / f"trips_{CELL_FILE_ADDITION}_with_predicted_distance.parquet"
#PARQUET_PATH = COMBINED_DIR / f'trips_0075_with_predicted_distance.parquet'


print("Reading  :", PARQUET_PATH)

SEED = 42
EXTENSION = CELL_FILE_ADDITION

df = pd.read_parquet(PARQUET_PATH)



Reading  : C:\Users\aless\OneDrive - Nexus365\Thesis\driver_data\combined_path\new_test\original\trips_original_0075_v2_with_predicted_distance.parquet


In [44]:
df["dow"]        = df["begintrip_timestamp_london"].dt.dayofweek
df["month_idx"]  = (
    df["begintrip_timestamp_london"].dt.year * 12 +
    df["begintrip_timestamp_london"].dt.month
)
df["doy"] = df["begintrip_timestamp_london"].dt.dayofyear

dow_map = {0:"Monday", 1:"Tuesday", 2:"Wednesday", 3:"Thursday",
           4:"Friday", 5:"Saturday", 6:"Sunday"}

dow_map = {0:"Monday", 1:"Tuesday", 2:"Wednesday", 3:"Thursday",
           4:"Friday", 5:"Saturday", 6:"Sunday"}
df["dow_text"]       = df["begintrip_timestamp_london"].dt.dayofweek.map(dow_map)

In [45]:
# https://www.tomtom.com/newsroom/explainers-and-insights/london-is-the-worlds-slowest-city/

speed_grid = {
    "Sunday":    [19,20,22,22,24,26,25,25,23,21,19,18,17,16,16,16,16,16,16,17,19,20,20,22],
    "Monday":    [23,24,25,26,26,26,21,17,15,15,15,15,15,15,16,15,15,15,16,17,19,20,20,22],
    "Tuesday":   [23,25,25,26,26,26,21,16,14,14,14,14,14,14,15,14,14,14,14,16,18,19,19,21],
    "Wednesday": [23,24,25,26,26,26,21,16,14,14,14,14,14,14,14,14,14,14,14,16,18,19,19,20],
    "Thursday":  [22,24,25,26,26,25,21,16,14,14,14,14,14,14,14,14,13,13,14,15,17,18,18,19],
    "Friday":    [21,22,24,25,26,25,21,17,15,15,15,14,14,14,14,14,14,14,14,15,17,18,17,18],
    "Saturday":  [19,21,22,23,25,26,25,23,21,19,18,16,16,15,15,15,15,15,15,15,17,17,17,17]
}

speed_tbl = (
    pd.DataFrame(speed_grid,    # index = 0…23
                 index=range(24))
      .stack()
      .reset_index(name="kmh")         # columns: level_0, level_1, kmh
      .rename(columns={"level_0": "hour",   # 💡 rename here
                       "level_1": "dow_text"})
)

In [46]:
print(f"Loaded {len(df):,} rows")
df.head()

Loaded 147,498 rows


Unnamed: 0,origin_row,origin_col,dest_row,dest_col,begin_lat,begin_lng,end_lat,end_lng,haversine_km,begintrip_timestamp_london,...,driver_id_offline_online,trip_distance_miles,trip_distance_km,osrm_sec,osrm_km,dow,month_idx,doy,km_pred,dow_text
0,5,12,7,7,51.440338,-0.159358,51.456711,-0.191571,2.880576,2016-04-28 17:23:20+01:00,...,03d0deda558765c2fbf485c57117bf2a3537611151fb49441ef9f285092305b6,2.789894,4.489888,398.700012,3.4857,3,24196,119,3.625828,Thursday
1,6,7,9,11,51.445763,-0.1914,51.47443,-0.167369,3.596283,2016-04-28 17:50:48+01:00,...,03d0deda558765c2fbf485c57117bf2a3537611151fb49441ef9f285092305b6,3.236049,5.207903,626.099976,5.0948,3,24196,119,5.318336,Thursday
2,10,11,12,8,51.479115,-0.16691,51.490761,-0.18379,1.744453,2016-04-28 18:10:50+01:00,...,03d0deda558765c2fbf485c57117bf2a3537611151fb49441ef9f285092305b6,1.491487,2.400309,302.100006,1.9474,3,24196,119,2.202951,Thursday
3,11,7,13,6,51.487488,-0.191229,51.502617,-0.199705,1.781674,2016-04-28 18:24:59+01:00,...,03d0deda558765c2fbf485c57117bf2a3537611151fb49441ef9f285092305b6,1.64228,2.642988,389.100006,2.6575,3,24196,119,2.887826,Thursday
4,5,13,13,8,51.444721,-0.148535,51.502235,-0.186893,6.925133,2016-04-28 21:06:02+01:00,...,03d0deda558765c2fbf485c57117bf2a3537611151fb49441ef9f285092305b6,8.20427,13.203459,1175.900024,7.8614,3,24196,119,8.588139,Thursday


Using the predicted km

In [47]:
PREDICTED_DISTANCE_FEATURES = [
    "begin_lat",  "begin_lng",  "end_lat",  "end_lng",
    "km_pred",
    "hour", "dow",   "month_idx", "doy",
]

In [48]:
TARGET_TRUE = "duration_sec"
X_train_true, X_test_true, y_train_true, y_test_true = train_test_split(
    df[PREDICTED_DISTANCE_FEATURES], df[TARGET_TRUE], test_size=0.20, random_state=SEED)

results = []

Baseline Information

In [49]:
mean_duration   = df["duration_sec"].mean()
median_duration = df["duration_sec"].median()
print(f"Mean duration   {mean_duration:,.4f}")
print(f"Median duration {median_duration:,.4f}")

# naïve constant model = predict training mean for every test row
const_pred = np.full_like(y_test_true, fill_value=y_test_true.median())
baseline_mae  = mean_absolute_error(y_test_true, const_pred)
baseline_rmse = root_mean_squared_error(y_test_true, const_pred)
print(f"Constant-median baseline  MAE {baseline_mae:.4f}  RMSE {baseline_rmse:.4f}")
results.append(("Constant-median baseline  MAE", baseline_mae, baseline_rmse))

Mean duration   1,081.3554
Median duration 909.3205
Constant-median baseline  MAE 520.9073  RMSE 741.5017


In [50]:
MODELS = {
    # "XGBoost"      : XGBRegressor(
    #     n_estimators=800,
    #     learning_rate=0.05,
    #     max_depth=8,
    #     subsample=0.8,
    #     objective="reg:squarederror",
    #     n_jobs=-1,
    #     random_state=SEED,
    # ),
    "LightGBM"     : lgb.LGBMRegressor(
        objective       = "regression_l1",  # MAE loss for time
        n_estimators    = 800,
        learning_rate   = 0.05,
        num_leaves      = 63,
        subsample       = 0.8,
        min_data_in_leaf= 50,
        max_depth       = -1,
        random_state    = SEED,
    ),
    "LightGBM quantile":lgb.LGBMRegressor(
        objective="quantile",
        alpha=0.5,                 # median
        n_estimators=800,
        learning_rate=0.05,
        num_leaves=63,
        min_data_in_leaf=50,
        subsample=0.8,
        random_state=SEED
    )
}

In [51]:
for name, model in MODELS.items():
    model.fit(X_train_true, y_train_true)
    pred = model.predict(X_test_true)
    mae  = mean_absolute_error(y_test_true, pred)
    rmse = root_mean_squared_error(y_test_true, pred)
    results.append((name+"PREDICTED DISTANCE", mae, rmse))
    print(f"{name:15}  MAE {mae:5.4f} sec   RMSE {rmse:5.4f} sec")
    resid = pd.DataFrame({
    "abs_err": (y_test_true - pred).abs(),
    "km":      X_test_true["km_pred"].values,
    "hour":    X_test_true["hour"].values,
    })
    print(resid.nlargest(10, "abs_err"))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006594 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1644
[LightGBM] [Info] Number of data points in the train set: 117998, number of used features: 9
[LightGBM] [Info] Start training from score 910.000000
LightGBM         MAE 191.5096 sec   RMSE 359.0806 sec
             abs_err        km  hour
127692  12645.436783  6.167392    14
66292   10022.415885  3.883019    14
145447   8485.464543  0.836624    16
67316    8039.759657       NaN    22
91696    7544.267048  1.866701     2
8464     7450.904915  5.188652     0
22052    7432.176164  6.826224    20
146492   7271.320078  4.719693    13
135240   7156.882658  1.648337    12
47412    5424.680273  4.120681    16
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005072 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1644

Using only the ORSM Distance Feature

In [52]:
OSRM_DISTANCE_FEATURES = [
    "begin_lat",  "begin_lng",  "end_lat",  "end_lng",
    "osrm_km",
    "hour", "dow",  "month_idx", "doy",
]

In [53]:
TARGET_TRUE = "duration_sec"
X_train_true, X_test_true, y_train_true, y_test_true = train_test_split(
    df[OSRM_DISTANCE_FEATURES], df[TARGET_TRUE], test_size=0.20, random_state=SEED)

In [54]:
for name, model in MODELS.items():
    model.fit(X_train_true, y_train_true)
    pred = model.predict(X_test_true)
    mae  = mean_absolute_error(y_test_true, pred)
    rmse = root_mean_squared_error(y_test_true, pred)
    results.append((name+"OSRM DISTANCE", mae, rmse))
    print(f"{name:15}  MAE {mae:5.4f} sec   RMSE {rmse:5.4f} sec")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001975 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1644
[LightGBM] [Info] Number of data points in the train set: 117998, number of used features: 9
[LightGBM] [Info] Start training from score 910.000000
LightGBM         MAE 192.0907 sec   RMSE 357.7023 sec
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008382 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1644
[LightGBM] [Info] Number of data points in the train set: 117998, number of used features: 9
[LightGBM] [Info] Start training from score 910.000000
LightGBM quantile  MAE 192.1519 sec   RMSE 357.6785 sec


Using Only Haversine Distance

In [55]:
HAVERSINE_DISTANCE_FEATURES = [
  #  "origin_row", "origin_col", "dest_row", "dest_col",
    "begin_lat",  "begin_lng",  "end_lat",  "end_lng",
    "haversine_km",
    "hour", "dow",   "month_idx", "doy",
]

In [56]:
TARGET_TRUE = "duration_sec"
X_train_true, X_test_true, y_train_true, y_test_true = train_test_split(
    df[HAVERSINE_DISTANCE_FEATURES], df[TARGET_TRUE], test_size=0.20, random_state=SEED)

In [57]:
# for name, model in MODELS.items():
#     model.fit(X_train_true, y_train_true)
#     pred = model.predict(X_test_true)
#     mae  = mean_absolute_error(y_test_true, pred)
#     rmse = root_mean_squared_error(y_test_true, pred)
#     results.append((name+"HAVERSINE DISTANCE", mae, rmse))
#     print(f"{name:15}  MAE {mae:5.4f} sec   RMSE {rmse:5.4f} sec")

USING NO DISTANCE JUST SECONDS PREDICTED

In [58]:
OSRM_FEATURES = [
    "begin_lat",  "begin_lng",  "end_lat",  "end_lng",
    "osrm_sec",
    "hour", "dow",   "month_idx", "doy",
]

In [59]:
TARGET_TRUE = "duration_sec"
good = df["osrm_sec"].notna() & df["duration_sec"].notna() & df["km_pred"].notna()
work = df.loc[good].copy()                     # 140 k → e.g. 132 k rows

X_train_true, X_test_true, y_train_true, y_test_true = train_test_split(
    work[OSRM_FEATURES], work[TARGET_TRUE], test_size=0.20, random_state=SEED)

In [60]:
for name, model in MODELS.items():
    model.fit(X_train_true, y_train_true)
    pred = model.predict(X_test_true)
    mae  = mean_absolute_error(y_test_true, pred)
    rmse = root_mean_squared_error(y_test_true, pred)
    results.append((name+"OSRM SECONDS", mae, rmse))
    print(f"{name:15}  MAE {mae:5.4f} sec   RMSE {rmse:5.4f} sec")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002069 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1645
[LightGBM] [Info] Number of data points in the train set: 117564, number of used features: 9
[LightGBM] [Info] Start training from score 909.000000
LightGBM         MAE 190.9998 sec   RMSE 350.1140 sec
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002316 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1645
[LightGBM] [Info] Number of data points in the train set: 117564, number of used features: 9
[LightGBM] [Info] Start training from score 909.000000
LightGBM quantile  MAE 191.0235 sec   RMSE 349.8539 sec


OSRM SECONDS and KM Predicted

In [61]:
OSRM_PRED_FEATURES = [
    "begin_lat",  "begin_lng",  "end_lat",  "end_lng",
    "osrm_sec", "km_pred",
    "hour", "dow",   "month_idx", "doy",
]

In [62]:
TARGET_TRUE = "duration_sec"
good = df["osrm_sec"].notna() & df["duration_sec"].notna() & df["km_pred"].notna()
work = df.loc[good].copy()                     # 140 k → e.g. 132 k rows

X_train_true, X_test_true, y_train_true, y_test_true = train_test_split(
    work[OSRM_PRED_FEATURES], work[TARGET_TRUE], test_size=0.20, random_state=SEED)

In [63]:
for name, model in MODELS.items():
    model.fit(X_train_true, y_train_true)
    pred = model.predict(X_test_true)
    mae  = mean_absolute_error(y_test_true, pred)
    rmse = root_mean_squared_error(y_test_true, pred)
    results.append((name+"OSRM SECONDS & PREDICTED", mae, rmse))
    print(f"{name:15}  MAE {mae:5.4f} sec   RMSE {rmse:5.4f} sec")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007691 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1900
[LightGBM] [Info] Number of data points in the train set: 117564, number of used features: 10
[LightGBM] [Info] Start training from score 909.000000
LightGBM         MAE 190.6059 sec   RMSE 349.4547 sec
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006714 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1900
[LightGBM] [Info] Number of data points in the train set: 117564, number of used features: 10
[LightGBM] [Info] Start training from score 909.000000
LightGBM quantile  MAE 190.6752 sec   RMSE 349.6670 sec


OSRM SECONDS and OSRM Predicted

In [64]:
OSRM_PRED_FEATURES = [
    "begin_lat",  "begin_lng",  "end_lat",  "end_lng",
    "osrm_sec", "osrm_km",
    "hour", "dow",   "month_idx", "doy",
]

In [65]:
TARGET_TRUE = "duration_sec"
good = df["osrm_sec"].notna() & df["duration_sec"].notna() & df["km_pred"].notna()
work = df.loc[good].copy()                     # 140 k → e.g. 132 k rows

X_train_true, X_test_true, y_train_true, y_test_true = train_test_split(
    work[OSRM_PRED_FEATURES], work[TARGET_TRUE], test_size=0.20, random_state=SEED)

In [66]:
for name, model in MODELS.items():
    model.fit(X_train_true, y_train_true)
    pred = model.predict(X_test_true)
    mae  = mean_absolute_error(y_test_true, pred)
    rmse = root_mean_squared_error(y_test_true, pred)
    results.append((name+"OSRM SECONDS & OSRM KM", mae, rmse))
    print(f"{name:15}  MAE {mae:5.4f} sec   RMSE {rmse:5.4f} sec")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007844 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1900
[LightGBM] [Info] Number of data points in the train set: 117564, number of used features: 10
[LightGBM] [Info] Start training from score 909.000000
LightGBM         MAE 190.7554 sec   RMSE 349.5183 sec
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008549 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1900
[LightGBM] [Info] Number of data points in the train set: 117564, number of used features: 10
[LightGBM] [Info] Start training from score 909.000000
LightGBM quantile  MAE 190.5008 sec   RMSE 349.4414 sec


In [67]:
OSRM_PRED_FEATURES = [
    "begin_lat",  "begin_lng",  "end_lat",  "end_lng",
    "osrm_sec", "osrm_km", "km_pred",
    "hour", "dow",   "month_idx", "doy",
]

In [68]:
TARGET_TRUE = "duration_sec"
good = df["osrm_sec"].notna() & df["duration_sec"].notna() & df["km_pred"].notna()
work = df.loc[good].copy()                     # 140 k → e.g. 132 k rows

X_train_true, X_test_true, y_train_true, y_test_true = train_test_split(
    work[OSRM_PRED_FEATURES], work[TARGET_TRUE], test_size=0.20, random_state=SEED)

In [69]:
for name, model in MODELS.items():
    model.fit(X_train_true, y_train_true)
    pred = model.predict(X_test_true)
    mae  = mean_absolute_error(y_test_true, pred)
    rmse = root_mean_squared_error(y_test_true, pred)
    results.append((name+"OSRM SECONDS & OSRM KM & KM PRED", mae, rmse))
    print(f"{name:15}  MAE {mae:5.4f} sec   RMSE {rmse:5.4f} sec")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008500 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2155
[LightGBM] [Info] Number of data points in the train set: 117564, number of used features: 11
[LightGBM] [Info] Start training from score 909.000000
LightGBM         MAE 190.4787 sec   RMSE 349.4277 sec
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001816 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2155
[LightGBM] [Info] Number of data points in the train set: 117564, number of used features: 11
[LightGBM] [Info] Start training from score 909.000000
LightGBM quantile  MAE 190.6382 sec   RMSE 349.5403 sec


Using the OSRM Time Calculated by the OSRM Route

In [70]:
mask = df["osrm_sec"].notna()            # keep only rows with a value
coverage = mask.mean() * 100
print(f"OSRM coverage: {coverage:.1f}%  ({mask.sum():,}/{len(df):,})")

mae  = mean_absolute_error(df.loc[mask, "duration_sec"],
                           df.loc[mask, "osrm_sec"])
rmse = root_mean_squared_error(df.loc[mask, "duration_sec"],
                          df.loc[mask, "osrm_sec"])

print(f"MAE  {mae:.1f} sec   RMSE  {rmse:.1f} sec")
results.append(("OSRM Seconds no change", mae, rmse))

OSRM coverage: 100.0%  (147,482/147,498)
MAE  363.6 sec   RMSE  562.8 sec


Using a Congestion Ratio to Adjust OSRM Time Using the Predicted KM

In [71]:
good = df["osrm_sec"].notna() & df["duration_sec"].notna() & df["km_pred"].notna()

work = df.loc[good].copy()                     # 140 k → e.g. 132 k rows
work["cong_ratio"] = work["duration_sec"] / work["osrm_sec"]

print(f"Training on {len(work):,} clean rows "
      f"(coverage {len(work)/len(df):.1%})")

# ── 2. Features & target ─────────────────────────────────────────────────
FEATURES_RATIO = [
    "begin_lat",  "begin_lng",  "end_lat",  "end_lng",
    "km_pred",
    "hour","dow", "month_idx","doy"
]
TARGET_RATIO = "cong_ratio"

X_train, X_test, y_train, y_test = train_test_split(
    work[FEATURES_RATIO], work[TARGET_RATIO], test_size=0.2, random_state=SEED
)

# ── 3. Train LightGBM on ratio ───────────────────────────────────────────
ratio_model = lgb.LGBMRegressor(
        objective       = "regression_l1",  # MAE loss for time
        n_estimators    = 800,
        learning_rate   = 0.05,
        num_leaves      = 63,
        subsample       = 0.8,
        min_data_in_leaf= 50,
        max_depth       = -1,
        random_state    = SEED,
).fit(X_train, y_train)

# ── 4. Predict durations for the test split ──────────────────────────────
pred_ratio    = ratio_model.predict(X_test)
dur_pred_test = work.loc[X_test.index, "osrm_sec"] * pred_ratio
dur_true_test = work.loc[X_test.index, "duration_sec"]

mae  = mean_absolute_error(dur_true_test, dur_pred_test)
rmse = root_mean_squared_error(dur_true_test, dur_pred_test)

print(f"Ratio Duration OSRM MAE  {mae:.1f} sec   RMSE  {rmse:.1f} sec")
results.append(("Ratio OSRM Duration using Predicted km", mae, rmse))

# X_train already contains the columns in the order LightGBM saw them
fi = (pd.Series(ratio_model.feature_importances_,
                index=X_train.columns,
                name="gain")
        .sort_values(ascending=False))

print("\n🔎  Top features (gain normalised):")
print((fi / fi.sum() * 100).round(2).head(20))   # show top-20 %

Training on 146,955 clean rows (coverage 99.6%)
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007780 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1645
[LightGBM] [Info] Number of data points in the train set: 117564, number of used features: 9
[LightGBM] [Info] Start training from score 1.378498
Ratio Duration OSRM MAE  189.2 sec   RMSE  348.6 sec

🔎  Top features (gain normalised):
end_lng      14.55
begin_lng    14.26
begin_lat    13.72
end_lat      13.55
doy          11.51
km_pred       9.89
month_idx     8.88
hour          8.70
dow           4.94
Name: gain, dtype: float64


In [72]:
good = df["osrm_sec"].notna() & df["duration_sec"].notna() & df["km_pred"].notna()

work = df.loc[good].copy()                     # 140 k → e.g. 132 k rows
work["cong_ratio"] = work["duration_sec"] / work["osrm_sec"]

print(f"Training on {len(work):,} clean rows "
      f"(coverage {len(work)/len(df):.1%})")

# ── 2. Features & target ─────────────────────────────────────────────────
FEATURES_RATIO = [
    "begin_lat",  "begin_lng",  "end_lat",  "end_lng",
    "km_pred", "osrm_km", "osrm_sec",
    "hour","dow", "month_idx","doy"
]
TARGET_RATIO = "cong_ratio"

X_train, X_test, y_train, y_test = train_test_split(
    work[FEATURES_RATIO], work[TARGET_RATIO], test_size=0.2, random_state=SEED
)

# ── 3. Train LightGBM on ratio ───────────────────────────────────────────
ratio_model = lgb.LGBMRegressor(
        objective       = "regression_l1",  # MAE loss for time
        n_estimators    = 800,
        learning_rate   = 0.05,
        num_leaves      = 63,
        subsample       = 0.8,
        min_data_in_leaf= 50,
        max_depth       = -1,
        random_state    = SEED,
).fit(X_train, y_train)

# ── 4. Predict durations for the test split ──────────────────────────────
pred_ratio    = ratio_model.predict(X_test)
dur_pred_test = work.loc[X_test.index, "osrm_sec"] * pred_ratio
dur_true_test = work.loc[X_test.index, "duration_sec"]

mae  = mean_absolute_error(dur_true_test, dur_pred_test)
rmse = root_mean_squared_error(dur_true_test, dur_pred_test)

print(f"Ratio Duration All 3 OSRM MAE  {mae:.1f} sec   RMSE  {rmse:.1f} sec")
results.append(("Ratio OSRM Duration using all 3", mae, rmse))

# X_train already contains the columns in the order LightGBM saw them
fi = (pd.Series(ratio_model.feature_importances_,
                index=X_train.columns,
                name="gain")
        .sort_values(ascending=False))

print("\n🔎  Top features (gain normalised):")
print((fi / fi.sum() * 100).round(2).head(20))   # show top-20 %

Training on 146,955 clean rows (coverage 99.6%)
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008659 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2155
[LightGBM] [Info] Number of data points in the train set: 117564, number of used features: 11
[LightGBM] [Info] Start training from score 1.378498
Ratio Duration All 3 OSRM MAE  189.0 sec   RMSE  347.4 sec

🔎  Top features (gain normalised):
begin_lat    12.97
end_lat      12.85
end_lng      12.57
begin_lng    12.42
doy          10.45
month_idx     8.65
hour          8.34
osrm_sec      7.31
osrm_km       5.08
km_pred       4.90
dow           4.46
Name: gain, dtype: float64


Using a Congestion Ratio to Adjust OSRM Time and only using the OSRM Distance

In [73]:
good = df["osrm_sec"].notna() & df["duration_sec"].notna() & df["km_pred"].notna()

work = df.loc[good].copy()                     # 140 k → e.g. 132 k rows
work["cong_ratio"] = work["duration_sec"] / work["osrm_sec"]

print(f"Training on {len(work):,} clean rows "
      f"(coverage {len(work)/len(df):.1%})")

# ── 2. Features & target ─────────────────────────────────────────────────
FEATURES_RATIO = [
   # "origin_row","origin_col","dest_row","dest_col",
    "begin_lat",  "begin_lng",  "end_lat",  "end_lng",
    "osrm_km",          # or haversine_km
    "hour","dow", "month_idx","doy"
]
TARGET_RATIO = "cong_ratio"

X_train, X_test, y_train, y_test = train_test_split(
    work[FEATURES_RATIO], work[TARGET_RATIO], test_size=0.2, random_state=SEED
)

# ── 3. Train LightGBM on ratio ───────────────────────────────────────────
ratio_model = lgb.LGBMRegressor(
        objective       = "regression_l1",  # MAE loss for time
        n_estimators    = 800,
        learning_rate   = 0.05,
        num_leaves      = 63,
        subsample       = 0.8,
        min_data_in_leaf= 50,
        max_depth       = -1,
        random_state    = SEED,
).fit(X_train, y_train)

# ── 4. Predict durations for the test split ──────────────────────────────
pred_ratio    = ratio_model.predict(X_test)
dur_pred_test = work.loc[X_test.index, "osrm_sec"] * pred_ratio
dur_true_test = work.loc[X_test.index, "duration_sec"]

mae  = mean_absolute_error(dur_true_test, dur_pred_test)
rmse = root_mean_squared_error(dur_true_test, dur_pred_test)

print(f"Ratio Duration OSRM MAE  {mae:.1f} sec   RMSE  {rmse:.1f} sec")
results.append(("Ratio OSRM Duration using OSRM km", mae, rmse))


fi = (pd.Series(ratio_model.feature_importances_,
                index=X_train.columns,
                name="gain")
        .sort_values(ascending=False))

print("\n🔎  Top features (gain normalised):")
print((fi / fi.sum() * 100).round(2).head(20))   # show top-20 %

Training on 146,955 clean rows (coverage 99.6%)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002584 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1645
[LightGBM] [Info] Number of data points in the train set: 117564, number of used features: 9
[LightGBM] [Info] Start training from score 1.378498
Ratio Duration OSRM MAE  189.1 sec   RMSE  346.8 sec

🔎  Top features (gain normalised):
begin_lng    14.21
begin_lat    14.19
end_lng      14.16
end_lat      13.95
doy          11.16
osrm_km      10.79
month_idx     8.52
hour          8.37
dow           4.67
Name: gain, dtype: float64


Using London Average Drive Speed and Predicted KM to Engineer Data

In [74]:
df = df.drop(columns="kmh", errors="ignore")
df = df.merge(speed_tbl, on=["hour", "dow_text"], how="left")

missing = df["kmh"].isna().mean()
print(f"Coverage after merge: {(1-missing):.5%}")

df["eng_sec_pred"] = df["km_pred"] / df["kmh"] * 3600
df["eng_sec_osrm"] = df["osrm_km"] / df["kmh"] * 3600


Coverage after merge: 100.00000%


In [75]:
mask = df["eng_sec_pred"].notna()            # keep only rows with a value
coverage = mask.mean() * 100
print(f"OSRM coverage: {coverage:.1f}%  ({mask.sum():,}/{len(df):,})")

mae  = mean_absolute_error(df.loc[mask, "duration_sec"],
                           df.loc[mask, "eng_sec_pred"])
rmse = root_mean_squared_error(df.loc[mask, "duration_sec"],
                          df.loc[mask, "eng_sec_pred"])

print(f"Engineered Seconds with Traffic and Predicted Distance no ML  {mae:.1f} sec   RMSE  {rmse:.1f} sec")
results.append(("Engineered Seconds with Traffic and Predicted Distance no ML ", mae, rmse))

OSRM coverage: 99.6%  (146,955/147,498)
Engineered Seconds with Traffic and Predicted Distance no ML  296.2 sec   RMSE  472.9 sec


Using London Average Drive Speed and OSRM KM to Engineer Data

In [76]:
mask = df["eng_sec_osrm"].notna()            # keep only rows with a value
coverage = mask.mean() * 100
print(f"OSRM coverage: {coverage:.1f}%  ({mask.sum():,}/{len(df):,})")

mae  = mean_absolute_error(df.loc[mask, "duration_sec"],
                           df.loc[mask, "eng_sec_osrm"])
rmse = root_mean_squared_error(df.loc[mask, "duration_sec"],
                          df.loc[mask, "eng_sec_osrm"])

print(f"Engineered Seconds with Traffic and OSRM Distance no ML  {mae:.1f} sec   RMSE  {rmse:.1f} sec")
results.append(("Engineered Seconds with Traffic and OSRM Distance no ML ", mae, rmse))

OSRM coverage: 100.0%  (147,482/147,498)
Engineered Seconds with Traffic and OSRM Distance no ML  283.5 sec   RMSE  467.0 sec


Engineered Time using Pred KM & Traffic - Feature includes Predicted KM


In [77]:
good = df["eng_sec_pred"].notna() & df["duration_sec"].notna()
work = df.loc[good].copy()
work["cong_ratio"] = work["duration_sec"] / work["eng_sec_pred"]

print(f"Training on {len(work):,} clean rows "
      f"(coverage {len(work)/len(df):.1%})")

# ── 2. Features & target ─────────────────────────────────────────────────
FEATURES_RATIO = [
    "begin_lat",  "begin_lng",  "end_lat",  "end_lng",
    "km_pred",          # or haversine_km
    "hour","dow", "month_idx","doy"
]
TARGET_RATIO = "cong_ratio"

X_train, X_test, y_train, y_test = train_test_split(
    work[FEATURES_RATIO], work[TARGET_RATIO], test_size=0.2, random_state=SEED
)

# ── 3. Train LightGBM on ratio ───────────────────────────────────────────
ratio_model = lgb.LGBMRegressor(
        objective       = "regression_l1",  # MAE loss for time
        n_estimators    = 800,
        learning_rate   = 0.05,
        num_leaves      = 63,
        subsample       = 0.8,
        min_data_in_leaf= 50,
        max_depth       = -1,
        random_state    = SEED,
).fit(X_train, y_train)

# ── 4. Predict durations for the test split ──────────────────────────────
pred_ratio    = ratio_model.predict(X_test)
dur_pred_test = work.loc[X_test.index, "eng_sec_pred"] * pred_ratio
dur_true_test = work.loc[X_test.index, "duration_sec"]

mae  = mean_absolute_error(dur_true_test, dur_pred_test)
rmse = root_mean_squared_error(dur_true_test, dur_pred_test)

print(f"Duration model  MAE {mae:.1f} s   RMSE {rmse:.1f} s")
results.append(("Ratio Engineered Time using Pred KM & Traffic - Feature includes Predicted KM", mae, rmse))

fi = (pd.Series(ratio_model.feature_importances_,
                index=X_train.columns,
                name="gain")
        .sort_values(ascending=False))

print("\n🔎  Top features (gain normalised):")
print((fi / fi.sum() * 100).round(2).head(20))   # show top-20 %

Training on 146,955 clean rows (coverage 99.6%)
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007121 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1645
[LightGBM] [Info] Number of data points in the train set: 117564, number of used features: 9
[LightGBM] [Info] Start training from score 0.890117
Duration model  MAE 189.7 s   RMSE 347.5 s

🔎  Top features (gain normalised):
begin_lat    13.99
begin_lng    13.88
end_lat      13.36
end_lng      13.29
doy          11.62
km_pred      10.37
hour          9.85
month_idx     8.74
dow           4.89
Name: gain, dtype: float64


Ratio Engineered Time using OSRM KM & Traffic - Feature includes Predicted KM


In [78]:
good = df["eng_sec_osrm"].notna() & df["duration_sec"].notna()
work = df.loc[good].copy()
work["cong_ratio"] = work["duration_sec"] / work["eng_sec_osrm"]

print(f"Training on {len(work):,} clean rows "
      f"(coverage {len(work)/len(df):.1%})")

# ── 2. Features & target ─────────────────────────────────────────────────
FEATURES_RATIO = [
   # "origin_row","origin_col","dest_row","dest_col",
    "begin_lat",  "begin_lng",  "end_lat",  "end_lng",
    "km_pred",          # or haversine_km
    "hour","dow", "month_idx","doy"
]
TARGET_RATIO = "cong_ratio"

X_train, X_test, y_train, y_test = train_test_split(
    work[FEATURES_RATIO], work[TARGET_RATIO], test_size=0.2, random_state=SEED
)

# ── 3. Train LightGBM on ratio ───────────────────────────────────────────
ratio_model = lgb.LGBMRegressor(
        objective       = "regression_l1",  # MAE loss for time
        n_estimators    = 800,
        learning_rate   = 0.05,
        num_leaves      = 63,
        subsample       = 0.8,
        min_data_in_leaf= 50,
        max_depth       = -1,
        random_state    = SEED,
).fit(X_train, y_train)

# ── 4. Predict durations for the test split ──────────────────────────────
pred_ratio    = ratio_model.predict(X_test)
dur_pred_test = work.loc[X_test.index, "eng_sec_osrm"] * pred_ratio
dur_true_test = work.loc[X_test.index, "duration_sec"]

mae  = mean_absolute_error(dur_true_test, dur_pred_test)
rmse = root_mean_squared_error(dur_true_test, dur_pred_test)

print(f"Duration model  MAE {mae:.1f} s   RMSE {rmse:.1f} s")
results.append(("Ratio Engineered Time using OSRM KM & Traffic - Feature includes Predicted KM", mae, rmse))

fi = (pd.Series(ratio_model.feature_importances_,
                index=X_train.columns,
                name="gain")
        .sort_values(ascending=False))

print("\n🔎  Top features (gain normalised):")
print((fi / fi.sum() * 100).round(2).head(20))   # show top-20 %

Training on 147,482 clean rows (coverage 100.0%)
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006756 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1644
[LightGBM] [Info] Number of data points in the train set: 117985, number of used features: 9
[LightGBM] [Info] Start training from score 0.950617
Duration model  MAE 206.3 s   RMSE 1306.5 s

🔎  Top features (gain normalised):
begin_lat    14.09
end_lng      13.42
end_lat      13.40
begin_lng    13.19
km_pred      11.89
doy          11.30
hour          9.39
month_idx     8.54
dow           4.78
Name: gain, dtype: float64


Ratio Engineered Time using Pred KM & Traffic - Feature includes OSRM KM

In [79]:
good = df["eng_sec_pred"].notna() & df["duration_sec"].notna()
work = df.loc[good].copy()
work["cong_ratio"] = work["duration_sec"] / work["eng_sec_pred"]

print(f"Training on {len(work):,} clean rows "
      f"(coverage {len(work)/len(df):.1%})")

# ── 2. Features & target ─────────────────────────────────────────────────
FEATURES_RATIO = [
   # "origin_row","origin_col","dest_row","dest_col",
    "begin_lat",  "begin_lng",  "end_lat",  "end_lng",
    "osrm_km",          # or haversine_km
    "hour","dow", "month_idx","doy"
]
TARGET_RATIO = "cong_ratio"

X_train, X_test, y_train, y_test = train_test_split(
    work[FEATURES_RATIO], work[TARGET_RATIO], test_size=0.2, random_state=SEED
)

# ── 3. Train LightGBM on ratio ───────────────────────────────────────────
ratio_model = lgb.LGBMRegressor(
        objective       = "regression_l1",  # MAE loss for time
        n_estimators    = 800,
        learning_rate   = 0.05,
        num_leaves      = 63,
        subsample       = 0.8,
        min_data_in_leaf= 50,
        max_depth       = -1,
        random_state    = SEED,
).fit(X_train, y_train)

# ── 4. Predict durations for the test split ──────────────────────────────
pred_ratio    = ratio_model.predict(X_test)
dur_pred_test = work.loc[X_test.index, "eng_sec_pred"] * pred_ratio
dur_true_test = work.loc[X_test.index, "duration_sec"]

mae  = mean_absolute_error(dur_true_test, dur_pred_test)
rmse = root_mean_squared_error(dur_true_test, dur_pred_test)

print(f"Duration model  MAE {mae:.1f} s   RMSE {rmse:.1f} s")
results.append(("Ratio Engineered Time using Pred KM & Traffic - Feature includes OSRM KM", mae, rmse))

fi = (pd.Series(ratio_model.feature_importances_,
                index=X_train.columns,
                name="gain")
        .sort_values(ascending=False))

print("\n🔎  Top features (gain normalised):")
print((fi / fi.sum() * 100).round(2).head(20))   # show top-20 %

Training on 146,955 clean rows (coverage 99.6%)
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007209 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1645
[LightGBM] [Info] Number of data points in the train set: 117564, number of used features: 9
[LightGBM] [Info] Start training from score 0.890117
Duration model  MAE 189.8 s   RMSE 347.2 s

🔎  Top features (gain normalised):
begin_lat    13.84
begin_lng    13.60
end_lng      13.34
end_lat      13.15
doy          11.68
osrm_km      10.28
hour         10.22
month_idx     9.04
dow           4.85
Name: gain, dtype: float64


Ratio Engineered Time using OSRM KM & Traffic - Feature includes OSRM KM

In [80]:
good = df["eng_sec_osrm"].notna() & df["duration_sec"].notna()
work = df.loc[good].copy()
work["cong_ratio"] = work["duration_sec"] / work["eng_sec_osrm"]

print(f"Training on {len(work):,} clean rows "
      f"(coverage {len(work)/len(df):.1%})")

# ── 2. Features & target ─────────────────────────────────────────────────
FEATURES_RATIO = [
  #  "origin_row","origin_col","dest_row","dest_col",
    "begin_lat",  "begin_lng",  "end_lat",  "end_lng",
    "km_pred",          # or haversine_km
    "hour","dow", "month_idx","doy"
]
TARGET_RATIO = "cong_ratio"

X_train, X_test, y_train, y_test = train_test_split(
    work[FEATURES_RATIO], work[TARGET_RATIO], test_size=0.2, random_state=SEED
)

# ── 3. Train LightGBM on ratio ───────────────────────────────────────────
ratio_model = lgb.LGBMRegressor(
        objective       = "regression_l1",  # MAE loss for time
        n_estimators    = 800,
        learning_rate   = 0.05,
        num_leaves      = 63,
        subsample       = 0.8,
        min_data_in_leaf= 50,
        max_depth       = -1,
        random_state    = SEED,
).fit(X_train, y_train)

# ── 4. Predict durations for the test split ──────────────────────────────
pred_ratio    = ratio_model.predict(X_test)
dur_pred_test = work.loc[X_test.index, "eng_sec_osrm"] * pred_ratio
dur_true_test = work.loc[X_test.index, "duration_sec"]

mae  = mean_absolute_error(dur_true_test, dur_pred_test)
rmse = root_mean_squared_error(dur_true_test, dur_pred_test)

print(f"Duration model  MAE {mae:.1f} s   RMSE {rmse:.1f} s")
results.append(("Ratio Engineered Time using OSRM KM & Traffic - Feature includes OSRM KM", mae, rmse))

fi = (pd.Series(ratio_model.feature_importances_,
                index=X_train.columns,
                name="gain")
        .sort_values(ascending=False))

print("\n🔎  Top features (gain normalised):")
print((fi / fi.sum() * 100).round(2).head(20))   # show top-20 %

Training on 147,482 clean rows (coverage 100.0%)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002087 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1644
[LightGBM] [Info] Number of data points in the train set: 117985, number of used features: 9
[LightGBM] [Info] Start training from score 0.950617
Duration model  MAE 206.3 s   RMSE 1306.5 s

🔎  Top features (gain normalised):
begin_lat    14.09
end_lng      13.42
end_lat      13.40
begin_lng    13.19
km_pred      11.89
doy          11.30
hour          9.39
month_idx     8.54
dow           4.78
Name: gain, dtype: float64


INCLUDES THE ERROR AND RATIO OF DISTANCES

In [81]:
BASE = [
    "begin_lat","begin_lng","end_lat","end_lng",
    "osrm_sec",
    "hour","dow", "month_idx","doy"
]

df["ratio_len"]   = df["km_pred"] / df["osrm_km"]
df["err_len_km"]  = (df["km_pred"] - df["osrm_km"]).abs()

FEATURES_DUR = BASE + ["ratio_len", "err_len_km"]
TARGET_DUR   = "duration_sec"

X_tr, X_te, y_tr, y_te = train_test_split(
        df[FEATURES_DUR], df[TARGET_DUR], test_size=0.2, random_state=SEED)

duration_model = lgb.LGBMRegressor(
        objective="regression_l1",
        n_estimators=1000,
        learning_rate=0.05,
        num_leaves=127,
        subsample=0.8,
        min_data_in_leaf=30,
        random_state=SEED,
).fit(X_tr, y_tr)


mae = mean_absolute_error(y_te, duration_model.predict(X_te))
rmse = root_mean_squared_error(y_te, duration_model.predict(X_te))

print(f"INCLUDES THE ERROR AND RATIO OF DISTANCES {mae:.1f} s   RMSE {rmse:.1f} s")
results.append(("Ratio of OSRM and Predicted and Error Bias", mae, rmse))

fi = (pd.Series(duration_model.feature_importances_,
                index=X_tr.columns,
                name="gain")
        .sort_values(ascending=False))

print("\n🔎  Top features (gain normalised):")
print((fi / fi.sum() * 100).round(2).head(20))   # show top-20 %

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007259 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2154
[LightGBM] [Info] Number of data points in the train set: 117998, number of used features: 11
[LightGBM] [Info] Start training from score 910.000000
INCLUDES THE ERROR AND RATIO OF DISTANCES 190.6 s   RMSE 357.1 s

🔎  Top features (gain normalised):
begin_lng     11.78
begin_lat     11.62
end_lng       11.55
end_lat       11.45
doy           10.62
hour           8.31
osrm_sec       7.75
ratio_len      7.72
month_idx      7.53
err_len_km     7.41
dow            4.26
Name: gain, dtype: float64


In [82]:
import os
pd.set_option("display.max_colwidth", None)  # show full text in any column

leader = (
    pd.DataFrame(results, columns=["Model","MAE","RMSE"])
      .sort_values("MAE")
      .reset_index(drop=True)
)

leader
PROJECT_ROOT = Path.cwd().resolve().parents[0]   
out = PROJECT_ROOT / "duration" / "duration_results" / "v4" / f"leader_{SEED}_{EXTENSION}.csv"
os.makedirs(out.parent, exist_ok=True) 
leader.to_csv(out, index=False)
print("Wrote:", out)

Wrote: C:\Users\aless\OneDrive - Nexus365\Thesis\driver_data\models\duration\duration_results\v4\leader_42_original_0075_v2.csv
