In [8]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline      import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model  import LinearRegression, ElasticNetCV
from sklearn.neighbors     import KNeighborsRegressor
from sklearn.tree          import DecisionTreeRegressor
from sklearn.ensemble      import (
    GradientBoostingRegressor, AdaBoostRegressor,
    RandomForestRegressor, ExtraTreesRegressor
)
from xgboost               import XGBRegressor
import lightgbm as lgb
from sklearn.metrics       import mean_absolute_error, root_mean_squared_error
import pandas as pd, numpy as np
import json, joblib
from pathlib import Path
from sklearn.model_selection import KFold
from itertools import combinations

In [9]:
PROJECT_ROOT = Path.cwd().resolve().parents[1]     
COMBINED_DIR   = PROJECT_ROOT / "combined_path"
OG_DIR = COMBINED_DIR / "new_test" / "original"
CELL_FILE_ADDITION = "original_0075_v2"

PARQUET_PATH    = OG_DIR / f"trips_{CELL_FILE_ADDITION}_with_predicted_distance.parquet"
PARQUET_OUT    = OG_DIR / f"trips_{CELL_FILE_ADDITION}_with_predicted_distance_time.parquet"

DURATION_MODEL_DIR = PROJECT_ROOT / "models" / "duration" / "model_artifacts" / "duration_v4_original"
DURATION_MODEL_DIR.mkdir(parents=True, exist_ok=True)

print("Reading  :", PARQUET_PATH)
print("Saving to :", DURATION_MODEL_DIR)

SEED = 42



Reading  : C:\Users\aless\OneDrive - Nexus365\Thesis\driver_data\combined_path\new_test\original\trips_original_0075_v2_with_predicted_distance.parquet
Saving to : C:\Users\aless\OneDrive - Nexus365\Thesis\driver_data\models\duration\model_artifacts\duration_v4_original


In [10]:
df = pd.read_parquet(PARQUET_PATH)

In [11]:
print(f"Loaded {len(df):,} rows")
df.head()

Loaded 147,498 rows


Unnamed: 0,origin_row,origin_col,dest_row,dest_col,begin_lat,begin_lng,end_lat,end_lng,haversine_km,begintrip_timestamp_london,...,pay_after_uber_cut,driver_id_offline_online,trip_distance_miles,trip_distance_km,osrm_sec,osrm_km,dow,month_idx,doy,km_pred
0,5,12,7,7,51.440338,-0.159358,51.456711,-0.191571,2.880576,2016-04-28 17:23:20+01:00,...,6.62,03d0deda558765c2fbf485c57117bf2a3537611151fb49...,2.789894,4.489888,398.700012,3.4857,3,24196,119,3.625828
1,6,7,9,11,51.445763,-0.1914,51.47443,-0.167369,3.596283,2016-04-28 17:50:48+01:00,...,7.1,03d0deda558765c2fbf485c57117bf2a3537611151fb49...,3.236049,5.207903,626.099976,5.0948,3,24196,119,5.318336
2,10,11,12,8,51.479115,-0.16691,51.490761,-0.18379,1.744453,2016-04-28 18:10:50+01:00,...,4.53,03d0deda558765c2fbf485c57117bf2a3537611151fb49...,1.491487,2.400309,302.100006,1.9474,3,24196,119,2.202951
3,11,7,13,6,51.487488,-0.191229,51.502617,-0.199705,1.781674,2016-04-28 18:24:59+01:00,...,4.68,03d0deda558765c2fbf485c57117bf2a3537611151fb49...,1.64228,2.642988,389.100006,2.6575,3,24196,119,2.887826
4,5,13,13,8,51.444721,-0.148535,51.502235,-0.186893,6.925133,2016-04-28 21:06:02+01:00,...,14.74,03d0deda558765c2fbf485c57117bf2a3537611151fb49...,8.20427,13.203459,1175.900024,7.8614,3,24196,119,8.588139


In [12]:
def build_duration_feature(df: pd.DataFrame,
                           osrm_col: str = "osrm_sec",
                           true_col: str = "duration_sec",
                           k: int = 5) -> pd.DataFrame:

    # ── 1. subset & target ──────────────────────────
    good = df["osrm_sec"].notna() & df["duration_sec"].notna() & df["km_pred"].notna()
    work = df.loc[good].copy()

    work["cong_ratio"] = work["duration_sec"] / work["osrm_sec"]


    FEATURES = [
        "begin_lat",  "begin_lng",  "end_lat",  "end_lng",
        "osrm_km", "osrm_sec", "km_pred",        
        "hour","dow","month_idx","doy"
    ]

    # container for out-of-fold preds
    work["sec_pred"] = np.nan

    # ── 2. K-fold OOF loop ─────────────────────────
    kf = KFold(n_splits=k, shuffle=True, random_state=SEED)

    params = dict(objective="regression_l1",
                  n_estimators=800,
                  learning_rate=0.05,
                  num_leaves=63,
                  subsample=0.8,
                  min_data_in_leaf=50,
                  max_depth=-1,
                  random_state=SEED)

    for fold, (tr_idx, va_idx) in enumerate(kf.split(work), 1):
        m = lgb.LGBMRegressor(**params)
        m.fit(work.iloc[tr_idx][FEATURES], work.iloc[tr_idx]["cong_ratio"])
        ratio_val = m.predict(work.iloc[va_idx][FEATURES])
        work.loc[work.index[va_idx], "sec_pred"] = ratio_val * work.iloc[va_idx]["osrm_sec"]
        print(f"fold {fold}/{k} done")


    assert work["sec_pred"].isna().sum() == 0, "OOF fill failed"


    mae = mean_absolute_error(work[true_col], work["sec_pred"])
    rmse = root_mean_squared_error(work[true_col], work["sec_pred"])
    print(f"OOF MAE = {mae:.6f} sec OOF RMSE = {rmse:.6f}  (over {len(work):,} rows)")


    final_model = lgb.LGBMRegressor(**params)
    final_model.fit(work[FEATURES], work["cong_ratio"])

 
    joblib.dump(final_model, DURATION_MODEL_DIR / "duration_lgbm.joblib")
    final_model.booster_.save_model(str(DURATION_MODEL_DIR / "duration_lgbm.txt"))
    json.dump(FEATURES, open(DURATION_MODEL_DIR / "duration_feature_order.json", "w"))
    json.dump({"k": k, "mae_sec": float(mae)},
              open(DURATION_MODEL_DIR / "metadata.json", "w"), indent=2)

    print("✅  artefacts saved to", DURATION_MODEL_DIR)

  
    df = df.copy()
    df.loc[work.index, "sec_pred"] = work["sec_pred"]

    return df, final_model


In [13]:
df, duration_model = build_duration_feature(df, k=8)
df.head()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007997 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2155
[LightGBM] [Info] Number of data points in the train set: 128585, number of used features: 11
[LightGBM] [Info] Start training from score 1.377936
fold 1/8 done
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007252 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2154
[LightGBM] [Info] Number of data points in the train set: 128585, number of used features: 11
[LightGBM] [Info] Start training from score 1.377933
fold 2/8 done
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007713 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2153
[LightGBM] [Info] Number of data points in the train set: 128585, number of used features:

Unnamed: 0,origin_row,origin_col,dest_row,dest_col,begin_lat,begin_lng,end_lat,end_lng,haversine_km,begintrip_timestamp_london,...,driver_id_offline_online,trip_distance_miles,trip_distance_km,osrm_sec,osrm_km,dow,month_idx,doy,km_pred,sec_pred
0,5,12,7,7,51.440338,-0.159358,51.456711,-0.191571,2.880576,2016-04-28 17:23:20+01:00,...,03d0deda558765c2fbf485c57117bf2a3537611151fb49...,2.789894,4.489888,398.700012,3.4857,3,24196,119,3.625828,664.820351
1,6,7,9,11,51.445763,-0.1914,51.47443,-0.167369,3.596283,2016-04-28 17:50:48+01:00,...,03d0deda558765c2fbf485c57117bf2a3537611151fb49...,3.236049,5.207903,626.099976,5.0948,3,24196,119,5.318336,955.559975
2,10,11,12,8,51.479115,-0.16691,51.490761,-0.18379,1.744453,2016-04-28 18:10:50+01:00,...,03d0deda558765c2fbf485c57117bf2a3537611151fb49...,1.491487,2.400309,302.100006,1.9474,3,24196,119,2.202951,542.938647
3,11,7,13,6,51.487488,-0.191229,51.502617,-0.199705,1.781674,2016-04-28 18:24:59+01:00,...,03d0deda558765c2fbf485c57117bf2a3537611151fb49...,1.64228,2.642988,389.100006,2.6575,3,24196,119,2.887826,690.397036
4,5,13,13,8,51.444721,-0.148535,51.502235,-0.186893,6.925133,2016-04-28 21:06:02+01:00,...,03d0deda558765c2fbf485c57117bf2a3537611151fb49...,8.20427,13.203459,1175.900024,7.8614,3,24196,119,8.588139,1328.159043


In [14]:
df.to_parquet(PARQUET_OUT, compression="zstd")
print("Saved →", PARQUET_OUT)

Saved → C:\Users\aless\OneDrive - Nexus365\Thesis\driver_data\combined_path\new_test\original\trips_original_0075_v2_with_predicted_distance_time.parquet
