In [140]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline      import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model  import LinearRegression, ElasticNetCV
from sklearn.neighbors     import KNeighborsRegressor
from sklearn.tree          import DecisionTreeRegressor
from sklearn.ensemble      import (
    GradientBoostingRegressor, AdaBoostRegressor,
    RandomForestRegressor, ExtraTreesRegressor
)
from xgboost               import XGBRegressor
import lightgbm as lgb
from sklearn.metrics       import mean_absolute_error, root_mean_squared_error
import pandas as pd, numpy as np
from pathlib import Path

In [141]:
PROJECT_ROOT = Path.cwd().resolve().parents[1]     
COMBINED_DIR   = PROJECT_ROOT / "combined_path"
LARGER_DIR = COMBINED_DIR / "new_test" / "original"
CELL_FILE_ADDITION = "original_0075_v2"


PARQUET_PATH = LARGER_DIR / f"trips_with_price_duration_{CELL_FILE_ADDITION}_km_osrm.parquet"
SEED = 80
results = []

df = pd.read_parquet(PARQUET_PATH)
print(len(df))

147498


In [142]:
df["dow"]        = df["begintrip_timestamp_london"].dt.dayofweek
df["month_idx"]  = (
    df["begintrip_timestamp_london"].dt.year * 12 +
    df["begintrip_timestamp_london"].dt.month
)
df["doy"] = df["begintrip_timestamp_london"].dt.dayofyear

dow_map = {0:"Monday", 1:"Tuesday", 2:"Wednesday", 3:"Thursday",
           4:"Friday", 5:"Saturday", 6:"Sunday"}

df["dow_text"]       = df["begintrip_timestamp_london"].dt.dayofweek.map(dow_map)

In [143]:
print(f"Loaded {len(df):,} rows")
df.head()

Loaded 147,498 rows


Unnamed: 0,origin_row,origin_col,dest_row,dest_col,begin_lat,begin_lng,end_lat,end_lng,haversine_km,begintrip_timestamp_london,...,pay_after_uber_cut,driver_id_offline_online,trip_distance_miles,trip_distance_km,osrm_sec,osrm_km,dow,month_idx,doy,dow_text
0,5,12,7,7,51.440338,-0.159358,51.456711,-0.191571,2.880576,2016-04-28 17:23:20+01:00,...,6.62,03d0deda558765c2fbf485c57117bf2a3537611151fb49...,2.789894,4.489888,398.700012,3.4857,3,24196,119,Thursday
1,6,7,9,11,51.445763,-0.1914,51.47443,-0.167369,3.596283,2016-04-28 17:50:48+01:00,...,7.1,03d0deda558765c2fbf485c57117bf2a3537611151fb49...,3.236049,5.207903,626.099976,5.0948,3,24196,119,Thursday
2,10,11,12,8,51.479115,-0.16691,51.490761,-0.18379,1.744453,2016-04-28 18:10:50+01:00,...,4.53,03d0deda558765c2fbf485c57117bf2a3537611151fb49...,1.491487,2.400309,302.100006,1.9474,3,24196,119,Thursday
3,11,7,13,6,51.487488,-0.191229,51.502617,-0.199705,1.781674,2016-04-28 18:24:59+01:00,...,4.68,03d0deda558765c2fbf485c57117bf2a3537611151fb49...,1.64228,2.642988,389.100006,2.6575,3,24196,119,Thursday
4,5,13,13,8,51.444721,-0.148535,51.502235,-0.186893,6.925133,2016-04-28 21:06:02+01:00,...,14.74,03d0deda558765c2fbf485c57117bf2a3537611151fb49...,8.20427,13.203459,1175.900024,7.8614,3,24196,119,Thursday


# Test on Strength of OSRM Distance

Raw OSRM Distance

In [None]:
distance_results = []
mask = (
    df["osrm_km"].notna() & df["trip_distance_km"].notna() &
    np.isfinite(df["osrm_km"]) & np.isfinite(df["trip_distance_km"]) 
    & (df["osrm_km"] > 0.05)         
    & (df["trip_distance_km"] > 0.05) 
)
coverage = mask.mean() * 100
print(f"OSRM coverage: {coverage:.1f}%  ({mask.sum():,}/{len(df):,})")

mae  = mean_absolute_error(df.loc[mask, "trip_distance_km"],
                           df.loc[mask, "osrm_km"])
rmse = root_mean_squared_error(df.loc[mask, "trip_distance_km"],
                          df.loc[mask, "osrm_km"])

print(f"MAE  {mae:.1f} km   RMSE  {rmse:.1f} km")
distance_results.append(("OSRM KM No Change", mae, rmse))

OSRM coverage: 99.6%  (146,956/147,498)
MAE  0.7 km   RMSE  1.9 km


In [145]:
DISTANCE_PREDICTION_FEATURES = [
    "begin_lat",  "begin_lng",  "end_lat",  "end_lng",
    "osrm_km",
    "hour", "dow",  "month_idx", "doy",
]

In [146]:
TARGET_TRUE = "trip_distance_km"
X_train_true, X_test_true, y_train_true, y_test_true = train_test_split(
    df[DISTANCE_PREDICTION_FEATURES], df[TARGET_TRUE], test_size=0.20, random_state=SEED)

BASELINE MEDIAN STRENGTH

In [None]:
mean_distance  = df["trip_distance_km"].mean()
median_distance = df["trip_distance_km"].median()
print(f"Mean distance   {mean_distance:,.4f}")
print(f"Median distance {median_distance:,.4f}")

# naive constant model = predict training mean for every test row
const_pred = np.full_like(y_test_true, fill_value=y_test_true.median())
baseline_mae  = mean_absolute_error(y_test_true, const_pred)
baseline_rmse = root_mean_squared_error(y_test_true, const_pred)
print(f"Constant-median baseline  MAE {baseline_mae:.4f}  RMSE {baseline_rmse:.4f}")
distance_results.append(("Constant-median baseline  MAE", baseline_mae, baseline_rmse))

Mean distance   5.8305
Median distance 4.7610
Constant-median baseline  MAE 2.8732  RMSE 4.2533


OSRM in Features and Ratio Test

In [None]:
good = (
    df["osrm_km"].notna() & df["trip_distance_km"].notna() &
    np.isfinite(df["osrm_km"]) & np.isfinite(df["trip_distance_km"]) 
    & (df["osrm_km"] > 0.05)       
    & (df["trip_distance_km"] > 0.05) 
)

work = df.loc[good].copy()
print(len(df))
print(len(work))
work["cong_ratio"] = work["trip_distance_km"] / work["osrm_km"]

print(f"Training on {len(work):,} clean rows "
      f"(coverage {len(work)/len(df):.1%})")


FEATURES_RATIO = [
    "begin_lat",  "begin_lng",  "end_lat",  "end_lng",
    "osrm_km",
    "hour","dow","month_idx","doy"
]
TARGET_RATIO = "cong_ratio"

X_train, X_test, y_train, y_test = train_test_split(
    work[FEATURES_RATIO], work[TARGET_RATIO], test_size=0.2, random_state=SEED
)

ratio_model = lgb.LGBMRegressor(
        objective       = "regression_l1", 
        n_estimators    = 800,
        learning_rate   = 0.05,
        num_leaves      = 63,
        subsample       = 0.8,
        min_data_in_leaf= 50,
        max_depth       = -1,
        random_state    = SEED,
).fit(X_train, y_train)


pred_ratio    = ratio_model.predict(X_test)
km_pred = work.loc[X_test.index, "osrm_km"] * pred_ratio
km_true = work.loc[X_test.index, "trip_distance_km"]

mae  = mean_absolute_error(km_true, km_pred)
rmse = root_mean_squared_error(km_true, km_pred)

print(f"OSRM in Feature, Ratio  {mae:.1f} km   RMSE  {rmse:.1f} km")
distance_results.append(("OSRM in Feature, Ratio", mae, rmse))


fi = (pd.Series(ratio_model.feature_importances_,
                index=X_train.columns,
                name="gain")
        .sort_values(ascending=False))

print("Top features (gain normalised):")
print((fi / fi.sum() * 100).round(2).head(20)) 

147498
146956
Training on 146,956 clean rows (coverage 99.6%)
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006392 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1644
[LightGBM] [Info] Number of data points in the train set: 117564, number of used features: 9
[LightGBM] [Info] Start training from score 1.059785
OSRM in Feature, Ratio  0.6 km   RMSE  1.9 km

🔎  Top features (gain normalised):
end_lng      14.02
begin_lng    13.73
end_lat      13.05
begin_lat    12.98
doy          12.09
osrm_km      12.07
month_idx    10.43
hour          7.66
dow           3.97
Name: gain, dtype: float64


OSRM NOT IN FEATURE & RATIO

In [None]:
good = (
    df["osrm_km"].notna() & df["trip_distance_km"].notna() &
    np.isfinite(df["osrm_km"]) & np.isfinite(df["trip_distance_km"]) 
    & (df["osrm_km"] > 0.05)         
    & (df["trip_distance_km"] > 0.05) 
)

work = df.loc[good].copy()                    
work["cong_ratio"] = work["trip_distance_km"] / work["osrm_km"]

print(f"Training on {len(work):,} clean rows "
      f"(coverage {len(work)/len(df):.1%})")


FEATURES_RATIO = [
    #"origin_row","origin_col","dest_row","dest_col",
    "begin_lat",  "begin_lng",  "end_lat",  "end_lng",
    #"osrm_km",
    "hour","dow","month_idx","doy"
]
TARGET_RATIO = "cong_ratio"

X_train, X_test, y_train, y_test = train_test_split(
    work[FEATURES_RATIO], work[TARGET_RATIO], test_size=0.2, random_state=SEED
)


ratio_model = lgb.LGBMRegressor(
        objective       = "regression_l1",  
        n_estimators    = 800,
        learning_rate   = 0.05,
        num_leaves      = 63,
        subsample       = 0.8,
        min_data_in_leaf= 50,
        max_depth       = -1,
        random_state    = SEED,
).fit(X_train, y_train)


pred_ratio    = ratio_model.predict(X_test)
km_pred = work.loc[X_test.index, "osrm_km"] * pred_ratio
km_true = work.loc[X_test.index, "trip_distance_km"]

mae  = mean_absolute_error(km_true, km_pred)
rmse = root_mean_squared_error(km_true, km_pred)

print(f"OSRM not in Feature, Ratio  {mae:.1f} km   RMSE  {rmse:.1f} km")
distance_results.append(("OSRM not in Feature, Ratio", mae, rmse))



fi = (pd.Series(ratio_model.feature_importances_,
                index=X_train.columns,
                name="gain")
        .sort_values(ascending=False))

print("Top features (gain normalised):")
print((fi / fi.sum() * 100).round(2).head(20))  




Training on 146,956 clean rows (coverage 99.6%)
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006333 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1389
[LightGBM] [Info] Number of data points in the train set: 117564, number of used features: 8
[LightGBM] [Info] Start training from score 1.059785
OSRM not in Feature, Ratio  0.6 km   RMSE  1.9 km

🔎  Top features (gain normalised):
end_lat      16.25
begin_lng    15.42
end_lng      15.24
begin_lat    14.56
doy          13.69
month_idx    11.61
hour          8.49
dow           4.73
Name: gain, dtype: float64


OSRM LOG IN FEATURE LIST AND RATIO

In [None]:
good = (     
    df["osrm_km"].notna() & df["trip_distance_km"].notna() &     
    np.isfinite(df["osrm_km"]) & np.isfinite(df["trip_distance_km"])      
    & (df["osrm_km"] > 0.05)         
    & (df["trip_distance_km"] > 0.05) 
)

work = df.loc[good].copy()                    
work["cong_ratio"] = work["trip_distance_km"] / work["osrm_km"]

print(f"Training on {len(work):,} clean rows "
      f"(coverage {len(work)/len(df):.1%})")

work["log_osrm_km"] = np.log1p(work["osrm_km"])


FEATURES_RATIO = [
    #"origin_row","origin_col","dest_row","dest_col",
    "begin_lat",  "begin_lng",  "end_lat",  "end_lng",
    "log_osrm_km",
    "hour","dow","month_idx","doy"
]
TARGET_RATIO = "cong_ratio"

X_train, X_test, y_train, y_test = train_test_split(
    work[FEATURES_RATIO], work[TARGET_RATIO], test_size=0.2, random_state=SEED
)


ratio_model = lgb.LGBMRegressor(
        objective       = "regression_l1",  # MAE loss for time
        n_estimators    = 800,
        learning_rate   = 0.05,
        num_leaves      = 63,
        subsample       = 0.8,
        min_data_in_leaf= 50,
        max_depth       = -1,
        random_state    = SEED,
).fit(X_train, y_train)


pred_ratio    = ratio_model.predict(X_test)
km_pred = work.loc[X_test.index, "osrm_km"] * pred_ratio
km_true = work.loc[X_test.index, "trip_distance_km"]

mae  = mean_absolute_error(km_true, km_pred)
rmse = root_mean_squared_error(km_true, km_pred)

print(f"OSRM Log in Feature, Ratio  {mae:.1f} km   RMSE  {rmse:.1f} km")
distance_results.append(("OSRM Log in Feature, Ratio", mae, rmse))



fi = (pd.Series(ratio_model.feature_importances_,
                index=X_train.columns,
                name="gain")
        .sort_values(ascending=False))

print("Top features (gain normalised):")
print((fi / fi.sum() * 100).round(2).head(20))   




Training on 146,956 clean rows (coverage 99.6%)
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006897 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1644
[LightGBM] [Info] Number of data points in the train set: 117564, number of used features: 9
[LightGBM] [Info] Start training from score 1.059785
OSRM Log in Feature, Ratio  0.6 km   RMSE  1.9 km

🔎  Top features (gain normalised):
end_lng        14.02
begin_lng      13.73
end_lat        13.05
begin_lat      12.98
doy            12.09
log_osrm_km    12.07
month_idx      10.43
hour            7.66
dow             3.97
Name: gain, dtype: float64


LOG RATIO, NO OSRM IN FEATURE LIST

In [None]:
good = (     
    df["osrm_km"].notna() & df["trip_distance_km"].notna() &     
    np.isfinite(df["osrm_km"]) & np.isfinite(df["trip_distance_km"])      
    & (df["osrm_km"] > 0.05)         
    & (df["trip_distance_km"] > 0.05) 
)

work = df.loc[good].copy()                     
work["cong_ratio"] = work["trip_distance_km"] / work["osrm_km"]

work["log_ratio"] = np.log(work["cong_ratio"])      

FEATURES_RATIO = [
    #"origin_row","origin_col","dest_row","dest_col",
    "begin_lat",  "begin_lng",  "end_lat",  "end_lng",
    #"osrm_km",
    "hour","dow","month_idx","doy"
]
TARGET_RATIO = "log_ratio"


X_train, X_test, y_train, y_test = train_test_split(
    work[FEATURES_RATIO], work[TARGET_RATIO], test_size=0.2, random_state=SEED
)

ratio_model = lgb.LGBMRegressor(
        objective       = "regression_l1",  # MAE loss for time
        n_estimators    = 800,
        learning_rate   = 0.05,
        num_leaves      = 63,
        subsample       = 0.8,
        min_data_in_leaf= 50,
        max_depth       = -1,
        random_state    = SEED,
).fit(X_train, y_train)


pred_ratio = np.exp(ratio_model.predict(X_test))     
km_pred    = work.loc[X_test.index, "osrm_km"] * pred_ratio
km_true    = work.loc[X_test.index, "trip_distance_km"]

mae = mean_absolute_error(km_true, km_pred)
rmse= root_mean_squared_error(km_true, km_pred)
print(f"OSRM not in Feature, Log Ratio  {mae:.1f} km   RMSE  {rmse:.1f} km")
distance_results.append(("OSRM not in Feature, Log Ratio", mae, rmse))


fi = (pd.Series(ratio_model.feature_importances_,
                index=X_train.columns,
                name="gain")
        .sort_values(ascending=False))

print("Top features (gain normalised):")
print((fi / fi.sum() * 100).round(2).head(20))   



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001773 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1389
[LightGBM] [Info] Number of data points in the train set: 117564, number of used features: 8
[LightGBM] [Info] Start training from score 0.058066
OSRM not in Feature, Log Ratio  0.6 km   RMSE  2.0 km

🔎  Top features (gain normalised):
end_lng      16.63
end_lat      16.10
begin_lng    15.90
begin_lat    15.32
doy          12.60
month_idx    11.03
hour          8.16
dow           4.25
Name: gain, dtype: float64


LOG RATIO, OSRM IN LIST

In [None]:
good = (     
    df["osrm_km"].notna() & df["trip_distance_km"].notna() &     
    np.isfinite(df["osrm_km"]) & np.isfinite(df["trip_distance_km"])      
    & (df["osrm_km"] > 0.05)          
    & (df["trip_distance_km"] > 0.05) 
)

work = df.loc[good].copy()                    
work["cong_ratio"] = work["trip_distance_km"] / work["osrm_km"]

work["log_ratio"] = np.log(work["cong_ratio"])       

FEATURES_RATIO = [
    "begin_lat",  "begin_lng",  "end_lat",  "end_lng",
    "osrm_km",
    "hour","dow","month_idx","doy"
]
TARGET_RATIO = "log_ratio"


X_train, X_test, y_train, y_test = train_test_split(
    work[FEATURES_RATIO], work[TARGET_RATIO], test_size=0.2, random_state=SEED
)

ratio_model = lgb.LGBMRegressor(
        objective       = "regression_l1",  
        n_estimators    = 800,
        learning_rate   = 0.05,
        num_leaves      = 63,
        subsample       = 0.8,
        min_data_in_leaf= 50,
        max_depth       = -1,
        random_state    = SEED,
).fit(X_train, y_train)


pred_ratio = np.exp(ratio_model.predict(X_test))     
km_pred    = work.loc[X_test.index, "osrm_km"] * pred_ratio
km_true    = work.loc[X_test.index, "trip_distance_km"]

mae = mean_absolute_error(km_true, km_pred)
rmse= root_mean_squared_error(km_true, km_pred)
print(f"OSRM in Feature, Log Ratio  {mae:.1f} km   RMSE  {rmse:.1f} km")
distance_results.append(("OSRM in Feature, Log Ratio", mae, rmse))


fi = (pd.Series(ratio_model.feature_importances_,
                index=X_train.columns,
                name="gain")
        .sort_values(ascending=False))

print("Top features (gain normalised):")
print((fi / fi.sum() * 100).round(2).head(20))  



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001636 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1644
[LightGBM] [Info] Number of data points in the train set: 117564, number of used features: 9
[LightGBM] [Info] Start training from score 0.058066
OSRM in Feature, Log Ratio  0.6 km   RMSE  1.9 km

🔎  Top features (gain normalised):
end_lng      13.90
end_lat      13.48
begin_lng    13.42
begin_lat    12.90
osrm_km      12.87
doy          11.97
month_idx    10.11
hour          7.44
dow           3.91
Name: gain, dtype: float64


LOG RATIO, LOG OSRM IN Feature List

In [None]:
good = (     
    df["osrm_km"].notna() & df["trip_distance_km"].notna() &     
    np.isfinite(df["osrm_km"]) & np.isfinite(df["trip_distance_km"])      
    & (df["osrm_km"] > 0.05)           
    & (df["trip_distance_km"] > 0.05)
)

work = df.loc[good].copy()                    
work["cong_ratio"] = work["trip_distance_km"] / work["osrm_km"]

work["log_ratio"] = np.log(work["cong_ratio"])       

work["log_osrm_km"] = np.log1p(work["osrm_km"])
FEATURES_RATIO = [
    "begin_lat",  "begin_lng",  "end_lat",  "end_lng",
    "log_osrm_km",
    "hour","dow","month_idx","doy"
]
TARGET_RATIO = "log_ratio"


X_train, X_test, y_train, y_test = train_test_split(
    work[FEATURES_RATIO], work[TARGET_RATIO], test_size=0.2, random_state=SEED
)

ratio_model = lgb.LGBMRegressor(
        objective       = "regression_l1",  # MAE loss for time
        n_estimators    = 800,
        learning_rate   = 0.05,
        num_leaves      = 63,
        subsample       = 0.8,
        min_data_in_leaf= 50,
        max_depth       = -1,
        random_state    = SEED,
).fit(X_train, y_train)


pred_ratio = np.exp(ratio_model.predict(X_test))    
km_pred    = work.loc[X_test.index, "osrm_km"] * pred_ratio
km_true    = work.loc[X_test.index, "trip_distance_km"]

mae = mean_absolute_error(km_true, km_pred)
rmse= root_mean_squared_error(km_true, km_pred)
print(f"OSRM Log in Feature, Log Ratio  {mae:.1f} km   RMSE  {rmse:.1f} km")
distance_results.append(("OSRM Log in Feature, Log Ratio", mae, rmse))

fi = (pd.Series(ratio_model.feature_importances_,
                index=X_train.columns,
                name="gain")
        .sort_values(ascending=False))

print("Top features (gain normalised):")
print((fi / fi.sum() * 100).round(2).head(20)) 



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007327 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1644
[LightGBM] [Info] Number of data points in the train set: 117564, number of used features: 9
[LightGBM] [Info] Start training from score 0.058066
OSRM Log in Feature, Log Ratio  0.6 km   RMSE  1.9 km

🔎  Top features (gain normalised):
end_lng        13.90
end_lat        13.48
begin_lng      13.42
begin_lat      12.90
log_osrm_km    12.87
doy            11.97
month_idx      10.11
hour            7.44
dow             3.91
Name: gain, dtype: float64


In [None]:
good = (
    df["osrm_km"].notna() & df["trip_distance_km"].notna() & df["osrm_sec"].notna() &
    np.isfinite(df["osrm_km"]) & np.isfinite(df["trip_distance_km"]) & np.isfinite(df["osrm_sec"]) 
    & (df["osrm_km"] > 0.05)         
    & (df["trip_distance_km"] > 0.05) 
    & (df["osrm_sec"] > 5)
)

work = df.loc[good].copy()                     
work["cong_ratio"] = work["trip_distance_km"] / work["osrm_km"]

print(f"Training on {len(work):,} clean rows "
      f"(coverage {len(work)/len(df):.1%})")


FEATURES_RATIO = [
    #"origin_row","origin_col","dest_row","dest_col",
    "begin_lat",  "begin_lng",  "end_lat",  "end_lng",
    "osrm_km", "osrm_sec",
    "hour","dow","month_idx","doy"
]
TARGET_RATIO = "cong_ratio"

X_train, X_test, y_train, y_test = train_test_split(
    work[FEATURES_RATIO], work[TARGET_RATIO], test_size=0.2, random_state=SEED
)

ratio_model = lgb.LGBMRegressor(
        objective       = "regression_l1",  # MAE loss for time
        n_estimators    = 800,
        learning_rate   = 0.05,
        num_leaves      = 63,
        subsample       = 0.8,
        min_data_in_leaf= 50,
        max_depth       = -1,
        random_state    = SEED,
).fit(X_train, y_train)

pred_ratio    = ratio_model.predict(X_test)
km_pred = work.loc[X_test.index, "osrm_km"] * pred_ratio
km_true = work.loc[X_test.index, "trip_distance_km"]

mae  = mean_absolute_error(km_true, km_pred)
rmse = root_mean_squared_error(km_true, km_pred)

print(f"OSRM  km and sec , Ratio  {mae:.1f} km   RMSE  {rmse:.1f} km")
distance_results.append(("OSRM km and sec Feature, Ratio", mae, rmse))


fi = (pd.Series(ratio_model.feature_importances_,
                index=X_train.columns,
                name="gain")
        .sort_values(ascending=False))

print("Top features (gain normalised):")
print((fi / fi.sum() * 100).round(2).head(20))   


Training on 146,955 clean rows (coverage 99.6%)
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006595 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1900
[LightGBM] [Info] Number of data points in the train set: 117564, number of used features: 10
[LightGBM] [Info] Start training from score 1.059593
OSRM  km and sec , Ratio  0.6 km   RMSE  2.0 km

🔎  Top features (gain normalised):
begin_lng    12.82
begin_lat    12.75
end_lat      12.35
end_lng      12.33
doy          11.23
osrm_km       9.52
month_idx     9.37
osrm_sec      8.97
hour          7.00
dow           3.67
Name: gain, dtype: float64


In [None]:
good = (
    df["osrm_km"].notna() & df["trip_distance_km"].notna() & df["osrm_sec"].notna() &
    np.isfinite(df["osrm_km"]) & np.isfinite(df["trip_distance_km"]) & np.isfinite(df["osrm_sec"]) 
    & (df["osrm_km"] > 0.05)         
    & (df["trip_distance_km"] > 0.05) 
    & (df["osrm_sec"] > 5)
)

work = df.loc[good].copy()                   
work["cong_ratio"] = work["trip_distance_km"] / work["osrm_km"]

print(f"Training on {len(work):,} clean rows "
      f"(coverage {len(work)/len(df):.1%})")

FEATURES_RATIO = [
    #"origin_row","origin_col","dest_row","dest_col",
    "begin_lat",  "begin_lng",  "end_lat",  "end_lng",
    "osrm_sec",
    "hour","dow","month_idx","doy"
]
TARGET_RATIO = "cong_ratio"

X_train, X_test, y_train, y_test = train_test_split(
    work[FEATURES_RATIO], work[TARGET_RATIO], test_size=0.2, random_state=SEED
)

ratio_model = lgb.LGBMRegressor(
        objective       = "regression_l1",  # MAE loss for time
        n_estimators    = 800,
        learning_rate   = 0.05,
        num_leaves      = 63,
        subsample       = 0.8,
        min_data_in_leaf= 50,
        max_depth       = -1,
        random_state    = SEED,
).fit(X_train, y_train)

pred_ratio    = ratio_model.predict(X_test)
km_pred = work.loc[X_test.index, "osrm_km"] * pred_ratio
km_true = work.loc[X_test.index, "trip_distance_km"]

mae  = mean_absolute_error(km_true, km_pred)
rmse = root_mean_squared_error(km_true, km_pred)

print(f"OSRM  sec , Ratio  {mae:.1f} km   RMSE  {rmse:.1f} km")
distance_results.append(("OSRM sec in Feature, Ratio", mae, rmse))


fi = (pd.Series(ratio_model.feature_importances_,
                index=X_train.columns,
                name="gain")
        .sort_values(ascending=False))

print("Top features (gain normalised):")
print((fi / fi.sum() * 100).round(2).head(20)) 

Training on 146,955 clean rows (coverage 99.6%)
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005851 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1645
[LightGBM] [Info] Number of data points in the train set: 117564, number of used features: 9
[LightGBM] [Info] Start training from score 1.059593
OSRM  sec , Ratio  0.6 km   RMSE  2.0 km

🔎  Top features (gain normalised):
end_lat      13.57
begin_lat    13.56
end_lng      13.26
doy          12.69
begin_lng    12.52
osrm_sec     12.36
month_idx     9.98
hour          7.75
dow           4.31
Name: gain, dtype: float64


In [None]:
good = (
    df["osrm_km"].notna() & df["trip_distance_km"].notna() & df["osrm_sec"].notna() &
    np.isfinite(df["osrm_km"]) & np.isfinite(df["trip_distance_km"]) & np.isfinite(df["osrm_sec"]) 
    & (df["osrm_km"] > 0.05)         
    & (df["trip_distance_km"] > 0.05) 
    & (df["osrm_sec"] > 5)
)

work = df.loc[good].copy()                    

print(f"Training on {len(work):,} clean rows "
      f"(coverage {len(work)/len(df):.1%})")


FEATURES_RATIO = [
    #"origin_row","origin_col","dest_row","dest_col",
    "begin_lat",  "begin_lng",  "end_lat",  "end_lng",
    "osrm_km",
    "hour","dow","month_idx","doy"
]
TARGET_RATIO = "trip_distance_km"

X_train, X_test, y_train, y_test = train_test_split(
    work[FEATURES_RATIO], work[TARGET_RATIO], test_size=0.2, random_state=SEED
)


ratio_model = lgb.LGBMRegressor(
        objective       = "regression_l1",  # MAE loss for time
        n_estimators    = 800,
        learning_rate   = 0.05,
        num_leaves      = 63,
        subsample       = 0.8,
        min_data_in_leaf= 50,
        max_depth       = -1,
        random_state    = SEED,
).fit(X_train, y_train)


km_pred = ratio_model.predict(X_test)
km_true = work.loc[X_test.index, "trip_distance_km"]

mae  = mean_absolute_error(km_true, km_pred)
rmse = root_mean_squared_error(km_true, km_pred)

print(f"OSRM  km , Direct  {mae:.1f} km   RMSE  {rmse:.1f} km")
distance_results.append(("OSRM km Feature, Direct", mae, rmse))



fi = (pd.Series(ratio_model.feature_importances_,
                index=X_train.columns,
                name="gain")
        .sort_values(ascending=False))

print("Top features (gain normalised):")
print((fi / fi.sum() * 100).round(2).head(20))   




Training on 146,955 clean rows (coverage 99.6%)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002359 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1645
[LightGBM] [Info] Number of data points in the train set: 117564, number of used features: 9
[LightGBM] [Info] Start training from score 4.765678
OSRM  km , Direct  0.6 km   RMSE  2.0 km

🔎  Top features (gain normalised):
begin_lat    13.99
end_lat      13.69
end_lng      13.43
begin_lng    12.85
osrm_km      12.07
doy          12.07
month_idx    11.10
hour          7.22
dow           3.58
Name: gain, dtype: float64


In [None]:
good = (
    df["osrm_km"].notna() & df["trip_distance_km"].notna() & df["osrm_sec"].notna() &
    np.isfinite(df["osrm_km"]) & np.isfinite(df["trip_distance_km"]) & np.isfinite(df["osrm_sec"]) 
    & (df["osrm_km"] > 0.05)         
    & (df["trip_distance_km"] > 0.05) 
    & (df["osrm_sec"] > 5)
)

work = df.loc[good].copy()                    

print(f"Training on {len(work):,} clean rows "
      f"(coverage {len(work)/len(df):.1%})")


FEATURES_RATIO = [
    #"origin_row","origin_col","dest_row","dest_col",
    "begin_lat",  "begin_lng",  "end_lat",  "end_lng",
    "osrm_km", "osrm_sec",
    "hour","dow","month_idx","doy"
]
TARGET_RATIO = "trip_distance_km"

X_train, X_test, y_train, y_test = train_test_split(
    work[FEATURES_RATIO], work[TARGET_RATIO], test_size=0.2, random_state=SEED
)


ratio_model = lgb.LGBMRegressor(
        objective       = "regression_l1",  # MAE loss for time
        n_estimators    = 800,
        learning_rate   = 0.05,
        num_leaves      = 63,
        subsample       = 0.8,
        min_data_in_leaf= 50,
        max_depth       = -1,
        random_state    = SEED,
).fit(X_train, y_train)


km_pred = ratio_model.predict(X_test)
km_true = work.loc[X_test.index, "trip_distance_km"]

mae  = mean_absolute_error(km_true, km_pred)
rmse = root_mean_squared_error(km_true, km_pred)

print(f"OSRM  km sec, Direct  {mae:.1f} km   RMSE  {rmse:.1f} km")
distance_results.append(("OSRM km sec Feature, Direct", mae, rmse))



fi = (pd.Series(ratio_model.feature_importances_,
                index=X_train.columns,
                name="gain")
        .sort_values(ascending=False))

print("Top features (gain normalised):")
print((fi / fi.sum() * 100).round(2).head(20))   




Training on 146,955 clean rows (coverage 99.6%)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002348 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1900
[LightGBM] [Info] Number of data points in the train set: 117564, number of used features: 10
[LightGBM] [Info] Start training from score 4.765678
OSRM  km sec, Direct  0.6 km   RMSE  2.0 km

🔎  Top features (gain normalised):
begin_lat    13.05
end_lat      12.98
begin_lng    11.76
end_lng      11.28
doy          10.46
osrm_km      10.42
month_idx    10.30
osrm_sec      9.76
hour          6.58
dow           3.41
Name: gain, dtype: float64


In [158]:
leader = (
    pd.DataFrame(distance_results, columns=["Model","MAE","RMSE"])
      .sort_values("MAE")
      .reset_index(drop=True)
)
leader

import os
EXTENSION = "original_v2"
PROJECT_ROOT = Path.cwd().resolve().parents[0]   
out = PROJECT_ROOT / "distance" / "distance_results" / "v3" / f"leader_{SEED}_{EXTENSION}.csv"
os.makedirs(out.parent, exist_ok=True) 
leader.to_csv(out, index=False)
print("Wrote:", out)

Wrote: C:\Users\aless\OneDrive - Nexus365\Thesis\driver_data\models\distance\distance_results\v3\leader_80_original_v2.csv
