In [52]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline      import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model  import LinearRegression, ElasticNetCV
from sklearn.neighbors     import KNeighborsRegressor
from sklearn.tree          import DecisionTreeRegressor
from sklearn.ensemble      import (
    GradientBoostingRegressor, AdaBoostRegressor,
    RandomForestRegressor, ExtraTreesRegressor
)
from xgboost               import XGBRegressor
import lightgbm as lgb
from sklearn.metrics       import mean_absolute_error, root_mean_squared_error
import pandas as pd, numpy as np
import json, joblib
from pathlib import Path
from sklearn.model_selection import KFold
from itertools import combinations

In [None]:
PROJECT_ROOT = Path.cwd().resolve().parents[1]     
COMBINED_DIR   = PROJECT_ROOT / "combined_path"
OG_DIR = COMBINED_DIR / "new_test" / "original"
CELL_FILE_ADDITION = "original_0075_v2"

PARQUET_PATH    = OG_DIR / f"trips_{CELL_FILE_ADDITION}_with_predicted_distance_time.parquet"

SEED = 10

df = pd.read_parquet(PARQUET_PATH)

CREATE ALL ADDITIONAL COLUMNS NEEDED FOR PREDICTION

In [54]:
df["dow"]        = df["begintrip_timestamp_london"].dt.dayofweek
df["month_idx"]  = (
    df["begintrip_timestamp_london"].dt.year * 12 +
    df["begintrip_timestamp_london"].dt.month
)
df["doy"] = df["begintrip_timestamp_london"].dt.dayofyear
print(f"Loaded {len(df):,} rows")
df.head()

Loaded 147,498 rows


Unnamed: 0,origin_row,origin_col,dest_row,dest_col,begin_lat,begin_lng,end_lat,end_lng,haversine_km,begintrip_timestamp_london,...,driver_id_offline_online,trip_distance_miles,trip_distance_km,osrm_sec,osrm_km,dow,month_idx,doy,km_pred,sec_pred
0,5,12,7,7,51.440338,-0.159358,51.456711,-0.191571,2.880576,2016-04-28 17:23:20+01:00,...,03d0deda558765c2fbf485c57117bf2a3537611151fb49441ef9f285092305b6,2.789894,4.489888,398.700012,3.4857,3,24196,119,3.625828,664.820351
1,6,7,9,11,51.445763,-0.1914,51.47443,-0.167369,3.596283,2016-04-28 17:50:48+01:00,...,03d0deda558765c2fbf485c57117bf2a3537611151fb49441ef9f285092305b6,3.236049,5.207903,626.099976,5.0948,3,24196,119,5.318336,955.559975
2,10,11,12,8,51.479115,-0.16691,51.490761,-0.18379,1.744453,2016-04-28 18:10:50+01:00,...,03d0deda558765c2fbf485c57117bf2a3537611151fb49441ef9f285092305b6,1.491487,2.400309,302.100006,1.9474,3,24196,119,2.202951,542.938647
3,11,7,13,6,51.487488,-0.191229,51.502617,-0.199705,1.781674,2016-04-28 18:24:59+01:00,...,03d0deda558765c2fbf485c57117bf2a3537611151fb49441ef9f285092305b6,1.64228,2.642988,389.100006,2.6575,3,24196,119,2.887826,690.397036
4,5,13,13,8,51.444721,-0.148535,51.502235,-0.186893,6.925133,2016-04-28 21:06:02+01:00,...,03d0deda558765c2fbf485c57117bf2a3537611151fb49441ef9f285092305b6,8.20427,13.203459,1175.900024,7.8614,3,24196,119,8.588139,1328.159043


# LOOP TO TEST ALL COMBINATIONS

GLOBAL VARIABLES

In [55]:
RESULTS = []
MODELS = {
    # linear & neighbors
    # "LinearReg"   : make_pipeline(StandardScaler(), LinearRegression()),
    # "ElasticNet"  : make_pipeline(StandardScaler(),
    #                               ElasticNetCV(l1_ratio=[.1,.5,.9], cv=5)),
    # "KNN (k=8)"   : make_pipeline(StandardScaler(),
    #                               KNeighborsRegressor(n_neighbors=8)),
    # # single tree
    # "DecisionTree": DecisionTreeRegressor(max_depth=16, min_samples_leaf=4),
    # boosted trees
    #"GradientBoost": GradientBoostingRegressor(),
    "XGBoost"      : XGBRegressor(
        n_estimators=800,
        learning_rate=0.05,
        max_depth=8,
        subsample=0.8,
        objective="reg:squarederror",
        n_jobs=-1,
        random_state=SEED,
        #n_estimators=400, learning_rate=0.1, max_depth=8,
        #subsample=0.8, objective="reg:squarederror", n_jobs=-1
    ),
    "LightGBM"     : lgb.LGBMRegressor(
        n_estimators=400, learning_rate=0.1, subsample=0.8, max_depth=-1
    ),
    "LightGBM_regressionl1": lgb.LGBMRegressor(
                  objective="regression_l1",
                  n_estimators=800,
                  learning_rate=0.05,
                  num_leaves=63,
                  subsample=0.8,
                  min_data_in_leaf=50,
                  max_depth=-1,
                  random_state=SEED
    )
}

ALL BASE TESTS

In [56]:
def run_base_models(X_train_true, y_train_true, X_test_true, y_test_true, test_name):
  for name, model in MODELS.items():
      model.fit(X_train_true, y_train_true)
      pred = model.predict(X_test_true)
      mae  = mean_absolute_error(y_test_true, pred)
      rmse = root_mean_squared_error(y_test_true, pred)
      RESULTS.append((""+name+""+test_name, mae, rmse))
      print(f"{name:15}  MAE £{mae:5.2f}   RMSE £{rmse:5.2f}")

All LOG TESTS

In [57]:
def run_log_tests(X_train_log, y_train_log, X_test_log, y_test_log, test_name):

  # LIGHT GBM LOG FAIR
  lgb_log_tuned = lgb.LGBMRegressor(
      objective="fair",      # robust loss
      fair_c=1.0,
      n_estimators=800,
      learning_rate=0.05,
      subsample=0.8,
      num_leaves=63,
      min_data_in_leaf=50,
      max_depth=-1,
      random_seed=SEED
  ).fit(X_train_log, y_train_log)


  pred_lgb = np.expm1(lgb_log_tuned.predict(X_test_log))
  y_true = df["pay_after_uber_cut"].loc[y_test_log.index]
  mae = mean_absolute_error(y_true, pred_lgb)
  rmse = root_mean_squared_error(y_true, pred_lgb)
  print("Log-LightGBM and "+test_name+" MAE £{:.4f}  RMSE £{:.4f}".format(
        mae, rmse))
  RESULTS.append(("Log-LightGBM and "+test_name, mae, rmse))

  fi = (pd.Series(lgb_log_tuned.feature_importances_,
                  index=X_train_log.columns,
                  name="gain")
          .sort_values(ascending=False))

  print("\n🔎  Top features (gain normalised):")
  print((fi / fi.sum() * 100).round(2).head(20))   # show top-20 %

  # XGBoost Log Test
  xgb_log = XGBRegressor(
    n_estimators=800,
    learning_rate=0.05,
    max_depth=8,
    subsample=0.8,
    objective="reg:squarederror",
    n_jobs=-1,
    random_state=SEED,
  )
  xgb_log.fit(X_train_log, y_train_log)

  pred_log_xgb = np.expm1(xgb_log.predict(X_test_log))
  mae_xgb  = mean_absolute_error(y_true, pred_log_xgb)
  rmse_xgb = root_mean_squared_error(y_true, pred_log_xgb)
  print(f"Log-XGB "+test_name+" MAE £{mae_xgb:.4f}  RMSE £{rmse_xgb:.4f}")
  RESULTS.append(("Log-XGB + "+test_name, mae_xgb, rmse_xgb))


  # Blend Test
  blend = 0.75*pred_lgb + 0.25*pred_log_xgb
  mae = mean_absolute_error(y_true, blend)
  rmse = root_mean_squared_error(y_true, blend)
  print("Blend  MAE £{:.4f}".format(mae))
  print("Blend  RMSE £{:.4f}".format(rmse))
  RESULTS.append(("BLENDED Light GBM and XGBOOST + "+test_name, mae, rmse))


  # LIGHT GBM LOG FAIR
  lgb_q = lgb.LGBMRegressor(
        objective="quantile",
        alpha=0.5,                 # median
        n_estimators=800,
        learning_rate=0.05,
        num_leaves=63,
        min_data_in_leaf=50,
        random_seed=SEED,
        subsample=0.8)
  lgb_q.fit(X_train_log, y_train_log)      # still log-fare target
  pred_q = np.expm1(lgb_q.predict(X_test_log))
  y_true = df["pay_after_uber_cut"].loc[y_test_log.index]
  mae = mean_absolute_error(y_true, pred_q)
  rmse = root_mean_squared_error(y_true, pred_q)
  print("Light GBM Log Quantile Median Regression "+test_name+" MAE £{:.4f}  RMSE £{:.4f}".format(
        mae, rmse))
  RESULTS.append(("Light GBM Log Quantile Median Regression + "+test_name, mae, rmse))

  fi = (pd.Series(lgb_q.feature_importances_,
                  index=X_train_log.columns,
                  name="gain")
          .sort_values(ascending=False))

  print("\n🔎  Top features (gain normalised):")
  print((fi / fi.sum() * 100).round(2).head(20))   # show top-20 %

In [58]:
BASE_FEATURES = [
    "begin_lat", "begin_lng", "end_lat", "end_lng",
    "hour", "dow", "month_idx", "doy", 
]
OPTIONAL = ["km_pred", "sec_pred", "osrm_sec", "osrm_km"]

TARGET_TRUE = "pay_after_uber_cut"
TARGET_LOG = "log_fare"
df["log_fare"] = np.log1p(df["pay_after_uber_cut"])

for r in range(len(OPTIONAL) + 1):
    for combo in combinations(OPTIONAL, r):
        test_name = "+".join(combo) if combo else "none"
        FEATURES = BASE_FEATURES + list(combo)
        print(test_name)

        good = df[FEATURES].notna().all(axis=1)
        work = df.loc[good].copy()

        X_train_true, X_test_true, y_train_true, y_test_true = train_test_split(
            work[FEATURES], work[TARGET_TRUE], test_size=0.20, random_state=SEED)

        
        X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(
            work[FEATURES], work[TARGET_LOG], test_size=0.20, random_state=SEED)

        run_base_models(X_train_true, y_train_true, X_test_true, y_test_true, test_name)
        run_log_tests(X_train_log, y_train_log, X_test_log, y_test_log, test_name)

none
XGBoost          MAE £ 1.86   RMSE £ 3.33
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001527 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1389
[LightGBM] [Info] Number of data points in the train set: 117998, number of used features: 8
[LightGBM] [Info] Start training from score 9.232135
LightGBM         MAE £ 1.94   RMSE £ 3.34
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004171 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1389
[LightGBM] [Info] Number of data points in the train set: 117998, number of used features: 8
[LightGBM] [Info] Start training from score 7.790000
LightGBM_regressionl1  MAE £ 1.77   RMSE £ 3.41
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007752 seconds.
You can set

In [59]:
import os
pd.set_option("display.max_colwidth", None)  # show full text in any column

leader = (
    pd.DataFrame(RESULTS, columns=["Model","MAE","RMSE"])
      .sort_values("MAE")
      .reset_index(drop=True)
)

leader
PROJECT_ROOT = Path.cwd().resolve().parents[0]   
out = PROJECT_ROOT / "price" / "price_results" / "v4" / f"leader_{SEED}_{CELL_FILE_ADDITION}.csv"
os.makedirs(out.parent, exist_ok=True) 
leader.to_csv(out, index=False)
print("Wrote:", out)

Wrote: C:\Users\aless\OneDrive - Nexus365\Thesis\driver_data\models\price\price_results\v4\leader_10_original_0075_v2.csv
