In [1]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline      import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model  import LinearRegression, ElasticNetCV
from sklearn.neighbors     import KNeighborsRegressor
from sklearn.tree          import DecisionTreeRegressor
from sklearn.ensemble      import (
    GradientBoostingRegressor, AdaBoostRegressor,
    RandomForestRegressor, ExtraTreesRegressor
)
from xgboost               import XGBRegressor
import lightgbm as lgb
from sklearn.metrics       import mean_absolute_error, root_mean_squared_error
import pandas as pd, numpy as np
import json, joblib
from pathlib import Path
from sklearn.model_selection import KFold
from itertools import combinations

In [2]:
PROJECT_ROOT = Path.cwd().resolve().parents[1]     
COMBINED_DIR   = PROJECT_ROOT / "combined_path"
OG_DIR = COMBINED_DIR / "new_test" / "original"
CELL_FILE_ADDITION = "original_0075_v2"

PARQUET_PATH    = OG_DIR / f"trips_{CELL_FILE_ADDITION}_with_predicted_distance_time.parquet"
PARQUET_OUT    = OG_DIR / f"trips_{CELL_FILE_ADDITION}_with_predicted_information.parquet"

PRICE_MODEL_DIR = PROJECT_ROOT / "models" / "price" / "model_artifacts" / "price_v4_original"
PRICE_MODEL_DIR.mkdir(parents=True, exist_ok=True)

print("Reading  :", PARQUET_PATH)
print("Saving to :", PRICE_MODEL_DIR)

SEED = 42

df = pd.read_parquet(PARQUET_PATH)

Reading  : C:\Users\aless\OneDrive - Nexus365\Thesis\driver_data\combined_path\new_test\original\trips_original_0075_v2_with_predicted_distance_time.parquet
Saving to : C:\Users\aless\OneDrive - Nexus365\Thesis\driver_data\models\price\model_artifacts\price_v4_original


In [3]:
df = pd.read_parquet(PARQUET_PATH)

In [4]:
print(f"Loaded {len(df):,} rows")
df.head()

Loaded 147,498 rows


Unnamed: 0,origin_row,origin_col,dest_row,dest_col,begin_lat,begin_lng,end_lat,end_lng,haversine_km,begintrip_timestamp_london,...,driver_id_offline_online,trip_distance_miles,trip_distance_km,osrm_sec,osrm_km,dow,month_idx,doy,km_pred,sec_pred
0,5,12,7,7,51.440338,-0.159358,51.456711,-0.191571,2.880576,2016-04-28 17:23:20+01:00,...,03d0deda558765c2fbf485c57117bf2a3537611151fb49...,2.789894,4.489888,398.700012,3.4857,3,24196,119,3.625828,664.820351
1,6,7,9,11,51.445763,-0.1914,51.47443,-0.167369,3.596283,2016-04-28 17:50:48+01:00,...,03d0deda558765c2fbf485c57117bf2a3537611151fb49...,3.236049,5.207903,626.099976,5.0948,3,24196,119,5.318336,955.559975
2,10,11,12,8,51.479115,-0.16691,51.490761,-0.18379,1.744453,2016-04-28 18:10:50+01:00,...,03d0deda558765c2fbf485c57117bf2a3537611151fb49...,1.491487,2.400309,302.100006,1.9474,3,24196,119,2.202951,542.938647
3,11,7,13,6,51.487488,-0.191229,51.502617,-0.199705,1.781674,2016-04-28 18:24:59+01:00,...,03d0deda558765c2fbf485c57117bf2a3537611151fb49...,1.64228,2.642988,389.100006,2.6575,3,24196,119,2.887826,690.397036
4,5,13,13,8,51.444721,-0.148535,51.502235,-0.186893,6.925133,2016-04-28 21:06:02+01:00,...,03d0deda558765c2fbf485c57117bf2a3537611151fb49...,8.20427,13.203459,1175.900024,7.8614,3,24196,119,8.588139,1328.159043


In [6]:
def predict_price_blended(models, df_new) -> np.ndarray:
    FEATURES = models["features"]
    assert all(c in df_new.columns for c in FEATURES), "Missing features for inference"
    X = df_new[FEATURES]

    pred_lgb = np.expm1(models["lgbm"].predict(X))
    pred_xgb = np.expm1(models["xgb"].predict(X))
    return models["w_lgb"] * pred_lgb + models["w_xgb"] * pred_xgb


In [7]:
def build_price_feature(df: pd.DataFrame, k: int = 5):
    FEATURES = [
    "begin_lat","begin_lng","end_lat","end_lng",
    "sec_pred","osrm_km",
    "hour","dow","month_idx","doy",
    ]

    LGB_PARAMS = dict(
        objective="fair", 
        fair_c=1.0,
        n_estimators=800, 
        learning_rate=0.05,
        subsample=0.8, 
        num_leaves=63,
        min_data_in_leaf=50, 
        max_depth=-1,
        random_state=SEED,
    )

    XGB_PARAMS = dict(
        n_estimators=800, 
        learning_rate=0.05,
        max_depth=8, 
        subsample=0.8,
        objective="reg:squarederror",
        n_jobs=-1, 
        random_state=SEED,
    )

    w_lgb = 0.75
    w_xgb  = 0.25
    assert abs((w_lgb + w_xgb) - 1.0) < 1e-9, "Blend weights must sum to 1.0"

    need = FEATURES + ["pay_after_uber_cut"]

    good = df[need].notna().all(axis=1)
    work = df.loc[good].copy()

    # log target
    work["log_fare"] = np.log1p(work["pay_after_uber_cut"].clip(lower=0))

    # OOF container
    work["price_pred"] = np.nan

    kf = KFold(n_splits=k, shuffle=True, random_state=SEED)

    for fold, (tr_idx, va_idx) in enumerate(kf.split(work), 1):
        Xtr, ytr = work.iloc[tr_idx][FEATURES], work.iloc[tr_idx]["log_fare"]
        Xva      = work.iloc[va_idx][FEATURES]

        # LGBM on log target
        mdl_lgb = lgb.LGBMRegressor(**LGB_PARAMS)
        mdl_lgb.fit(Xtr, ytr)
        pred_lgb_log = mdl_lgb.predict(Xva)
        pred_lgb = np.expm1(pred_lgb_log)

        # XGB on log target
        mdl_xgb = XGBRegressor(**XGB_PARAMS)
        mdl_xgb.fit(Xtr, ytr)
        pred_xgb_log = mdl_xgb.predict(Xva)
        pred_xgb = np.expm1(pred_xgb_log)

        # Blend in price space
        blend = w_lgb * pred_lgb + w_xgb * pred_xgb
        work.loc[work.index[va_idx], "price_pred"] = blend

        print(f"fold {fold}/{k} done")


    assert work["price_pred"].isna().sum() == 0, "OOF fill failed!"


    mae  = mean_absolute_error(work["pay_after_uber_cut"], work["price_pred"])
    rmse = root_mean_squared_error(work["pay_after_uber_cut"], work["price_pred"])
    print(f"OOF  MAE £{mae:.3f}   RMSE £{rmse:.3f}   on {len(work):,} rows")

    X_all, y_all = work[FEATURES], work["log_fare"]
    final_lgb = lgb.LGBMRegressor(**LGB_PARAMS).fit(X_all, y_all)
    final_xgb = XGBRegressor(**XGB_PARAMS).fit(X_all, y_all)


    PRICE_MODEL_DIR.mkdir(parents=True, exist_ok=True)
    joblib.dump(final_lgb, PRICE_MODEL_DIR / "price_lgbm_log.joblib")
    joblib.dump(final_xgb, PRICE_MODEL_DIR / "price_xgb_log.joblib")
    json.dump(
        FEATURES,
        open(PRICE_MODEL_DIR / "price_feature_order.json", "w")
    )
    json.dump(
        {"k": k, "mae": float(mae), "rmse": float(rmse),
         "blend_w": {"lgbm": float(w_lgb), "xgb": float(w_xgb)},
         "model_type": "blend_log_space_models_price_space_blend"},
        open(PRICE_MODEL_DIR / "metadata.json", "w"), indent=2
    )


    df = df.copy()
    df.loc[work.index, "price_pred"] = work["price_pred"]

    models = {"lgbm": final_lgb, "xgb": final_xgb,
              "features": FEATURES, "w_lgb": w_lgb, "w_xgb": w_xgb}
    return df, models

In [8]:
df, duration_model = build_price_feature(df, k=8)
df.head()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005500 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1900
[LightGBM] [Info] Number of data points in the train set: 128585, number of used features: 10
[LightGBM] [Info] Start training from score 2.219904
fold 1/8 done
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001634 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1899
[LightGBM] [Info] Number of data points in the train set: 128585, number of used features: 10
[LightGBM] [Info] Start training from score 2.221269
fold 2/8 done
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010075 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=tru

Unnamed: 0,origin_row,origin_col,dest_row,dest_col,begin_lat,begin_lng,end_lat,end_lng,haversine_km,begintrip_timestamp_london,...,trip_distance_miles,trip_distance_km,osrm_sec,osrm_km,dow,month_idx,doy,km_pred,sec_pred,price_pred
0,5,12,7,7,51.440338,-0.159358,51.456711,-0.191571,2.880576,2016-04-28 17:23:20+01:00,...,2.789894,4.489888,398.700012,3.4857,3,24196,119,3.625828,664.820351,5.677524
1,6,7,9,11,51.445763,-0.1914,51.47443,-0.167369,3.596283,2016-04-28 17:50:48+01:00,...,3.236049,5.207903,626.099976,5.0948,3,24196,119,5.318336,955.559975,7.482545
2,10,11,12,8,51.479115,-0.16691,51.490761,-0.18379,1.744453,2016-04-28 18:10:50+01:00,...,1.491487,2.400309,302.100006,1.9474,3,24196,119,2.202951,542.938647,5.192629
3,11,7,13,6,51.487488,-0.191229,51.502617,-0.199705,1.781674,2016-04-28 18:24:59+01:00,...,1.64228,2.642988,389.100006,2.6575,3,24196,119,2.887826,690.397036,5.808538
4,5,13,13,8,51.444721,-0.148535,51.502235,-0.186893,6.925133,2016-04-28 21:06:02+01:00,...,8.20427,13.203459,1175.900024,7.8614,3,24196,119,8.588139,1328.159043,9.898572


In [9]:
df.to_parquet(PARQUET_OUT, compression="zstd")
print("Saved →", PARQUET_OUT)

Saved → C:\Users\aless\OneDrive - Nexus365\Thesis\driver_data\combined_path\new_test\original\trips_original_0075_v2_with_predicted_information.parquet
