In [2]:
import pandas as pd
from pathlib import Path
import itertools as it
import numpy as np

PROJECT_ROOT = Path.cwd().resolve().parents[0]     
COMBINED_DIR   = PROJECT_ROOT / "combined_path"
OG_DIR   = COMBINED_DIR / "new_test" / "original"
CELL_FILE_ADDITION = "original_0075_v2"

SRC_PARQUET      = OG_DIR / f"trips_with_price_duration_{CELL_FILE_ADDITION}_km_osrm.parquet"         
DEST_FACTOR      = 6                        
OUT_BASENAME     = OG_DIR / f"origin_dest_hour_lookup_price_{CELL_FILE_ADDITION}" 



def hour_to_group(h: int) -> str:
    if   6 <= h < 10:       return "early_morning"
    elif 10 <= h < 15:      return "mid_day"
    elif 15 <= h < 18:      return "afternoon"
    elif 18 <= h < 22:      return "evening"
    elif h >= 22 or h < 2:  return "night"
    else:                   return "late_night"          

TG_LABELS = [
    "early_morning", "mid_day", "afternoon",
    "evening", "night", "late_night"
]

print(f"Loading {SRC_PARQUET!r} …")
df = pd.read_parquet(SRC_PARQUET)

req = {"origin_row","origin_col","dest_row","dest_col","hour","pay_after_uber_cut"}
missing = req.difference(df.columns.str.lower())
if missing:
    raise ValueError(f"input file is missing columns: {sorted(missing)}")


df["time_group"]      = df["hour"].astype(int).apply(hour_to_group)
df["origin_super_row"] = (df["origin_row"] // DEST_FACTOR).astype("int16")
df["origin_super_col"] = (df["origin_col"] // DEST_FACTOR).astype("int16")
df["dest_super_row"] = (df["dest_row"] // DEST_FACTOR).astype("int16")
df["dest_super_col"] = (df["dest_col"] // DEST_FACTOR).astype("int16")

MAX_COLS = 7
df["origin_super_col"] = (df["origin_col"] // DEST_FACTOR).clip(upper=MAX_COLS-1).astype("int16")
df["dest_super_col"]   = (df["dest_col"]   // DEST_FACTOR).clip(upper=MAX_COLS-1).astype("int16")

print("Aggregating median price + ride count …")
agg = (
    df.groupby(
        ["origin_super_row","origin_super_col",
         "dest_super_row","dest_super_col",
         "time_group"],
        sort=False, observed=True
    )["pay_after_uber_cut"]
      .agg(median_price_usd="median", ride_count="size")
      .reset_index()
)

print(f"{len(agg):,} origin–dest–hour buckets produced.")

agg_origin = (
    df.groupby(
        ["origin_super_row","origin_super_col","time_group"],
        sort=False, observed=True
    )["pay_after_uber_cut"]
      .agg(median_price_usd="median", ride_count="size")
      .reset_index()
)
print(f"{len(agg_origin):,} origin-only buckets.\n")

print(agg_origin.head())


agg_origin_app = agg_origin.copy()
agg_origin_app["dest_super_row"] = pd.NA
agg_origin_app["dest_super_col"] = pd.NA

COLS = [
    "origin_super_row", "origin_super_col",
    "dest_super_row",   "dest_super_col",
    "time_group",
    "median_price_usd", "ride_count"
]
agg_origin_app = agg_origin_app[COLS]
agg            = agg[COLS]          


agg_combined = pd.concat([agg, agg_origin_app], ignore_index=True, sort=False)

print(f"Combined table has {len(agg_origin):,} rows "
      f"({len(agg)} detailed  +  {len(agg_origin_app)} origin-only).")


base = Path(OUT_BASENAME)
OUT_PARQUET = base.with_suffix(".parquet")
OUT_CSV = base.with_suffix(".csv")
OUT_NUMPY = base.with_suffix(".npy")

agg_origin.to_parquet(OUT_PARQUET, compression="zstd")
agg_combined.to_csv(OUT_CSV, index=False)
np.save(OUT_NUMPY, agg_combined.to_numpy())

print(f"Saved {base}.parquet  and  {base}.csv and {base}.npy")

print("Coverage & descriptive statistics")

n_orow = df["origin_super_row"].max() + 1
n_ocol = df["origin_super_col"].max() + 1
n_drow = df["dest_super_row"].max() + 1
n_dcol = df["dest_super_col"].max() + 1
TOTAL  = n_orow * n_ocol * n_drow * n_dcol * len(TG_LABELS)

missing_cnt = TOTAL - len(agg_combined)
cov_pct     = 100 * len(agg_combined) / TOTAL

print(f"universe size          : {TOTAL:,}")
print(f"populated buckets      : {len(agg_combined):,}  ({cov_pct:4.1f} %)")
print(f"missing buckets        : {missing_cnt:,}")

rc = agg_combined["ride_count"]
print("ride_count statistics")
print(f"min / 25% / 50% / 75% / max : "
      f"{rc.min():.0f} / {rc.quantile(.25):.0f} / {rc.median():.0f} / "
      f"{rc.quantile(.75):.0f} / {rc.max():.0f}")
print(f"mean +- std : {rc.mean():.2f} +- {rc.std():.2f}")

mp = agg_combined["median_price_usd"]
print("median_price_usd statistics")
print(f"min / 25% / 50% / 75% / max : "
      f"{mp.min():.2f} / {mp.quantile(.25):.2f} / {mp.median():.2f} / "
      f"{mp.quantile(.75):.2f} / {mp.max():.2f}")
print(f"mean +- std {mp.mean():.2f} +- {mp.std():.2f}")

print("Done.")

Loading WindowsPath('C:/Users/aless/OneDrive - Nexus365/Thesis/driver_data/combined_path/new_test/original/trips_with_price_duration_original_0075_v2_km_osrm.parquet') …
Aggregating median price + ride count …
3,721 origin–dest–hour buckets produced.
168 origin-only buckets.

   origin_super_row  origin_super_col time_group  median_price_usd  ride_count
0                 0                 2  afternoon              6.85         878
1                 1                 1  afternoon              7.25        1420
2                 1                 1    evening              6.92        2522
3                 0                 2    evening              6.29        1270
4                 2                 1    evening              7.18        4232
Combined table has 168 rows (3721 detailed  +  168 origin-only).
Saved C:\Users\aless\OneDrive - Nexus365\Thesis\driver_data\combined_path\new_test\original\origin_dest_hour_lookup_price_original_0075_v2.parquet  and  C:\Users\aless\OneDrive - Nexus

In [3]:
SRC_PARQUET      = OG_DIR / f"trips_with_price_duration_{CELL_FILE_ADDITION}_km_osrm.parquet"         
DEST_FACTOR      = 6                        
OUT_BASENAME     = OG_DIR / f"origin_dest_hour_lookup_distance_{CELL_FILE_ADDITION}" 

print(f"Loading {SRC_PARQUET!r} …")
df = pd.read_parquet(SRC_PARQUET)

req = {
    "origin_row","origin_col","dest_row","dest_col",
    "hour","trip_distance_km"           
}
missing = req.difference(df.columns.str.lower())
if missing:
    raise ValueError(f"input file is missing columns: {sorted(missing)}")


df["time_group"]       = df["hour"].astype(int).apply(hour_to_group)
df["origin_super_row"] = (df["origin_row"] // DEST_FACTOR).astype("int16")
df["origin_super_col"] = (df["origin_col"] // DEST_FACTOR).astype("int16")
df["dest_super_row"]   = (df["dest_row"]   // DEST_FACTOR).astype("int16")
df["dest_super_col"]   = (df["dest_col"]   // DEST_FACTOR).astype("int16")

MAX_COLS = 7
df["origin_super_col"] = (df["origin_col"] // DEST_FACTOR).clip(upper=MAX_COLS-1).astype("int16")
df["dest_super_col"]   = (df["dest_col"]   // DEST_FACTOR).clip(upper=MAX_COLS-1).astype("int16")


print("Aggregating median distance + ride count …")
agg = (
    df.groupby(
        ["origin_super_row","origin_super_col",
         "dest_super_row","dest_super_col",
         "time_group"],
        sort=False, observed=True
    )["trip_distance_km"]               
      .agg(median_dist_km="median", ride_count="size")
      .reset_index()
)

print(f"{len(agg):,} origin–dest–hour buckets produced.")


agg_origin = (
    df.groupby(
        ["origin_super_row","origin_super_col","time_group"],
        sort=False, observed=True
    )["trip_distance_km"]                 
      .agg(median_dist_km="median", ride_count="size")
      .reset_index()
)
print(f"      → {len(agg_origin):,} origin-only buckets.\n")


TOTAL_RIDES = len(df)
agg["rf_global"] = agg["ride_count"] / TOTAL_RIDES    


agg_origin["rf_global"] = agg_origin["ride_count"] / TOTAL_RIDES


agg_origin_app = agg_origin.copy()
agg_origin_app["dest_super_row"] = pd.NA
agg_origin_app["dest_super_col"] = pd.NA


COLS = [
    "origin_super_row","origin_super_col",
    "dest_super_row","dest_super_col",
    "time_group",
    "median_dist_km","ride_count","rf_global"            
]
agg_origin_app = agg_origin_app[COLS]
agg            = agg.reindex(columns=COLS, fill_value=pd.NA)
agg_combined   = pd.concat([agg, agg_origin_app], ignore_index=True, sort=False)

print(f"Combined table has {len(agg_combined):,} rows "
      f"({len(agg)} detailed + {len(agg_origin_app)} origin-only).")


base = Path(OUT_BASENAME)
agg_combined.to_parquet(base.with_suffix(".parquet"), compression="zstd")
agg_combined.to_csv    (base.with_suffix(".csv"),     index=False)
np.save                (base.with_suffix(".npy"),     agg_combined.to_numpy())
print(f"Saved lookup tables to {base}.[parquet|csv|npy]")


print("Coverage & descriptive statistics")
n_orow = df["origin_super_row"].max() + 1
n_ocol = df["origin_super_col"].max() + 1
n_drow = df["dest_super_row"].max() + 1
n_dcol = df["dest_super_col"].max() + 1
TOTAL  = n_orow * n_ocol * n_drow * n_dcol * len(TG_LABELS)

cov_pct = 100 * len(agg_combined) / TOTAL
print(f"universe size     : {TOTAL:,}")
print(f"populated buckets : {len(agg_combined):,}  ({cov_pct:4.1f} %)")

print("\n   ride_count statistics")
print(agg_combined["ride_count"].describe(percentiles=[.25,.5,.75]).round(2))

print("\n   median_dist_km statistics")
print(agg_combined["median_dist_km"].describe(percentiles=[.25,.5,.75]).round(2))

print("\nDone.")


Loading WindowsPath('C:/Users/aless/OneDrive - Nexus365/Thesis/driver_data/combined_path/new_test/original/trips_with_price_duration_original_0075_v2_km_osrm.parquet') …
Aggregating median distance + ride count …
3,721 origin–dest–hour buckets produced.
      → 168 origin-only buckets.

Combined table has 3,889 rows (3721 detailed + 168 origin-only).
Saved lookup tables to C:\Users\aless\OneDrive - Nexus365\Thesis\driver_data\combined_path\new_test\original\origin_dest_hour_lookup_distance_original_0075_v2.[parquet|csv|npy]
Coverage & descriptive statistics
universe size     : 4,704
populated buckets : 3,889  (82.7 %)

   ride_count statistics
count    3889.00
mean       75.85
std       307.04
min         1.00
25%         3.00
50%         9.00
75%        41.00
max      7698.00
Name: ride_count, dtype: float64

   median_dist_km statistics
count    3889.00
mean       11.43
std         6.41
min         1.16
25%         6.08
50%        10.83
75%        15.34
max        62.14
Name: median_

In [4]:
SRC_PARQUET      = OG_DIR / f"trips_with_price_duration_{CELL_FILE_ADDITION}_km_osrm.parquet"         
DEST_FACTOR      = 6                        
OUT_BASENAME     = OG_DIR / f"origin_dest_hour_lookup_demand_{CELL_FILE_ADDITION}"           

print(f"Loading {SRC_PARQUET!r} …")
df = pd.read_parquet(SRC_PARQUET)

REQUIRED = {"origin_row", "origin_col",
            "dest_row",   "dest_col",
            "hour",       "begintrip_timestamp_london"}
missing = REQUIRED.difference(df.columns.str.lower())
if missing:
    raise ValueError(f"Input file is missing columns: {sorted(missing)}")


df["pickup_date"] = (
    pd.to_datetime(df["begintrip_timestamp_london"])
      .dt.normalize()
)

N_DAYS_TOTAL = df["pickup_date"].nunique()
TOTAL_RIDES  = len(df)
print(f"      {TOTAL_RIDES:,} rides across {N_DAYS_TOTAL} calendar day(s)")


df["time_group"]       = df["hour"].astype(int).apply(hour_to_group)
df["origin_super_row"] = (df["origin_row"] // DEST_FACTOR).astype("int16")
df["origin_super_col"] = (df["origin_col"] // DEST_FACTOR).astype("int16")
df["dest_super_row"]   = (df["dest_row"]   // DEST_FACTOR).astype("int16")
df["dest_super_col"]   = (df["dest_col"]   // DEST_FACTOR).astype("int16")

MAX_COLS = 7
df["origin_super_col"] = (df["origin_col"] // DEST_FACTOR).clip(upper=MAX_COLS-1).astype("int16")
df["dest_super_col"]   = (df["dest_col"]   // DEST_FACTOR).clip(upper=MAX_COLS-1).astype("int16")

print("Aggregating origin-only buckets …")
agg_origin = (
    df.groupby(
        ["origin_super_row","origin_super_col","time_group"],
        sort=False, observed=True
    )
    .size()
    .reset_index(name="ride_count")
)

agg_origin["rides_per_day"] = agg_origin["ride_count"] / N_DAYS_TOTAL
agg_origin["rf_global"]     = agg_origin["ride_count"] / TOTAL_RIDES
agg_origin["rf_time_group"] = (
    agg_origin.groupby("time_group", observed=True)["ride_count"]
              .transform(lambda s: s / s.sum())
)


print(f"{len(agg_origin):,} origin-only rows.")

base = Path(OUT_BASENAME)
agg_origin.to_parquet(base.with_suffix(".parquet"), compression="zstd")
agg_origin.to_csv    (base.with_suffix(".csv"),     index=False)
np.save              (base.with_suffix(".npy"),     agg_origin.to_numpy())

print(f"Saved to {base}.[parquet|csv|npy]")


print("Coverage & descriptive stats")

n_or = df["origin_super_row"].max() + 1
n_oc = df["origin_super_col"].max() + 1
n_dr = df["dest_super_row"].max()   + 1
n_dc = df["dest_super_col"].max()   + 1
TOTAL_UNIVERSE = n_or * n_oc * n_dr * n_dc * len(TG_LABELS)

print(f"universe size     : {TOTAL_UNIVERSE:,}")
print(f"populated buckets : {len(agg_origin):,}  "
      f"({100*len(agg_origin)/TOTAL_UNIVERSE:4.1f} %)")

print("ride_count statistics")
print(agg_origin["ride_count"].describe(percentiles=[.25,.5,.75]).round(2))

print("rides_per_day statistics")
print(agg_origin["rides_per_day"].describe(percentiles=[.25,.5,.75]).round(2))



Loading WindowsPath('C:/Users/aless/OneDrive - Nexus365/Thesis/driver_data/combined_path/new_test/original/trips_with_price_duration_original_0075_v2_km_osrm.parquet') …
      147,498 rides across 2821 calendar day(s)
Aggregating origin-only buckets …
168 origin-only rows.
Saved to C:\Users\aless\OneDrive - Nexus365\Thesis\driver_data\combined_path\new_test\original\origin_dest_hour_lookup_demand_original_0075_v2.[parquet|csv|npy]
Coverage & descriptive stats
universe size     : 4,704
populated buckets : 168  ( 3.6 %)
ride_count statistics
count     168.00
mean      877.96
std      1121.87
min        12.00
25%       252.00
50%       556.00
75%       945.50
max      7698.00
Name: ride_count, dtype: float64
rides_per_day statistics
count    168.00
mean       0.31
std        0.40
min        0.00
25%        0.09
50%        0.20
75%        0.34
max        2.73
Name: rides_per_day, dtype: float64


In [5]:
SRC_PARQUET      = OG_DIR / f"trips_with_price_duration_{CELL_FILE_ADDITION}_km_osrm.parquet"         
DEST_FACTOR      = 6                        
OUT_BASENAME     = OG_DIR / f"hour_lookup_demand_{CELL_FILE_ADDITION}"     


print(f"Loading {SRC_PARQUET!r} …")
df = pd.read_parquet(SRC_PARQUET)

REQUIRED = {"hour", "begintrip_timestamp_london"}
missing = REQUIRED.difference(df.columns.str.lower())
if missing:
    raise ValueError(f"Input file is missing columns: {sorted(missing)}")


df["pickup_date"] = (
    pd.to_datetime(df["begintrip_timestamp_london"])
      .dt.normalize()
)

N_DAYS_TOTAL = df["pickup_date"].nunique()
TOTAL_RIDES  = len(df)
print(f"      {TOTAL_RIDES:,} rides across {N_DAYS_TOTAL} calendar day(s)")


df["time_group"]       = df["hour"].astype(int).apply(hour_to_group)


print("Aggregating origin-only buckets …")
agg_origin = (
    df.groupby(
        ["hour"],
        sort=False, observed=True
    )
    .size()
    .reset_index(name="ride_count")
)

agg_origin["rides_per_day"] = agg_origin["ride_count"] / N_DAYS_TOTAL
agg_origin["rf_global"]     = agg_origin["ride_count"] / TOTAL_RIDES
agg_origin["rf_global_adjusted"] = agg_origin["rf_global"] * 10

print(f"      {len(agg_origin):,} origin-only rows.")


base = Path(OUT_BASENAME)
agg_origin.to_parquet(base.with_suffix(".parquet"), compression="zstd")
agg_origin.to_csv    (base.with_suffix(".csv"),     index=False)
np.save              (base.with_suffix(".npy"),     agg_origin.to_numpy())

print(f"Saved to {base}.[parquet|csv|npy]")



Loading WindowsPath('C:/Users/aless/OneDrive - Nexus365/Thesis/driver_data/combined_path/new_test/original/trips_with_price_duration_original_0075_v2_km_osrm.parquet') …
      147,498 rides across 2821 calendar day(s)
Aggregating origin-only buckets …
      24 origin-only rows.
Saved to C:\Users\aless\OneDrive - Nexus365\Thesis\driver_data\combined_path\new_test\original\hour_lookup_demand_original_0075_v2.[parquet|csv|npy]


In [6]:

SRC_PARQUET      = OG_DIR / f"trips_{CELL_FILE_ADDITION}_with_predicted_information.parquet"         
DEST_FACTOR      = 6                        
OUT_BASENAME     = OG_DIR / f"origin_dest_hour_lookup_rate_{CELL_FILE_ADDITION}"     



print(f"Loading {SRC_PARQUET!r} …")
df = pd.read_parquet(SRC_PARQUET)
print(df.head())

req_cols = {
    "origin_row", "origin_col", "dest_row", "dest_col",
    "hour", "pay_after_uber_cut",
    "sec_pred"
}
present = {c.lower() for c in df.columns}
if not (("sec_pred" in present)):
    raise ValueError("input file needs either 'trip_duration_min' or 'trip_duration_sec'")
missing = {c for c in req_cols if c not in present}
if missing:
    raise ValueError(f"input file is missing columns: {sorted(missing)}")


df["duration_min"] = df["sec_pred"] / 60.0

df["rate_usd_per_min"] = df["pay_after_uber_cut"] / df["duration_min"].clip(lower=0.5)


df["time_group"]       = df["hour"].astype(int).apply(hour_to_group)
df["origin_super_row"] = (df["origin_row"] // DEST_FACTOR).astype("int16")
df["origin_super_col"] = (df["origin_col"] // DEST_FACTOR).astype("int16")
df["dest_super_row"]   = (df["dest_row"]   // DEST_FACTOR).astype("int16")
df["dest_super_col"]   = (df["dest_col"]   // DEST_FACTOR).astype("int16")
MAX_COLS = 7
df["origin_super_col"] = (df["origin_col"] // DEST_FACTOR).clip(upper=MAX_COLS-1).astype("int16")
df["dest_super_col"]   = (df["dest_col"]   // DEST_FACTOR).clip(upper=MAX_COLS-1).astype("int16")


print("Aggregating median rate and ride count")
agg = (
    df.groupby(
        ["origin_super_row","origin_super_col",
         "dest_super_row","dest_super_col",
         "time_group"],
        sort=False, observed=True
    )["rate_usd_per_min"]
      .agg(median_rate_usd="median", ride_count="size")
      .reset_index()
)

print(f"{len(agg):,} origin–dest–hour buckets produced")

agg_origin = (
    df.groupby(
        ["origin_super_row","origin_super_col","time_group"],
        sort=False, observed=True
    )["rate_usd_per_min"]
      .agg(median_rate_usd="median", ride_count="size")
      .reset_index()
)
print(f"{len(agg_origin):,} origin-only buckets.\n")
print(agg_origin.head())


agg_origin_app = agg_origin.copy()
agg_origin_app["dest_super_row"] = pd.NA
agg_origin_app["dest_super_col"] = pd.NA

COLS = [
    "origin_super_row","origin_super_col",
    "dest_super_row","dest_super_col",
    "time_group",
    "median_rate_usd","ride_count"
]
agg_combined = pd.concat(
    [agg[COLS], agg_origin_app[COLS]],
    ignore_index=True, sort=False
)

print(f"[3/5] Combined table has {len(agg_combined):,} rows "
      f"({len(agg)} detailed  +  {len(agg_origin_app)} origin-only).")


base = Path(OUT_BASENAME)
agg_combined.to_parquet(base.with_suffix(".parquet"), compression="zstd")
agg_combined.to_csv(base.with_suffix(".csv"), index=False)
np.save(base.with_suffix(".npy"), agg_combined.to_numpy())

print(f"Saved {base}.parquet / .csv / .npy")


print("Coverage & descriptive statistics")

n_orow = df["origin_super_row"].max() + 1
n_ocol = df["origin_super_col"].max() + 1
n_drow = df["dest_super_row"].max() + 1
n_dcol = df["dest_super_col"].max() + 1
UNIVERSE = n_orow * n_ocol * n_drow * n_dcol * len(TG_LABELS)

missing_cnt = UNIVERSE - len(agg_combined)
cov_pct      = 100 * len(agg_combined) / UNIVERSE
print(f"universe size       : {UNIVERSE:,}")
print(f"populated buckets   : {len(agg_combined):,}  ({cov_pct:4.1f} %)")
print(f"missing buckets     : {missing_cnt:,}")

rc = agg_combined["ride_count"]
print("ride_count statistics")
print(f"      min / 25% / 50% / 75% / max : "
      f"{rc.min():.0f} / {rc.quantile(.25):.0f} / {rc.median():.0f} / "
      f"{rc.quantile(.75):.0f} / {rc.max():.0f}")
print(f"      mean +- std                 : {rc.mean():.2f} +- {rc.std():.2f}")

rp = agg_combined["median_rate_usd"]
print("median_rate_usd statistics")
print(f"      min / 25% / 50% / 75% / max : "
      f"{rp.min():.2f} / {rp.quantile(.25):.2f} / {rp.median():.2f} / "
      f"{rp.quantile(.75):.2f} / {rp.max():.2f}")
print(f"      mean +- std                 : {rp.mean():.2f} +- {rp.std():.2f}")


Loading WindowsPath('C:/Users/aless/OneDrive - Nexus365/Thesis/driver_data/combined_path/new_test/original/trips_original_0075_v2_with_predicted_information.parquet') …
   origin_row  origin_col  dest_row  dest_col  begin_lat  begin_lng  \
0           5          12         7         7  51.440338  -0.159358   
1           6           7         9        11  51.445763  -0.191400   
2          10          11        12         8  51.479115  -0.166910   
3          11           7        13         6  51.487488  -0.191229   
4           5          13        13         8  51.444721  -0.148535   

     end_lat   end_lng  haversine_km begintrip_timestamp_london  ...  \
0  51.456711 -0.191571      2.880576  2016-04-28 17:23:20+01:00  ...   
1  51.474430 -0.167369      3.596283  2016-04-28 17:50:48+01:00  ...   
2  51.490761 -0.183790      1.744453  2016-04-28 18:10:50+01:00  ...   
3  51.502617 -0.199705      1.781674  2016-04-28 18:24:59+01:00  ...   
4  51.502235 -0.186893      6.925133  2016-0