In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
import pandas as pd
import os

# Starts a Spark session
spark = (
    SparkSession.builder
        .appName("Taxi vs Rideshare Profitability")
        .config("spark.sql.repl.eagerEval.enabled", False)   
        .config("spark.sql.parquet.cacheMetadata", "true")
        .config("spark.sql.session.timeZone", "Etc/UTC")
        .config("spark.sql.shuffle.partitions", "64")        
        .config("spark.driver.memory", "6g")                
        .config("spark.executor.memory", "6g")
        .getOrCreate()
)

# Define months
months = ["2024-01","2024-02","2024-03","2024-04","2024-05","2024-06"]

# Load in data files 
yellow_files = [f"data/yellow/yellow_tripdata_{m}.parquet" for m in months]
fhvhv_files  = [f"data/fhvhv/fhvhv_tripdata_{m}.parquet"   for m in months]

df_yellow = (
    spark.read.parquet(*yellow_files)
         .withColumn("service_type", lit("yellow"))
)
df_fhvhv = (
    spark.read.parquet(*fhvhv_files)
         .withColumn("service_type", lit("hv_fhv"))
)

# Merge
df = df_yellow.unionByName(df_fhvhv, allowMissingColumns=True)

# External tables (you'll wire these in later for fuel/energy)
electricity = spark.read.csv("data/external/electricity.csv", header=True, inferSchema=True)
fuel        = spark.read.csv("data/external/fuel.csv", header=True, inferSchema=True)



                                                                                



In [None]:

# Preprocess data
from pyspark.sql import functions as F
from pyspark.sql.functions import col, to_timestamp, coalesce, unix_timestamp, when, lit, date_format, hour, dayofweek, broadcast
from pyspark.sql.types import IntegerType, DoubleType
from pyspark import StorageLevel
from pyspark.sql import Window

# Reduce memory usage
_needed = [
    "service_type",
    "tpep_pickup_datetime","tpep_dropoff_datetime",
    "pickup_datetime","dropoff_datetime",
    "trip_distance","trip_miles","trip_time",
    "PULocationID","DOLocationID",
    "passenger_count","payment_type",
    "fare_amount","extra","tip_amount",
    "driver_pay","tips"
]
df = df.select([c for c in _needed if c in df.columns])

# Standardise timestamps
df = (
    df.withColumn("pickup_ts",  to_timestamp(coalesce(col("tpep_pickup_datetime"),  col("pickup_datetime"))))
      .withColumn("dropoff_ts", to_timestamp(coalesce(col("tpep_dropoff_datetime"), col("dropoff_datetime"))))
)

# Remove rows with null pickup or dropoff timestamps
df = df.filter(col("pickup_ts").isNotNull() & col("dropoff_ts").isNotNull())

# Standardise location IDs
df = df.withColumn("month", date_format(col("pickup_ts"), "yyyy-MM"))

# Standardise distance and keep positive distances only and not null
df = (
    df.withColumn("distance_mi", coalesce(col("trip_distance"), col("trip_miles")).cast(DoubleType()))
      .filter(col("distance_mi").isNotNull() & (col("distance_mi") > 0))
)

# Standardise trip time and not null
df = df.withColumn(
    "trip_time_s",
    when(col("trip_time").isNotNull(), col("trip_time").cast("double"))
    .otherwise((unix_timestamp(col("dropoff_ts")) - unix_timestamp(col("pickup_ts"))).cast("double"))
)
df = df.filter(col("trip_time_s").isNotNull() & (col("trip_time_s") > 0))

# Deduplicate rows 
dedupe_key = [c for c in ["service_type","pickup_ts","dropoff_ts","PULocationID","DOLocationID","distance_mi","trip_time_s"] if c in df.columns]

# If dedupe_key is empty, we won't deduplicate
if dedupe_key:
    w = Window.partitionBy(["month"] + dedupe_key).orderBy(F.lit(1))
    df = df.withColumn("__rn", F.row_number().over(w)).filter(col("__rn") == 1).drop("__rn")

# Fixed Parameters
CREDIT_CARD_FEE = 0.025 
MAINTENANCE_COST_PER_MILE = 0.15
MAINTENANCE_COST_PER_MILE_HV = 0.15

# Payment type exists
if "payment_type" not in df.columns:
    df = df.withColumn("payment_type", lit(None).cast(IntegerType()))

# Time features 
df = (df
    .withColumn("pickup_hour", hour(col("pickup_ts")))
    .withColumn("pickup_dow", dayofweek(col("pickup_ts")))  
    .withColumn("is_weekend", (col("pickup_dow").isin([1,7])).cast("boolean"))
)

# Add revenue 
rev_yellow = coalesce(col("fare_amount"), lit(0.0)) + coalesce(col("extra"), lit(0.0)) + coalesce(col("tip_amount"), lit(0.0))
rev_hv     = coalesce(col("driver_pay"), lit(0.0)) + coalesce(col("tips"), lit(0.0))
df = df.withColumn(
    "revenue",
    when(col("service_type") == "yellow", rev_yellow).otherwise(rev_hv).cast(DoubleType())
)

# Add costs
maint_rate = when(col("service_type") == "yellow",
                  lit(MAINTENANCE_COST_PER_MILE)
              ).otherwise(
                  lit(MAINTENANCE_COST_PER_MILE_HV)
              )
df = df.withColumn("expense_maintenance", (col("distance_mi") * maint_rate).cast(DoubleType()))

# Credit card fee 
df = df.withColumn(
     "expense_cc_processing",
    when((col("service_type") == "yellow") & (col("payment_type") == 1),
         (lit(CREDIT_CARD_FEE) * col("revenue")).cast(DoubleType()))
    .otherwise(lit(0.0))
)

# Expenses pre-fuel 
df = df.withColumn(
    "expenses_nonfuel",
    (col("expense_maintenance") + col("expense_cc_processing")).cast(DoubleType())
)

# Keep distance
df = df.filter(col("distance_mi") >= 0.1)

# Make sure within month range
df = df.filter( (col("month") >= "2024-01") & (col("month") <= "2024-06") )

# Keep duration >= 60 seconds
df = df.filter(col("trip_time_s") >= 60)

# Keep positive passenger count 
if "passenger_count" in df.columns:
    df = df.filter(
        when(col("service_type") == "yellow", col("passenger_count") > 0)
        .otherwise(True)
    )

# Valid TLC zone IDs 
for c in ["PULocationID", "DOLocationID"]:
    if c in df.columns:
        df = df.filter((col(c) >= 1) & (col(c) <= 263))

# Non-negative money fields 
money_ok = (
    (coalesce(col("fare_amount"), lit(0.0))  >= 0) &
    (coalesce(col("extra"),       lit(0.0))  >= 0) &
    (coalesce(col("tip_amount"),  lit(0.0))  >= 0) &
    (coalesce(col("driver_pay"),  lit(0.0))  >= 0) &
    (coalesce(col("tips"),        lit(0.0))  >= 0)
)
df = df.filter(money_ok)

# Minimum initial fare for Yellow 
df = df.filter(
    when(col("service_type") == "yellow", coalesce(col("fare_amount"), lit(0.0)) >= 1.50)
    .otherwise(True)
)


# Pre-fuel profitability
df = (df
    .withColumn("active_hours", (col("trip_time_s") / 3600.0).cast(DoubleType()))
    .withColumn("net_before_fuel", (col("revenue") - col("expenses_nonfuel")).cast(DoubleType()))
    .withColumn("net_per_hr_before_fuel", (col("net_before_fuel") / col("active_hours")).cast(DoubleType()))
    .withColumn("mph", (col("distance_mi") / (col("trip_time_s")/3600.0)).cast(DoubleType()))
)

# Single-pass outlier trim using 99.9% quantile
stacked = (
    df.select("service_type", F.lit("distance_mi").alias("metric"), col("distance_mi").alias("value"))
      .unionByName(df.select("service_type", F.lit("trip_time_s").alias("metric"), col("trip_time_s").alias("value")))
      .unionByName(df.select("service_type", F.lit("revenue").alias("metric"),     col("revenue").alias("value")))
)
bounds = (
    stacked.groupBy("service_type", "metric")
           .agg(F.expr("percentile_approx(value, 0.999, 10000)").alias("p999"))
)
df = (
    df.alias("t")
      .join(bounds.alias("b1").filter(col("b1.metric") == "distance_mi")
                 .select(col("service_type").alias("s1"), col("p999").alias("p_d")),
            on=[col("t.service_type") == col("s1")], how="left")
      .join(bounds.alias("b2").filter(col("b2.metric") == "trip_time_s")
                 .select(col("service_type").alias("s2"), col("p999").alias("p_t")),
            on=[col("t.service_type") == col("s2")], how="left")
      .join(bounds.alias("b3").filter(col("b3.metric") == "revenue")
                 .select(col("service_type").alias("s3"), col("p999").alias("p_r")),
            on=[col("t.service_type") == col("s3")], how="left")
      .filter( (col("distance_mi") <= F.coalesce(col("p_d"), lit(float("inf")))) &
               (col("trip_time_s") <= F.coalesce(col("p_t"), lit(float("inf")))) &
               (col("revenue")     <= F.coalesce(col("p_r"), lit(float("inf")))) )
      .drop("s1","s2","s3","p_d","p_t","p_r")
)

df = df.repartition(64, "service_type", "month")

# Cap impossible speeds
df = df.filter((col("mph") >= 0) & (col("mph") <= 120.0))


# Fuel and energy costs
fuel = fuel.select("month", "price_per_gallon").dropna()
electricity = electricity.select("month", "price_usd_per_kwh").dropna()

# Join fuel and electricity prices
df = (df.join(broadcast(fuel), on="month", how="left")
        .join(broadcast(electricity), on="month", how="left")
)

# Energy assumptions according to EPA and AFDC 
MPG_FHV  = 27.0  
MPG_TAXI = 16.0  

KWH_YELLOW = 0.30
KWH_FHV    = 0.30

YELLOW_EV_PERCENT = 0.00  # Assuming no EVs in Yellow Taxi fleet 
FHV_EV_PERCENT    = 0.10  # Example share for HVFHV

# Per-service parameters as columns 
df = (df
    .withColumn(
        "mpg",
        when(col("service_type") == "yellow", lit(MPG_TAXI))
        .otherwise(lit(MPG_FHV)).cast(DoubleType())
    )
    .withColumn(
        "kwh_per_mile",
        when(col("service_type") == "yellow", lit(KWH_YELLOW))
        .otherwise(lit(KWH_FHV)).cast(DoubleType())
    )
    .withColumn(
        "ev_share",
        when(col("service_type") == "yellow", lit(YELLOW_EV_PERCENT))
        .otherwise(lit(FHV_EV_PERCENT)).cast(DoubleType())
    )
)

# Cost per mile (blend gas vs EV by ev_share)
gas_cpm = (col("price_per_gallon") / col("mpg")).cast(DoubleType())
ev_cpm  = (col("price_usd_per_kwh") * col("kwh_per_mile")).cast(DoubleType())

df = (df
    .withColumn(
        "energy_cost_per_mile",
        ((lit(1.0) - coalesce(col("ev_share"), lit(0.0))) * coalesce(gas_cpm, lit(0.0))) +
        (coalesce(col("ev_share"), lit(0.0)) * coalesce(ev_cpm, lit(0.0)))
    )
    .withColumn("expense_fuel", (col("distance_mi") * col("energy_cost_per_mile")).cast(DoubleType()))
    .withColumn("net_after_fuel", (col("revenue") - col("expenses_nonfuel") - col("expense_fuel")).cast(DoubleType()))
    .withColumn("net_per_hr_after_fuel", (col("net_after_fuel") / col("active_hours")).cast(DoubleType()))
)

# Keep only relevant columns
cols_keep = [
    # Metadata
    "service_type", "month", "pickup_ts", "dropoff_ts",
    "pickup_hour", "pickup_dow", "is_weekend",
    # Location IDs
    "PULocationID", "DOLocationID",
    # engineered trip metrics
    "distance_mi", "trip_time_s", "mph", "active_hours",
    # Feature engineering
    "revenue", "expenses_nonfuel", "expense_fuel",
    "net_before_fuel", "net_after_fuel",
    "net_per_hr_before_fuel", "net_per_hr_after_fuel",
    # Parameters
    "price_per_gal", "price_usd_per_kwh",
    "energy_cost_per_mile", "ev_share", "mpg", "kwh_per_mile",
]

# Filter columns to keep only those that exist in the DataFrame
cols_keep = [c for c in cols_keep if c in df.columns]
df = df.select(*cols_keep)








In [None]:
# Analyze results and Geospatial Analysis
# ===============================
# ANALYSIS + GEOSPATIAL PREP
# ===============================
from pyspark.sql.functions import col, sum as ssum, count, round as sround, when, lit, expr
from pyspark.sql import functions as F

# ---- speed toggle for dev runs ----
FAST_MODE = False  # True = ~2% stratified sample, False = full data
if FAST_MODE:
    df_base = df.stat.sampleBy("service_type", {"yellow":0.02, "hv_fhv":0.02}, seed=7)
else:
    df_base = df

# ---------- 0) small helpers (no UDFs) ----------
# time-weighted aggregator fragment used below
sum_net   = ssum("net_after_fuel").alias("sum_net")
sum_hours = ssum("active_hours").alias("sum_hours")

# ---------- 1) overall time-weighted net $/hr ----------
overall = (
    df_base.groupBy("service_type")
           .agg(sum_net, sum_hours, count("*").alias("trips"))
           .withColumn("net_per_hr_TW", sround(col("sum_net")/col("sum_hours"), 2))
)

# ---------- 2) monthly time-weighted net $/hr ----------
monthly = (
    df_base.groupBy("service_type","month")
           .agg(sum_net, sum_hours, count("*").alias("trips"))
           .withColumn("net_per_hr_TW", sround(col("sum_net")/col("sum_hours"), 2))
           .orderBy("service_type","month")
)

# ---------- 3) hour-of-day profile ----------
hod = (
    df_base.groupBy("service_type","pickup_hour")
           .agg(sum_net, sum_hours, count("*").alias("trips"))
           .withColumn("net_per_hr_TW", sround(col("sum_net")/col("sum_hours"), 2))
           .orderBy("service_type","pickup_hour")
)

# ---------- 4) weekday vs weekend ----------
wkend = (
    df_base.groupBy("service_type","is_weekend")
           .agg(sum_net, sum_hours, count("*").alias("trips"))
           .withColumn("net_per_hr_TW", sround(col("sum_net")/col("sum_hours"), 2))
           .withColumn("week_type", when(col("is_weekend"), lit("weekend")).otherwise(lit("weekday")))
           .select("service_type","week_type","trips","sum_net","sum_hours","net_per_hr_TW")
           .orderBy("service_type","week_type")
)

# ---------- 5) earnings composition (per trip averages) ----------
comp = (
    df_base.groupBy("service_type")
           .agg(
               ssum("revenue").alias("sum_rev"),
               ssum("expenses_nonfuel").alias("sum_nonfuel"),
               ssum("expense_fuel").alias("sum_fuel"),
               count("*").alias("n_trips")
           )
           .withColumn("rev_per_trip",     sround(col("sum_rev")/col("n_trips"), 2))
           .withColumn("nonfuel_per_trip", sround(col("sum_nonfuel")/col("n_trips"), 2))
           .withColumn("fuel_per_trip",    sround(col("sum_fuel")/col("n_trips"), 2))
           .select("service_type","n_trips","rev_per_trip","nonfuel_per_trip","fuel_per_trip")
)

# ---------- 6) efficiency: revenue & net per mile (quantiles) ----------
eff = (df_base
       .withColumn("rev_per_mi", col("revenue")/col("distance_mi"))
       .withColumn("net_per_mi", col("net_after_fuel")/col("distance_mi"))
       .filter(col("distance_mi") > 0.1)
)

eff_q = (
    eff.groupBy("service_type")
       .agg(
           F.percentile_approx("rev_per_mi", [0.25,0.5,0.75], 1000).alias("rev_q"),
           F.percentile_approx("net_per_mi", [0.25,0.5,0.75], 1000).alias("net_q")
       )
       .select(
           "service_type",
           col("rev_q")[0].alias("rev_per_mi_q25"),
           col("rev_q")[1].alias("rev_per_mi_q50"),
           col("rev_q")[2].alias("rev_per_mi_q75"),
           col("net_q")[0].alias("net_per_mi_q25"),
           col("net_q")[1].alias("net_per_mi_q50"),
           col("net_q")[2].alias("net_per_mi_q75")
       )
)

# ---------- 7) pickup zone: time-weighted net $/hr + trips (choropleth) ----------
zone = (
    df_base.groupBy("service_type","PULocationID")
           .agg(sum_net, sum_hours, count("*").alias("trips"))
           .withColumn("net_per_hr_TW", sround(col("sum_net")/col("sum_hours"), 2))
)
zone_y  = zone.filter(col("service_type")=="yellow")
zone_hv = zone.filter(col("service_type")=="hv_fhv")

# ---------- 8) difference map: Yellow minus HVFHV net/hr by pickup zone ----------
zone_pivot = (
    zone.groupBy("PULocationID")
        .pivot("service_type", ["yellow","hv_fhv"])
        .agg(F.first("net_per_hr_TW"))
        .withColumn("diff_yellow_minus_hv", sround(col("yellow") - col("hv_fhv"), 2))
)

# ---------- 9) pickup density map ----------
zone_trips = (
    df_base.groupBy("service_type","PULocationID")
           .agg(count("*").alias("trips"))
           .orderBy("service_type", col("trips").desc())
)

# ---------- 10) top OD flows (for desire-lines) ----------
flows = (
    df_base.groupBy("service_type","PULocationID","DOLocationID")
           .count().withColumnRenamed("count","trips")
)
flows_y_top  = flows.filter(col("service_type")=="yellow").orderBy(col("trips").desc()).limit(200)
flows_hv_top = flows.filter(col("service_type")=="hv_fhv").orderBy(col("trips").desc()).limit(200)

# ---------- 11) airport corridors ----------
# (check your taxi_zones lookup if you want to confirm these IDs)
JFK_IDS = [132]   # JFK Airport
LGA_IDS = [138]   # LaGuardia Airport
# EWR Newark is LocationID 1 in TLC zones (optional): EWR_IDS = [1]

airport_pickups = (
    df_base.withColumn(
        "airport",
        when(col("PULocationID").isin(JFK_IDS), lit("JFK"))
        .when(col("PULocationID").isin(LGA_IDS), lit("LGA"))
        .otherwise(lit("OTHER"))
    )
    .groupBy("service_type","airport")
    .agg(sum_net, sum_hours, count("*").alias("trips"))
    .withColumn("net_per_hr_TW", sround(col("sum_net")/col("sum_hours"), 2))
    .orderBy("service_type","airport")
)

# ===============================
# WRITE ALL CSVs (pick what you need)
# ===============================
overall.coalesce(1).write.mode("overwrite").csv("plots/overall.csv", header=True)
monthly.coalesce(1).write.mode("overwrite").csv("plots/monthly.csv", header=True)
hod.coalesce(1).write.mode("overwrite").csv("plots/hour_of_day.csv", header=True)
wkend.coalesce(1).write.mode("overwrite").csv("plots/weekday_weekend.csv", header=True)
comp.coalesce(1).write.mode("overwrite").csv("plots/composition.csv", header=True)
eff_q.coalesce(1).write.mode("overwrite").csv("plots/efficiency_quantiles.csv", header=True)

zone_y.coalesce(1).write.mode("overwrite").csv("plots/zone_yellow.csv", header=True)
zone_hv.coalesce(1).write.mode("overwrite").csv("plots/zone_hvfhv.csv", header=True)
zone_pivot.coalesce(1).write.mode("overwrite").csv("plots/zone_diff_y_minus_hv.csv", header=True)
zone_trips.coalesce(1).write.mode("overwrite").csv("plots/zone_trips.csv", header=True)

flows_y_top.coalesce(1).write.mode("overwrite").csv("plots/flows_top200_yellow.csv", header=True)
flows_hv_top.coalesce(1).write.mode("overwrite").csv("plots/flows_top200_hvfhv.csv", header=True)

airport_pickups.coalesce(1).write.mode("overwrite").csv("plots/airport_pickups.csv", header=True)


# ==========================================
# PLOTTING: analysis + geospatial (matplotlib / GeoPandas)
# ==========================================
import os, glob
import pandas as pd
import matplotlib.pyplot as plt

# ---------- helpers ----------
def load_csv(dirpath):
    """Load a Spark-written CSV folder (with part-*.csv) into a pandas DF."""
    paths = sorted(glob.glob(os.path.join(dirpath, "*.csv")))
    if not paths:
        paths = sorted(glob.glob(os.path.join(dirpath, "part-*.csv")))
    if not paths:
        raise FileNotFoundError(f"No CSV files found in {dirpath}")
    dfs = [pd.read_csv(p) for p in paths]
    return pd.concat(dfs, ignore_index=True)

os.makedirs("plots/img", exist_ok=True)

# ---------- 1) Overall (table print) ----------
overall = load_csv("plots/overall.csv")
print("\nOVERALL (time-weighted net/hr):")
print(overall)

# ---------- 2) Monthly net/hr (line chart) ----------
monthly = load_csv("plots/monthly.csv")
monthly_p = monthly.pivot(index="month", columns="service_type", values="net_per_hr_TW").sort_index()
ax = monthly_p.plot(marker="o")
ax.set_title("Time-weighted Net $/hr by Month")
ax.set_ylabel("Net $/hr")
ax.set_xlabel("Month (2024)")
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig("plots/img/monthly_net_per_hr.png", dpi=150)
plt.close()

# ---------- 3) Hour-of-day profile ----------
hod = load_csv("plots/hour_of_day.csv")
hod_p = hod.pivot(index="pickup_hour", columns="service_type", values="net_per_hr_TW").sort_index()
ax = hod_p.plot(marker="o")
ax.set_title("Time-weighted Net $/hr by Hour of Day")
ax.set_xlabel("Pickup Hour")
ax.set_ylabel("Net $/hr")
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig("plots/img/hour_of_day_net_per_hr.png", dpi=150)
plt.close()

# ---------- 4) Weekday vs Weekend (bar) ----------
wkend = load_csv("plots/weekday_weekend.csv")
wkend_p = wkend.pivot(index="service_type", columns="week_type", values="net_per_hr_TW")
ax = wkend_p.plot(kind="bar")
ax.set_title("Time-weighted Net $/hr: Weekday vs Weekend")
ax.set_xlabel("")
ax.set_ylabel("Net $/hr")
ax.grid(True, axis="y", alpha=0.3)
plt.tight_layout()
plt.savefig("plots/img/weekday_weekend_net_per_hr.png", dpi=150)
plt.close()

# ---------- 5) Earnings composition (stacked per-trip) ----------
comp = load_csv("plots/composition.csv")
comp = comp.set_index("service_type")[["rev_per_trip","nonfuel_per_trip","fuel_per_trip"]]
ax = comp.plot(kind="bar", stacked=True)
ax.set_title("Per-trip Averages: Revenue vs Costs")
ax.set_xlabel("")
ax.set_ylabel("USD per trip")
ax.grid(True, axis="y", alpha=0.3)
plt.tight_layout()
plt.savefig("plots/img/composition_per_trip.png", dpi=150)
plt.close()

# ---------- 6) Efficiency quantiles (bars with error bars) ----------
effq = load_csv("plots/efficiency_quantiles.csv")
# Build error bars for net_per_mi using q25/median/q75
effq = effq.set_index("service_type")
median = effq["net_per_mi_q50"]
err_low = median - effq["net_per_mi_q25"]
err_hi  = effq["net_per_mi_q75"] - median
ax = median.plot(kind="bar", yerr=[err_low, err_hi], capsize=3)
ax.set_title("Net per Mile (q25–median–q75)")
ax.set_xlabel("")
ax.set_ylabel("USD per mile")
ax.grid(True, axis="y", alpha=0.3)
plt.tight_layout()
plt.savefig("plots/img/net_per_mile_quantiles.png", dpi=150)
plt.close()

# ======================================================
# GEOSPATIAL (requires GeoPandas + shapely + fiona)
#   pip install geopandas shapely fiona pyproj
# ======================================================
try:
    import geopandas as gpd
    from shapely.geometry import LineString

    TAXI_ZONES_PATH = "taxi_zones/taxi_zones.geojson"  # <- change if needed
    zones = gpd.read_file(TAXI_ZONES_PATH)[["LocationID","geometry","zone","borough"]]

    # ---------- 7) Zone choropleths: Yellow + HVFHV net/hr ----------
    zone_y  = load_csv("plots/zone_yellow.csv").rename(columns={"PULocationID":"LocationID"})
    zone_hv = load_csv("plots/zone_hvfhv.csv").rename(columns={"PULocationID":"LocationID"})

    z_y = zones.merge(zone_y[["LocationID","net_per_hr_TW","trips"]], on="LocationID", how="left")
    z_h = zones.merge(zone_hv[["LocationID","net_per_hr_TW","trips"]], on="LocationID", how="left")

    ax = z_y.plot(column="net_per_hr_TW", legend=True)
    ax.set_title("Yellow: Net $/hr by Pickup Zone")
    plt.tight_layout()
    plt.savefig("plots/img/map_zone_yellow_net_per_hr.png", dpi=150)
    plt.close()

    ax = z_h.plot(column="net_per_hr_TW", legend=True)
    ax.set_title("HVFHV: Net $/hr by Pickup Zone")
    plt.tight_layout()
    plt.savefig("plots/img/map_zone_hvfhv_net_per_hr.png", dpi=150)
    plt.close()

    # ---------- 8) Difference map: Yellow − HVFHV ----------
    zone_diff = load_csv("plots/zone_diff_y_minus_hv.csv")
    zone_diff = zone_diff.rename(columns={"PULocationID":"LocationID", "diff_yellow_minus_hv":"diff_net_per_hr"})
    z_diff = zones.merge(zone_diff[["LocationID","diff_net_per_hr"]], on="LocationID", how="left")

    ax = z_diff.plot(column="diff_net_per_hr", legend=True)
    ax.set_title("Net $/hr Difference (Yellow − HVFHV) by Zone")
    plt.tight_layout()
    plt.savefig("plots/img/map_zone_diff.png", dpi=150)
    plt.close()

    # ---------- 9) Pickup density (trips) ----------
    zone_trips = load_csv("plots/zone_trips.csv").rename(columns={"PULocationID":"LocationID"})
    # choose a service to map (Yellow as default)
    zt_y = zones.merge(zone_trips[zone_trips.service_type=="yellow"][["LocationID","trips"]],
                       on="LocationID", how="left")
    ax = zt_y.plot(column="trips", legend=True)
    ax.set_title("Pickup Density (Trips) — Yellow")
    plt.tight_layout()
    plt.savefig("plots/img/map_zone_trips_yellow.png", dpi=150)
    plt.close()

    # ---------- 10) Desire lines (top flows) ----------
    flows_y  = load_csv("plots/flows_top200_yellow.csv")
    flows_hv = load_csv("plots/flows_top200_hvfhv.csv")

    # centroids lookup
    zones_cent = zones.copy()
    zones_cent["centroid"] = zones_cent.geometry.centroid
    cent = zones_cent.set_index("LocationID")["centroid"].to_dict()

    def flows_to_gdf(df_flows):
        rows = []
        for _, r in df_flows.iterrows():
            pu, do, trips = int(r["PULocationID"]), int(r["DOLocationID"]), int(r["trips"])
            if pu in cent and do in cent:
                line = LineString([cent[pu], cent[do]])
                rows.append({"PULocationID": pu, "DOLocationID": do, "trips": trips, "geometry": line})
        return gpd.GeoDataFrame(rows, geometry="geometry", crs=zones.crs)

    gdf_y  = flows_to_gdf(flows_y)
    gdf_hv = flows_to_gdf(flows_hv)

    # scale linewidths by trips (simple linear scaling)
    lw_y  = (gdf_y["trips"] / gdf_y["trips"].max()) * 4 + 0.5
    lw_hv = (gdf_hv["trips"] / gdf_hv["trips"].max()) * 4 + 0.5

    base = zones.boundary.plot(linewidth=0.5)
    gdf_y.plot(ax=base, linewidth=lw_y)
    plt.title("Top 200 OD Flows — Yellow")
    plt.tight_layout()
    plt.savefig("plots/img/flows_yellow.png", dpi=150)
    plt.close()

    base = zones.boundary.plot(linewidth=0.5)
    gdf_hv.plot(ax=base, linewidth=lw_hv)
    plt.title("Top 200 OD Flows — HVFHV")
    plt.tight_layout()
    plt.savefig("plots/img/flows_hvfhv.png", dpi=150)
    plt.close()

    # ---------- 11) Airport pickups ----------
    ap = load_csv("plots/airport_pickups.csv")
    ap_p = ap.pivot(index="service_type", columns="airport", values="net_per_hr_TW")
    ax = ap_p.plot(kind="bar")
    ax.set_title("Airport Pickup Net $/hr (Time-weighted)")
    ax.set_xlabel("")
    ax.set_ylabel("Net $/hr")
    ax.grid(True, axis="y", alpha=0.3)
    plt.tight_layout()
    plt.savefig("plots/img/airport_pickups.png", dpi=150)
    plt.close()

    print("\nSaved maps to plots/img/*.png")

except ImportError as e:
    print("\n[Note] GeoPandas not installed — skipping map outputs.")
    print("Install with: pip install geopandas shapely fiona pyproj")



