# Refactor bus corridors

In [1]:
import datetime as dt
import geopandas as gpd
import numpy as np
import pandas as pd

from siuba import *

import B1_bus_corridors as bus_corridors
from A1_rail_ferry_brt import analysis_date
from utilities import GCS_FILE_PATH
from shared_utils import rt_utils

import dask.dataframe as dd
import dask_geopandas


date_str = analysis_date.strftime(rt_utils.FULL_DATE_FMT)



In [4]:
itp_id = 101

FILE_PATH = f"{rt_utils.GCS_FILE_PATH}cached_views/"
        
routelines = dask_geopandas.read_parquet(f"{FILE_PATH}routelines_{itp_id}_{date_str}.parquet")
trips = dd.read_parquet(f"{FILE_PATH}trips_{itp_id}_{date_str}.parquet")
stop_times = dd.read_parquet(f"{FILE_PATH}st_{itp_id}_{date_str}.parquet")
stops = dask_geopandas.read_parquet(f"{FILE_PATH}stops_{itp_id}_{date_str}.parquet")

#gdf = bus_corridors.single_operator_hqta(routelines, trips, stop_times, stops)
#gdf.to_parquet("./data/182_bus2.parquet")


In [None]:
import dask_utils

# Pare down all the shape_id-trip_id combos down to shape_id-route_id
# Keep the longest route_length to use to get hqta segments
route_shapes = dask_utils.select_needed_shapes_for_route_network(routelines, trips)

all_routes = gpd.GeoDataFrame()
for i in route_shapes.index:
    one_route = route_shapes[route_shapes.index==i]
    gdf = dask_utils.segment_route(one_route)

    all_routes = pd.concat([all_routes, gdf])


# Add HQTA segment ID
all_routes2 = dask_utils.add_segment_id(all_routes)

##generous buffer for street/sidewalk width? 
# Required to spatially find stops within each segment
all_routes3 = dask_utils.add_buffer(all_routes2, buffer_size=50)

In [None]:
# Convert to dask gdf
hqta_segments = dask_geopandas.from_geopandas(all_routes3, npartitions=1)
# Join hqta segment to stops
segment_to_stop = bus_corridors.hqta_segment_to_stop(hqta_segments, stops)

# Within hqta segment, if there are multiple stops, keep stop with highest trip count
segment_to_stop_unique = bus_corridors.hqta_segment_keep_one_stop(segment_to_stop, stop_times)

# Get aggregated stops by departure_hour and stop_id
trips_by_stop_hour = dask_utils.stop_times_aggregation_by_hour(stop_times)

In [None]:
#https://stackoverflow.com/questions/71688126/groupby-map-partitions-in-dask
trips_by_stop_hour = stop_times.map_partitions(
    lambda df: df.groupby(["calitp_itp_id", "stop_id"], as_index=False)
        .agg({"trip_id": "count"}), meta={"calitp_itp_id": int,
                                          "stop_id": str,
                                          "trip_id": int
                                         }
)

In [None]:
# calitp-analytics-data/data-analyses/high_quality_transit_areas/bus_corridors/182_bus.parquet
eric = gpd.read_parquet(f"{GCS_FILE_PATH}bus_corridors/182_bus.parquet")

In [None]:
tiff = gpd.read_parquet(f"./data/182_bus2.parquet")

In [None]:
def stats(df):
    print(df.columns)
    print(df.dtypes)
    print(f"# obs: {len(df)}")
    print(f"# unique stops: {df.stop_id.nunique()}")
    print(f"sum am_max stops: {df.am_max_trips.sum()}")
    print(f"sum pm_max stops: {df.am_max_trips.sum()}")
    print(f"# unique hqta_segment_id: {df.hqta_segment_id.nunique()}")

    
def compare(eric, tiff):
    print("***********Eric************")
    stats(eric)
    print("***********Tiff************")
    stats(tiff)

In [None]:
compare(eric, tiff)

In [None]:
keep_cols = ["stop_id", "am_max_trips", "pm_max_trips"]
eric2 = eric[keep_cols].drop_duplicates()
tiff2 = tiff[keep_cols].drop_duplicates()

In [None]:
m1 = pd.merge(eric2, tiff2,
         on = "stop_id",
         how = "outer",
         validate = "1:1",
         indicator=True
)

m1._merge.value_counts()

In [None]:
# For the ones that are in both, generated the same number of trips
m1[m1._merge=="both"][m1.am_max_trips_x != m1.am_max_trips_y]

In [None]:
m1[m1._merge=="both"][m1.pm_max_trips_x != m1.pm_max_trips_y]

In [None]:
# Remove trips that only run during AM/PM peak, because those don't qualify as HQTA
def invalid_trips_only_peak_hours(df):
    trip_cols = ["calitp_itp_id", "trip_id"]
    
    df = df.assign(
        is_am_peak = df.time_of_day.map(lambda x: 1 if x=="AM Peak" 
                                          else 0),
        is_pm_peak = df.time_of_day.map(lambda x: 1 if x=="PM Peak"
                                          else 0),
        is_other = df.time_of_day.map(lambda x: 1 if ((x != "AM Peak") and 
                                                        (x != "PM Peak"))
                                        else 0),
    )
    
    df2 = (df.groupby(trip_cols)
           .agg({"is_am_peak": np.max, 
                 "is_pm_peak": np.max,
                 "is_other": np.max,})
           .reset_index()
          )
    
    # Drop trips that only run AM/PM
    only_peak = df2[(df2.is_am_peak==1) & 
              (df2.is_pm_peak==1) & 
              (df2.is_other==0)].trip_id.compute()
        
    # Return list of trip_ids that are invalid
    return list(only_peak.unique())

In [None]:
invalid_trips = invalid_trips_only_peak_hours(trip_stops_by_hour)