# HQ Transit Corridor/Major Transit Stop Debugging

* fix https://github.com/cal-itp/data-analyses/issues/1838
* fix analysis_name duplicates in final step

In [4]:
import geopandas as gpd
import lookback_wrappers
import numpy as np
import pandas as pd

from update_vars import (
    AM_PEAK,
    GCS_FILE_PATH,
    EXPORT_PATH,
    HQ_TRANSIT_THRESHOLD,
    MS_TRANSIT_THRESHOLD,
    PM_PEAK,
    PROJECT_CRS,
    SEGMENT_BUFFER_METERS,
    analysis_date,
)

from calitp_data_analysis.gcs_geopandas import GCSGeoPandas
gcsgp = GCSGeoPandas()

In [4]:
hqta_segments = gcsgp.read_parquet(f"{GCS_FILE_PATH}hqta_segments.parquet")

In [35]:
max_arrivals_by_stop = pd.read_parquet(f"{GCS_FILE_PATH}max_arrivals_by_stop.parquet").rename(columns = {'route_id': 'route_id_stop_freq'})

In [8]:
max_arrivals_by_stop[max_arrivals_by_stop.stop_id == '2620-a1']

Unnamed: 0,schedule_gtfs_dataset_key,stop_id,am_max_trips,route_dir,pm_max_trips,am_max_trips_hr,pm_max_trips_hr,n_trips,route_dir_count,route_id
16,ba208824d3e906d2a4f36d46ca821b3b,2620-a1,19,10482_1,22.0,6.33,5.5,41.0,2,10482
16,ba208824d3e906d2a4f36d46ca821b3b,2620-a1,19,10486_1,22.0,6.33,5.5,41.0,2,10486


In [10]:
# hqta_segments[hqta_segments.route_id == '20480']

In [11]:
def hqta_segment_to_stop(hqta_segments: gpd.GeoDataFrame, stops: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    """
    Spatially join hqta segments to stops.
    Which stops fall into which segments?
    """
    segment_cols = ["hqta_segment_id", "segment_sequence"]

    segment_to_stop = (
        gpd.sjoin(stops[["stop_id", "geometry"]], hqta_segments, how="inner", predicate="intersects").drop(
            columns=["index_right"]
        )
    )[segment_cols + ["stop_id"]]

    # After sjoin, we don't want to keep stop's point geom
    # Merge on hqta_segment_id's polygon geom
    segment_to_stop2 = pd.merge(hqta_segments, segment_to_stop, on=segment_cols)

    return segment_to_stop2

## fix #1838 at `hqta_segment_keep_one_stop`?

In [27]:
test = gcsgp.read_parquet(f"{GCS_FILE_PATH}test_segment_to_stop.parquet")

In [30]:
test.columns

Index(['schedule_gtfs_dataset_key', 'route_id_seg', 'route_key',
       'segment_sequence', 'geometry', 'hqta_segment_id', 'route_direction',
       'fwd_azimuth', 'fwd_azimuth_360', 'circuitous_segment', 'stop_id'],
      dtype='object')

In [36]:
stop_cols = ["schedule_gtfs_dataset_key", "stop_id"]
segment_to_stop_frequencies = pd.merge(test, max_arrivals_by_stop, on=stop_cols)

In [37]:
segment_to_stop_frequencies.shape

(60778, 19)

In [38]:
segment_to_stop_frequencies.columns

Index(['schedule_gtfs_dataset_key', 'route_id_seg', 'route_key',
       'segment_sequence', 'geometry', 'hqta_segment_id', 'route_direction',
       'fwd_azimuth', 'fwd_azimuth_360', 'circuitous_segment', 'stop_id',
       'am_max_trips', 'route_dir', 'pm_max_trips', 'am_max_trips_hr',
       'pm_max_trips_hr', 'n_trips', 'route_dir_count', 'route_id_stop_freq'],
      dtype='object')

In [None]:
def hqta_segment_keep_one_stop(hqta_segments: gpd.GeoDataFrame, stop_frequencies: pd.DataFrame) -> gpd.GeoDataFrame:
    """
    Since multiple stops can fall into the same segment,
    keep the stop with the highest trips (sum across AM and PM).

    Returns gdf where each segment only appears once.
    """
    stop_cols = ["schedule_gtfs_dataset_key", "stop_id"]

    segment_to_stop_frequencies = pd.merge(hqta_segments, stop_frequencies, on=stop_cols)
    # TODO check route_id here?
    # Can't sort by multiple columns in dask,
    # so, find the max, then inner merge
    max_trips_by_segment = max_trips_by_group(
        segment_to_stop_frequencies, group_cols=["hqta_segment_id"], max_col="n_trips"
    )

    # Merge in and keep max trips observation
    # Since there might be duplicates still, where multiple stops all
    # share 2 trips for that segment, do a drop duplicates at the end
    max_trip_cols = ["hqta_segment_id", "am_max_trips_hr", "pm_max_trips_hr"]

    segment_to_stop_unique = pd.merge(
        segment_to_stop_frequencies, max_trips_by_segment, on=["hqta_segment_id", "n_trips"], how="inner"
    ).drop_duplicates(subset=max_trip_cols)

    # In the case of same number of trips overall, do a sort
    # with descending order for AM, then PM trips
    segment_to_stop_gdf = (
        segment_to_stop_unique.sort_values(max_trip_cols, ascending=[True, False, False])
        .drop_duplicates(subset="hqta_segment_id")
        .reset_index(drop=True)
    )

    return segment_to_stop_gdf

## fix analysis_name dupes

In [1]:
import intake
catalog = intake.open_catalog("*.yml")

In [9]:
major_stop_bus = catalog.major_stop_bus().read().to_crs(PROJECT_CRS)

In [10]:
major_stop_bus[major_stop_bus.stop_id == '2158-a2']

Unnamed: 0,schedule_gtfs_dataset_key_primary,schedule_gtfs_dataset_key_secondary,stop_id,geometry,hqta_type
5738,ba208824d3e906d2a4f36d46ca821b3b,2187bd8e29f5c69e2e4402f5d9b59d62,2158-a2,POINT (207559.784 -437116.932),major_stop_bus
5746,ba208824d3e906d2a4f36d46ca821b3b,ba208824d3e906d2a4f36d46ca821b3b,2158-a2,POINT (207559.784 -437116.932),major_stop_bus


In [6]:
stops = gcsgp.read_parquet(f'{EXPORT_PATH}ca_hq_transit_stops.parquet')

In [12]:
from segment_speed_utils import helpers

In [15]:
helpers.import_schedule_gtfs_key_organization_crosswalk(
        analysis_date,
        # columns=["schedule_gtfs_dataset_key", "organization_name", "organization_source_record_id", "base64_url"],
    ).columns

Index(['schedule_gtfs_dataset_key', 'name', 'schedule_source_record_id',
       'base64_url', 'organization_source_record_id', 'organization_name',
       'caltrans_district', 'counties_served', 'hq_city', 'hq_county',
       'is_public_entity', 'is_publicly_operating', 'funding_sources',
       'on_demand_vehicles_at_max_service', 'vehicles_at_max_service',
       'number_of_state_counties', 'primary_uza_name', 'density',
       'number_of_counties_with_service', 'state_admin_funds_expended',
       'service_area_sq_miles', 'population', 'service_area_pop',
       'subrecipient_type', 'primary_uza_code', 'reporter_type',
       'organization_type', 'voms_pt', 'voms_do', 'year'],
      dtype='object')

In [8]:
stops[stops.stop_id == '2158-a2']

Unnamed: 0,agency_primary,hqta_type,stop_id,route_id,hqta_details,agency_secondary,base64_url_primary,base64_url_secondary,avg_trips_per_peak_hr,mpo,geometry,plan_name
12145,City of Duarte,hq_corridor_bus,2158-a2,20707,corridor_frequent_stop,,aHR0cHM6Ly9mb290aGlsbDNyZHBhcnR5LnJpZGVyYWxlcn...,,4.0,,POINT (-117.75141 34.05940),
12146,City of Duarte,hq_corridor_bus,2158-a2,20480,corridor_frequent_stop,,aHR0cHM6Ly9mb290aGlsbDNyZHBhcnR5LnJpZGVyYWxlcn...,,4.0,,POINT (-117.75141 34.05940),
12614,City of Duarte,major_stop_bus,2158-a2,20707,intersection_2_bus_routes_same_operator,Foothill Transit,aHR0cHM6Ly9mb290aGlsbDNyZHBhcnR5LnJpZGVyYWxlcn...,aHR0cHM6Ly9mb290aGlsbDNyZHBhcnR5LnJpZGVyYWxlcn...,4.0,,POINT (-117.75141 34.05940),
12615,City of Duarte,major_stop_bus,2158-a2,20480,intersection_2_bus_routes_same_operator,City of Duarte,aHR0cHM6Ly9mb290aGlsbDNyZHBhcnR5LnJpZGVyYWxlcn...,aHR0cHM6Ly9mb290aGlsbDNyZHBhcnR5LnJpZGVyYWxlcn...,4.0,,POINT (-117.75141 34.05940),
23834,Foothill Transit,hq_corridor_bus,2158-a2,20707,corridor_frequent_stop,,aHR0cHM6Ly9mb290aGlsbDNyZHBhcnR5LnJpZGVyYWxlcn...,,4.0,,POINT (-117.75141 34.05940),
23835,Foothill Transit,hq_corridor_bus,2158-a2,20480,corridor_frequent_stop,,aHR0cHM6Ly9mb290aGlsbDNyZHBhcnR5LnJpZGVyYWxlcn...,,4.0,,POINT (-117.75141 34.05940),
24303,Foothill Transit,major_stop_bus,2158-a2,20707,intersection_2_bus_routes_same_operator,Foothill Transit,aHR0cHM6Ly9mb290aGlsbDNyZHBhcnR5LnJpZGVyYWxlcn...,aHR0cHM6Ly9mb290aGlsbDNyZHBhcnR5LnJpZGVyYWxlcn...,4.0,,POINT (-117.75141 34.05940),
24304,Foothill Transit,major_stop_bus,2158-a2,20480,intersection_2_bus_routes_different_operators,OmniTrans,aHR0cHM6Ly9mb290aGlsbDNyZHBhcnR5LnJpZGVyYWxlcn...,aHR0cHM6Ly93d3cub21uaXRyYW5zLm9yZy9nb29nbGUvZ2...,4.0,,POINT (-117.75141 34.05940),
