# HQ Transit Corridor/Major Transit Stop Debugging

* fix https://github.com/cal-itp/data-analyses/issues/1838
* fix analysis_name duplicates in final step

In [1]:
import geopandas as gpd
import lookback_wrappers
import numpy as np
import pandas as pd

from update_vars import (
    AM_PEAK,
    GCS_FILE_PATH,
    EXPORT_PATH,
    HQ_TRANSIT_THRESHOLD,
    MS_TRANSIT_THRESHOLD,
    PM_PEAK,
    PROJECT_CRS,
    SEGMENT_BUFFER_METERS,
    analysis_date,
)

from calitp_data_analysis.gcs_geopandas import GCSGeoPandas
gcsgp = GCSGeoPandas()

## fix #1838 at `hqta_segment_keep_one_stop`?

In [4]:
hqta_segments = gcsgp.read_parquet(f"{GCS_FILE_PATH}hqta_segments.parquet")

In [35]:
max_arrivals_by_stop = pd.read_parquet(f"{GCS_FILE_PATH}max_arrivals_by_stop.parquet").rename(columns = {'route_id': 'route_id_stop_freq'})

In [8]:
max_arrivals_by_stop[max_arrivals_by_stop.stop_id == '2620-a1']

Unnamed: 0,schedule_gtfs_dataset_key,stop_id,am_max_trips,route_dir,pm_max_trips,am_max_trips_hr,pm_max_trips_hr,n_trips,route_dir_count,route_id
16,ba208824d3e906d2a4f36d46ca821b3b,2620-a1,19,10482_1,22.0,6.33,5.5,41.0,2,10482
16,ba208824d3e906d2a4f36d46ca821b3b,2620-a1,19,10486_1,22.0,6.33,5.5,41.0,2,10486


In [10]:
# hqta_segments[hqta_segments.route_id == '20480']

In [27]:
test = gcsgp.read_parquet(f"{GCS_FILE_PATH}test_segment_to_stop.parquet")

In [30]:
test.columns

Index(['schedule_gtfs_dataset_key', 'route_id_seg', 'route_key',
       'segment_sequence', 'geometry', 'hqta_segment_id', 'route_direction',
       'fwd_azimuth', 'fwd_azimuth_360', 'circuitous_segment', 'stop_id'],
      dtype='object')

In [36]:
stop_cols = ["schedule_gtfs_dataset_key", "stop_id"]
segment_to_stop_frequencies = pd.merge(test, max_arrivals_by_stop, on=stop_cols)

In [37]:
segment_to_stop_frequencies.shape

(60778, 19)

In [38]:
segment_to_stop_frequencies.columns

Index(['schedule_gtfs_dataset_key', 'route_id_seg', 'route_key',
       'segment_sequence', 'geometry', 'hqta_segment_id', 'route_direction',
       'fwd_azimuth', 'fwd_azimuth_360', 'circuitous_segment', 'stop_id',
       'am_max_trips', 'route_dir', 'pm_max_trips', 'am_max_trips_hr',
       'pm_max_trips_hr', 'n_trips', 'route_dir_count', 'route_id_stop_freq'],
      dtype='object')

In [None]:
def hqta_segment_keep_one_stop(hqta_segments: gpd.GeoDataFrame, stop_frequencies: pd.DataFrame) -> gpd.GeoDataFrame:
    """
    Since multiple stops can fall into the same segment,
    keep the stop with the highest trips (sum across AM and PM).

    Returns gdf where each segment only appears once.
    """
    stop_cols = ["schedule_gtfs_dataset_key", "stop_id"]

    segment_to_stop_frequencies = pd.merge(hqta_segments, stop_frequencies, on=stop_cols)
    # TODO check route_id here?
    # Can't sort by multiple columns in dask,
    # so, find the max, then inner merge
    max_trips_by_segment = max_trips_by_group(
        segment_to_stop_frequencies, group_cols=["hqta_segment_id"], max_col="n_trips"
    )

    # Merge in and keep max trips observation
    # Since there might be duplicates still, where multiple stops all
    # share 2 trips for that segment, do a drop duplicates at the end
    max_trip_cols = ["hqta_segment_id", "am_max_trips_hr", "pm_max_trips_hr"]

    segment_to_stop_unique = pd.merge(
        segment_to_stop_frequencies, max_trips_by_segment, on=["hqta_segment_id", "n_trips"], how="inner"
    ).drop_duplicates(subset=max_trip_cols)

    # In the case of same number of trips overall, do a sort
    # with descending order for AM, then PM trips
    segment_to_stop_gdf = (
        segment_to_stop_unique.sort_values(max_trip_cols, ascending=[True, False, False])
        .drop_duplicates(subset="hqta_segment_id")
        .reset_index(drop=True)
    )

    return segment_to_stop_gdf

## fix analysis_name dupes

In [2]:
import intake
catalog = intake.open_catalog("*.yml")
from shared_utils import portfolio_utils, catalog_utils
from calitp_data_analysis.sql import query_sql

In [3]:
catalog.rail_brt_ferry_stops().read().to_crs(PROJECT_CRS).head(3)

Unnamed: 0,schedule_gtfs_dataset_key_primary,stop_id,stop_name,route_id,route_type,hqta_type,geometry
0,1165b1474df778cb0fc3ba9246e32035,ABE,Aberdeen Amtrak Station,88,2,major_stop_rail,POINT (3634080.474 1021286.801)
1,1165b1474df778cb0fc3ba9246e32035,ABE,Aberdeen Amtrak Station,42985,2,major_stop_rail,POINT (3634080.474 1021286.801)
2,1165b1474df778cb0fc3ba9246e32035,ABQ,Albuquerque Alvarado Transportation Center,51,2,major_stop_rail,POINT (1212593.823 -240549.356)


In [4]:
catalog.major_stop_bus_branching().read().to_crs(PROJECT_CRS).head(3)

Unnamed: 0,stop_id,geometry,analysis_name,schedule_gtfs_dataset_key_primary,schedule_gtfs_dataset_key_secondary,hqta_type
0,14551,POINT (-209523.625 -28229.210),City and County of San Francisco,1451f537bdcefd0e8ba827d12c4ef4b8,1451f537bdcefd0e8ba827d12c4ef4b8,major_stop_bus
1,14552,POINT (-209365.490 -28354.280),City and County of San Francisco,1451f537bdcefd0e8ba827d12c4ef4b8,1451f537bdcefd0e8ba827d12c4ef4b8,major_stop_bus
2,14553,POINT (-209819.351 -28006.267),City and County of San Francisco,1451f537bdcefd0e8ba827d12c4ef4b8,1451f537bdcefd0e8ba827d12c4ef4b8,major_stop_bus


In [5]:
catalog.stops_in_hq_corr().read().to_crs(PROJECT_CRS).head(3)

Unnamed: 0,schedule_gtfs_dataset_key_primary,stop_id,hqta_type,geometry
0,b8f8c5887cd49382593456b4898f0538,5726744,hq_corridor_bus,POINT (152057.740 -458962.417)
1,b8f8c5887cd49382593456b4898f0538,8318419,hq_corridor_bus,POINT (152064.856 -459487.252)
2,30416ad0ba0c171c814b13c2bb363bfe,190,hq_corridor_bus,POINT (152068.594 -458963.893)


In [6]:
major_stop_bus = catalog.major_stop_bus().read().to_crs(PROJECT_CRS)

In [7]:
major_stop_bus[major_stop_bus.stop_id == '2158-a2']

Unnamed: 0,schedule_gtfs_dataset_key_primary,schedule_gtfs_dataset_key_secondary,stop_id,geometry,hqta_type
5738,ba208824d3e906d2a4f36d46ca821b3b,2187bd8e29f5c69e2e4402f5d9b59d62,2158-a2,POINT (207559.784 -437116.932),major_stop_bus
5746,ba208824d3e906d2a4f36d46ca821b3b,ba208824d3e906d2a4f36d46ca821b3b,2158-a2,POINT (207559.784 -437116.932),major_stop_bus


In [8]:
query = """
    SELECT
    key AS schedule_gtfs_dataset_key,
    analysis_name AS agency,
    FROM
    cal-itp-data-infra.mart_transit_database.dim_gtfs_datasets
    WHERE _is_current = TRUE
    AND analysis_name IS NOT NULL
    """

df = query_sql(query)

In [9]:
df

Unnamed: 0,schedule_gtfs_dataset_key,agency
0,372a06b593e1716d1c911b1d1d35bedd,City of Solvang
1,f376bbdbbc9eb991df0b76ad83cdf686,San Luis Obispo Regional Transit Authority
2,535a5f615a37a8d9e911b893935a1446,FlixBus and Greyhound
3,c1bb6ee2e72a860c33430f7e19a6cab3,San Diego International Airport
4,2b3d19b86c7fd4dd69c6d0248b669ee4,City and County of San Francisco
...,...,...
213,bfccbb6bd59ab08d812ab782149c57b2,Yosemite Area Regional Transportation System
214,bfe67a45c3c09a0b9a4414dcce7d7804,Riverside Transit Agency
215,7ee71e62eafbb6d39ad97e72f9cd0ac2,"San Diego Metropolitan Transit System, Airport..."
216,b0f74fdaf964317b28da2431091c3575,City of West Covina


In [10]:
major_stop_bus_datasets = major_stop_bus.copy()[['schedule_gtfs_dataset_key_primary']].rename(columns={'schedule_gtfs_dataset_key_primary':'schedule_gtfs_dataset_key'}).drop_duplicates()
df = portfolio_utils.standardize_operator_info_for_exports(major_stop_bus_datasets, analysis_date)[['schedule_gtfs_dataset_key', 'analysis_name']]

In [11]:
df

Unnamed: 0,schedule_gtfs_dataset_key,analysis_name
0,57df9e6a9b06d6132b68ec87a1f33a13,San Mateo County Transit District
1,1451f537bdcefd0e8ba827d12c4ef4b8,City and County of San Francisco
2,360d2cbc743d547a60d78e8df45a1bda,Mission Bay Transportation Management Agency
3,f3c9434e3dccc3548b8c3a967d734cd3,"Golden Gate Bridge, Highway and Transportation..."
4,cf834f31d57989d22f357a644e062400,Alameda-Contra Costa Transit District
5,6bd0f26d162555d6720b116958fd46f0,Presidio Trust
6,6a2313555ab8020a49c5ea434cb22ba7,San Francisco International Airport
7,6a7787500def36facc40271eac4e080a,Alameda-Contra Costa Transit District
8,16d960978e3d6f2112b9e442f8152786,Stanford University
9,259794729086bec3fbeda36fb14ec373,Santa Clara Valley Transportation Authority


In [12]:
stops = gcsgp.read_parquet(f'{EXPORT_PATH}ca_hq_transit_stops.parquet')

In [13]:
stops[stops.stop_id == '2158-a2']

Unnamed: 0,agency_primary,hqta_type,stop_id,route_id,hqta_details,agency_secondary,base64_url_primary,base64_url_secondary,avg_trips_per_peak_hr,mpo,geometry,plan_name
17863,Foothill Transit,hq_corridor_bus,2158-a2,20480,corridor_frequent_stop,,aHR0cHM6Ly9mb290aGlsbDNyZHBhcnR5LnJpZGVyYWxlcn...,,4.0,,POINT (-117.75141 34.05940),
17864,Foothill Transit,hq_corridor_bus,2158-a2,20707,corridor_frequent_stop,,aHR0cHM6Ly9mb290aGlsbDNyZHBhcnR5LnJpZGVyYWxlcn...,,4.0,,POINT (-117.75141 34.05940),
18332,Foothill Transit,major_stop_bus,2158-a2,20480,intersection_2_bus_routes_same_operator,Foothill Transit,aHR0cHM6Ly9mb290aGlsbDNyZHBhcnR5LnJpZGVyYWxlcn...,aHR0cHM6Ly9mb290aGlsbDNyZHBhcnR5LnJpZGVyYWxlcn...,4.0,,POINT (-117.75141 34.05940),
18333,Foothill Transit,major_stop_bus,2158-a2,20707,intersection_2_bus_routes_same_operator,Foothill Transit,aHR0cHM6Ly9mb290aGlsbDNyZHBhcnR5LnJpZGVyYWxlcn...,aHR0cHM6Ly9mb290aGlsbDNyZHBhcnR5LnJpZGVyYWxlcn...,4.0,,POINT (-117.75141 34.05940),


In [15]:
# stops.explore()

In [16]:
areas = gcsgp.read_parquet(f'{EXPORT_PATH}ca_hq_transit_areas.parquet')

In [18]:
areas.agency_primary.unique()

array(['Alameda-Contra Costa Transit District',
       'Anaheim Transportation Network',
       'City and County of San Francisco', 'City of Burbank',
       'City of Fresno', 'City of Gardena', 'City of Glendale',
       'City of Los Angeles', 'City of Montebello', 'City of Norwalk',
       'City of Pasadena', 'City of Santa Monica', 'City of Santa Rosa',
       'City of Union City', 'Eastern Contra Costa Transit Authority',
       'Emeryville Transportation Management Agency', 'Foothill Transit',
       'Long Beach Transit', 'Los Angeles County',
       'Los Angeles County Metropolitan Transportation Authority',
       'Los Angeles World Airports', 'Marin County Transit District',
       'OmniTrans', 'Orange County Transportation Authority',
       'Riverside Transit Agency', 'Sacramento Regional Transit District',
       'San Diego Metropolitan Transit System, Airport, Flagship Cruises',
       'San Francisco International Airport',
       'San Joaquin Regional Transit District',
  

In [19]:
len(stops)

48178

In [20]:
len(areas)

19701

In [22]:
# areas.explore()