In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(1_000_000_000_000) ## 1TB?

In [2]:
import geopandas as gpd
import pandas as pd
from siuba import *
import numpy as np

from segment_speed_utils import helpers

import sys
sys.path.append('./scripts/')

In [3]:
from update_vars import (analysis_date, EXPORT_PATH, GCS_FILE_PATH, PROJECT_CRS,
SEGMENT_BUFFER_METERS, HQ_TRANSIT_THRESHOLD, MS_TRANSIT_THRESHOLD, SHARED_STOP_THRESHOLD)

In [4]:
import create_aggregate_stop_frequencies as casf

In [5]:
import importlib
importlib.reload(casf)

<module 'create_aggregate_stop_frequencies' from '/home/jovyan/data-analyses/high_quality_transit_areas/./scripts/create_aggregate_stop_frequencies.py'>

In [7]:
analysis_date

'2025-05-14'

In [10]:
stop_times = helpers.import_scheduled_stop_times(
    analysis_date,
    get_pandas = True,
)

stop_times = casf.add_route_dir(stop_times, analysis_date)

st_prepped = stop_times.pipe(casf.prep_stop_times)

## HCD Y-Branching

* combine single route frequencies with collinearity screen...
* actually, these routes need to pass _both_ a collinearity screen and a non-collinearity screen: 8+ stops shared and also 8+ stops not shared (meaningful service on all 3 legs of the Y)

In [11]:
max_arrivals_by_stop_single = st_prepped.pipe(casf.stop_times_aggregation_max_by_stop, analysis_date, single_route_dir=True)

In [12]:
def get_explode_singles(
    single_route_aggregation: pd.DataFrame,
    ms_precursor_threshold: int | float
) -> pd.DataFrame:
    single_qual = (single_route_aggregation.query('am_max_trips_hr >= @ms_precursor_threshold & pm_max_trips_hr >= @ms_precursor_threshold')
                   .explode('route_dir')
                   .sort_values(['schedule_gtfs_dataset_key','stop_id', 'route_dir'])[['schedule_gtfs_dataset_key','stop_id', 'route_dir']]
                  )
    return single_qual

In [13]:
singles_explode = get_explode_singles(max_arrivals_by_stop_single, MS_TRANSIT_THRESHOLD).explode('route_dir')

In [14]:
from tqdm import tqdm
tqdm.pandas()

In [15]:
share_counts = {}
singles_explode.groupby(['schedule_gtfs_dataset_key', 'stop_id']).progress_apply(casf.accumulate_share_count, share_counts=share_counts)

100%|██████████| 19059/19059 [00:02<00:00, 7119.23it/s]


In [17]:
qualify_dict = {key: share_counts[key] for key in share_counts.keys() if share_counts[key] >= SHARED_STOP_THRESHOLD}
feeds_to_filter = np.unique([key.split('__')[0] for key in qualify_dict.keys()])

In [19]:
feeds_to_filter

array(['0666caf3ec1ecc96b74f4477ee4bc939',
       '3364ec074ca85001da3abd78be2ae521',
       '3c275e5acf8974e1afd765bd3011424c',
       '4b317fc27dde351e12253d46cedd8df0',
       '7cc0cb1871dfd558f11a2885c145d144',
       'c499f905e33929a641f083dad55c521e',
       'ecd018ad66f497fb8f188ed5a71b284b',
       'fb467982dcc77a7f9199bebe709bb700'], dtype='<U32')

In [20]:
## TODO integrate into script

def get_trips_with_route_dir(analysis_date):
    trips = helpers.import_scheduled_trips(
    analysis_date,
    columns = ["feed_key", "gtfs_dataset_key", "trip_id",
               "route_id", "direction_id", "route_type",
              "shape_array_key", "route_short_name", "name"],
    get_pandas = True
    )
    trips = trips[trips['route_type'].isin(['3', '11'])] #  bus only

    trips.direction_id = trips.direction_id.fillna(0).astype(int).astype(str)
    trips['route_dir'] = trips[['route_id', 'direction_id']].agg('_'.join, axis=1)
    
    return trips

In [22]:
shapes = helpers.import_scheduled_shapes(analysis_date, columns=['shape_array_key', 'geometry'])
trips = (get_trips_with_route_dir(analysis_date)
         .query("schedule_gtfs_dataset_key.isin(@feeds_to_filter)")
         .drop_duplicates(subset=['schedule_gtfs_dataset_key', 'shape_array_key', 'route_dir'])
        )
feeds = trips[['feed_key', 'schedule_gtfs_dataset_key']].drop_duplicates()
stops = helpers.import_scheduled_stops(analysis_date, columns=['feed_key', 'stop_id', 'geometry'])

* dissolve shapes by route+dir, pick longest
* also try  dissolve shapes by route+dir, dissolve
* find non-overlap by previously qualifying pair

In [23]:
shapes = shapes.merge(trips, on='shape_array_key')
shapes = shapes.assign(length = shapes.geometry.length)

In [24]:
BRANCHING_OVERLAY_BUFFER = 20
AREA_MULTIPLIER = BRANCHING_OVERLAY_BUFFER * 2

TARGET_METERS_DIFFERENCE = 5000 #  5km per route
TARGET_AREA_DIFFERENCE = TARGET_METERS_DIFFERENCE * AREA_MULTIPLIER

In [25]:
shapes.geometry = shapes.buffer(OVERLAY_BUFFER)
shapes = shapes.assign(area = shapes.geometry.map(lambda x: x.area))

In [26]:
max_by_route_dir = shapes.groupby(['schedule_gtfs_dataset_key', 'route_dir']).length.max().reset_index()

In [27]:
shapes = (shapes.merge(max_by_route_dir, on = ['schedule_gtfs_dataset_key', 'route_dir', 'length'])
          .drop_duplicates(subset = ['schedule_gtfs_dataset_key', 'route_dir', 'length'])
         )

In [28]:
# shapes.explore(column='length')

In [29]:
feeds_to_filter

array(['0666caf3ec1ecc96b74f4477ee4bc939',
       '3364ec074ca85001da3abd78be2ae521',
       '3c275e5acf8974e1afd765bd3011424c',
       '4b317fc27dde351e12253d46cedd8df0',
       '7cc0cb1871dfd558f11a2885c145d144',
       'c499f905e33929a641f083dad55c521e',
       'ecd018ad66f497fb8f188ed5a71b284b',
       'fb467982dcc77a7f9199bebe709bb700'], dtype='<U32')

In [30]:
# gtfs_dataset_key = feeds_to_filter[0]
gtfs_dataset_key = feeds_to_filter[0]
gtfs_dataset_key

'0666caf3ec1ecc96b74f4477ee4bc939'

In [31]:
def evaluate_overlaps(gtfs_dataset_key: str, show_map: bool = False) -> list:

    this_feed_qual = {key.split(gtfs_dataset_key)[1][2:]:qualify_dict[key] for key in qualify_dict.keys() if key.split('__')[0] == gtfs_dataset_key}
    qualify_pairs = [tuple(key.split('__')) for key in this_feed_qual.keys()]

    qualify_sets = [set(x) for x in qualify_pairs]
    qualify_sets = set(map(frozenset, qualify_sets))

    unique_qualify_pairs_possible = [list(x) for x in qualify_sets]

    ## TODO accumulate pairs after check...
    unique_qualify_pairs = []
    for pair in unique_qualify_pairs_possible:
        print(f'{pair}...', end='')
        these_shapes = shapes.query('route_dir.isin(@pair) & schedule_gtfs_dataset_key == @gtfs_dataset_key')
        first_row = these_shapes.iloc[0:1][['schedule_gtfs_dataset_key', 'route_dir', 'shape_array_key', 'geometry']]
        sym_diff = first_row.overlay(these_shapes.iloc[1:2][['route_dir', 'geometry']], how='symmetric_difference')
        sym_diff = sym_diff.assign(area = sym_diff.geometry.map(lambda x: x.area),
                              route_dir = sym_diff.route_dir_1.fillna(sym_diff.route_dir_2))
        diff_area = sym_diff.area.sum()
        area_ratios = (sym_diff.area / TARGET_AREA_DIFFERENCE)
        if (sym_diff.area > TARGET_AREA_DIFFERENCE).all():
            print(f'passed, {area_ratios[0]:.2f} and {area_ratios[1]:.2f} times area target')
            m = these_shapes.explore(color='gray', tiles='CartoDB Positron')
            if show_map: display(sym_diff.explore(column='route_dir', m=m, tiles='CartoDB Positron'))
            unique_qualify_pairs += [pair]
        else:
            print(f'failed, {area_ratios[0]:.2f} and {area_ratios[1]:.2f} times area target')
            if show_map: display(these_shapes.explore(column='route_dir', tiles='CartoDB Positron'))
            
    return unique_qualify_pairs

In [32]:
unique_qualify_pairs = evaluate_overlaps(gtfs_dataset_key, show_map=False)

['204-13191_1', '754-13191_1']...failed, 0.02 and 0.22 times area target
['720-13191_1', '20-13191_1']...failed, 0.39 and 0.15 times area target
['14-13191_1', '78-13191_1']...passed, 4.80 and 4.71 times area target
['251-13191_0', '60-13191_0']...passed, 5.81 and 6.71 times area target
['165-13191_0', '164-13191_0']...passed, 6.07 and 5.85 times area target
['720-13191_0', '20-13191_0']...failed, 0.03 and 0.28 times area target
['251-13191_1', '60-13191_1']...passed, 6.68 and 6.15 times area target
['40-13191_0', '45-13191_0']...passed, 5.16 and 3.37 times area target
['94-13191_0', '90-13191_0']...passed, 3.86 and 7.11 times area target
['78-13191_1', '76-13191_1']...passed, 4.53 and 4.48 times area target
['210-13191_0', '40-13191_0']...passed, 5.23 and 5.40 times area target
['53-13191_0', '18-13191_1']...passed, 3.58 and 5.06 times area target
['70-13191_0', '78-13191_0']...passed, 4.02 and 4.18 times area target
['224-13191_1', '152-13191_0']...passed, 5.05 and 5.71 times area ta

## Adding stops

In [33]:
max_arrivals_by_stop_single.head(3)

Unnamed: 0,schedule_gtfs_dataset_key,stop_id,am_max_trips,route_dir,pm_max_trips,am_max_trips_hr,pm_max_trips_hr,n_trips,route_dir_count
0,0139b1253130b33adcd4b3a4490530d2,00eb15cb-1430-4964-b8ae-ca6183e1d0ef,2,[D1_0],4.0,0.67,1.0,6.0,1
1,0139b1253130b33adcd4b3a4490530d2,02a30e39-496f-45d4-ba1c-ac8f3c66b621,2,[0ad6c6aa-1939-45a0-a3a8-02ebe8e19092_0],6.0,0.67,1.5,8.0,1
2,0139b1253130b33adcd4b3a4490530d2,02a30e39-496f-45d4-ba1c-ac8f3c66b621,2,[D2_0],3.0,0.67,0.75,5.0,1


In [34]:
def find_stops_this_pair(feed_stops: pd.DataFrame, one_feed_pair: list) -> pd.DataFrame:
    feed_stops = (feed_stops.explode(column='route_dir')
                  .query('route_dir in @one_feed_pair')
                  .groupby(['schedule_gtfs_dataset_key', 'stop_id'])[['route_dir']]
                  .count()
                  .reset_index()
                 )
    return feed_stops.query('route_dir > 1')

In [35]:
def find_stops_this_feed(gtfs_dataset_key: str,
                         max_arrivals_by_stop_single: pd.DataFrame,
                         unique_qualify_pairs: list) -> pd.DataFrame:
    
    feed_stops = max_arrivals_by_stop_single.query('schedule_gtfs_dataset_key == @gtfs_dataset_key')
    stop_dfs = []
    for pair in unique_qualify_pairs:
        these_stops = find_stops_this_pair(feed_stops, pair)
        stop_dfs += [these_stops]    
    if len(stop_dfs) > 0:
        feed_add = pd.concat(stop_dfs).merge(feeds, on = 'schedule_gtfs_dataset_key')
        feed_add = stops.merge(feed_add, on = ['feed_key', 'stop_id'])
        return feed_add

In [36]:
hcd_branching_stops = []
for gtfs_dataset_key in feeds_to_filter:
    unique_qualify_pairs = evaluate_overlaps(gtfs_dataset_key, show_map=False)
    this_feed_stops = find_stops_this_feed(gtfs_dataset_key, max_arrivals_by_stop_single, unique_qualify_pairs)
    hcd_branching_stops += [this_feed_stops]
hcd_branching_stops = pd.concat(hcd_branching_stops)

['204-13191_1', '754-13191_1']...failed, 0.02 and 0.22 times area target
['720-13191_1', '20-13191_1']...failed, 0.39 and 0.15 times area target
['14-13191_1', '78-13191_1']...passed, 4.80 and 4.71 times area target
['251-13191_0', '60-13191_0']...passed, 5.81 and 6.71 times area target
['165-13191_0', '164-13191_0']...passed, 6.07 and 5.85 times area target
['720-13191_0', '20-13191_0']...failed, 0.03 and 0.28 times area target
['251-13191_1', '60-13191_1']...passed, 6.68 and 6.15 times area target
['40-13191_0', '45-13191_0']...passed, 5.16 and 3.37 times area target
['94-13191_0', '90-13191_0']...passed, 3.86 and 7.11 times area target
['78-13191_1', '76-13191_1']...passed, 4.53 and 4.48 times area target
['210-13191_0', '40-13191_0']...passed, 5.23 and 5.40 times area target
['53-13191_0', '18-13191_1']...passed, 3.58 and 5.06 times area target
['70-13191_0', '78-13191_0']...passed, 4.02 and 4.18 times area target
['224-13191_1', '152-13191_0']...passed, 5.05 and 5.71 times area ta

In [37]:
# hcd_branching_stops.explore()