In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(1_000_000_000_000) ## 1TB?

In [2]:
import geopandas as gpd
import pandas as pd
from siuba import *
import numpy as np

from segment_speed_utils import helpers, gtfs_schedule_wrangling
from shared_utils import rt_dates, gtfs_utils_v2
import folium
import itertools

In [49]:
from update_vars import (analysis_date, AM_PEAK, PM_PEAK, EXPORT_PATH, GCS_FILE_PATH, PROJECT_CRS,
SEGMENT_BUFFER_METERS, AM_PEAK, PM_PEAK, HQ_TRANSIT_THRESHOLD, MS_TRANSIT_THRESHOLD, SHARED_STOP_THRESHOLD)

In [31]:
import create_aggregate_stop_frequencies as casf

In [41]:
import importlib
importlib.reload(create_aggregate_stop_frequencies)

<module 'create_aggregate_stop_frequencies' from '/home/jovyan/data-analyses/high_quality_transit_areas/create_aggregate_stop_frequencies.py'>

In [6]:
am_peak_hrs = list(range(AM_PEAK[0].hour, AM_PEAK[1].hour))
pm_peak_hrs = list(range(PM_PEAK[0].hour, PM_PEAK[1].hour))
both_peaks_hrs = am_peak_hrs + pm_peak_hrs

In [7]:
analysis_date

'2025-05-14'

In [8]:
stop_times = helpers.import_scheduled_stop_times(
    analysis_date,
    get_pandas = True,
)

In [9]:
stop_times = casf.add_route_dir(stop_times, analysis_date)

In [10]:
st_prepped = stop_times.pipe(casf.prep_stop_times)

  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)


## HCD Y-Branching

* combine single route frequencies with collinearity screen...
* actually, these routes need to pass _both_ a collinearity screen and a non-collinearity screen: 8+ stops shared and also 8+ stops not shared (meaningful service on all 3 legs of the Y)

In [22]:
max_arrivals_by_stop_single = st_prepped.pipe(casf.stop_times_aggregation_max_by_stop, analysis_date, single_route_dir=True)
max_arrivals_by_stop_multi = st_prepped.pipe(casf.stop_times_aggregation_max_by_stop, analysis_date, single_route_dir=False)

In [23]:
multi_only_explode = casf.get_explode_multiroute_only(max_arrivals_by_stop_single, max_arrivals_by_stop_multi, (HQ_TRANSIT_THRESHOLD, MS_TRANSIT_THRESHOLD))

In [24]:
multi_only_explode.head(3)

Unnamed: 0,schedule_gtfs_dataset_key,stop_id,route_dir
0,0139b1253130b33adcd4b3a4490530d2,0a2a817a-d35d-434d-b039-6a78d79d7602,91673676-969b-4c0a-a233-158c98c522dc_0
1,0139b1253130b33adcd4b3a4490530d2,0a2a817a-d35d-434d-b039-6a78d79d7602,T6_0
2,0139b1253130b33adcd4b3a4490530d2,52c2636c-34a3-434c-99ae-cdf3dc36d15c,T11x_0


In [27]:
def get_explode_singles(
    single_route_aggregation: pd.DataFrame,
    frequency_thresholds: tuple
) -> pd.DataFrame:
    ms_precursor_threshold = min(frequency_thresholds)
    single_qual = (single_route_aggregation.query('am_max_trips_hr >= @ms_precursor_threshold & pm_max_trips_hr >= @ms_precursor_threshold')
                   .explode('route_dir')
                   .sort_values(['schedule_gtfs_dataset_key','stop_id', 'route_dir'])[['schedule_gtfs_dataset_key','stop_id', 'route_dir']]
                  )
    return single_qual

In [30]:
singles_explode = get_explode_singles(single_frequencies, (HQ_TRANSIT_THRESHOLD, MS_TRANSIT_THRESHOLD)).explode('route_dir')

In [32]:
from tqdm import tqdm
tqdm.pandas()

In [66]:
share_counts = {}
singles_explode.groupby(['schedule_gtfs_dataset_key', 'stop_id']).progress_apply(casf.accumulate_share_count, share_counts=share_counts)

In [68]:
qualify_dict = {key: share_counts[key] for key in share_counts.keys() if share_counts[key] >= SHARED_STOP_THRESHOLD}

In [141]:
feeds_to_filter = np.unique([key.split('__')[0] for key in qualify_dict.keys()])

In [91]:
qualify_dict

{'0666caf3ec1ecc96b74f4477ee4bc939__20-13191_1__720-13191_1': 16,
 '0666caf3ec1ecc96b74f4477ee4bc939__720-13191_1__20-13191_1': 16,
 '0666caf3ec1ecc96b74f4477ee4bc939__14-13191_1__76-13191_1': 9,
 '0666caf3ec1ecc96b74f4477ee4bc939__14-13191_1__78-13191_1': 9,
 '0666caf3ec1ecc96b74f4477ee4bc939__76-13191_1__14-13191_1': 9,
 '0666caf3ec1ecc96b74f4477ee4bc939__76-13191_1__78-13191_1': 13,
 '0666caf3ec1ecc96b74f4477ee4bc939__78-13191_1__14-13191_1': 9,
 '0666caf3ec1ecc96b74f4477ee4bc939__78-13191_1__76-13191_1': 13,
 '0666caf3ec1ecc96b74f4477ee4bc939__210-13191_1__40-13191_1': 19,
 '0666caf3ec1ecc96b74f4477ee4bc939__40-13191_1__210-13191_1': 19,
 '0666caf3ec1ecc96b74f4477ee4bc939__180-13191_1__217-13191_1': 29,
 '0666caf3ec1ecc96b74f4477ee4bc939__217-13191_1__180-13191_1': 29,
 '0666caf3ec1ecc96b74f4477ee4bc939__204-13191_0__754-13191_0': 17,
 '0666caf3ec1ecc96b74f4477ee4bc939__754-13191_0__204-13191_0': 17,
 '0666caf3ec1ecc96b74f4477ee4bc939__111-13191_1__40-13191_1': 8,
 '0666caf3ec1ecc9

In [69]:
(singles_explode.query('route_dir.isin(["20-13191_1", "720-13191_1"])').stop_id.value_counts() == 1).value_counts()
#  True: unshared stop count False: shared stop count

True     60
False    16
Name: stop_id, dtype: int64

* local/rapid overlap, there are many unshared stops but that does not offer riders meaningful choice (still a spatial subset)
* need a spatial methodology here...

In [143]:
feeds_to_filter = np.unique([key.split('__')[0] for key in qualify_dict.keys()])

In [144]:
feeds_to_filter

array(['0666caf3ec1ecc96b74f4477ee4bc939',
       '3364ec074ca85001da3abd78be2ae521',
       '3c275e5acf8974e1afd765bd3011424c',
       '4b317fc27dde351e12253d46cedd8df0',
       '7cc0cb1871dfd558f11a2885c145d144',
       'c499f905e33929a641f083dad55c521e',
       'ecd018ad66f497fb8f188ed5a71b284b',
       'fb467982dcc77a7f9199bebe709bb700'], dtype='<U32')

In [83]:
helpers.import_scheduled_shapes?

[0;31mSignature:[0m
[0mhelpers[0m[0;34m.[0m[0mimport_scheduled_shapes[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0manalysis_date[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfilters[0m[0;34m:[0m [0mtuple[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcolumns[0m[0;34m:[0m [0mlist[0m [0;34m=[0m [0;34m[[0m[0;34m'shape_array_key'[0m[0;34m,[0m [0;34m'geometry'[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mget_pandas[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcrs[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'EPSG:3310'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0mUnion[0m[0;34m[[0m[0mgeopandas[0m[0;34m.[0m[0mgeodataframe[0m[0;34m.[0m[0mGeoDataFrame[0m[0;34m,[0m [0mdask_geopandas[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mGeoDataFrame[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m Import 

In [276]:
## TODO integrate into script

def get_trips_with_route_dir(analysis_date):
    trips = helpers.import_scheduled_trips(
    analysis_date,
    columns = ["feed_key", "gtfs_dataset_key", "trip_id",
               "route_id", "direction_id", "route_type",
              "shape_array_key", "route_short_name", "name"],
    get_pandas = True
    )
    trips = trips[trips['route_type'].isin(['3', '11'])] #  bus only

    trips.direction_id = trips.direction_id.fillna(0).astype(int).astype(str)
    trips['route_dir'] = trips[['route_id', 'direction_id']].agg('_'.join, axis=1)
    
    return trips

In [277]:
shapes = helpers.import_scheduled_shapes(analysis_date, columns=['shape_array_key', 'geometry'])

trips = (get_trips_with_route_dir(analysis_date)
         .query("schedule_gtfs_dataset_key.isin(@feeds_to_filter)")
         .drop_duplicates(subset=['schedule_gtfs_dataset_key', 'shape_array_key', 'route_dir'])
        
        )

In [278]:
trips.head(3)

Unnamed: 0,feed_key,schedule_gtfs_dataset_key,trip_id,route_id,direction_id,route_type,shape_array_key,route_short_name,name,route_dir
1671,9e8eed3cc17c4fdde369dd07bcb45faa,3c275e5acf8974e1afd765bd3011424c,11273689|25712684:T5|10:36:00,11273689,0,3,bcd4f310183437ac0b20a2eb6426c0d5,,Stanford Schedule,11273689_0
1686,9e8eed3cc17c4fdde369dd07bcb45faa,3c275e5acf8974e1afd765bd3011424c,11273689|25817216:T11|12:14:00,11273689,1,3,78a42d5496baf388fc40fa027f76d3f0,,Stanford Schedule,11273689_1
1700,9e8eed3cc17c4fdde369dd07bcb45faa,3c275e5acf8974e1afd765bd3011424c,14602095|25712661:T4|16:07:00,14602095,1,3,97bea6ce476801b6e456c0e88b3cfba6,RP PM,Stanford Schedule,14602095_1


In [291]:
trips.drop_duplicates(subset=['name'])

Unnamed: 0,feed_key,schedule_gtfs_dataset_key,trip_id,route_id,direction_id,route_type,shape_array_key,route_short_name,name,route_dir
1671,9e8eed3cc17c4fdde369dd07bcb45faa,3c275e5acf8974e1afd765bd3011424c,11273689|25712684:T5|10:36:00,11273689,0,3,bcd4f310183437ac0b20a2eb6426c0d5,,Stanford Schedule,11273689_0
3466,f6774d861953d4f4cdcffec95e2652c7,4b317fc27dde351e12253d46cedd8df0,450090,1,1,3,1e993477fc281851b7ab7c2ee3c87cb7,1,Culver City Schedule,1_1
7348,8510daa6c8576e648fcbd4f92ea73a51,3364ec074ca85001da3abd78be2ae521,17536579,AIR,1,3,bf88d35e133e57ca36ae69c981aca5d9,,San Diego Schedule,AIR_1
12323,35702a19aac0ed4d2a616627483d3850,c499f905e33929a641f083dad55c521e,8735020,10,1,3,43789e566d08d8ee428d834c6c07c529,10,Bay Area 511 AC Transit Schedule,10_1
17323,98a10e85e7dfe3ff1caa761d1bc34606,fb467982dcc77a7f9199bebe709bb700,3749742,Express 101,0,3,c16a64094bac37f9057d868c39cd006f,Express 101,Bay Area 511 Santa Clara Transit Schedule,Express 101_0
19236,a0024a6c86c0039081d5354b929ee347,7cc0cb1871dfd558f11a2885c145d144,11708336_M21,1,0,3,f817420b11a48d033253443b372eac68,1,Bay Area 511 Muni Schedule,1_0
35181,8d9623a1823a27925b7e2f00e44fc5bb,0666caf3ec1ecc96b74f4477ee4bc939,10010007510813-DEC24,10-13191,1,3,f45cb04e61e982437431a7d384e70a9c,10/48,LA Metro Bus Schedule,10-13191_1
91781,cd299184726656597ae2cdb4f4e81e4a,ecd018ad66f497fb8f188ed5a71b284b,12070891,1,0,3,27128e796c0bbcee787e24b5281dfbe8,1,OCTA Schedule,1_0


* dissolve shapes by route+dir, pick longest
* also try  dissolve shapes by route+dir, dissolve
* find non-overlap by previously qualifying pair

In [279]:
shapes = shapes.merge(trips, on='shape_array_key')
shapes = shapes.assign(length = shapes.geometry.length)

In [299]:
shapes.geometry = shapes.buffer(OVERLAY_BUFFER)
shapes = shapes.assign(area = shapes.geometry.map(lambda x: x.area))

In [280]:
max_by_route_dir = shapes.groupby(['schedule_gtfs_dataset_key', 'route_dir']).length.max().reset_index()

In [281]:
shapes = (shapes.merge(max_by_route_dir, on = ['schedule_gtfs_dataset_key', 'route_dir', 'length'])
          .drop_duplicates(subset = ['schedule_gtfs_dataset_key', 'route_dir', 'length'])
         )

In [282]:
# shapes.explore(column='length')

In [292]:
feeds_to_filter

array(['0666caf3ec1ecc96b74f4477ee4bc939',
       '3364ec074ca85001da3abd78be2ae521',
       '3c275e5acf8974e1afd765bd3011424c',
       '4b317fc27dde351e12253d46cedd8df0',
       '7cc0cb1871dfd558f11a2885c145d144',
       'c499f905e33929a641f083dad55c521e',
       'ecd018ad66f497fb8f188ed5a71b284b',
       'fb467982dcc77a7f9199bebe709bb700'], dtype='<U32')

In [355]:
gtfs_dataset_key = feeds_to_filter[7]
gtfs_dataset_key

'fb467982dcc77a7f9199bebe709bb700'

In [356]:
this_feed_qual = {key.split(gtfs_dataset_key)[1][2:]:qualify_dict[key] for key in qualify_dict.keys() if key.split('__')[0] == gtfs_dataset_key}
qualify_pairs = [tuple(key.split('__')) for key in this_feed_qual.keys()]

In [357]:
qualify_sets = [set(x) for x in qualify_pairs]
qualify_sets = set(map(frozenset, qualify_sets))

In [358]:
unique_qualify_pairs = [list(x) for x in qualify_sets]

In [360]:
OVERLAY_BUFFER = 20
AREA_MULTIPLIER = OVERLAY_BUFFER * 2

TARGET_METERS_DIFFERENCE = 5000 #  5km per route
TARGET_AREA_DIFFERENCE = TARGET_METERS_DIFFERENCE * AREA_MULTIPLIER

In [None]:
for pair in unique_qualify_pairs:
    print(f'{pair}...', end='')
    these_shapes = shapes.query('route_dir.isin(@pair) & schedule_gtfs_dataset_key == @gtfs_dataset_key')
    first_row = these_shapes.iloc[0:1][['schedule_gtfs_dataset_key', 'route_dir', 'shape_array_key', 'geometry']]
    sym_diff = first_row.overlay(these_shapes.iloc[1:2][['route_dir', 'geometry']], how='symmetric_difference')
    sym_diff = sym_diff.assign(area = sym_diff.geometry.map(lambda x: x.area),
                          route_dir = sym_diff.route_dir_1.fillna(sym_diff.route_dir_2))
    diff_area = sym_diff.area.sum()
    area_ratios = (sym_diff.area / TARGET_AREA_DIFFERENCE)
    if (sym_diff.area > TARGET_AREA_DIFFERENCE).all():
        print(f'passed, {area_ratios[0]:.2f} and {area_ratios[1]:.2f} times area target')
        m = these_shapes.explore(color='gray', tiles='CartoDB Positron')
        display(sym_diff.explore(column='route_dir', m=m, tiles='CartoDB Positron'))
    else:
        print(f'failed, {area_ratios[0]:.2f} and {area_ratios[1]:.2f} times area target')
        display(these_shapes.explore(column='route_dir', tiles='CartoDB Positron'))