In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(1_000_000_000_000) ## 1TB?

In [67]:
import geopandas as gpd
import pandas as pd
from siuba import *
import numpy as np

from segment_speed_utils import helpers, gtfs_schedule_wrangling
from shared_utils import rt_dates, gtfs_utils_v2
import folium
import itertools

In [3]:
from update_vars import (analysis_date, AM_PEAK, PM_PEAK, EXPORT_PATH, GCS_FILE_PATH, PROJECT_CRS,
SEGMENT_BUFFER_METERS, AM_PEAK, PM_PEAK, HQ_TRANSIT_THRESHOLD, MS_TRANSIT_THRESHOLD)

In [4]:
import sjoin_stops_to_segments

In [91]:
import create_aggregate_stop_frequencies

In [120]:
import importlib
importlib.reload(create_aggregate_stop_frequencies)

<module 'create_aggregate_stop_frequencies' from '/home/jovyan/data-analyses/high_quality_transit_areas/create_aggregate_stop_frequencies.py'>

In [121]:
analysis_date

'2024-10-21'

In [122]:
stop_times = helpers.import_scheduled_stop_times(
    analysis_date,
    get_pandas = True,
)

In [123]:
stop_times = create_aggregate_stop_frequencies.add_route_dir(stop_times, analysis_date)

In [105]:
# stop_times >> head(2)

In [124]:
st_prepped = stop_times.pipe(create_aggregate_stop_frequencies.prep_stop_times)

## multi logic

In [127]:
multi_test2 = create_aggregate_stop_frequencies.stop_times_aggregation_max_by_stop(st_prepped, analysis_date, single_route_dir=False)

## single logic

In [129]:
single_test2 = create_aggregate_stop_frequencies.stop_times_aggregation_max_by_stop(st_prepped, analysis_date, single_route_dir=True)

## create count of shared stops between each route_dir

In [126]:
min_freq = min([HQ_TRANSIT_THRESHOLD, MS_TRANSIT_THRESHOLD])

In [146]:
def get_explode_multiroute_only(
    single_route_aggregation: pd.DataFrame,
    multi_route_aggregation: pd.DataFrame,
    min_freqency: int
) -> pd.DataFrame:
    '''
    Shrink the problem space for the compute-intensive collinearity screen.
    First, get stops with any chance of qualifying as a major stop/hq corr for
    both single and multi-route aggregations.
    Then get stops that appear in multi-route qualifiers only, these will go to
    further processing.
    '''
    single_qual = single_route_aggregation >> filter(_.am_max_trips_hr > min_freqency, _.pm_max_trips_hr > min_freqency)
    multi_qual = multi_route_aggregation >> filter(_.am_max_trips_hr > min_freqency, _.pm_max_trips_hr > min_freqency)
    multi_only = multi_qual >> anti_join(_, single_qual, on=['schedule_gtfs_dataset_key', 'stop_id'])
    print(f'{multi_only.shape[0]} stops may qualify with multi-route aggregation')
    multi_only_explode = (multi_only[['schedule_gtfs_dataset_key', 'stop_id', 'route_dir']]
    .explode('route_dir')
    .sort_values(['schedule_gtfs_dataset_key','stop_id', 'route_dir'])) #  sorting crucial for next step
    return multi_only_explode

In [147]:
multi_only_explode = get_explode_multiroute_only(single_test2, multi_test2, min([HQ_TRANSIT_THRESHOLD, MS_TRANSIT_THRESHOLD]))

4560 stops may qualify with multi-route aggregation


In [148]:
multi_only_explode

Unnamed: 0,schedule_gtfs_dataset_key,stop_id,route_dir
96,0139b1253130b33adcd4b3a4490530d2,52c2636c-34a3-434c-99ae-cdf3dc36d15c,0e85fd4c-5258-4256-9852-4a96554aadb7_0
96,0139b1253130b33adcd4b3a4490530d2,52c2636c-34a3-434c-99ae-cdf3dc36d15c,T11x_0
96,0139b1253130b33adcd4b3a4490530d2,52c2636c-34a3-434c-99ae-cdf3dc36d15c,T1_0
96,0139b1253130b33adcd4b3a4490530d2,52c2636c-34a3-434c-99ae-cdf3dc36d15c,T2_0
96,0139b1253130b33adcd4b3a4490530d2,52c2636c-34a3-434c-99ae-cdf3dc36d15c,T3_0
...,...,...,...
79116,fe4aab1717eca5a2935c32c85a35a5bf,115,3_0
79116,fe4aab1717eca5a2935c32c85a35a5bf,115,4_1
79246,ff1bc5dde661d62c877165421e9ca257,LO_19,ROUTEA_0
79246,ff1bc5dde661d62c877165421e9ca257,LO_19,ROUTEA_1


In [205]:
def accumulate_share_count(route_dir_exploded: pd.DataFrame):
    '''
    For use via pd.DataFrame.groupby.apply
    Accumulate the number of times each route_dir shares stops with
    each other in a dictionary (share_counts)
    '''
    global share_counts
    rt_dir = route_dir_exploded.route_dir.to_numpy()
    schedule_gtfs_dataset_key = route_dir_exploded.schedule_gtfs_dataset_key.iloc[0]
    for route_dir in rt_dir:
        route = route_dir.split('_')[0] #  don't compare opposite dirs of same route, leads to edge cases like AC Transit 45
        other_dirs = [x for x in rt_dir if x != route_dir and x.split('_')[0] != route]
        for other_dir in other_dirs:
            key = schedule_gtfs_dataset_key+'__'+route_dir+'__'+other_dir
            if key in share_counts.keys():
                share_counts[key] += 1
            else:
                share_counts[key] = 1

In [206]:
share_counts = {}
multi_only_explode.groupby(['schedule_gtfs_dataset_key', 'stop_id']).apply(accumulate_share_count)

In [207]:
# share_counts

### Which threshold?

* 8 catches Muni 48 and 66, which are somewhat marginal but not an edge case per se

In [208]:
SHARED_STOP_THRESHOLD = 8
qualify = {key: share_counts[key] for key in share_counts.keys() if share_counts[key] >= SHARED_STOP_THRESHOLD}

In [209]:
# qualify

## lookup function/filtering steps

* 

edge cases:

[AC Transit 45](https://www.actransit.org/sites/default/files/timetable_files/45-2023_12_03.pdf) _Opposite directions share a same-direction loop._ __Solved__ by preventing the same route from being compared with itself in the opposite direction.

[SDMTS 944/945](https://www.sdmts.com/sites/default/files/routes/pdf/944.pdf) _Shared frequent stops are few, and these routes are isolated._ __Solved__ by once again applying the `SHARED_STOP_THRESHOLD` after aggregation (by ensuring at least one route_dir at each stop has >= `SHARED_STOP_THRESHOLD` frequent stops). Complex typology including a loop route, each pair of [944, 945, 945A(946)] has >= threshold... but not actually in the same spots!

In [216]:
feeds_to_filter = np.unique([key.split('__')[0] for key in qualify.keys()])

In [217]:
feeds_no_qualify = np.unique([key.split('__')[0] for key in share_counts.keys() if key.split('__')[0] not in feeds_to_filter])

In [218]:
from calitp_data_analysis.tables import tbls

In [219]:
feeds_no_qualify = tbls.mart_transit_database.dim_gtfs_service_data() >> filter(_.gtfs_dataset_key.isin(feeds_no_qualify)) >> distinct(_.name, _.gtfs_dataset_key) >> collect()

In [225]:
# feed_names = (tbls.mart_transit_database.dim_gtfs_service_data() >> filter(_.gtfs_dataset_key.isin(feeds_to_filter))
#  >> distinct(_.name, _.gtfs_dataset_key)
#  >> collect()
# )

In [226]:
feed_names_filtered = feed_names >> filter(_.name.str.contains('Muni'))
display(feed_names_filtered)
dataset_key = feed_names_filtered.gtfs_dataset_key.iloc[0]

Unnamed: 0,name,gtfs_dataset_key
0,Muni Metro Rail – Bay Area 511 Muni Schedule,7cc0cb1871dfd558f11a2885c145d144
14,Muni Bus – Bay Area 511 Muni Schedule,7cc0cb1871dfd558f11a2885c145d144


In [227]:
# dataset_key = '015d67d5b75b5cf2b710bbadadfb75f5' #  Marin
# dataset_key = '3c62ad6ee589d56eca915ce291a5df0a' #  Yolobus 42A and 42B share 5+ stops so they match, which isn't desirable.
# dataset_key = '70c8a8b71c815224299523bf2115924a' #  SacRT
# dataset_key = '63029a23cb0e73f2a5d98a345c5e2e40' #  Elk Grove
# dataset_key = 'f1b35a50955aeb498533c1c6fdafbe44' #  LBT

In [228]:
this_feed_qual = {key.split(dataset_key)[1][2:]:qualify[key] for key in qualify.keys() if key.split('__')[0] == dataset_key}

In [229]:
this_feed_qual

{'18_1__31_1': 8,
 '31_1__18_1': 8,
 '9R_1__9_1': 18,
 '9_1__9R_1': 18,
 '5R_0__5_0': 17,
 '5R_1__5_1': 15,
 '5_0__5R_0': 17,
 '5_1__5R_1': 15,
 'PH_0__PM_0': 11,
 'PM_0__PH_0': 11,
 '48_1__66_1': 8,
 '66_1__48_1': 8}

In [230]:
qualify_pairs = [tuple(key.split('__')) for key in this_feed_qual.keys()]

In [231]:
qualify_pairs

[('18_1', '31_1'),
 ('31_1', '18_1'),
 ('9R_1', '9_1'),
 ('9_1', '9R_1'),
 ('5R_0', '5_0'),
 ('5R_1', '5_1'),
 ('5_0', '5R_0'),
 ('5_1', '5R_1'),
 ('PH_0', 'PM_0'),
 ('PM_0', 'PH_0'),
 ('48_1', '66_1'),
 ('66_1', '48_1')]

In [232]:
arr = np.array(qualify_pairs[0])
for pair in qualify_pairs[1:]: arr = np.append(arr, np.array(pair))

In [233]:
any_appearance = np.unique(arr)

In [234]:
any_appearance

array(['18_1', '31_1', '48_1', '5R_0', '5R_1', '5_0', '5_1', '66_1',
       '9R_1', '9_1', 'PH_0', 'PM_0'], dtype='<U4')

In [235]:
#  only need to check stops that qualify as multi-route only
stops_to_eval = multi_only >> filter(_.schedule_gtfs_dataset_key == dataset_key) >> distinct(_.stop_id)
st_to_eval = st_prepped >> filter(_.schedule_gtfs_dataset_key == dataset_key,
                                  _.stop_id.isin(stops_to_eval.stop_id),
                                  _.route_dir.isin(any_appearance)
                                 )

In [236]:
#  cut down problem space by checking if stops still could qual after filtering for any appearance
min_rows = min_freq * (len(am_peak_hrs) + len(pm_peak_hrs))

In [237]:
st_could_qual = (st_to_eval >> group_by(_.stop_id)
 >> mutate(could_qualify = _.shape[0] >= min_rows)
 >> ungroup()
 >> filter(_.could_qualify)
)

In [238]:
# one_stop = st_could_qual >> filter(_.stop_id == '23585') #  Yolobus 23017 knocked out in last step
# one_stop = st_could_qual >> filter(_.stop_id == '1677') #  PCH/Redondo EB

In [239]:
def check_stop(this_stop_route_dirs, qualify_pairs):
    #  check if all possible combinations included
    this_stop_route_dirs = list(this_stop_route_dirs)
    if len(this_stop_route_dirs) == 1:
        print('exhausted!')
        return []
    print(f'attempting {this_stop_route_dirs}... ', end='')
    stop_route_dir_pairs = list(itertools.combinations(this_stop_route_dirs, 2))
    checks = np.array([True if rt_dir in qualify_pairs else False for rt_dir in stop_route_dir_pairs])
    if checks.all():
        print(f'matched!')
        return this_stop_route_dirs
    else:
        print('subsetting...')
        this_stop_route_dirs.pop(-1)
        return check_stop(this_stop_route_dirs, qualify_pairs)

In [240]:
check_stop(['no', 'nyet', 'bazz', 'fizz', 'buzz'], qualify_pairs)

attempting ['no', 'nyet', 'bazz', 'fizz', 'buzz']... subsetting...
attempting ['no', 'nyet', 'bazz', 'fizz']... subsetting...
attempting ['no', 'nyet', 'bazz']... subsetting...
attempting ['no', 'nyet']... subsetting...
exhausted!


[]

In [241]:
def filter_qualifying_stops(one_stop_df):

    one_stop_df = (one_stop_df >> group_by(_.route_dir)
                >> mutate(route_dir_count = _.shape[0]) >> ungroup()
                >> arrange(-_.route_dir_count)
               )
    this_stop_route_dirs = (one_stop_df >> distinct(_.route_dir, _.route_dir_count)).route_dir.to_numpy() #  preserves sort order
    aggregation_ok_route_dirs = check_stop(this_stop_route_dirs, qualify_pairs)
    return one_stop_df >> filter(_.route_dir.isin(aggregation_ok_route_dirs))

In [242]:
df2 = st_could_qual.groupby('stop_id').apply(filter_qualifying_stops)

attempting ['31_1', '18_1']... matched!
attempting ['31_1', '18_1']... matched!
attempting ['31_1', '18_1']... matched!
attempting ['31_1', '18_1']... matched!
attempting ['9_1', '9R_1']... matched!
exhausted!
exhausted!
exhausted!
exhausted!
exhausted!
exhausted!
exhausted!
exhausted!
attempting ['31_1', '18_1']... matched!
attempting ['31_1', '18_1']... matched!
attempting ['48_1', '18_1']... subsetting...
exhausted!
attempting ['31_1', '18_1']... matched!
attempting ['5R_1', '5R_0', '31_1', '5_0', '5_1']... subsetting...
attempting ['5R_1', '5R_0', '31_1', '5_0']... subsetting...
attempting ['5R_1', '5R_0', '31_1']... subsetting...
attempting ['5R_1', '5R_0']... subsetting...
exhausted!
exhausted!
attempting ['5R_0', '5_0']... matched!
attempting ['5R_1', '5_1']... matched!
attempting ['5R_0', '5_0']... matched!
attempting ['5R_1', '5_1']... matched!
attempting ['5R_1', '5_1']... matched!
attempting ['5R_0', '5_0']... matched!
attempting ['5R_1', '5_1']... matched!
attempting ['5R_1

In [243]:
df2 = df2.reset_index(drop=True)

In [244]:
df2.stop_id.unique()

array(['13059', '13062', '13064', '13066', '13244', '13570', '13572',
       '13925', '14211', '14212', '14213', '14215', '14218', '14221',
       '14222', '14228', '14231', '14232', '14732', '14734', '14736',
       '14737', '14742', '14746', '14747', '14748', '14749', '14751',
       '14753', '14809', '14842', '14843', '14895', '15658', '15685',
       '16047', '16049', '16051', '16058', '16059', '16063', '16068',
       '16069', '16072', '16073', '16075', '16139', '16141', '16143',
       '16145', '16147', '16149', '16151', '16153', '16339', '16344',
       '16383', '16385', '16386', '16388', '16565', '16568', '16570',
       '16575', '16584', '17204', '17264', '17297', '17720', '17721',
       '17727', '17728', '17729', '17730'], dtype=object)

In [245]:
# df2

In [246]:
trips_per_peak_multi = gtfs_schedule_wrangling.stop_arrivals_per_stop(
    df2,
    group_cols = multi_cols,
    count_col = "trip_id",
    route_dir_array = True
).rename(columns = {"n_arrivals": "n_trips"})

In [247]:
df2_final = last_bit(trips_per_peak_multi) >> filter(_.am_max_trips_hr > min_freq, _.pm_max_trips_hr > min_freq)

In [248]:
df2 >> count(_.stop_id)

Unnamed: 0,stop_id,n
0,13059,41
1,13062,41
2,13064,41
3,13066,41
4,13244,56
...,...,...
69,17721,35
70,17727,35
71,17728,36
72,17729,35


In [249]:
df2_final >> count(_.stop_id)

Unnamed: 0,stop_id,n
0,13059,1
1,13062,1
2,13064,1
3,13066,1
4,13244,1
...,...,...
69,17721,1
70,17727,1
71,17728,1
72,17729,1


In [250]:
short_routes = df2_final.explode('route_dir') >> count(_.route_dir) >> filter(_.n < SHARED_STOP_THRESHOLD)

In [251]:
df2_final['all_short'] = df2_final.route_dir.map(lambda x: np.array([True if y in list(short_routes.route_dir) else False for y in x]).all())

In [252]:
df2_final >> filter(_.all_short) #  stops where _every_ shared route has less than SHARED_STOP_THRESHOLD frequent stops (even after aggregation)

Unnamed: 0,schedule_gtfs_dataset_key,stop_id,am_max_trips,route_dir,pm_max_trips,am_max_trips_hr,pm_max_trips_hr,n_trips,route_dir_count,all_short


In [253]:
df2_final = df2_final >> filter(-_.all_short)

## Map single result

In [210]:
stops = helpers.import_scheduled_stops(
    analysis_date,
    get_pandas = True,
    crs = PROJECT_CRS
)

In [254]:
gdf = stops >> inner_join(_, df2, on=['stop_id', 'feed_key']) >> inner_join(_, df2_final, on = ['schedule_gtfs_dataset_key', 'stop_id']) >> distinct(_.stop_id, _.geometry)

In [259]:
gdf.explore()

In [256]:
df2_final >> filter(_.stop_id == '40010')

Unnamed: 0,schedule_gtfs_dataset_key,stop_id,am_max_trips,route_dir,pm_max_trips,am_max_trips_hr,pm_max_trips_hr,n_trips,route_dir_count,all_short


## Map overall results

In [211]:
gdf = stops >> inner_join(_, multi_only, on = ['stop_id']) >> select(_.stop_id, _.geometry)

In [212]:
gdf2 = stops >> inner_join(_, single_qual, on = ['stop_id']) >> select(_.stop_id, _.geometry)

In [213]:
gdf2.geometry = gdf2.buffer(400)

In [214]:
gdf = gdf.overlay(gdf2, how='difference')

In [215]:
# gdf.explore()