In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(1_000_000_000_000) ## 1TB?

In [2]:
import geopandas as gpd
import pandas as pd
from siuba import *
import numpy as np

from segment_speed_utils import helpers, gtfs_schedule_wrangling
from shared_utils import rt_dates, gtfs_utils_v2
import folium

In [3]:
from update_vars import (analysis_date, AM_PEAK, PM_PEAK, EXPORT_PATH, GCS_FILE_PATH, PROJECT_CRS,
SEGMENT_BUFFER_METERS, AM_PEAK, PM_PEAK, HQ_TRANSIT_THRESHOLD, MS_TRANSIT_THRESHOLD)

In [4]:
import sjoin_stops_to_segments

In [5]:
analysis_date

'2024-10-21'

In [6]:
imported_st = helpers.import_scheduled_stop_times(
    analysis_date,
    get_pandas = True,
)

In [7]:
import importlib

importlib.reload(sjoin_stops_to_segments)

<module 'sjoin_stops_to_segments' from '/home/jovyan/data-analyses/high_quality_transit_areas/sjoin_stops_to_segments.py'>

In [8]:
# # (1) Aggregate stop times - by stop_id, find max trips in AM/PM peak
# # takes 1 min
# max_arrivals_by_stop = imported_st.pipe(sjoin_stops_to_segments.prep_stop_times).pipe(sjoin_stops_to_segments.stop_times_aggregation_max_by_stop, analysis_date)

In [9]:
# # (1) Aggregate stop times - by stop_id, find max trips in AM/PM peak
# # takes 1 min
# max_arrivals_by_stop_single = imported_st.pipe(
#     sjoin_stops_to_segments.stop_times_aggregation_max_by_stop, analysis_date, single_route_dir=True)

## multi logic

In [271]:
trips = helpers.import_scheduled_trips(
    analysis_date,
    columns = ["feed_key", "gtfs_dataset_key", "trip_id",
               "route_id", "direction_id"],
    get_pandas = True
)

trips = imported_st.merge(
    trips,
    on = ["feed_key", "trip_id"]
)

In [272]:
trips.direction_id = trips.direction_id.fillna(0).astype(int).astype(str)
trips['route_dir'] = trips[['route_id', 'direction_id']].agg('_'.join, axis=1)

In [273]:
st_prepped = trips.pipe(sjoin_stops_to_segments.prep_stop_times)

In [603]:
multi_cols = ["schedule_gtfs_dataset_key", "stop_id", "peak"]
# cols = ["schedule_gtfs_dataset_key", "stop_id", "peak",
#        "route_id", "direction_id"]

In [275]:
trips_per_peak_multi = gtfs_schedule_wrangling.stop_arrivals_per_stop(
    st_prepped,
    group_cols = multi_cols,
    count_col = "trip_id",
    route_dir_array = True
).rename(columns = {"n_arrivals": "n_trips"})

In [276]:
stop_cols = ["schedule_gtfs_dataset_key", "stop_id"]
trips_per_hour_cols = ["peak"]

In [277]:
def last_bit(trips_per_peak_period):

    am_trips = (trips_per_peak_period[trips_per_peak_period.peak == 'am_peak']
                .rename(columns = {"n_trips": "am_max_trips"})
                .drop(columns="peak")
               )
    pm_trips = (trips_per_peak_period[trips_per_peak_period.peak == 'pm_peak']
                .rename(columns = {"n_trips": "pm_max_trips"})
                .drop(columns=["peak", "route_dir"])
               )

    max_trips_by_stop = pd.merge(
        am_trips, 
        pm_trips,
        on = stop_cols,
        how = "left"
    )
    #  divide by length of peak to get trips/hr, keep n_trips a raw sum
    max_trips_by_stop = max_trips_by_stop.assign(
        am_max_trips_hr = (max_trips_by_stop.am_max_trips.fillna(0) / len(am_peak_hrs)).astype(int),
        pm_max_trips_hr = (max_trips_by_stop.pm_max_trips.fillna(0) / len(pm_peak_hrs)).astype(int),
        n_trips = (max_trips_by_stop.am_max_trips.fillna(0) + 
                   max_trips_by_stop.pm_max_trips.fillna(0)),
        route_dir_count = max_trips_by_stop.route_dir.map(lambda x: x.size)
    )
    
    return max_trips_by_stop

In [278]:
am_peak_hrs = list(range(AM_PEAK[0].hour, AM_PEAK[1].hour))
pm_peak_hrs = list(range(PM_PEAK[0].hour, PM_PEAK[1].hour))

In [279]:
multi_qual = last_bit(trips_per_peak_multi)

In [280]:
min_freq = min([HQ_TRANSIT_THRESHOLD, MS_TRANSIT_THRESHOLD])

In [282]:
multi_qual = multi_qual >> filter(_.am_max_trips_hr > min_freq, _.pm_max_trips_hr > min_freq, _.route_dir_count > 1)

In [283]:
multi_qual

Unnamed: 0,schedule_gtfs_dataset_key,stop_id,am_max_trips,route_dir,pm_max_trips,am_max_trips_hr,pm_max_trips_hr,n_trips,route_dir_count
96,0139b1253130b33adcd4b3a4490530d2,52c2636c-34a3-434c-99ae-cdf3dc36d15c,39,"[0e85fd4c-5258-4256-9852-4a96554aadb7_0, T11x_...",58.0,13,14,97.0,9
179,0139b1253130b33adcd4b3a4490530d2,98d2a60c-86b1-45d6-b5d6-39b273c9eb46,16,"[0ad6c6aa-1939-45a0-a3a8-02ebe8e19092_0, 79b4a...",24.0,5,6,40.0,5
180,0139b1253130b33adcd4b3a4490530d2,999ff07b-4a27-4c80-9a1c-e868038ce097,24,"[0ad6c6aa-1939-45a0-a3a8-02ebe8e19092_0, D1_0,...",44.0,8,11,68.0,7
296,015d67d5b75b5cf2b710bbadadfb75f5,40103,12,"[17_0, 17_1, 61_0, 61_1]",17.0,4,4,29.0,4
297,015d67d5b75b5cf2b710bbadadfb75f5,40113,30,"[17_0, 22_0, 22_1, 36_0, 36_1, 61_1, 71_0, 71_1]",48.0,10,12,78.0,8
...,...,...,...,...,...,...,...,...,...
79047,fb746afc72ff40405cfefa6d23ab58a0,53129,28,"[A_0, B_0, E_0, M_0]",42.0,9,10,70.0,4
79049,fb746afc72ff40405cfefa6d23ab58a0,53163,20,"[5_AM_0, A_0, B_0, C_0, E_0, F_0]",27.0,6,6,47.0,6
79093,fc6cd27871cce0092a08ccf68fb240a2,132744,28,"[4556_0, 4557_0, 4558_0, 4559_0]",44.0,9,11,72.0,4
79116,fe4aab1717eca5a2935c32c85a35a5bf,115,13,"[11_0, 12_0, 14_0, 1_0, 2_0, 3_0, 4_1]",22.0,4,5,35.0,7


## single logic

In [284]:
# cols = ["schedule_gtfs_dataset_key", "stop_id", "peak"]
cols = ["schedule_gtfs_dataset_key", "stop_id", "peak",
       "route_id", "direction_id"]

In [285]:
trips_per_peak_single = gtfs_schedule_wrangling.stop_arrivals_per_stop(
    st_prepped,
    group_cols = cols,
    count_col = "trip_id",
    route_dir_array = True
).rename(columns = {"n_arrivals": "n_trips"})

In [286]:
stop_cols = ["schedule_gtfs_dataset_key", "stop_id"]
trips_per_hour_cols = ["peak"]

In [287]:
def last_bit(trips_per_peak_period):

    am_trips = (trips_per_peak_period[trips_per_peak_period.peak == 'am_peak']
                .rename(columns = {"n_trips": "am_max_trips"})
                .drop(columns="peak")
               )
    pm_trips = (trips_per_peak_period[trips_per_peak_period.peak == 'pm_peak']
                .rename(columns = {"n_trips": "pm_max_trips"})
                .drop(columns=["peak", "route_dir"])
               )

    max_trips_by_stop = pd.merge(
        am_trips, 
        pm_trips,
        on = stop_cols,
        how = "left"
    )
    #  divide by length of peak to get trips/hr, keep n_trips a raw sum
    max_trips_by_stop = max_trips_by_stop.assign(
        am_max_trips_hr = (max_trips_by_stop.am_max_trips.fillna(0) / len(am_peak_hrs)).astype(int),
        pm_max_trips_hr = (max_trips_by_stop.pm_max_trips.fillna(0) / len(pm_peak_hrs)).astype(int),
        n_trips = (max_trips_by_stop.am_max_trips.fillna(0) + 
                   max_trips_by_stop.pm_max_trips.fillna(0)),
        route_dir_count = max_trips_by_stop.route_dir.map(lambda x: x.size)
    )
    
    return max_trips_by_stop

In [288]:
df_single = last_bit(trips_per_peak_single)

In [289]:
df_single

Unnamed: 0,schedule_gtfs_dataset_key,stop_id,route_id_x,direction_id_x,am_max_trips,route_dir,route_id_y,direction_id_y,pm_max_trips,am_max_trips_hr,pm_max_trips_hr,n_trips,route_dir_count
0,0139b1253130b33adcd4b3a4490530d2,00eb15cb-1430-4964-b8ae-ca6183e1d0ef,D1,0,2,[D1_0],D1,0,4.0,0,1,6.0,1
1,0139b1253130b33adcd4b3a4490530d2,02a30e39-496f-45d4-ba1c-ac8f3c66b621,0ad6c6aa-1939-45a0-a3a8-02ebe8e19092,0,2,[0ad6c6aa-1939-45a0-a3a8-02ebe8e19092_0],0ad6c6aa-1939-45a0-a3a8-02ebe8e19092,0,6.0,0,1,8.0,1
2,0139b1253130b33adcd4b3a4490530d2,02a30e39-496f-45d4-ba1c-ac8f3c66b621,0ad6c6aa-1939-45a0-a3a8-02ebe8e19092,0,2,[0ad6c6aa-1939-45a0-a3a8-02ebe8e19092_0],D2,0,3.0,0,0,5.0,1
3,0139b1253130b33adcd4b3a4490530d2,02a30e39-496f-45d4-ba1c-ac8f3c66b621,0ad6c6aa-1939-45a0-a3a8-02ebe8e19092,0,2,[0ad6c6aa-1939-45a0-a3a8-02ebe8e19092_0],c6726149-9979-4ebb-85f6-0be90402266c,0,4.0,0,1,6.0,1
4,0139b1253130b33adcd4b3a4490530d2,02a30e39-496f-45d4-ba1c-ac8f3c66b621,D2,0,2,[D2_0],0ad6c6aa-1939-45a0-a3a8-02ebe8e19092,0,6.0,0,1,8.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
200142,ff1bc5dde661d62c877165421e9ca257,exp_20,ROUTEA,1,5,[ROUTEA_1],ROUTEA,1,8.0,1,2,13.0,1
200143,ff1bc5dde661d62c877165421e9ca257,exp_21,ROUTEA,1,5,[ROUTEA_1],ROUTEA,1,8.0,1,2,13.0,1
200144,ff1bc5dde661d62c877165421e9ca257,exp_22,ROUTEA,1,5,[ROUTEA_1],ROUTEA,1,8.0,1,2,13.0,1
200145,ff1bc5dde661d62c877165421e9ca257,exp_23,ROUTEA,1,5,[ROUTEA_1],ROUTEA,1,8.0,1,2,13.0,1


In [290]:
single_qual = df_single >> filter(_.am_max_trips_hr > min_freq, _.pm_max_trips_hr > min_freq)

## create count of shared stops between each route_dir

In [291]:
multi_only = multi_qual >> anti_join(_, single_qual, on=['schedule_gtfs_dataset_key', 'stop_id'])

In [292]:
# multi_only

In [908]:
multi_only_explode = (multi_only[['schedule_gtfs_dataset_key', 'stop_id', 'route_dir']]
.explode('route_dir')
.sort_values(['schedule_gtfs_dataset_key','stop_id', 'route_dir']))

In [909]:
multi_only_explode

Unnamed: 0,schedule_gtfs_dataset_key,stop_id,route_dir
96,0139b1253130b33adcd4b3a4490530d2,52c2636c-34a3-434c-99ae-cdf3dc36d15c,0e85fd4c-5258-4256-9852-4a96554aadb7_0
96,0139b1253130b33adcd4b3a4490530d2,52c2636c-34a3-434c-99ae-cdf3dc36d15c,T11x_0
96,0139b1253130b33adcd4b3a4490530d2,52c2636c-34a3-434c-99ae-cdf3dc36d15c,T1_0
96,0139b1253130b33adcd4b3a4490530d2,52c2636c-34a3-434c-99ae-cdf3dc36d15c,T2_0
96,0139b1253130b33adcd4b3a4490530d2,52c2636c-34a3-434c-99ae-cdf3dc36d15c,T3_0
...,...,...,...
79116,fe4aab1717eca5a2935c32c85a35a5bf,115,3_0
79116,fe4aab1717eca5a2935c32c85a35a5bf,115,4_1
79246,ff1bc5dde661d62c877165421e9ca257,LO_19,ROUTEA_0
79246,ff1bc5dde661d62c877165421e9ca257,LO_19,ROUTEA_1


In [910]:
xy = multi_only_explode.loc[96,:].route_dir.to_numpy()

In [911]:
xy

array(['0e85fd4c-5258-4256-9852-4a96554aadb7_0', 'T11x_0', 'T1_0', 'T2_0',
       'T3_0', 'T4_0', 'T5_0', 'T6_0',
       'e430d571-76bd-45d4-8b01-76e3ef7c3ae1_0'], dtype=object)

In [902]:
def test_share_count(df):
    global share_counts
    xy = df.route_dir.to_numpy()
    schedule_gtfs_dataset_key = df.schedule_gtfs_dataset_key.iloc[0]
    for route_dir in xy:
        route = route_dir.split('_')[0] #  don't compare opposite dirs of same route, leads to edge cases like AC Transit 45
        other_dirs = [x for x in xy if x != route_dir and x.split('_')[0] != route]
        for other_dir in other_dirs:
            key = schedule_gtfs_dataset_key+'__'+route_dir+'__'+other_dir
            if key in share_counts.keys():
                share_counts[key] += 1
            else:
                share_counts[key] = 1

In [912]:
share_counts = {}

In [913]:
multi_only_explode.groupby(['schedule_gtfs_dataset_key', 'stop_id']).apply(test_share_count)

In [980]:
# share_counts

### Which threshold?

* 8 catches Muni 48 and 66, which are somewhat marginal but not an edge case per se

In [1173]:
SHARED_STOP_THRESHOLD = 8
qualify = {key: share_counts[key] for key in share_counts.keys() if share_counts[key] >= SHARED_STOP_THRESHOLD}

In [1174]:
# qualify

## explore potential stops (not yet filtered)

In [1175]:
stops = helpers.import_scheduled_stops(
    analysis_date,
    get_pandas = True,
    crs = PROJECT_CRS
)

In [305]:
gdf = stops >> inner_join(_, multi_only, on = ['stop_id']) >> select(_.stop_id, _.geometry)

In [306]:
gdf2 = stops >> inner_join(_, single_qual, on = ['stop_id']) >> select(_.stop_id, _.geometry)

In [307]:
gdf2.geometry = gdf2.buffer(400)

In [308]:
gdf = gdf.overlay(gdf2, how='difference')

In [309]:
# gdf.explore()

## lookup function/filtering steps

* 

edge cases:

[AC Transit 45](https://www.actransit.org/sites/default/files/timetable_files/45-2023_12_03.pdf) _Opposite directions share a same-direction loop._ __Solved__ by preventing the same route from being compared with itself in the opposite direction.

[SDMTS 944/945](https://www.sdmts.com/sites/default/files/routes/pdf/944.pdf) _Shared frequent stops are few, and these routes are isolated._ __Solved__ by once again applying the `SHARED_STOP_THRESHOLD` after aggregation (by ensuring at least one route_dir at each stop has >= `SHARED_STOP_THRESHOLD` frequent stops). Complex typology including a loop route, each pair of [944, 945, 945A(946)] has >= threshold... but not actually in the same spots!

In [1129]:
feeds_to_filter = np.unique([key.split('__')[0] for key in qualify.keys()])

In [1130]:
feeds_no_qualify = np.unique([key.split('__')[0] for key in share_counts.keys() if key.split('__')[0] not in feeds_to_filter])

In [1131]:
from calitp_data_analysis.tables import tbls

In [1132]:
feeds_no_qualify = tbls.mart_transit_database.dim_gtfs_service_data() >> filter(_.gtfs_dataset_key.isin(feeds_no_qualify)) >> distinct(_.name, _.gtfs_dataset_key) >> collect()

In [1133]:
feed_names = (tbls.mart_transit_database.dim_gtfs_service_data() >> filter(_.gtfs_dataset_key.isin(feeds_to_filter))
 >> distinct(_.name, _.gtfs_dataset_key)
 >> collect()
)

In [1176]:
feed_names_filtered = feed_names >> filter(_.name.str.contains('Muni'))
display(feed_names_filtered)
dataset_key = feed_names_filtered.gtfs_dataset_key.iloc[0]

Unnamed: 0,name,gtfs_dataset_key
0,Muni Metro Rail – Bay Area 511 Muni Schedule,7cc0cb1871dfd558f11a2885c145d144
14,Muni Bus – Bay Area 511 Muni Schedule,7cc0cb1871dfd558f11a2885c145d144


In [1177]:
# dataset_key = '015d67d5b75b5cf2b710bbadadfb75f5' #  Marin
# dataset_key = '3c62ad6ee589d56eca915ce291a5df0a' #  Yolobus 42A and 42B share 5+ stops so they match, which isn't desirable.
# dataset_key = '70c8a8b71c815224299523bf2115924a' #  SacRT
# dataset_key = '63029a23cb0e73f2a5d98a345c5e2e40' #  Elk Grove
# dataset_key = 'f1b35a50955aeb498533c1c6fdafbe44' #  LBT

In [1178]:
this_feed_qual = {key.split(dataset_key)[1][2:]:qualify[key] for key in qualify.keys() if key.split('__')[0] == dataset_key}

In [1179]:
this_feed_qual

{'18_1__31_1': 8,
 '31_1__18_1': 8,
 '9R_1__9_1': 18,
 '9_1__9R_1': 18,
 '5R_0__5_0': 17,
 '5R_1__5_1': 15,
 '5_0__5R_0': 17,
 '5_1__5R_1': 15,
 'PH_0__PM_0': 11,
 'PM_0__PH_0': 11,
 '48_1__66_1': 8,
 '66_1__48_1': 8}

In [1180]:
qualify_pairs = [tuple(key.split('__')) for key in this_feed_qual.keys()]

In [1181]:
qualify_pairs

[('18_1', '31_1'),
 ('31_1', '18_1'),
 ('9R_1', '9_1'),
 ('9_1', '9R_1'),
 ('5R_0', '5_0'),
 ('5R_1', '5_1'),
 ('5_0', '5R_0'),
 ('5_1', '5R_1'),
 ('PH_0', 'PM_0'),
 ('PM_0', 'PH_0'),
 ('48_1', '66_1'),
 ('66_1', '48_1')]

In [1182]:
arr = np.array(qualify_pairs[0])
for pair in qualify_pairs[1:]: arr = np.append(arr, np.array(pair))

In [1183]:
any_appearance = np.unique(arr)

In [1184]:
any_appearance

array(['18_1', '31_1', '48_1', '5R_0', '5R_1', '5_0', '5_1', '66_1',
       '9R_1', '9_1', 'PH_0', 'PM_0'], dtype='<U4')

In [1185]:
#  only need to check stops that qualify as multi-route only
stops_to_eval = multi_only >> filter(_.schedule_gtfs_dataset_key == dataset_key) >> distinct(_.stop_id)
st_to_eval = st_prepped >> filter(_.schedule_gtfs_dataset_key == dataset_key,
                                  _.stop_id.isin(stops_to_eval.stop_id),
                                  _.route_dir.isin(any_appearance)
                                 )

In [1186]:
#  cut down problem space by checking if stops still could qual after filtering for any appearance
min_rows = min_freq * (len(am_peak_hrs) + len(pm_peak_hrs))

In [1187]:
st_could_qual = (st_to_eval >> group_by(_.stop_id)
 >> mutate(could_qualify = _.shape[0] >= min_rows)
 >> ungroup()
 >> filter(_.could_qualify)
)

In [1188]:
# one_stop = st_could_qual >> filter(_.stop_id == '23585') #  Yolobus 23017 knocked out in last step
# one_stop = st_could_qual >> filter(_.stop_id == '1677') #  PCH/Redondo EB

In [1189]:
def check_stop(this_stop_route_dirs, qualify_pairs):
    #  check if all possible combinations included
    this_stop_route_dirs = list(this_stop_route_dirs)
    if len(this_stop_route_dirs) == 1:
        print('exhausted!')
        return []
    print(f'attempting {this_stop_route_dirs}... ', end='')
    stop_route_dir_pairs = list(itertools.combinations(this_stop_route_dirs, 2))
    checks = np.array([True if rt_dir in qualify_pairs else False for rt_dir in stop_route_dir_pairs])
    if checks.all():
        print(f'matched!')
        return this_stop_route_dirs
    else:
        print('subsetting...')
        this_stop_route_dirs.pop(-1)
        return check_stop(this_stop_route_dirs, qualify_pairs)

In [1190]:
check_stop(['no', 'nyet', 'bazz', 'fizz', 'buzz'], qualify_pairs)

attempting ['no', 'nyet', 'bazz', 'fizz', 'buzz']... subsetting...
attempting ['no', 'nyet', 'bazz', 'fizz']... subsetting...
attempting ['no', 'nyet', 'bazz']... subsetting...
attempting ['no', 'nyet']... subsetting...
exhausted!


[]

In [1191]:
def filter_qualifying_stops(one_stop_df):

    one_stop_df = (one_stop_df >> group_by(_.route_dir)
                >> mutate(route_dir_count = _.shape[0]) >> ungroup()
                >> arrange(-_.route_dir_count)
               )
    this_stop_route_dirs = (one_stop_df >> distinct(_.route_dir, _.route_dir_count)).route_dir.to_numpy() #  preserves sort order
    aggregation_ok_route_dirs = check_stop(this_stop_route_dirs, qualify_pairs)
    return one_stop_df >> filter(_.route_dir.isin(aggregation_ok_route_dirs))

In [1192]:
df2 = st_could_qual.groupby('stop_id').apply(filter_qualifying_stops)

attempting ['31_1', '18_1']... matched!
attempting ['31_1', '18_1']... matched!
attempting ['31_1', '18_1']... matched!
attempting ['31_1', '18_1']... matched!
attempting ['9_1', '9R_1']... matched!
exhausted!
exhausted!
exhausted!
exhausted!
exhausted!
exhausted!
exhausted!
exhausted!
attempting ['31_1', '18_1']... matched!
attempting ['31_1', '18_1']... matched!
attempting ['48_1', '18_1']... subsetting...
exhausted!
attempting ['31_1', '18_1']... matched!
attempting ['5R_1', '5R_0', '31_1', '5_0', '5_1']... subsetting...
attempting ['5R_1', '5R_0', '31_1', '5_0']... subsetting...
attempting ['5R_1', '5R_0', '31_1']... subsetting...
attempting ['5R_1', '5R_0']... subsetting...
exhausted!
exhausted!
attempting ['5R_0', '5_0']... matched!
attempting ['5R_1', '5_1']... matched!
attempting ['5R_0', '5_0']... matched!
attempting ['5R_1', '5_1']... matched!
attempting ['5R_1', '5_1']... matched!
attempting ['5R_0', '5_0']... matched!
attempting ['5R_1', '5_1']... matched!
attempting ['5R_1

In [1193]:
df2 = df2.reset_index(drop=True)

In [1194]:
df2.stop_id.unique()

array(['13059', '13062', '13064', '13066', '13244', '13570', '13572',
       '13925', '14211', '14212', '14213', '14215', '14218', '14221',
       '14222', '14228', '14231', '14232', '14732', '14734', '14736',
       '14737', '14742', '14746', '14747', '14748', '14749', '14751',
       '14753', '14809', '14842', '14843', '14895', '15658', '15685',
       '16047', '16049', '16051', '16058', '16059', '16063', '16068',
       '16069', '16072', '16073', '16075', '16139', '16141', '16143',
       '16145', '16147', '16149', '16151', '16153', '16339', '16344',
       '16383', '16385', '16386', '16388', '16565', '16568', '16570',
       '16575', '16584', '17204', '17264', '17297', '17720', '17721',
       '17727', '17728', '17729', '17730'], dtype=object)

In [1195]:
# df2

In [1196]:
trips_per_peak_multi = gtfs_schedule_wrangling.stop_arrivals_per_stop(
    df2,
    group_cols = multi_cols,
    count_col = "trip_id",
    route_dir_array = True
).rename(columns = {"n_arrivals": "n_trips"})

In [1197]:
df2_final = last_bit(trips_per_peak_multi) >> filter(_.am_max_trips_hr > min_freq, _.pm_max_trips_hr > min_freq)

In [1198]:
df2 >> count(_.stop_id)

Unnamed: 0,stop_id,n
0,13059,41
1,13062,41
2,13064,41
3,13066,41
4,13244,56
...,...,...
69,17721,35
70,17727,35
71,17728,36
72,17729,35


In [1199]:
df2_final >> count(_.stop_id)

Unnamed: 0,stop_id,n
0,13059,1
1,13062,1
2,13064,1
3,13066,1
4,13244,1
...,...,...
69,17721,1
70,17727,1
71,17728,1
72,17729,1


In [1200]:
short_routes = df2_final.explode('route_dir') >> count(_.route_dir) >> filter(_.n < SHARED_STOP_THRESHOLD)

In [1201]:
df2_final['all_short'] = df2_final.route_dir.map(lambda x: np.array([True if y in list(short_routes.route_dir) else False for y in x]).all())

In [1202]:
df2_final >> filter(_.all_short) #  stops where _every_ shared route has less than SHARED_STOP_THRESHOLD frequent stops (even after aggregation)

Unnamed: 0,schedule_gtfs_dataset_key,stop_id,am_max_trips,route_dir,pm_max_trips,am_max_trips_hr,pm_max_trips_hr,n_trips,route_dir_count,all_short


In [1203]:
df2_final = df2_final >> filter(-_.all_short)

In [1204]:
gdf = stops >> inner_join(_, df2, on=['stop_id', 'feed_key']) >> inner_join(_, df2_final, on = ['schedule_gtfs_dataset_key', 'stop_id']) >> distinct(_.stop_id, _.geometry)

In [1206]:
# gdf.explore()

In [1066]:
df2_final >> filter(_.stop_id == '40010')

Unnamed: 0,schedule_gtfs_dataset_key,stop_id,am_max_trips,route_dir,pm_max_trips,am_max_trips_hr,pm_max_trips_hr,n_trips,route_dir_count
53,baeeb157e85a901e47b828ef9fe75091,40010,14,"[944_0, 945_1, 946_0]",16,4,4,30,3
