In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(1_000_000_000_000) ## 1TB?

In [2]:
import geopandas as gpd
import pandas as pd
from siuba import *
import numpy as np

from segment_speed_utils import helpers, gtfs_schedule_wrangling
from shared_utils import rt_dates, gtfs_utils_v2
import folium

In [3]:
from update_vars import (analysis_date, AM_PEAK, PM_PEAK, EXPORT_PATH, GCS_FILE_PATH, PROJECT_CRS,
SEGMENT_BUFFER_METERS, AM_PEAK, PM_PEAK, HQ_TRANSIT_THRESHOLD, MS_TRANSIT_THRESHOLD)

In [4]:
import sjoin_stops_to_segments

In [5]:
analysis_date

'2024-10-21'

In [6]:
imported_st = helpers.import_scheduled_stop_times(
    analysis_date,
    get_pandas = True,
)

In [7]:
import importlib

importlib.reload(sjoin_stops_to_segments)

<module 'sjoin_stops_to_segments' from '/home/jovyan/data-analyses/high_quality_transit_areas/sjoin_stops_to_segments.py'>

In [8]:
# # (1) Aggregate stop times - by stop_id, find max trips in AM/PM peak
# # takes 1 min
# max_arrivals_by_stop = imported_st.pipe(sjoin_stops_to_segments.prep_stop_times).pipe(sjoin_stops_to_segments.stop_times_aggregation_max_by_stop, analysis_date)

In [9]:
# # (1) Aggregate stop times - by stop_id, find max trips in AM/PM peak
# # takes 1 min
# max_arrivals_by_stop_single = imported_st.pipe(
#     sjoin_stops_to_segments.stop_times_aggregation_max_by_stop, analysis_date, single_route_dir=True)

## multi logic

In [10]:
trips = helpers.import_scheduled_trips(
    analysis_date,
    columns = ["feed_key", "gtfs_dataset_key", "trip_id",
               "route_id", "direction_id"],
    get_pandas = True
)

trips = imported_st.merge(
    trips,
    on = ["feed_key", "trip_id"]
)

In [11]:
trips.direction_id = trips.direction_id.fillna(0).round(0).astype(str)
trips['route_dir'] = trips[['route_id', 'direction_id']].agg('_'.join, axis=1)

In [12]:
st_prepped = trips.pipe(sjoin_stops_to_segments.prep_stop_times)

In [13]:
cols = ["schedule_gtfs_dataset_key", "stop_id", "peak"]
# cols = ["schedule_gtfs_dataset_key", "stop_id", "peak",
#        "route_id", "direction_id"]

In [14]:
trips_per_peak_multi = gtfs_schedule_wrangling.stop_arrivals_per_stop(
    st_prepped,
    group_cols = cols,
    count_col = "trip_id",
    route_dir_array = True
).rename(columns = {"n_arrivals": "n_trips"})

In [15]:
stop_cols = ["schedule_gtfs_dataset_key", "stop_id"]
trips_per_hour_cols = ["peak"]

In [16]:
def last_bit(trips_per_peak_period):

    am_trips = (trips_per_peak_period[trips_per_peak_period.peak == 'am_peak']
                .rename(columns = {"n_trips": "am_max_trips"})
                .drop(columns="peak")
               )
    pm_trips = (trips_per_peak_period[trips_per_peak_period.peak == 'pm_peak']
                .rename(columns = {"n_trips": "pm_max_trips"})
                .drop(columns=["peak", "route_dir"])
               )

    max_trips_by_stop = pd.merge(
        am_trips, 
        pm_trips,
        on = stop_cols,
        how = "left"
    )
    #  divide by length of peak to get trips/hr, keep n_trips a raw sum
    max_trips_by_stop = max_trips_by_stop.assign(
        am_max_trips_hr = (max_trips_by_stop.am_max_trips.fillna(0) / len(am_peak_hrs)).astype(int),
        pm_max_trips_hr = (max_trips_by_stop.pm_max_trips.fillna(0) / len(pm_peak_hrs)).astype(int),
        n_trips = (max_trips_by_stop.am_max_trips.fillna(0) + 
                   max_trips_by_stop.pm_max_trips.fillna(0)),
        route_dir_count = max_trips_by_stop.route_dir.map(lambda x: x.size)
    )
    
    return max_trips_by_stop

In [17]:
am_peak_hrs = list(range(AM_PEAK[0].hour, AM_PEAK[1].hour))
pm_peak_hrs = list(range(PM_PEAK[0].hour, PM_PEAK[1].hour))

In [18]:
multi_qual = last_bit(trips_per_peak_multi)

In [19]:
min_freq = min([HQ_TRANSIT_THRESHOLD, MS_TRANSIT_THRESHOLD])

In [20]:
multi_qual

Unnamed: 0,schedule_gtfs_dataset_key,stop_id,am_max_trips,route_dir,pm_max_trips,am_max_trips_hr,pm_max_trips_hr,n_trips,route_dir_count
0,0139b1253130b33adcd4b3a4490530d2,00eb15cb-1430-4964-b8ae-ca6183e1d0ef,2,[D1_0.0],4.0,0,1,6.0,1
1,0139b1253130b33adcd4b3a4490530d2,02a30e39-496f-45d4-ba1c-ac8f3c66b621,8,"[0ad6c6aa-1939-45a0-a3a8-02ebe8e19092_0.0, D2_...",13.0,2,3,21.0,3
2,0139b1253130b33adcd4b3a4490530d2,04a2c417-05bf-4f95-bfb6-dd9cec701f11,5,"[D3_0.0, c6726149-9979-4ebb-85f6-0be90402266c_...",8.0,1,2,13.0,2
3,0139b1253130b33adcd4b3a4490530d2,05d0285f-813a-4ea9-82e0-3b8d1127e8e0,3,[T2_0.0],5.0,1,1,8.0,1
4,0139b1253130b33adcd4b3a4490530d2,07fe70a4-21dd-4bcf-9adf-ed96f0daebbc,2,[D1_0.0],4.0,0,1,6.0,1
...,...,...,...,...,...,...,...,...,...
79257,ff1bc5dde661d62c877165421e9ca257,exp_20,5,[ROUTEA_1.0],8.0,1,2,13.0,1
79258,ff1bc5dde661d62c877165421e9ca257,exp_21,5,[ROUTEA_1.0],8.0,1,2,13.0,1
79259,ff1bc5dde661d62c877165421e9ca257,exp_22,5,[ROUTEA_1.0],8.0,1,2,13.0,1
79260,ff1bc5dde661d62c877165421e9ca257,exp_23,5,[ROUTEA_1.0],8.0,1,2,13.0,1


In [21]:
multi_qual = multi_qual >> filter(_.am_max_trips_hr > min_freq, _.pm_max_trips_hr > min_freq, _.route_dir_count > 1)

In [22]:
multi_qual

Unnamed: 0,schedule_gtfs_dataset_key,stop_id,am_max_trips,route_dir,pm_max_trips,am_max_trips_hr,pm_max_trips_hr,n_trips,route_dir_count
96,0139b1253130b33adcd4b3a4490530d2,52c2636c-34a3-434c-99ae-cdf3dc36d15c,39,"[0e85fd4c-5258-4256-9852-4a96554aadb7_0.0, T11...",58.0,13,14,97.0,9
179,0139b1253130b33adcd4b3a4490530d2,98d2a60c-86b1-45d6-b5d6-39b273c9eb46,16,"[0ad6c6aa-1939-45a0-a3a8-02ebe8e19092_0.0, 79b...",24.0,5,6,40.0,5
180,0139b1253130b33adcd4b3a4490530d2,999ff07b-4a27-4c80-9a1c-e868038ce097,24,"[0ad6c6aa-1939-45a0-a3a8-02ebe8e19092_0.0, D1_...",44.0,8,11,68.0,7
296,015d67d5b75b5cf2b710bbadadfb75f5,40103,12,"[17_0.0, 17_1.0, 61_0.0, 61_1.0]",17.0,4,4,29.0,4
297,015d67d5b75b5cf2b710bbadadfb75f5,40113,30,"[17_0.0, 22_0.0, 22_1.0, 36_0.0, 36_1.0, 61_1....",48.0,10,12,78.0,8
...,...,...,...,...,...,...,...,...,...
79047,fb746afc72ff40405cfefa6d23ab58a0,53129,28,"[A_0.0, B_0.0, E_0.0, M_0.0]",42.0,9,10,70.0,4
79049,fb746afc72ff40405cfefa6d23ab58a0,53163,20,"[5_AM_0.0, A_0.0, B_0.0, C_0.0, E_0.0, F_0.0]",27.0,6,6,47.0,6
79093,fc6cd27871cce0092a08ccf68fb240a2,132744,28,"[4556_0.0, 4557_0.0, 4558_0.0, 4559_0.0]",44.0,9,11,72.0,4
79116,fe4aab1717eca5a2935c32c85a35a5bf,115,13,"[11_0.0, 12_0.0, 14_0.0, 1_0.0, 2_0.0, 3_0.0, ...",22.0,4,5,35.0,7


## single logic

In [23]:
# cols = ["schedule_gtfs_dataset_key", "stop_id", "peak"]
cols = ["schedule_gtfs_dataset_key", "stop_id", "peak",
       "route_id", "direction_id"]

In [24]:
trips_per_peak_single = gtfs_schedule_wrangling.stop_arrivals_per_stop(
    st_prepped,
    group_cols = cols,
    count_col = "trip_id",
    route_dir_array = True
).rename(columns = {"n_arrivals": "n_trips"})

In [25]:
stop_cols = ["schedule_gtfs_dataset_key", "stop_id"]
trips_per_hour_cols = ["peak"]

In [26]:
def last_bit(trips_per_peak_period):

    am_trips = (trips_per_peak_period[trips_per_peak_period.peak == 'am_peak']
                .rename(columns = {"n_trips": "am_max_trips"})
                .drop(columns="peak")
               )
    pm_trips = (trips_per_peak_period[trips_per_peak_period.peak == 'pm_peak']
                .rename(columns = {"n_trips": "pm_max_trips"})
                .drop(columns=["peak", "route_dir"])
               )

    max_trips_by_stop = pd.merge(
        am_trips, 
        pm_trips,
        on = stop_cols,
        how = "left"
    )
    #  divide by length of peak to get trips/hr, keep n_trips a raw sum
    max_trips_by_stop = max_trips_by_stop.assign(
        am_max_trips_hr = (max_trips_by_stop.am_max_trips.fillna(0) / len(am_peak_hrs)).astype(int),
        pm_max_trips_hr = (max_trips_by_stop.pm_max_trips.fillna(0) / len(pm_peak_hrs)).astype(int),
        n_trips = (max_trips_by_stop.am_max_trips.fillna(0) + 
                   max_trips_by_stop.pm_max_trips.fillna(0)),
        route_dir_count = max_trips_by_stop.route_dir.map(lambda x: x.size)
    )
    
    return max_trips_by_stop

In [27]:
df_single = last_bit(trips_per_peak_single)

In [28]:
df_single

Unnamed: 0,schedule_gtfs_dataset_key,stop_id,route_id_x,direction_id_x,am_max_trips,route_dir,route_id_y,direction_id_y,pm_max_trips,am_max_trips_hr,pm_max_trips_hr,n_trips,route_dir_count
0,0139b1253130b33adcd4b3a4490530d2,00eb15cb-1430-4964-b8ae-ca6183e1d0ef,D1,0.0,2,[D1_0.0],D1,0.0,4.0,0,1,6.0,1
1,0139b1253130b33adcd4b3a4490530d2,02a30e39-496f-45d4-ba1c-ac8f3c66b621,0ad6c6aa-1939-45a0-a3a8-02ebe8e19092,0.0,2,[0ad6c6aa-1939-45a0-a3a8-02ebe8e19092_0.0],0ad6c6aa-1939-45a0-a3a8-02ebe8e19092,0.0,6.0,0,1,8.0,1
2,0139b1253130b33adcd4b3a4490530d2,02a30e39-496f-45d4-ba1c-ac8f3c66b621,0ad6c6aa-1939-45a0-a3a8-02ebe8e19092,0.0,2,[0ad6c6aa-1939-45a0-a3a8-02ebe8e19092_0.0],D2,0.0,3.0,0,0,5.0,1
3,0139b1253130b33adcd4b3a4490530d2,02a30e39-496f-45d4-ba1c-ac8f3c66b621,0ad6c6aa-1939-45a0-a3a8-02ebe8e19092,0.0,2,[0ad6c6aa-1939-45a0-a3a8-02ebe8e19092_0.0],c6726149-9979-4ebb-85f6-0be90402266c,0.0,4.0,0,1,6.0,1
4,0139b1253130b33adcd4b3a4490530d2,02a30e39-496f-45d4-ba1c-ac8f3c66b621,D2,0.0,2,[D2_0.0],0ad6c6aa-1939-45a0-a3a8-02ebe8e19092,0.0,6.0,0,1,8.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
200142,ff1bc5dde661d62c877165421e9ca257,exp_20,ROUTEA,1.0,5,[ROUTEA_1.0],ROUTEA,1.0,8.0,1,2,13.0,1
200143,ff1bc5dde661d62c877165421e9ca257,exp_21,ROUTEA,1.0,5,[ROUTEA_1.0],ROUTEA,1.0,8.0,1,2,13.0,1
200144,ff1bc5dde661d62c877165421e9ca257,exp_22,ROUTEA,1.0,5,[ROUTEA_1.0],ROUTEA,1.0,8.0,1,2,13.0,1
200145,ff1bc5dde661d62c877165421e9ca257,exp_23,ROUTEA,1.0,5,[ROUTEA_1.0],ROUTEA,1.0,8.0,1,2,13.0,1


In [29]:
single_qual = df_single >> filter(_.am_max_trips_hr > min_freq, _.pm_max_trips_hr > min_freq)

In [30]:
multi_only = multi_qual >> anti_join(_, single_qual, on=['schedule_gtfs_dataset_key', 'stop_id'])

In [31]:
multi_only

Unnamed: 0,schedule_gtfs_dataset_key,stop_id,am_max_trips,route_dir,pm_max_trips,am_max_trips_hr,pm_max_trips_hr,n_trips,route_dir_count
96,0139b1253130b33adcd4b3a4490530d2,52c2636c-34a3-434c-99ae-cdf3dc36d15c,39,"[0e85fd4c-5258-4256-9852-4a96554aadb7_0.0, T11...",58.0,13,14,97.0,9
179,0139b1253130b33adcd4b3a4490530d2,98d2a60c-86b1-45d6-b5d6-39b273c9eb46,16,"[0ad6c6aa-1939-45a0-a3a8-02ebe8e19092_0.0, 79b...",24.0,5,6,40.0,5
180,0139b1253130b33adcd4b3a4490530d2,999ff07b-4a27-4c80-9a1c-e868038ce097,24,"[0ad6c6aa-1939-45a0-a3a8-02ebe8e19092_0.0, D1_...",44.0,8,11,68.0,7
296,015d67d5b75b5cf2b710bbadadfb75f5,40103,12,"[17_0.0, 17_1.0, 61_0.0, 61_1.0]",17.0,4,4,29.0,4
297,015d67d5b75b5cf2b710bbadadfb75f5,40113,30,"[17_0.0, 22_0.0, 22_1.0, 36_0.0, 36_1.0, 61_1....",48.0,10,12,78.0,8
...,...,...,...,...,...,...,...,...,...
79047,fb746afc72ff40405cfefa6d23ab58a0,53129,28,"[A_0.0, B_0.0, E_0.0, M_0.0]",42.0,9,10,70.0,4
79049,fb746afc72ff40405cfefa6d23ab58a0,53163,20,"[5_AM_0.0, A_0.0, B_0.0, C_0.0, E_0.0, F_0.0]",27.0,6,6,47.0,6
79093,fc6cd27871cce0092a08ccf68fb240a2,132744,28,"[4556_0.0, 4557_0.0, 4558_0.0, 4559_0.0]",44.0,9,11,72.0,4
79116,fe4aab1717eca5a2935c32c85a35a5bf,115,13,"[11_0.0, 12_0.0, 14_0.0, 1_0.0, 2_0.0, 3_0.0, ...",22.0,4,5,35.0,7


In [32]:
multi_only[['schedule_gtfs_dataset_key', 'stop_id', 'route_dir']].explode('route_dir')

Unnamed: 0,schedule_gtfs_dataset_key,stop_id,route_dir
96,0139b1253130b33adcd4b3a4490530d2,52c2636c-34a3-434c-99ae-cdf3dc36d15c,0e85fd4c-5258-4256-9852-4a96554aadb7_0.0
96,0139b1253130b33adcd4b3a4490530d2,52c2636c-34a3-434c-99ae-cdf3dc36d15c,T11x_0.0
96,0139b1253130b33adcd4b3a4490530d2,52c2636c-34a3-434c-99ae-cdf3dc36d15c,T1_0.0
96,0139b1253130b33adcd4b3a4490530d2,52c2636c-34a3-434c-99ae-cdf3dc36d15c,T2_0.0
96,0139b1253130b33adcd4b3a4490530d2,52c2636c-34a3-434c-99ae-cdf3dc36d15c,T3_0.0
...,...,...,...
79116,fe4aab1717eca5a2935c32c85a35a5bf,115,3_0.0
79116,fe4aab1717eca5a2935c32c85a35a5bf,115,4_1.0
79246,ff1bc5dde661d62c877165421e9ca257,LO_19,ROUTEA_0.0
79246,ff1bc5dde661d62c877165421e9ca257,LO_19,ROUTEA_1.0


In [33]:
test = (multi_only[['schedule_gtfs_dataset_key', 'stop_id', 'route_dir']]
.explode('route_dir')
.sort_values(['schedule_gtfs_dataset_key','stop_id', 'route_dir']))

In [34]:
# test = test.head(5000)

In [35]:
test

Unnamed: 0,schedule_gtfs_dataset_key,stop_id,route_dir
96,0139b1253130b33adcd4b3a4490530d2,52c2636c-34a3-434c-99ae-cdf3dc36d15c,0e85fd4c-5258-4256-9852-4a96554aadb7_0.0
96,0139b1253130b33adcd4b3a4490530d2,52c2636c-34a3-434c-99ae-cdf3dc36d15c,T11x_0.0
96,0139b1253130b33adcd4b3a4490530d2,52c2636c-34a3-434c-99ae-cdf3dc36d15c,T1_0.0
96,0139b1253130b33adcd4b3a4490530d2,52c2636c-34a3-434c-99ae-cdf3dc36d15c,T2_0.0
96,0139b1253130b33adcd4b3a4490530d2,52c2636c-34a3-434c-99ae-cdf3dc36d15c,T3_0.0
...,...,...,...
79116,fe4aab1717eca5a2935c32c85a35a5bf,115,3_0.0
79116,fe4aab1717eca5a2935c32c85a35a5bf,115,4_1.0
79246,ff1bc5dde661d62c877165421e9ca257,LO_19,ROUTEA_0.0
79246,ff1bc5dde661d62c877165421e9ca257,LO_19,ROUTEA_1.0


In [36]:
xy = test.loc[96,:].route_dir.to_numpy()

In [37]:
xy

array(['0e85fd4c-5258-4256-9852-4a96554aadb7_0.0', 'T11x_0.0', 'T1_0.0',
       'T2_0.0', 'T3_0.0', 'T4_0.0', 'T5_0.0', 'T6_0.0',
       'e430d571-76bd-45d4-8b01-76e3ef7c3ae1_0.0'], dtype=object)

In [38]:
def test_share_count(df):
    global share_counts
    xy = df.route_dir.to_numpy()
    schedule_gtfs_dataset_key = df.schedule_gtfs_dataset_key.iloc[0]
    for route_dir in xy:
        other_dirs = [x for x in xy if x != route_dir]
        for other_dir in other_dirs:
            key = schedule_gtfs_dataset_key+'__'+route_dir+'__'+other_dir
            if key in share_counts.keys():
                share_counts[key] += 1
            else:
                share_counts[key] = 1

In [39]:
share_counts = {}

In [40]:
test.groupby(['schedule_gtfs_dataset_key', 'stop_id']).apply(test_share_count)

In [41]:
# share_counts

In [42]:
qualify = {key: share_counts[key] for key in share_counts.keys() if share_counts[key] >= 5}

In [43]:
qualify

{'015d67d5b75b5cf2b710bbadadfb75f5__22_1.0__36_0.0': 6,
 '015d67d5b75b5cf2b710bbadadfb75f5__22_1.0__36_1.0': 6,
 '015d67d5b75b5cf2b710bbadadfb75f5__36_0.0__22_1.0': 6,
 '015d67d5b75b5cf2b710bbadadfb75f5__36_0.0__36_1.0': 7,
 '015d67d5b75b5cf2b710bbadadfb75f5__36_1.0__22_1.0': 6,
 '015d67d5b75b5cf2b710bbadadfb75f5__36_1.0__36_0.0': 7,
 '015d67d5b75b5cf2b710bbadadfb75f5__17_1.0__36_1.0': 6,
 '015d67d5b75b5cf2b710bbadadfb75f5__36_1.0__17_1.0': 6,
 '015d67d5b75b5cf2b710bbadadfb75f5__22_1.0__23_1.0': 8,
 '015d67d5b75b5cf2b710bbadadfb75f5__22_1.0__68_1.0': 8,
 '015d67d5b75b5cf2b710bbadadfb75f5__23_1.0__22_1.0': 8,
 '015d67d5b75b5cf2b710bbadadfb75f5__23_1.0__68_1.0': 9,
 '015d67d5b75b5cf2b710bbadadfb75f5__68_1.0__22_1.0': 8,
 '015d67d5b75b5cf2b710bbadadfb75f5__68_1.0__23_1.0': 9,
 '015d67d5b75b5cf2b710bbadadfb75f5__23_1.0__35_0.0': 8,
 '015d67d5b75b5cf2b710bbadadfb75f5__23_1.0__36_1.0': 8,
 '015d67d5b75b5cf2b710bbadadfb75f5__35_0.0__23_1.0': 8,
 '015d67d5b75b5cf2b710bbadadfb75f5__35_0.0__36_1

## explore potential stops (not yet filtered)

In [44]:
stops = helpers.import_scheduled_stops(
    analysis_date,
    get_pandas = True,
    crs = PROJECT_CRS
)

In [45]:
gdf = stops >> inner_join(_, multi_only, on = ['stop_id']) >> select(_.stop_id, _.geometry)

In [46]:
gdf2 = stops >> inner_join(_, single_qual, on = ['stop_id']) >> select(_.stop_id, _.geometry)

In [47]:
gdf2.geometry = gdf2.buffer(400)

In [48]:
gdf = gdf.overlay(gdf2, how='difference')

In [49]:
# gdf.explore()

## lookup function/filtering steps

In [55]:
feeds_to_filter = np.unique([key.split('__')[0] for key in qualify.keys()])

In [126]:
from calitp_data_analysis.tables import tbls

In [133]:
df = tbls.mart_transit_database.dim_gtfs_service_data() >> filter(_.gtfs_dataset_key.isin(feeds_to_filter)) >> distinct(_.name, _.gtfs_dataset_key) >> collect()

In [135]:
df >> filter(_.name.str.contains('Yolo'))

Unnamed: 0,name,gtfs_dataset_key
10,Yolobus – Yolobus Schedule,3c62ad6ee589d56eca915ce291a5df0a


In [56]:
feeds_to_filter

array(['015d67d5b75b5cf2b710bbadadfb75f5',
       '0666caf3ec1ecc96b74f4477ee4bc939',
       '09e16227fc42c4fe90204a9d11581034',
       '1770249a5a2e770ca90628434d4934b1',
       '1c7027faabfeec976ea388973100bcf3',
       '239f3baf3dd3b9e9464f66a777f9897d',
       '2f1c266fc20f9875777fb752af32a66e',
       '364d59b3aea55aec2962a0b3244a40e0',
       '3bda4652977200408690059ef2ec4b4d',
       '3c275e5acf8974e1afd765bd3011424c',
       '3c62ad6ee589d56eca915ce291a5df0a',
       '40ead758629da2ad8a74dbc687652e5a',
       '48e137bc977da88970393f629c18432c',
       '4a3f5fe2e305cc696684d11226ba9878',
       '4c6b107352b318297bb39173c796f357',
       '5456c80d420043e15c8eb7368a8a4d89',
       '55a01ef72af21906934ae8ffb4786e86',
       '587e730fac4db21d54037e0f12b0dd5d',
       '62cae2cb469ba696ca1b29a4cd274b96',
       '68aa06a25a32c83eb38c20c43977feff',
       '6f307d834fda131d6e32fa0bf1585b09',
       '70c8a8b71c815224299523bf2115924a',
       '73105f2d1cabc8170ab066d96863c5d5',
       '749

In [137]:
# dataset_key = '015d67d5b75b5cf2b710bbadadfb75f5' #  Marin
dataset_key = '3c62ad6ee589d56eca915ce291a5df0a' #  Yolobus

In [138]:
this_feed_qual = {key.split(dataset_key)[1][2:]:qualify[key] for key in qualify.keys() if key.split('__')[0] == dataset_key}

In [139]:
this_feed_qual

{'240_0.0__40_0.0': 11,
 '240_0.0__41_0.0': 10,
 '240_0.0__42A_0.0': 5,
 '40_0.0__240_0.0': 11,
 '40_0.0__41_0.0': 10,
 '40_0.0__42A_0.0': 5,
 '41_0.0__240_0.0': 10,
 '41_0.0__40_0.0': 10,
 '42A_0.0__240_0.0': 5,
 '42A_0.0__40_0.0': 5,
 '240_0.0__42B_0.0': 6,
 '40_0.0__42B_0.0': 6,
 '41_0.0__42B_0.0': 6,
 '42B_0.0__240_0.0': 6,
 '42B_0.0__40_0.0': 6,
 '42B_0.0__41_0.0': 6,
 '230_1.0__37_0.0': 6,
 '230_1.0__42B_0.0': 6,
 '230_1.0__43AM_0.0': 6,
 '37_0.0__230_1.0': 6,
 '37_0.0__42A_0.0': 8,
 '37_0.0__42B_0.0': 9,
 '37_0.0__43AM_0.0': 6,
 '42A_0.0__37_0.0': 8,
 '42A_0.0__42B_0.0': 5,
 '42B_0.0__230_1.0': 6,
 '42B_0.0__37_0.0': 9,
 '42B_0.0__42A_0.0': 5,
 '42B_0.0__43AM_0.0': 6,
 '43AM_0.0__230_1.0': 6,
 '43AM_0.0__37_0.0': 6,
 '43AM_0.0__42B_0.0': 6,
 '240_0.0__37_0.0': 9,
 '37_0.0__240_0.0': 9,
 '37_0.0__40_0.0': 9,
 '37_0.0__41_0.0': 8,
 '40_0.0__37_0.0': 9,
 '41_0.0__37_0.0': 8}

In [140]:
list_pairs = [key.split('__') for key in this_feed_qual.keys()]

In [141]:
list_pairs

[['240_0.0', '40_0.0'],
 ['240_0.0', '41_0.0'],
 ['240_0.0', '42A_0.0'],
 ['40_0.0', '240_0.0'],
 ['40_0.0', '41_0.0'],
 ['40_0.0', '42A_0.0'],
 ['41_0.0', '240_0.0'],
 ['41_0.0', '40_0.0'],
 ['42A_0.0', '240_0.0'],
 ['42A_0.0', '40_0.0'],
 ['240_0.0', '42B_0.0'],
 ['40_0.0', '42B_0.0'],
 ['41_0.0', '42B_0.0'],
 ['42B_0.0', '240_0.0'],
 ['42B_0.0', '40_0.0'],
 ['42B_0.0', '41_0.0'],
 ['230_1.0', '37_0.0'],
 ['230_1.0', '42B_0.0'],
 ['230_1.0', '43AM_0.0'],
 ['37_0.0', '230_1.0'],
 ['37_0.0', '42A_0.0'],
 ['37_0.0', '42B_0.0'],
 ['37_0.0', '43AM_0.0'],
 ['42A_0.0', '37_0.0'],
 ['42A_0.0', '42B_0.0'],
 ['42B_0.0', '230_1.0'],
 ['42B_0.0', '37_0.0'],
 ['42B_0.0', '42A_0.0'],
 ['42B_0.0', '43AM_0.0'],
 ['43AM_0.0', '230_1.0'],
 ['43AM_0.0', '37_0.0'],
 ['43AM_0.0', '42B_0.0'],
 ['240_0.0', '37_0.0'],
 ['37_0.0', '240_0.0'],
 ['37_0.0', '40_0.0'],
 ['37_0.0', '41_0.0'],
 ['40_0.0', '37_0.0'],
 ['41_0.0', '37_0.0']]

In [142]:
arr = np.array(list_pairs[0])
for pair in list_pairs[1:]: arr = np.append(arr, np.array(pair))

In [143]:
any_appearance = np.unique(arr)

In [144]:
any_appearance

array(['230_1.0', '240_0.0', '37_0.0', '40_0.0', '41_0.0', '42A_0.0',
       '42B_0.0', '43AM_0.0'], dtype='<U8')

In [146]:
#  only need to check stops that qualify as multi-route only
stops_to_eval = multi_only >> filter(_.schedule_gtfs_dataset_key == dataset_key) >> distinct(_.stop_id)
st_to_eval = st_prepped >> filter(_.schedule_gtfs_dataset_key == dataset_key,
                                  _.stop_id.isin(stops_to_eval.stop_id),
                                  _.route_dir.isin(any_appearance)
                                 )

In [147]:
am_peak_hrs

[6, 7, 8]

In [148]:
pm_peak_hrs

[15, 16, 17, 18]

In [149]:
#  cut down problem space by checking if stops still could qual after filtering for any appearance
min_rows = min_freq * (len(am_peak_hrs) + len(pm_peak_hrs))

In [150]:
st_could_qual = (st_to_eval >> group_by(_.stop_id)
 >> mutate(could_qualify = _.shape[0] >= min_rows)
 >> ungroup()
 >> filter(_.could_qualify)
)

In [151]:
one_stop = st_could_qual >> filter(_.stop_id == st_could_qual.stop_id.iloc[0])

In [152]:
one_stop

Unnamed: 0,feed_key,feed_timezone,base64_url,trip_id,stop_id,stop_sequence,timepoint,arrival_sec,departure_sec,arrival_hour,departure_hour,schedule_gtfs_dataset_key,route_id,direction_id,route_dir,peak,could_qualify
687897,8c7bcf478cd7ef49c6e1582872dae998,America/Los_Angeles,aHR0cHM6Ly93d3cueW9sb2J1cy5jb20vR1RGUy9nb29nbG...,d86af250-e0a8-41b4-89f2-66a86d85a3e4,23284,3,0.0,61871.0,61871.0,17.0,17.0,3c62ad6ee589d56eca915ce291a5df0a,240,0.0,240_0.0,pm_peak,True
687941,8c7bcf478cd7ef49c6e1582872dae998,America/Los_Angeles,aHR0cHM6Ly93d3cueW9sb2J1cy5jb20vR1RGUy9nb29nbG...,bd84aba3-587c-4e9e-9a6b-33c77e88a508,23284,3,0.0,65471.0,65471.0,18.0,18.0,3c62ad6ee589d56eca915ce291a5df0a,240,0.0,240_0.0,pm_peak,True
687985,8c7bcf478cd7ef49c6e1582872dae998,America/Los_Angeles,aHR0cHM6Ly93d3cueW9sb2J1cy5jb20vR1RGUy9nb29nbG...,23d29a29-0ea9-4057-9bcb-39fbde5a675d,23284,3,0.0,58271.0,58271.0,16.0,16.0,3c62ad6ee589d56eca915ce291a5df0a,240,0.0,240_0.0,pm_peak,True
688113,8c7bcf478cd7ef49c6e1582872dae998,America/Los_Angeles,aHR0cHM6Ly93d3cueW9sb2J1cy5jb20vR1RGUy9nb29nbG...,1b9ef026-9a8a-4fed-a9dc-7555540ea9e3,23284,4,0.0,56467.0,56467.0,15.0,15.0,3c62ad6ee589d56eca915ce291a5df0a,40,0.0,40_0.0,pm_peak,True
688381,8c7bcf478cd7ef49c6e1582872dae998,America/Los_Angeles,aHR0cHM6Ly93d3cueW9sb2J1cy5jb20vR1RGUy9nb29nbG...,c6e95305-9dab-4788-acea-405482d235f2,23284,5,0.0,30067.0,30067.0,8.0,8.0,3c62ad6ee589d56eca915ce291a5df0a,41,0.0,41_0.0,am_peak,True
688616,8c7bcf478cd7ef49c6e1582872dae998,America/Los_Angeles,aHR0cHM6Ly93d3cueW9sb2J1cy5jb20vR1RGUy9nb29nbG...,728eba68-9984-4ff4-a3ef-e6eb5241749b,23284,3,0.0,54671.0,54671.0,15.0,15.0,3c62ad6ee589d56eca915ce291a5df0a,240,0.0,240_0.0,pm_peak,True
689245,8c7bcf478cd7ef49c6e1582872dae998,America/Los_Angeles,aHR0cHM6Ly93d3cueW9sb2J1cy5jb20vR1RGUy9nb29nbG...,05149ed0-20df-4e67-9080-263a6ba77289,23284,3,0.0,25871.0,25871.0,7.0,7.0,3c62ad6ee589d56eca915ce291a5df0a,240,0.0,240_0.0,am_peak,True
689299,8c7bcf478cd7ef49c6e1582872dae998,America/Los_Angeles,aHR0cHM6Ly93d3cueW9sb2J1cy5jb20vR1RGUy9nb29nbG...,b7e14d1c-6f97-4956-8ede-0588055e1a6c,23284,3,0.0,29471.0,29471.0,8.0,8.0,3c62ad6ee589d56eca915ce291a5df0a,240,0.0,240_0.0,am_peak,True
689463,8c7bcf478cd7ef49c6e1582872dae998,America/Los_Angeles,aHR0cHM6Ly93d3cueW9sb2J1cy5jb20vR1RGUy9nb29nbG...,8c2951c3-74fe-47c2-85d8-2ed279e41f0c,23284,242,0.0,57849.0,57849.0,16.0,16.0,3c62ad6ee589d56eca915ce291a5df0a,42A,0.0,42A_0.0,pm_peak,True
689527,8c7bcf478cd7ef49c6e1582872dae998,America/Los_Angeles,aHR0cHM6Ly93d3cueW9sb2J1cy5jb20vR1RGUy9nb29nbG...,08643ee7-074f-4c2a-81eb-a75864cf0cf6,23284,242,0.0,26829.0,26829.0,7.0,7.0,3c62ad6ee589d56eca915ce291a5df0a,42A,0.0,42A_0.0,am_peak,True


In [153]:
this_stop_route_dirs = one_stop.route_dir.unique()
this_stop_route_dirs

array(['240_0.0', '40_0.0', '41_0.0', '42A_0.0'], dtype=object)

In [113]:
list(this_stop_route_dirs)

[['36_1.0', '17_1.0']]

In [117]:
if this_stop_route_dirs.shape[0] == 2:
    #  quickly evaluate when only 2 route_dir
    stop_qualifies = list(this_stop_route_dirs) in list_pairs

In [118]:
stop_qualifies

True

In [123]:
gdf = stops >> filter(_.feed_key == '4e3d4b0f0aa4fe278fc86950a98612f0', _.stop_id == '44002') >> select(_.stop_id, _.geometry)

In [125]:
# gdf.explore() #  Marin Transit 36 and 17, hey it makes sense!