In [1]:
import pandas as pd
import geopandas as gpd

from siuba import *

In [2]:
import create_aggregate_stop_frequencies as casf

In [3]:
from update_vars import analysis_date

In [4]:
analysis_date

'2025-11-05'

In [5]:
st, trips = casf.get_st_trips(analysis_date)  # includes lookback
st_prepped = casf.add_route_dir(trips=trips, stop_times=st, analysis_date=analysis_date).pipe(casf.prep_stop_times)

max_arrivals_by_stop_single = st_prepped.pipe(
    casf.stop_times_aggregation_max_by_stop, analysis_date, single_route_dir=True
)

max_arrivals_by_stop_multi = st_prepped.pipe(
    casf.stop_times_aggregation_max_by_stop, analysis_date, single_route_dir=False
)

{'2025-08-20': ['eTrans Schedule', 'Roseville Transit GMV Schedule'], '2025-09-24': ['San Juan Capistrano Trolley Schedule', 'Culver City Schedule'], '2025-10-15': ['Yolobus Schedule', 'Go West Schedule', 'Bay Area 511 Angel Island-Tiburon Ferry Schedule', 'El Monte Schedule', 'Nevada County Schedule']}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stop_times["peak"] = stop_times["arrival_hour"].map(peaks_dict)


In [69]:
def get_explode_multiroute_only(
    single_route_aggregation: pd.DataFrame,
    multi_route_aggregation: pd.DataFrame,
    frequency_thresholds: tuple,
) -> pd.DataFrame:
    """
    Shrink the problem space for the compute-intensive collinearity screen.
    First, get stops with any chance of qualifying as either a major stop/hq corr for
    multi-route aggregations, and stops that already may qualify as an hq corr for single-route.
    Be more selective for single route, since some stops may meet the lower frequency as single,
    but if they could meet the higher as multi we want to check collinearity for those.
    Then get stops that appear in multi-route qualifiers only, these will go to
    further processing.
    """
    #  note this is max -- still evaluate stops meeting the lower threshold as single-route in case they meet the higher threshold as multi
    single_qual_max = single_route_aggregation[(single_route_aggregation.am_max_trips_hr >= max(frequency_thresholds))
                            & (single_route_aggregation.pm_max_trips_hr >= max(frequency_thresholds))]
    
    multi_qual = multi_route_aggregation[(multi_route_aggregation.am_max_trips_hr >= min(frequency_thresholds))
                            & (multi_route_aggregation.pm_max_trips_hr >= min(frequency_thresholds))]
    
    multi_only = multi_qual.merge(single_qual_max[["schedule_gtfs_dataset_key", "stop_id"]], on=["schedule_gtfs_dataset_key", "stop_id"], how='left', indicator=True)
    multi_only = multi_only[multi_only['_merge'] == 'left_only'].drop(columns=['_merge']).reset_index(drop=True)
    
    # multi_only = multi_qual >> anti_join(_, single_qual_max, on=["schedule_gtfs_dataset_key", "stop_id"])
    #  only consider route_dir that run at least hourly when doing multi-route aggregation, should reduce edge cases
    # single_hourly = single_route_aggregation >> filter(_.am_max_trips_hr >= 1, _.pm_max_trips_hr >= 1)
    single_hourly = single_route_aggregation[(single_route_aggregation.am_max_trips_hr >= 1) & (single_route_aggregation.pm_max_trips_hr >= 1)]
    single_hourly = single_hourly.explode("route_dir")[["route_dir", "schedule_gtfs_dataset_key", "stop_id"]]
    multi_only_explode = multi_only[["schedule_gtfs_dataset_key", "stop_id", "route_dir"]].explode("route_dir")
    multi_only_explode = multi_only_explode.merge(
        single_hourly, on=["route_dir", "schedule_gtfs_dataset_key", "stop_id"]
    )
    multi_only_explode = multi_only_explode.sort_values(
        ["schedule_gtfs_dataset_key", "stop_id", "route_dir"]
    )  # sorting crucial for next step
    # print(f'{multi_only_explode.stop_id.nunique()} stops may qualify with multi-route aggregation')
    return multi_only_explode

In [70]:
multi_only_explode = get_explode_multiroute_only(
    max_arrivals_by_stop_single, max_arrivals_by_stop_multi, (casf.HQ_TRANSIT_THRESHOLD, casf.MS_TRANSIT_THRESHOLD)
)

In [71]:
multi_only_explode_siu = casf.get_explode_multiroute_only(
    max_arrivals_by_stop_single, max_arrivals_by_stop_multi, (casf.HQ_TRANSIT_THRESHOLD, casf.MS_TRANSIT_THRESHOLD)
)

In [72]:
multi_only_explode.equals(multi_only_explode_siu)

True

In [66]:
multi_only_explode

Unnamed: 0,schedule_gtfs_dataset_key,stop_id,route_dir
0,0089bd1b0a2b78a8590d8749737d7146,40103,17_0
1,0089bd1b0a2b78a8590d8749737d7146,40103,17_1
2,0089bd1b0a2b78a8590d8749737d7146,40113,17_0
3,0089bd1b0a2b78a8590d8749737d7146,40113,22_1
4,0089bd1b0a2b78a8590d8749737d7146,40113,36_0
...,...,...,...
23316,f8e4fa18131802bf978177326377241d,882962,1_0
23317,f8e4fa18131802bf978177326377241d,882962,30R_0
23318,f8e4fa18131802bf978177326377241d,882975,30R_1
23319,f8e4fa18131802bf978177326377241d,882977,30R_1


In [13]:
frequency_thresholds = (casf.HQ_TRANSIT_THRESHOLD, casf.MS_TRANSIT_THRESHOLD)

In [14]:
single_qual_max = max_arrivals_by_stop_single >> filter(
    _.am_max_trips_hr >= max(frequency_thresholds), _.pm_max_trips_hr >= max(frequency_thresholds)
)

In [40]:
multi_qual = max_arrivals_by_stop_multi[(max_arrivals_by_stop_multi.am_max_trips_hr >= min(frequency_thresholds))
                        & (max_arrivals_by_stop_multi.pm_max_trips_hr >= min(frequency_thresholds))]

In [41]:
multi_only = multi_qual >> anti_join(_, single_qual_max, on=["schedule_gtfs_dataset_key", "stop_id"])

In [42]:
multi_only

Unnamed: 0,schedule_gtfs_dataset_key,stop_id,am_max_trips,route_dir,pm_max_trips,am_max_trips_hr,pm_max_trips_hr,n_trips,route_dir_count
11,0089bd1b0a2b78a8590d8749737d7146,40103,12,"[17_0, 17_1, 61_0, 61_1]",17.0,4.00,4.25,29.0,4
12,0089bd1b0a2b78a8590d8749737d7146,40113,30,"[17_0, 22_0, 22_1, 36_0, 36_1, 61_1, 71_0, 71_1]",47.0,10.00,11.75,77.0,8
46,0089bd1b0a2b78a8590d8749737d7146,40158,11,"[219_0, 219_1, 36_0]",22.0,3.67,5.50,33.0,3
53,0089bd1b0a2b78a8590d8749737d7146,40170,12,"[22_0, 36_0, 71_0]",24.0,4.00,6.00,36.0,3
55,0089bd1b0a2b78a8590d8749737d7146,40172,14,"[17_1, 22_1, 36_0, 36_1]",20.0,4.67,5.00,34.0,4
...,...,...,...,...,...,...,...,...,...
75927,f8e4fa18131802bf978177326377241d,882956,14,"[2_0, 30R_1, 501_0, 502_0, 504_0]",14.0,4.67,3.50,28.0,5
75931,f8e4fa18131802bf978177326377241d,882962,12,"[1_0, 30R_0]",20.0,4.00,5.00,32.0,2
75943,f8e4fa18131802bf978177326377241d,882975,9,[30R_1],12.0,3.00,3.00,21.0,1
75944,f8e4fa18131802bf978177326377241d,882977,9,[30R_1],12.0,3.00,3.00,21.0,1


In [50]:
df = multi_qual.merge(single_qual_max[["schedule_gtfs_dataset_key", "stop_id"]], on=["schedule_gtfs_dataset_key", "stop_id"], how='left', indicator=True)
df = df[df['_merge'] == 'left_only'].drop(columns=['_merge']).reset_index(drop=True)

In [58]:
df.reset_index(drop=True).equals(multi_only.reset_index(drop=True))

True

In [53]:
df

Unnamed: 0,schedule_gtfs_dataset_key,stop_id,am_max_trips,route_dir,pm_max_trips,am_max_trips_hr,pm_max_trips_hr,n_trips,route_dir_count
0,0089bd1b0a2b78a8590d8749737d7146,40103,12,"[17_0, 17_1, 61_0, 61_1]",17.0,4.00,4.25,29.0,4
1,0089bd1b0a2b78a8590d8749737d7146,40113,30,"[17_0, 22_0, 22_1, 36_0, 36_1, 61_1, 71_0, 71_1]",47.0,10.00,11.75,77.0,8
2,0089bd1b0a2b78a8590d8749737d7146,40158,11,"[219_0, 219_1, 36_0]",22.0,3.67,5.50,33.0,3
3,0089bd1b0a2b78a8590d8749737d7146,40170,12,"[22_0, 36_0, 71_0]",24.0,4.00,6.00,36.0,3
4,0089bd1b0a2b78a8590d8749737d7146,40172,14,"[17_1, 22_1, 36_0, 36_1]",20.0,4.67,5.00,34.0,4
...,...,...,...,...,...,...,...,...,...
24570,f8e4fa18131802bf978177326377241d,882956,14,"[2_0, 30R_1, 501_0, 502_0, 504_0]",14.0,4.67,3.50,28.0,5
24571,f8e4fa18131802bf978177326377241d,882962,12,"[1_0, 30R_0]",20.0,4.00,5.00,32.0,2
24572,f8e4fa18131802bf978177326377241d,882975,9,[30R_1],12.0,3.00,3.00,21.0,1
24573,f8e4fa18131802bf978177326377241d,882977,9,[30R_1],12.0,3.00,3.00,21.0,1


In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14410 entries, 0 to 24574
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   schedule_gtfs_dataset_key  14410 non-null  object 
 1   stop_id                    14410 non-null  object 
 2   am_max_trips               14410 non-null  int64  
 3   route_dir                  14410 non-null  object 
 4   pm_max_trips               14410 non-null  float64
 5   am_max_trips_hr            14410 non-null  float64
 6   pm_max_trips_hr            14410 non-null  float64
 7   n_trips                    14410 non-null  float64
 8   route_dir_count            14410 non-null  int64  
dtypes: float64(4), int64(2), object(3)
memory usage: 1.1+ MB


In [56]:
multi_only.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14410 entries, 11 to 75957
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   schedule_gtfs_dataset_key  14410 non-null  object 
 1   stop_id                    14410 non-null  object 
 2   am_max_trips               14410 non-null  int64  
 3   route_dir                  14410 non-null  object 
 4   pm_max_trips               14410 non-null  float64
 5   am_max_trips_hr            14410 non-null  float64
 6   pm_max_trips_hr            14410 non-null  float64
 7   n_trips                    14410 non-null  float64
 8   route_dir_count            14410 non-null  int64  
dtypes: float64(4), int64(2), object(3)
memory usage: 1.1+ MB


In [54]:
multi_only

Unnamed: 0,schedule_gtfs_dataset_key,stop_id,am_max_trips,route_dir,pm_max_trips,am_max_trips_hr,pm_max_trips_hr,n_trips,route_dir_count
11,0089bd1b0a2b78a8590d8749737d7146,40103,12,"[17_0, 17_1, 61_0, 61_1]",17.0,4.00,4.25,29.0,4
12,0089bd1b0a2b78a8590d8749737d7146,40113,30,"[17_0, 22_0, 22_1, 36_0, 36_1, 61_1, 71_0, 71_1]",47.0,10.00,11.75,77.0,8
46,0089bd1b0a2b78a8590d8749737d7146,40158,11,"[219_0, 219_1, 36_0]",22.0,3.67,5.50,33.0,3
53,0089bd1b0a2b78a8590d8749737d7146,40170,12,"[22_0, 36_0, 71_0]",24.0,4.00,6.00,36.0,3
55,0089bd1b0a2b78a8590d8749737d7146,40172,14,"[17_1, 22_1, 36_0, 36_1]",20.0,4.67,5.00,34.0,4
...,...,...,...,...,...,...,...,...,...
75927,f8e4fa18131802bf978177326377241d,882956,14,"[2_0, 30R_1, 501_0, 502_0, 504_0]",14.0,4.67,3.50,28.0,5
75931,f8e4fa18131802bf978177326377241d,882962,12,"[1_0, 30R_0]",20.0,4.00,5.00,32.0,2
75943,f8e4fa18131802bf978177326377241d,882975,9,[30R_1],12.0,3.00,3.00,21.0,1
75944,f8e4fa18131802bf978177326377241d,882977,9,[30R_1],12.0,3.00,3.00,21.0,1
