In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np

from siuba import *

In [2]:
import create_aggregate_stop_frequencies as casf

In [3]:
from update_vars import analysis_date

In [4]:
analysis_date

'2025-11-05'

In [5]:
st, trips = casf.get_st_trips(analysis_date)  # includes lookback
st_prepped = casf.add_route_dir(trips=trips, stop_times=st, analysis_date=analysis_date).pipe(casf.prep_stop_times)

max_arrivals_by_stop_single = st_prepped.pipe(
    casf.stop_times_aggregation_max_by_stop, analysis_date, single_route_dir=True
)

max_arrivals_by_stop_multi = st_prepped.pipe(
    casf.stop_times_aggregation_max_by_stop, analysis_date, single_route_dir=False
)

{'2025-08-20': ['eTrans Schedule', 'Roseville Transit GMV Schedule'], '2025-09-24': ['San Juan Capistrano Trolley Schedule', 'Culver City Schedule'], '2025-10-15': ['Yolobus Schedule', 'Go West Schedule', 'Bay Area 511 Angel Island-Tiburon Ferry Schedule', 'El Monte Schedule', 'Nevada County Schedule']}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stop_times["peak"] = stop_times["arrival_hour"].map(peaks_dict)


In [6]:
multi_only_explode = casf.get_explode_multiroute_only(
    max_arrivals_by_stop_single, max_arrivals_by_stop_multi, (casf.HQ_TRANSIT_THRESHOLD, casf.MS_TRANSIT_THRESHOLD)
)

## feed_level_filter (done)

## collinear_filter_feed (done)

In [10]:
am_peak_hrs = list(range(casf.AM_PEAK[0].hour, casf.AM_PEAK[1].hour))
pm_peak_hrs = list(range(casf.PM_PEAK[0].hour, casf.PM_PEAK[1].hour))
both_peaks_hrs = am_peak_hrs + pm_peak_hrs
peaks_dict = {key: "am_peak" for key in am_peak_hrs} | {key: "pm_peak" for key in pm_peak_hrs}

In [11]:
share_counts = {}
multi_only_explode.groupby(["schedule_gtfs_dataset_key", "stop_id"]).apply(
    casf.accumulate_share_count, share_counts=share_counts
)
qualify_dict = {key: share_counts[key] for key in share_counts.keys() if share_counts[key] >= casf.SHARED_STOP_THRESHOLD}

In [12]:
feeds_to_filter = np.unique([key.split("__")[0] for key in qualify_dict.keys()])

In [13]:
frequency_thresholds = (casf.HQ_TRANSIT_THRESHOLD, casf.MS_TRANSIT_THRESHOLD)

In [14]:
feeds_to_filter[1]

'076e30b080fdc5501151bd3fb0a37b9e'

In [26]:
st_could_qual, qualify_pairs = casf.feed_level_filter(
    feeds_to_filter[1], multi_only_explode, qualify_dict, st_prepped, frequency_thresholds
)

In [27]:
st_could_qual.head(3)

Unnamed: 0,feed_key,trip_id,stop_id,arrival_hour,schedule_gtfs_dataset_key,analysis_date,name,route_id,direction_id,route_type,analysis_name,route_dir,peak,could_qualify
2082024,cbae8712910fbb4c24027c69d481cc9e,10248,6849,8.0,076e30b080fdc5501151bd3fb0a37b9e,2025-11-05,Monterey Salinas Schedule,902,0,3,Monterey-Salinas Transit,902_0,am_peak,True
2082027,cbae8712910fbb4c24027c69d481cc9e,10248,5460,8.0,076e30b080fdc5501151bd3fb0a37b9e,2025-11-05,Monterey Salinas Schedule,902,0,3,Monterey-Salinas Transit,902_0,am_peak,True
2082029,cbae8712910fbb4c24027c69d481cc9e,10248,6855,8.0,076e30b080fdc5501151bd3fb0a37b9e,2025-11-05,Monterey Salinas Schedule,902,0,3,Monterey-Salinas Transit,902_0,am_peak,True


In [28]:
qualify_pairs

[('041_1', '042_1'),
 ('042_1', '041_1'),
 ('901_0', '902_0'),
 ('901_1', '902_1'),
 ('902_0', '901_0'),
 ('902_1', '901_1'),
 ('041_0', '042_0'),
 ('042_0', '041_0'),
 ('048_1', '049_1'),
 ('049_1', '048_1'),
 ('048_0', '049_0'),
 ('049_0', '048_0')]

## filter qualifying stops

In [97]:
def filter_qualifying_stops(one_stop_st: pd.DataFrame, qualify_pairs: list) -> pd.DataFrame:
    """
    Given stop_times for a single stop, and list of route_dir pairs that can be aggregated,
    filter this stop's stop_times to routes that can be aggregated
    """
    one_stop_st = (
        one_stop_st
        >> group_by(_.route_dir)
        >> mutate(route_dir_count=_.shape[0])
        >> ungroup()
        >> arrange(-_.route_dir_count)
    )
    this_stop_route_dirs = (
        one_stop_st >> distinct(_.route_dir, _.route_dir_count)
    ).route_dir.to_numpy()  # preserves sort order
    aggregation_ok_route_dirs = casf.check_stop(this_stop_route_dirs, qualify_pairs)
    print(aggregation_ok_route_dirs)
    return one_stop_st >> filter(_.route_dir.isin(aggregation_ok_route_dirs))

In [89]:
    # st_qual_filter_1 = st_could_qual.groupby("stop_id", group_keys=False).apply(
    #     filter_qualifying_stops, qualify_pairs=qualify_pairs
    # )
    # st_qual_filter_1 = st_qual_filter_1.reset_index(drop=True)

In [91]:
df = st_could_qual[st_could_qual.stop_id == '6849']

In [99]:
df2 = filter_qualifying_stops(df, qualify_pairs)

['902_0', '901_0']


In [100]:
one_stop_st = (
    df
    >> group_by(_.route_dir)
    >> mutate(route_dir_count=_.shape[0])
    >> ungroup()
    >> arrange(-_.route_dir_count)
)

In [102]:
one_stop_st.shape

(24, 15)

In [101]:
one_stop_st

Unnamed: 0,feed_key,trip_id,stop_id,arrival_hour,schedule_gtfs_dataset_key,analysis_date,name,route_id,direction_id,route_type,analysis_name,route_dir,peak,could_qualify,route_dir_count
2082024,cbae8712910fbb4c24027c69d481cc9e,10248,6849,8.0,076e30b080fdc5501151bd3fb0a37b9e,2025-11-05,Monterey Salinas Schedule,902,0,3,Monterey-Salinas Transit,902_0,am_peak,True,12
2082983,cbae8712910fbb4c24027c69d481cc9e,11039,6849,15.0,076e30b080fdc5501151bd3fb0a37b9e,2025-11-05,Monterey Salinas Schedule,902,0,3,Monterey-Salinas Transit,902_0,pm_peak,True,12
2083341,cbae8712910fbb4c24027c69d481cc9e,11313,6849,18.0,076e30b080fdc5501151bd3fb0a37b9e,2025-11-05,Monterey Salinas Schedule,902,0,3,Monterey-Salinas Transit,902_0,pm_peak,True,12
2083926,cbae8712910fbb4c24027c69d481cc9e,11115,6849,16.0,076e30b080fdc5501151bd3fb0a37b9e,2025-11-05,Monterey Salinas Schedule,901,0,3,Monterey-Salinas Transit,901_0,pm_peak,True,12
2084452,cbae8712910fbb4c24027c69d481cc9e,10075,6849,6.0,076e30b080fdc5501151bd3fb0a37b9e,2025-11-05,Monterey Salinas Schedule,902,0,3,Monterey-Salinas Transit,902_0,am_peak,True,12
2087599,cbae8712910fbb4c24027c69d481cc9e,10031,6849,6.0,076e30b080fdc5501151bd3fb0a37b9e,2025-11-05,Monterey Salinas Schedule,902,0,3,Monterey-Salinas Transit,902_0,am_peak,True,12
2088038,cbae8712910fbb4c24027c69d481cc9e,10151,6849,7.0,076e30b080fdc5501151bd3fb0a37b9e,2025-11-05,Monterey Salinas Schedule,902,0,3,Monterey-Salinas Transit,902_0,am_peak,True,12
2088346,cbae8712910fbb4c24027c69d481cc9e,11296,6849,18.0,076e30b080fdc5501151bd3fb0a37b9e,2025-11-05,Monterey Salinas Schedule,901,0,3,Monterey-Salinas Transit,901_0,pm_peak,True,12
2089312,cbae8712910fbb4c24027c69d481cc9e,11062,6849,15.0,076e30b080fdc5501151bd3fb0a37b9e,2025-11-05,Monterey Salinas Schedule,901,0,3,Monterey-Salinas Transit,901_0,pm_peak,True,12
2089451,cbae8712910fbb4c24027c69d481cc9e,11159,6849,16.0,076e30b080fdc5501151bd3fb0a37b9e,2025-11-05,Monterey Salinas Schedule,901,0,3,Monterey-Salinas Transit,901_0,pm_peak,True,12


In [103]:
this_stop_route_dirs = (
    one_stop_st >> distinct(_.route_dir, _.route_dir_count)
).route_dir.to_numpy()

In [104]:
this_stop_route_dirs

array(['902_0', '901_0'], dtype=object)