In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np

from siuba import *

In [2]:
import create_aggregate_stop_frequencies as casf

In [3]:
from update_vars import analysis_date

In [4]:
analysis_date

'2025-11-05'

In [5]:
st, trips = casf.get_st_trips(analysis_date)  # includes lookback
st_prepped = casf.add_route_dir(trips=trips, stop_times=st, analysis_date=analysis_date).pipe(casf.prep_stop_times)

max_arrivals_by_stop_single = st_prepped.pipe(
    casf.stop_times_aggregation_max_by_stop, analysis_date, single_route_dir=True
)

max_arrivals_by_stop_multi = st_prepped.pipe(
    casf.stop_times_aggregation_max_by_stop, analysis_date, single_route_dir=False
)

{'2025-08-20': ['eTrans Schedule', 'Roseville Transit GMV Schedule'], '2025-09-24': ['San Juan Capistrano Trolley Schedule', 'Culver City Schedule'], '2025-10-15': ['Yolobus Schedule', 'Go West Schedule', 'Bay Area 511 Angel Island-Tiburon Ferry Schedule', 'El Monte Schedule', 'Nevada County Schedule']}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stop_times["peak"] = stop_times["arrival_hour"].map(peaks_dict)


In [6]:
multi_only_explode = casf.get_explode_multiroute_only(
    max_arrivals_by_stop_single, max_arrivals_by_stop_multi, (casf.HQ_TRANSIT_THRESHOLD, casf.MS_TRANSIT_THRESHOLD)
)

## feed_level_filter (done)

## collinear_filter_feed (done)

In [7]:
am_peak_hrs = list(range(casf.AM_PEAK[0].hour, casf.AM_PEAK[1].hour))
pm_peak_hrs = list(range(casf.PM_PEAK[0].hour, casf.PM_PEAK[1].hour))
both_peaks_hrs = am_peak_hrs + pm_peak_hrs
peaks_dict = {key: "am_peak" for key in am_peak_hrs} | {key: "pm_peak" for key in pm_peak_hrs}

In [8]:
share_counts = {}
multi_only_explode.groupby(["schedule_gtfs_dataset_key", "stop_id"]).apply(
    casf.accumulate_share_count, share_counts=share_counts
)
qualify_dict = {key: share_counts[key] for key in share_counts.keys() if share_counts[key] >= casf.SHARED_STOP_THRESHOLD}

In [9]:
feeds_to_filter = np.unique([key.split("__")[0] for key in qualify_dict.keys()])

In [10]:
frequency_thresholds = (casf.HQ_TRANSIT_THRESHOLD, casf.MS_TRANSIT_THRESHOLD)

In [77]:
feeds_to_filter[2]

'11865a0a240039d4479ded2595c832a5'

In [84]:
st_could_qual, qualify_pairs = casf.feed_level_filter(
    feeds_to_filter[3], multi_only_explode, qualify_dict, st_prepped, frequency_thresholds
)

In [85]:
st_could_qual.head(3)

Unnamed: 0,feed_key,trip_id,stop_id,arrival_hour,schedule_gtfs_dataset_key,analysis_date,name,route_id,direction_id,route_type,analysis_name,route_dir,peak,could_qualify
540127,2cfb90968db794e9e8e25d6cf2f1ad58,11902365_M11,14823,8.0,1451f537bdcefd0e8ba827d12c4ef4b8,2025-11-05,Bay Area 511 Muni Schedule,8,0,3,City and County of San Francisco,8_0,am_peak,True
540130,2cfb90968db794e9e8e25d6cf2f1ad58,11902365_M11,15462,8.0,1451f537bdcefd0e8ba827d12c4ef4b8,2025-11-05,Bay Area 511 Muni Schedule,8,0,3,City and County of San Francisco,8_0,am_peak,True
540141,2cfb90968db794e9e8e25d6cf2f1ad58,11902365_M11,16061,8.0,1451f537bdcefd0e8ba827d12c4ef4b8,2025-11-05,Bay Area 511 Muni Schedule,8,0,3,City and County of San Francisco,8_0,am_peak,True


In [94]:
st_could_qual.stop_id.unique()

array(['14823', '15462', '16061', '16054', '16066', '15470', '16055',
       '17263', '13454', '18133', '18134', '16198', '13513', '13439',
       '16159', '16139', '13304', '16467', '13463', '13437', '16165',
       '14072', '17527', '13249', '13452', '16247', '16149', '16143',
       '13252', '13482', '13517', '16161', '16141', '13450', '16199',
       '17976', '13477', '16114', '16121', '14371', '13505', '16245',
       '16737', '13470', '16200', '16246', '16147', '16011', '16117',
       '13459', '13593', '13306', '13474', '16160', '16151', '16145',
       '13461', '17526', '13565', '13467', '17525', '16157', '16155',
       '13486', '14373', '16120', '13472', '16112', '18035', '13456',
       '16163', '16153', '13484', '16358', '16371', '17026', '14798',
       '16361', '16369', '13772', '14889', '16854', '16845', '14969',
       '14970', '16348', '14795', '16357', '16850', '16366', '16852',
       '14938', '16847', '14890', '17300', '14899', '14894', '13066',
       '17204', '130

In [86]:
qualify_pairs

[('18_1', '31_1'),
 ('31_1', '18_1'),
 ('8_0', '9_0'),
 ('9_0', '8_0'),
 ('48_1', '66_1'),
 ('66_1', '48_1')]

## filter qualifying stops

In [100]:
def filter_qualifying_stops(one_stop_st: pd.DataFrame, qualify_pairs: list) -> pd.DataFrame:
    """
    Given stop_times for a single stop, and list of route_dir pairs that can be aggregated,
    filter this stop's stop_times to routes that can be aggregated
    """
    count = one_stop_st.groupby("route_dir")[['trip_id']].count().reset_index().rename(columns={'trip_id': 'route_dir_count'})
    one_stop_st = one_stop_st.merge(count, on='route_dir').sort_values('route_dir_count', ascending=False)
    
    this_stop_route_dirs = one_stop_st.drop_duplicates(subset=['route_dir', 'route_dir_count']).route_dir.to_numpy()
    aggregation_ok_route_dirs = casf.check_stop(this_stop_route_dirs, qualify_pairs)
    print(aggregation_ok_route_dirs)
    aggregation_ok_st = one_stop_st[one_stop_st.route_dir.isin(aggregation_ok_route_dirs)]
    return aggregation_ok_st

In [101]:
    # st_qual_filter_1 = st_could_qual.groupby("stop_id", group_keys=False).apply(
    #     filter_qualifying_stops, qualify_pairs=qualify_pairs
    # )
    # st_qual_filter_1 = st_qual_filter_1.reset_index(drop=True)

In [102]:
df = st_could_qual[st_could_qual.stop_id == '16371']

In [103]:
df2 = casf.filter_qualifying_stops(df, qualify_pairs)

In [104]:
df3 = filter_qualifying_stops(df, qualify_pairs)

['9_0', '8_0']


In [105]:
df2 = df2.sort_values(['route_dir_count', 'route_id', 'trip_id']).reset_index(drop=True)

df3 = df3.sort_values(['route_dir_count', 'route_id', 'trip_id']).reset_index(drop=True)

In [106]:
df2.equals(df3)

True