In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np

from siuba import *

In [2]:
import create_aggregate_stop_frequencies as casf

In [3]:
from update_vars import analysis_date

In [4]:
analysis_date

'2025-11-05'

In [5]:
st, trips = casf.get_st_trips(analysis_date)  # includes lookback
st_prepped = casf.add_route_dir(trips=trips, stop_times=st, analysis_date=analysis_date).pipe(casf.prep_stop_times)

max_arrivals_by_stop_single = st_prepped.pipe(
    casf.stop_times_aggregation_max_by_stop, analysis_date, single_route_dir=True
)

max_arrivals_by_stop_multi = st_prepped.pipe(
    casf.stop_times_aggregation_max_by_stop, analysis_date, single_route_dir=False
)

{'2025-08-20': ['eTrans Schedule', 'Roseville Transit GMV Schedule'], '2025-09-24': ['San Juan Capistrano Trolley Schedule', 'Culver City Schedule'], '2025-10-15': ['Yolobus Schedule', 'Go West Schedule', 'Bay Area 511 Angel Island-Tiburon Ferry Schedule', 'El Monte Schedule', 'Nevada County Schedule']}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stop_times["peak"] = stop_times["arrival_hour"].map(peaks_dict)


In [6]:
multi_only_explode = casf.get_explode_multiroute_only(
    max_arrivals_by_stop_single, max_arrivals_by_stop_multi, (casf.HQ_TRANSIT_THRESHOLD, casf.MS_TRANSIT_THRESHOLD)
)

## feed_level_filter

In [9]:
def feed_level_filter(
    gtfs_dataset_key: str,
    multi_only_explode: pd.DataFrame,
    qualify_dict: dict,
    st_prepped: pd.DataFrame,
    frequency_thresholds: tuple,
) -> pd.DataFrame:
    """
    For a single feed, filter potential stop_times to evaluate based on if their route_dir
    appears at all in qualifying route_dir dict, recheck if there's any chance those stops
    could qualify. Further shrinks problem space for check_stop lookup step
    """

    this_feed_qual = {
        key.split(gtfs_dataset_key)[1][2:]: qualify_dict[key]
        for key in qualify_dict.keys()
        if key.split("__")[0] == gtfs_dataset_key
    }
    qualify_pairs = [tuple(key.split("__")) for key in this_feed_qual.keys()]
    arr = np.array(qualify_pairs[0])
    for pair in qualify_pairs[1:]:
        arr = np.append(arr, np.array(pair))
    any_appearance = np.unique(arr)

    #  only need to check stops that qualify as multi-route only
    stops_to_eval = multi_only_explode >> filter(_.schedule_gtfs_dataset_key == gtfs_dataset_key) >> distinct(_.stop_id)
    st_prepped = st_prepped >> filter(
        _.schedule_gtfs_dataset_key == gtfs_dataset_key,
        _.stop_id.isin(stops_to_eval.stop_id),
    )
    # print(f'{st_prepped.shape}')
    st_to_eval = st_prepped >> filter(_.route_dir.isin(any_appearance))
    # print(f'{st_to_eval.shape}')
    #  cut down problem space by checking if stops still could qual after filtering for any appearance
    min_rows = min(frequency_thresholds) * len(both_peaks_hrs)
    st_could_qual = (
        st_to_eval
        >> group_by(_.stop_id)
        >> mutate(could_qualify=_.shape[0] >= min_rows)
        >> ungroup()
        >> filter(_.could_qualify)
    )
    # print(f'{st_could_qual.shape}')
    return st_could_qual, qualify_pairs

## collinear_filter_feed (done)

In [10]:
am_peak_hrs = list(range(casf.AM_PEAK[0].hour, casf.AM_PEAK[1].hour))
pm_peak_hrs = list(range(casf.PM_PEAK[0].hour, casf.PM_PEAK[1].hour))
both_peaks_hrs = am_peak_hrs + pm_peak_hrs
peaks_dict = {key: "am_peak" for key in am_peak_hrs} | {key: "pm_peak" for key in pm_peak_hrs}

In [11]:
share_counts = {}
multi_only_explode.groupby(["schedule_gtfs_dataset_key", "stop_id"]).apply(
    casf.accumulate_share_count, share_counts=share_counts
)
qualify_dict = {key: share_counts[key] for key in share_counts.keys() if share_counts[key] >= casf.SHARED_STOP_THRESHOLD}

In [12]:
feeds_to_filter = np.unique([key.split("__")[0] for key in qualify_dict.keys()])

In [13]:
frequency_thresholds = (casf.HQ_TRANSIT_THRESHOLD, casf.MS_TRANSIT_THRESHOLD)

In [14]:
feeds_to_filter[1]

'076e30b080fdc5501151bd3fb0a37b9e'

In [15]:
st_could_qual, qualify_pairs = casf.feed_level_filter(
    feeds_to_filter[1], multi_only_explode, qualify_dict, st_prepped, frequency_thresholds
)

In [16]:
st_could_qual.head(3)

Unnamed: 0,feed_key,trip_id,stop_id,arrival_hour,schedule_gtfs_dataset_key,analysis_date,name,route_id,direction_id,route_type,analysis_name,route_dir,peak,could_qualify
2082024,cbae8712910fbb4c24027c69d481cc9e,10248,6849,8.0,076e30b080fdc5501151bd3fb0a37b9e,2025-11-05,Monterey Salinas Schedule,902,0,3,Monterey-Salinas Transit,902_0,am_peak,True
2082027,cbae8712910fbb4c24027c69d481cc9e,10248,5460,8.0,076e30b080fdc5501151bd3fb0a37b9e,2025-11-05,Monterey Salinas Schedule,902,0,3,Monterey-Salinas Transit,902_0,am_peak,True
2082029,cbae8712910fbb4c24027c69d481cc9e,10248,6855,8.0,076e30b080fdc5501151bd3fb0a37b9e,2025-11-05,Monterey Salinas Schedule,902,0,3,Monterey-Salinas Transit,902_0,am_peak,True


In [16]:
qualify_pairs

[('041_1', '042_1'),
 ('042_1', '041_1'),
 ('901_0', '902_0'),
 ('901_1', '902_1'),
 ('902_0', '901_0'),
 ('902_1', '901_1'),
 ('041_0', '042_0'),
 ('042_0', '041_0'),
 ('048_1', '049_1'),
 ('049_1', '048_1'),
 ('048_0', '049_0'),
 ('049_0', '048_0')]