In [11]:
import pandas as pd
import geopandas as gpd
import numpy as np

from siuba import *

In [2]:
import create_aggregate_stop_frequencies as casf

In [3]:
from update_vars import analysis_date

In [4]:
analysis_date

'2025-11-05'

In [5]:
st, trips = casf.get_st_trips(analysis_date)  # includes lookback
st_prepped = casf.add_route_dir(trips=trips, stop_times=st, analysis_date=analysis_date).pipe(casf.prep_stop_times)

max_arrivals_by_stop_single = st_prepped.pipe(
    casf.stop_times_aggregation_max_by_stop, analysis_date, single_route_dir=True
)

max_arrivals_by_stop_multi = st_prepped.pipe(
    casf.stop_times_aggregation_max_by_stop, analysis_date, single_route_dir=False
)

{'2025-08-20': ['eTrans Schedule', 'Roseville Transit GMV Schedule'], '2025-09-24': ['San Juan Capistrano Trolley Schedule', 'Culver City Schedule'], '2025-10-15': ['Yolobus Schedule', 'Go West Schedule', 'Bay Area 511 Angel Island-Tiburon Ferry Schedule', 'El Monte Schedule', 'Nevada County Schedule']}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stop_times["peak"] = stop_times["arrival_hour"].map(peaks_dict)


In [6]:
multi_only_explode = casf.get_explode_multiroute_only(
    max_arrivals_by_stop_single, max_arrivals_by_stop_multi, (casf.HQ_TRANSIT_THRESHOLD, casf.MS_TRANSIT_THRESHOLD)
)

## feed_level_filter

In [7]:
def feed_level_filter(
    gtfs_dataset_key: str,
    multi_only_explode: pd.DataFrame,
    qualify_dict: dict,
    st_prepped: pd.DataFrame,
    frequency_thresholds: tuple,
) -> pd.DataFrame:
    """
    For a single feed, filter potential stop_times to evaluate based on if their route_dir
    appears at all in qualifying route_dir dict, recheck if there's any chance those stops
    could qualify. Further shrinks problem space for check_stop lookup step
    """

    this_feed_qual = {
        key.split(gtfs_dataset_key)[1][2:]: qualify_dict[key]
        for key in qualify_dict.keys()
        if key.split("__")[0] == gtfs_dataset_key
    }
    qualify_pairs = [tuple(key.split("__")) for key in this_feed_qual.keys()]
    arr = np.array(qualify_pairs[0])
    for pair in qualify_pairs[1:]:
        arr = np.append(arr, np.array(pair))
    any_appearance = np.unique(arr)

    #  only need to check stops that qualify as multi-route only
    stops_to_eval = multi_only_explode >> filter(_.schedule_gtfs_dataset_key == gtfs_dataset_key) >> distinct(_.stop_id)
    st_prepped = st_prepped >> filter(
        _.schedule_gtfs_dataset_key == gtfs_dataset_key,
        _.stop_id.isin(stops_to_eval.stop_id),
    )
    # print(f'{st_prepped.shape}')
    st_to_eval = st_prepped >> filter(_.route_dir.isin(any_appearance))
    # print(f'{st_to_eval.shape}')
    #  cut down problem space by checking if stops still could qual after filtering for any appearance
    min_rows = min(frequency_thresholds) * len(both_peaks_hrs)
    st_could_qual = (
        st_to_eval
        >> group_by(_.stop_id)
        >> mutate(could_qualify=_.shape[0] >= min_rows)
        >> ungroup()
        >> filter(_.could_qualify)
    )
    # print(f'{st_could_qual.shape}')
    return st_could_qual, qualify_pairs

## collinear_filter_feed

In [21]:
am_peak_hrs = list(range(casf.AM_PEAK[0].hour, casf.AM_PEAK[1].hour))
pm_peak_hrs = list(range(casf.PM_PEAK[0].hour, casf.PM_PEAK[1].hour))
both_peaks_hrs = am_peak_hrs + pm_peak_hrs
peaks_dict = {key: "am_peak" for key in am_peak_hrs} | {key: "pm_peak" for key in pm_peak_hrs}

In [78]:
def collinear_filter_feed(
    gtfs_dataset_key: str,
    multi_only_explode: pd.DataFrame,
    qualify_dict: dict,
    st_prepped: pd.DataFrame,
    frequency_thresholds: tuple,
) -> pd.DataFrame:
    """
    Apply collinearity filtering steps to one feed.
    """

    st_could_qual, qualify_pairs = casf.feed_level_filter(
        gtfs_dataset_key, multi_only_explode, qualify_dict, st_prepped, frequency_thresholds
    )
    st_qual_filter_1 = st_could_qual.groupby("stop_id", group_keys=False).apply(
        casf.filter_qualifying_stops, qualify_pairs=qualify_pairs
    )
    st_qual_filter_1 = st_qual_filter_1.reset_index(drop=True)
    if st_qual_filter_1.empty:
        return
    trips_per_peak_qual_1 = casf.stop_times_aggregation_max_by_stop(st_qual_filter_1, analysis_date, single_route_dir=False)

    trips_per_peak_qual_1 = trips_per_peak_qual_1[(trips_per_peak_qual_1.am_max_trips_hr >= min(frequency_thresholds)) &
                                                      (trips_per_peak_qual_1.pm_max_trips_hr >= min(frequency_thresholds))]

    df = trips_per_peak_qual_1.explode("route_dir")[['route_dir', 'stop_id']].groupby('route_dir').count().reset_index()
    short_routes = df[df.stop_id < casf.SHARED_STOP_THRESHOLD]
    # print('short routes, all_short stops:')
    # display(short_routes)
    trips_per_peak_qual_1["all_short"] = trips_per_peak_qual_1.route_dir.map(
        lambda x: np.array([True if y in list(short_routes.route_dir) else False for y in x]).all()
    )
    display(trips_per_peak_qual_1[trips_per_peak_qual_1.all_short]) #  stops where _every_ shared route has less than SHARED_STOP_THRESHOLD frequent stops (even after aggregation)
    trips_per_peak_qual_2 = trips_per_peak_qual_1[~trips_per_peak_qual_1.all_short].drop(columns=['all_short'])


    return trips_per_peak_qual_2

In [8]:
share_counts = {}
multi_only_explode.groupby(["schedule_gtfs_dataset_key", "stop_id"]).apply(
    casf.accumulate_share_count, share_counts=share_counts
)
qualify_dict = {key: share_counts[key] for key in share_counts.keys() if share_counts[key] >= casf.SHARED_STOP_THRESHOLD}

In [12]:
feeds_to_filter = np.unique([key.split("__")[0] for key in qualify_dict.keys()])

In [13]:
frequency_thresholds = (casf.HQ_TRANSIT_THRESHOLD, casf.MS_TRANSIT_THRESHOLD)

In [15]:
feeds_to_filter[0]

'0089bd1b0a2b78a8590d8749737d7146'

In [79]:
df = casf.collinear_filter_feed(feeds_to_filter[1], multi_only_explode, qualify_dict, st_prepped, frequency_thresholds)

In [80]:
df2 = collinear_filter_feed(feeds_to_filter[1], multi_only_explode, qualify_dict, st_prepped, frequency_thresholds)

Unnamed: 0,schedule_gtfs_dataset_key,stop_id,am_max_trips,route_dir,pm_max_trips,am_max_trips_hr,pm_max_trips_hr,n_trips,route_dir_count,all_short
2,076e30b080fdc5501151bd3fb0a37b9e,2206,9,"[901_1, 902_1]",15,3.0,3.75,24,2,True
59,076e30b080fdc5501151bd3fb0a37b9e,6725,9,"[901_1, 902_1]",15,3.0,3.75,24,2,True
60,076e30b080fdc5501151bd3fb0a37b9e,6728,9,"[901_1, 902_1]",15,3.0,3.75,24,2,True
61,076e30b080fdc5501151bd3fb0a37b9e,6734,9,"[901_1, 902_1]",15,3.0,3.75,24,2,True
62,076e30b080fdc5501151bd3fb0a37b9e,6770,9,"[901_1, 902_1]",16,3.0,4.0,25,2,True
69,076e30b080fdc5501151bd3fb0a37b9e,6915,9,"[901_1, 902_1]",15,3.0,3.75,24,2,True


In [81]:
df.equals(df2)

True

In [68]:
df

Unnamed: 0,schedule_gtfs_dataset_key,stop_id,am_max_trips,route_dir,pm_max_trips,am_max_trips_hr,pm_max_trips_hr,n_trips,route_dir_count
0,076e30b080fdc5501151bd3fb0a37b9e,0013,9,"[041_1, 042_1]",15,3.00,3.75,24,2
1,076e30b080fdc5501151bd3fb0a37b9e,1997,11,"[041_0, 042_0]",13,3.67,3.25,24,2
3,076e30b080fdc5501151bd3fb0a37b9e,3125,9,"[041_1, 042_1]",15,3.00,3.75,24,2
4,076e30b080fdc5501151bd3fb0a37b9e,3187,11,"[041_0, 042_0]",13,3.67,3.25,24,2
5,076e30b080fdc5501151bd3fb0a37b9e,3191,11,"[041_0, 042_0]",13,3.67,3.25,24,2
...,...,...,...,...,...,...,...,...,...
64,076e30b080fdc5501151bd3fb0a37b9e,6819,9,"[901_0, 902_0]",15,3.00,3.75,24,2
65,076e30b080fdc5501151bd3fb0a37b9e,6846,10,"[901_0, 902_0]",14,3.33,3.50,24,2
66,076e30b080fdc5501151bd3fb0a37b9e,6849,10,"[901_0, 902_0]",14,3.33,3.50,24,2
67,076e30b080fdc5501151bd3fb0a37b9e,6855,9,"[901_0, 902_0]",15,3.00,3.75,24,2


In [53]:
short_routes = (
    df2.explode("route_dir") >> count(_.route_dir) >> filter(_.n < casf.SHARED_STOP_THRESHOLD)
)

In [54]:
short_routes

Unnamed: 0,route_dir,n
9,901_1,6
11,902_1,6


In [61]:
df2.explode("route_dir")[['route_dir', 'stop_id']].groupby('route_dir').count().reset_index()

Unnamed: 0,route_dir,stop_id
0,041_0,21
1,041_1,16
2,042_0,21
3,042_1,16
4,048_0,8
5,048_1,9
6,049_0,8
7,049_1,9
8,901_0,9
9,901_1,6
