# Stop Summary

For a stop, I want a set of metrics (peak or whatever time headway, frequency, routes served).

**For a stop** = aggregate to a stop, which is `feed_key/name and stop_id`.

Skip the route-direction frequencies, since several routes can serve the same stop. 

For a bus stop, it matters when the bus has physically visited the stop (regardless of what route/direction/whatever characteristic it has), and counting the "arrivals" is what can be used to derive frequency and headway.

In [1]:
import pandas as pd
import geopandas as gpd
import google.auth

from shared_utils import time_helpers 


GCS_FILE_PATH  = 'gs://calitp-analytics-data/data-analyses/ahsc_grant'
analysis_date = "06_02_2025"
credentials, project = google.auth.default()

In [2]:
'''
# get stoptimes as a parquet (drop the interval BQ columns and index)
# _interval columns are used in BQ, more for storing data in specific time intervals, but not useful in a parquet
# data types can be set upon reading in a csv (since csvs get confused)

pd.read_csv(
    f"{GCS_FILE_PATH}/stoptimes_{analysis_date}.csv",
    dtype = {
        "trip_id": "str",
        "stop_id": "str"
    }
).drop(
    columns = ["Unnamed: 0", "key", "_gtfs_key", 
               "arrival_time_interval", "departure_time_interval"]
).to_parquet(f"{GCS_FILE_PATH}/stoptimes_{analysis_date}.parquet")
'''

'\n# get stoptimes as a parquet (drop the interval BQ columns and index)\n# _interval columns are used in BQ, more for storing data in specific time intervals, but not useful in a parquet\n# data types can be set upon reading in a csv (since csvs get confused)\n\npd.read_csv(\n    f"{GCS_FILE_PATH}/stoptimes_{analysis_date}.csv",\n    dtype = {\n        "trip_id": "str",\n        "stop_id": "str"\n    }\n).drop(\n    columns = ["Unnamed: 0", "key", "_gtfs_key", \n               "arrival_time_interval", "departure_time_interval"]\n).to_parquet(f"{GCS_FILE_PATH}/stoptimes_{analysis_date}.parquet")\n'

In [3]:
def prep_trips(analysis_date):
    """
    Parquets can read in only specific columns too, see example below.
    """
    trips = pd.read_parquet(
        f"{GCS_FILE_PATH}/trips_{analysis_date}.parquet",
        columns = ["feed_key", "name", "trip_id", 
                "trip_instance_key", "route_id", "direction_id", 
                "trip_first_departure_datetime_pacific"]
    )


    # instead of gtfs_schedule_wrangling.get_trip_time_buckets, which relies on helpers function
    # which reads a filepath that you're not using,
    # just used the important portion, which is taking a trip_first_departure and categorizing it into time_of_day.
    trips = trips.assign(
        time_of_day = trips.apply(
            lambda x: time_helpers.categorize_time_of_day(
                x.trip_first_departure_datetime_pacific), axis=1), 
    )

    return trips


def prep_stops(analysis_date):

    stops = gpd.read_parquet(
        f"{GCS_FILE_PATH}/stop_locations_{analysis_date}.parquet",
        columns = ["feed_key", "stop_id", "stop_name", "geometry"],
        storage_options={'token': credentials.token})

    stops = stops.assign(
        stop_combo_col = stops.stop_id + stops.stop_name
    )

    return stops


def stop_summary_stats(
    df,
    stops: gpd.GeoDataFrame,
    group_cols: list
):
    """
    Take stop times and start counting frequency based on 
    how many scheduled arrivals it's supposed to make (# rows, so count of a column works).

    Store how many routes / route-direction it serves, but those will not be used for 
    frequency or headway.
    """
    df2 = (
        df.groupby(group_cols)
        .agg({
            "trip_instance_key": "nunique",
            "stop_sequence": "count", # prefer
            "route_id": "nunique"
        }).reset_index()
        .rename(columns = {
            "trip_instance_key": "n_trips",
            "stop_sequence": "n_arrivals",
            "route_id": "n_routes",
        })
    )
    
    # when time-of-day is a column, just get the number of hours in each bin
    if "time_of_day" in df2.columns:
        df2["duration"] = df2.time_of_day.map(
            time_helpers.HOURS_BY_TIME_OF_DAY
        )
    # when time-of-day is not a column, hours are set to 24    
    else:
        df2["duration"] = 24
    
    
    df2["frequency"] = df2.n_arrivals.divide(df2.duration)
    df2["headway"] = 60 / df2.frequency
    
    # post groupby, each row is a stop!
    # Merge stop_geom back in.
    # After we reduced the df from a lot of rows, where each row represented a trip-stop,
    # now that every row is a stop, attach the stop's pt geometry in basically a 1:1 merge
    gdf = pd.merge(
        stops,
        df2,
        on = ["feed_key", "stop_id"],
        how = "inner" # or left? 
        # with left, zeros need to be filled in
    )
    
    return gdf

In [4]:
# created above, much faster to read in
stop_times = pd.read_parquet(
    f"{GCS_FILE_PATH}/stoptimes_{analysis_date}.parquet",
    columns = ["feed_key", "trip_id", "stop_id", "stop_sequence"]
)

trips = prep_trips(analysis_date)
stops = prep_stops(analysis_date)

df = pd.merge(
    stop_times,
    trips, # adding this gives me route_id, trip_instance_key, or route_type
    on = ["feed_key", "trip_id"],
    how = "left"
)

In [5]:
# Use the same function to create summary stats
# simply adjust the groupby list of columns
# for all day, it's an operator-stop (since we need all the rows for an all-day average)
# for time-of-day, simply add operator-stop-time_of_day column

# All day
daily_stats = stop_summary_stats(
    df, stops, group_cols = ["feed_key", "stop_id"])

# Time-of-Day
timeofday_stats = stop_summary_stats(
    df, stops, group_cols = ["feed_key", "stop_id", "time_of_day"])

In [6]:
display(daily_stats.head(2))
display(daily_stats.headway.describe())

display(timeofday_stats.head(2))
display(timeofday_stats.headway.describe())

Unnamed: 0,feed_key,stop_id,stop_name,geometry,stop_combo_col,n_trips,n_arrivals,n_routes,duration,frequency,headway
0,ace4e22d6f2c299c36eba89ccb650b1b,00eb15cb-1430-4964-b8ae-ca6183e1d0ef,Grace and Laughter Apartments,POINT (-119.39065 36.55368),00eb15cb-1430-4964-b8ae-ca6183e1d0efGrace and ...,12,12,1,24,0.5,120.0
1,ace4e22d6f2c299c36eba89ccb650b1b,02a30e39-496f-45d4-ba1c-ac8f3c66b621,El Monte Way and Randle,POINT (-119.37038 36.54438),02a30e39-496f-45d4-ba1c-ac8f3c66b621El Monte W...,36,36,3,24,1.5,40.0


count    85773.000000
mean       107.035954
std        200.085699
min          0.005705
25%         26.181818
50%         51.428571
75%         96.000000
max       1440.000000
Name: headway, dtype: float64

Unnamed: 0,feed_key,stop_id,stop_name,geometry,stop_combo_col,time_of_day,n_trips,n_arrivals,n_routes,duration,frequency,headway
0,ace4e22d6f2c299c36eba89ccb650b1b,00eb15cb-1430-4964-b8ae-ca6183e1d0ef,Grace and Laughter Apartments,POINT (-119.39065 36.55368),00eb15cb-1430-4964-b8ae-ca6183e1d0efGrace and ...,AM Peak,3,3,1,3,1.0,60.0
1,ace4e22d6f2c299c36eba89ccb650b1b,00eb15cb-1430-4964-b8ae-ca6183e1d0ef,Grace and Laughter Apartments,POINT (-119.39065 36.55368),00eb15cb-1430-4964-b8ae-ca6183e1d0efGrace and ...,Midday,5,5,1,5,1.0,60.0


count    383228.000000
mean         60.572669
std          61.994445
min           0.004152
25%          20.000000
50%          37.500000
75%          60.000000
max         300.000000
Name: headway, dtype: float64

In [None]:
# These longer headways should only be happening in offpeak hours, 
# check that this is true majority of the time
timeofday_stats[timeofday_stats.headway >= 120].time_of_day.value_counts()

In [None]:
timeofday_stats[timeofday_stats.headway >= 60].time_of_day.value_counts()

## Prefer `n_arrivals` (counting stop_sequence) as opposed to `n_trips`.

Example below

In [7]:
df[["feed_key", "stop_id"]].drop_duplicates().shape

(85774, 2)

In [8]:
# In the groupby, just pull the rows where we have fewer n_trips than n_arrivals
# This might happen when a stop gets visited multiple times on the same trip
# Ex: bus starts at a plaza, ends at a plaza. stop_sequence = 1 and 10, trip_id is the same.
# Technically, the bus does visit the origin/destination stop 2x, so that should be reflected in frequency/headway!

stops_revisited = df.groupby(["feed_key", "stop_id"]).agg({
    "trip_instance_key": "nunique",
    "stop_sequence": "count"}
).reset_index().query(
    'trip_instance_key < stop_sequence'
)

stops_revisited

Unnamed: 0,feed_key,stop_id,trip_instance_key,stop_sequence
0,03d99a455df3ecf0fa8c558553f899d0,10782544,28,37
3,03d99a455df3ecf0fa8c558553f899d0,10788543,9,13
4,03d99a455df3ecf0fa8c558553f899d0,10788544,9,14
5,03d99a455df3ecf0fa8c558553f899d0,10803597,27,37
6,03d99a455df3ecf0fa8c558553f899d0,10832412,13,18
...,...,...,...,...
85453,fbb49fb80fac3a81394c3b2836bf1c8b,2000,197,210
85513,fbb49fb80fac3a81394c3b2836bf1c8b,4000,92,104
85695,fe2c3f21e06fddaf1b339054d30ff47c,22,28,30
85696,fe2c3f21e06fddaf1b339054d30ff47c,23,27,30


In [9]:
stops_revisited.shape[0] / df[["feed_key", "stop_id"]].drop_duplicates().shape[0]

0.012626203744724509

In [10]:
daily_stats[daily_stats.headway > 500].describe()

Unnamed: 0,n_trips,n_arrivals,n_routes,duration,frequency,headway
count,3037.0,3037.0,3037.0,3037.0,3037.0,3037.0
mean,1.602898,1.603227,1.061574,24.0,0.066801,1005.676655
std,0.489378,0.489309,0.24042,0.0,0.020388,352.302297
min,1.0,1.0,1.0,24.0,0.041667,720.0
25%,1.0,1.0,1.0,24.0,0.041667,720.0
50%,2.0,2.0,1.0,24.0,0.083333,720.0
75%,2.0,2.0,1.0,24.0,0.083333,1440.0
max,2.0,2.0,2.0,24.0,0.083333,1440.0


In [14]:
daily_stats[
    (daily_stats.feed_key=="058a3b43698f803c8686f7e72a9efd4c") & 
    (daily_stats.stop_id=="121914")]

Unnamed: 0,feed_key,stop_id,stop_name,geometry,stop_combo_col,n_trips,n_arrivals,n_routes,duration,frequency,headway
77896,058a3b43698f803c8686f7e72a9efd4c,121914,Amtrak,POINT (-120.65477 35.27628),121914Amtrak,22,22,1,24,0.916667,65.454545


In [15]:
timeofday_stats[
    (timeofday_stats.feed_key=="058a3b43698f803c8686f7e72a9efd4c") & 
    (timeofday_stats.stop_id=="121914")]

Unnamed: 0,feed_key,stop_id,stop_name,geometry,stop_combo_col,time_of_day,n_trips,n_arrivals,n_routes,duration,frequency,headway
347063,058a3b43698f803c8686f7e72a9efd4c,121914,Amtrak,POINT (-120.65477 35.27628),121914Amtrak,AM Peak,3,3,1,3,1.0,60.0
347064,058a3b43698f803c8686f7e72a9efd4c,121914,Amtrak,POINT (-120.65477 35.27628),121914Amtrak,Early AM,1,1,1,3,0.333333,180.0
347065,058a3b43698f803c8686f7e72a9efd4c,121914,Amtrak,POINT (-120.65477 35.27628),121914Amtrak,Evening,2,2,1,4,0.5,120.0
347066,058a3b43698f803c8686f7e72a9efd4c,121914,Amtrak,POINT (-120.65477 35.27628),121914Amtrak,Midday,7,7,1,5,1.4,42.857143
347067,058a3b43698f803c8686f7e72a9efd4c,121914,Amtrak,POINT (-120.65477 35.27628),121914Amtrak,PM Peak,9,9,1,5,1.8,33.333333
