# Stop Summary

Exploratory work that looks into number of routes, trips and headway minutes at a given stop

In [1]:
pip install shared_utils

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import geopandas as gpd
import google.auth
import gcsfs
import fsspec
from segment_speed_utils.project_vars import PUBLIC_GCS

from shared_utils import time_helpers 


GCS_FILE_PATH  = 'gs://calitp-analytics-data/data-analyses/ahsc_grant'
analysis_date = "06_02_2025"
credentials, project = google.auth.default()

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [4]:
def prep_trips(analysis_date):
    trips = pd.read_parquet(
        f"{GCS_FILE_PATH}/trips_{analysis_date}.parquet",
        columns = ["feed_key", "name", "trip_id", 
                "trip_instance_key", "route_id", "direction_id", 
                "trip_first_departure_datetime_pacific"]
    )

    trips = trips.assign(
        time_of_day = trips.apply(
            lambda x: time_helpers.categorize_time_of_day(
                x.trip_first_departure_datetime_pacific), axis=1), 
    )

    return trips

In [5]:
def prep_stops(analysis_date: str) -> gpd.GeoDataFrame:
    """
    Prep the stops table and keep only stops within California county boundaries.
    """
    stops = gpd.read_parquet(
        f"{GCS_FILE_PATH}/stop_locations_{analysis_date}.parquet",
        columns = ["feed_key", "stop_id", "stop_name", "geometry"],
        storage_options={'token': credentials.token}
    )

    # Load California county boundaries GeoJSON from ArcGIS REST service
    CA_URL = "https://services1.arcgis.com/jUJYIo9tSA7EHvfZ/arcgis/rest/services/California_County_Boundaries/FeatureServer/0/query?outFields=*&where=1%3D1&f=geojson"
    CA_county = gpd.read_file(CA_URL)

    # Reproject CA_county to match CRS of stops
    CA_county = CA_county.to_crs(stops.crs)

    # Spatial join to keep only stops within CA counties
    stops = gpd.sjoin(stops, CA_county, how="inner", predicate="within")

    # Keep only original stop columns (drop join columns)
    stops = stops[["feed_key", "stop_id", "stop_name", "geometry"]]

    # Add combined column if needed
    stops = stops.assign(
        stop_combo_col = stops.stop_id + stops.stop_name
    )

    return stops

In [6]:
def stop_summary_stats(
    df: pd.DataFrame,
    stops: gpd.GeoDataFrame,
    group_cols: list
) -> gpd.GeoDataFrame:
    """
    Take stop times and start counting frequency based on 
    how many scheduled arrivals it's supposed to make (# rows, so count of a column works).

    Store how many routes / route-direction it serves, but those will not be used for 
    frequency or headway.
    
    Returns a stop summary table with stop's point geometry
    """
    df2 = (
        df.groupby(group_cols)
        .agg({
            "trip_instance_key": "nunique",
            "stop_sequence": "count", # preferred
            "route_id": "nunique"
        }).reset_index()
        .rename(columns = {
            "trip_instance_key": "number_of_trips",
            "stop_sequence": "number_of_arrivals",
            "route_id": "number_of_routes",
        })
    )
    
    # when time-of-day is a column, just get the number of hours in each bin
    if "time_of_day" in df2.columns:
        df2["duration"] = df2.time_of_day.map(
            time_helpers.HOURS_BY_TIME_OF_DAY
        )
    # when time-of-day is not a column, hours are set to 24    
    else:
        df2["duration"] = 24
    
    
    df2["frequency"] = df2.number_of_arrivals.divide(df2.duration)
    df2["headway_minutes"] = 60 / df2.frequency
    
    # post groupby, each row is a stop!
    # Merge stop_geom back in.
    # After we reduced the df from a lot of rows, where each row represented a trip-stop,
    # now that every row is a stop, attach the stop's pt geometry
    # stop_combo_col shows up here
    gdf = pd.merge(
        stops,
        df2,
        on = ["feed_key", "stop_id"],
        how = "inner"
    )
    
    return gdf    

In [7]:
stop_times = pd.read_parquet(
    f"{GCS_FILE_PATH}/stoptimes_{analysis_date}.parquet",
    columns = ["feed_key", "trip_id", "stop_id", "stop_sequence"]
)

In [8]:
trips = prep_trips(analysis_date)
stops = prep_stops(analysis_date)

In [9]:
df = pd.merge(
    stop_times,
    trips, # adding this gives me route_id, trip_instance_key, or route_type
    on = ["feed_key", "trip_id"],
    how = "left"
)

In [10]:
# All day
daily_stats = stop_summary_stats(
    df, stops, group_cols = ["feed_key", "stop_id"])

# Time-of-Day
timeofday_stats = stop_summary_stats(
    df, stops, group_cols = ["feed_key", "stop_id", "time_of_day"])

In [11]:
display(daily_stats.number_of_trips.describe())

count   84082.00
mean       49.75
std       600.18
min         1.00
25%        15.00
50%        28.00
75%        56.00
max     54142.00
Name: number_of_trips, dtype: float64

In [12]:
display(timeofday_stats.number_of_trips.describe())

count   378038.00
mean        11.07
std        118.07
min          1.00
25%          3.00
50%          6.00
75%         12.00
max      11285.00
Name: number_of_trips, dtype: float64

In [13]:
display(daily_stats.number_of_routes.describe())

count   84082.00
mean        1.38
std         0.91
min         1.00
25%         1.00
50%         1.00
75%         1.00
max        19.00
Name: number_of_routes, dtype: float64

In [14]:
display(timeofday_stats.number_of_routes.describe())

count   378038.00
mean         1.26
std          0.66
min          1.00
25%          1.00
50%          1.00
75%          1.00
max         16.00
Name: number_of_routes, dtype: float64

In [15]:
display(daily_stats.headway_minutes.describe())

count   84082.00
mean      100.91
std       191.98
min         0.01
25%        25.71
50%        51.43
75%        96.00
max      1440.00
Name: headway_minutes, dtype: float64

In [16]:
display(timeofday_stats.headway_minutes.describe())

count   378038.00
mean        59.35
std         60.64
min          0.00
25%         20.00
50%         36.00
75%         60.00
max        300.00
Name: headway_minutes, dtype: float64

In [17]:
timeofday_stats[timeofday_stats.headway_minutes >= 120].time_of_day.value_counts()

Evening     18307
Early AM    14349
Owl          5542
PM Peak      5493
Midday       4133
AM Peak      3722
Name: time_of_day, dtype: int64

In [18]:
timeofday_stats[timeofday_stats.headway_minutes >= 60].time_of_day.value_counts()

Early AM    36387
Evening     31381
PM Peak     24612
AM Peak     22873
Midday      22482
Owl          8506
Name: time_of_day, dtype: int64

In [19]:
am_peak_stats = timeofday_stats[timeofday_stats["time_of_day"] == "AM Peak"]

In [23]:
am_peak_stats = am_peak_stats.rename(
    columns={"headway_minutes": "am_peak_headway_minutes"}
)
am_peak_stats_final = pd.merge(
    daily_stats,
    am_peak_stats[["feed_key", "stop_combo_col", "am_peak_headway_minutes"]],
    on=["feed_key", "stop_combo_col"],
    how="left",
    indicator=False,
)

In [25]:
am_peak_stats_final['am_peak_headway_minutes'].describe()

count   80798.00
mean       40.82
std        37.48
min         0.01
25%        16.36
50%        30.00
75%        60.00
max       180.00
Name: am_peak_headway_minutes, dtype: float64

In [26]:
GCS__PUBLIC_FILE_PATH = f"{PUBLIC_GCS}stop_route_trip_headway_data"

def export_gdf(gdf, filename: str):
    geojson_str = gdf.to_json()

    with fsspec.open(f"{GCS__PUBLIC_FILE_PATH}/{filename}.geojson", 'w') as f_out:
        f_out.write(geojson_str)

    print(f"Saved to {GCS__PUBLIC_FILE_PATH}/{filename}.geojson")

In [27]:
export_gdf(am_peak_stats_final, "am_peak_stats")
export_gdf(daily_stats, "daily_stats")
export_gdf(timeofday_stats, "timeofday_stats")

Saved to gs://calitp-publish-data-analysis/stop_route_trip_headway_data/am_peak_stats.geojson
Saved to gs://calitp-publish-data-analysis/stop_route_trip_headway_data/daily_stats.geojson
Saved to gs://calitp-publish-data-analysis/stop_route_trip_headway_data/timeofday_stats.geojson
