In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(800_000_000_000)

import branca
import folium
from shared_utils import gtfs_utils_v2
from segment_speed_utils import (helpers, 
                                 gtfs_schedule_wrangling, 
                                )

import pandas as pd
import geopandas as gpd 

import datetime as dt
import time
import fsspec
import gcsfs
from calitp_data_analysis import get_fs
fs = get_fs()
import tempfile

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
#Setting analysis date
analysis_dt = dt.date(2025,6,2)

In [4]:
# #Function to fetch feeds, trips, stoptimes and stops_geo data from warehouse v2
# def get_feeds_trips_stops_data(selected_date):
    
#     trip_cols = ["name", "gtfs_dataset_key", "feed_key", "trip_id", "route_id", "route_type", "route_key", 
#                  "num_stop_times", "direction_id", "trip_first_departure_sec", "trip_first_departure_ts", 
#                  "trip_last_arrival_sec", "trip_start_timezone", "trip_instance_key", "service_hours", "trip_first_departure_datetime_pacific"]
#     stoptimes_cols = ["key", "_gtfs_key", "feed_key", "trip_id", "stop_id", "stop_sequence", "arrival_time", "departure_time",
#        "arrival_time_interval", "departure_time_interval", "arrival_hour", "departure_hour"]
#     stop_cols = ["feed_key", "stop_id", "geometry", "stop_name", "stop_code", "location_type", "stop_desc"]
    
#     feed_data = gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(selected_date=selected_date)
    
    
#     feed_key_list = feed_data['feed_key'].tolist()
    
#     trips_data_list = []
#     stoptimes_data_list = []
#     stop_locations_gdf = gpd.GeoDataFrame()
#     for feed_key in feed_key_list:
#         try:
#             trips = gtfs_utils_v2.get_trips(selected_date=selected_date, operator_feeds=[feed_key])[trip_cols]
#             trips_data_list.append(trips)

#             stoptimes = gtfs_utils_v2.get_stop_times(selected_date=selected_date, operator_feeds=[feed_key], 
#                                                     trip_df=trips, get_df=True)[stoptimes_cols]
#             stoptimes_data_list.append(stoptimes)

#             stops_gdf = gtfs_utils_v2.get_stops(selected_date=selected_date, operator_feeds=[feed_key])[stop_cols]
#             stop_locations_gdf = pd.concat([stop_locations_gdf, stops_gdf], ignore_index=True)
#         except:
#             print('Skipping a row')
    
#     trips_data = pd.concat(trips_data_list, ignore_index=True)
#     stoptimes_data = pd.concat(stoptimes_data_list, ignore_index=True)
    
#     return feed_data, trips_data, stoptimes_data, stop_locations_gdf

In [5]:
# feed_data, trips_data, stoptimes_data, stop_locations_gdf = get_feeds_trips_stops_data(analysis_dt)

In [6]:
GCS_FILE_PATH  = 'gs://calitp-analytics-data/data-analyses/ahsc_grant'

Exporting data to GCS for reuse, minimizing repeated warehouse queries.

In [7]:
# trips_data.to_parquet(f"{GCS_FILE_PATH}/trips_06_02_2025.parquet")
# stoptimes_data.to_csv(f"{GCS_FILE_PATH}/stoptimes_06_02_2025.csv")
# feed_data.to_parquet(f"{GCS_FILE_PATH}/feed_06_02_2025.parquet")
# with fsspec.open(f"{GCS_FILE_PATH}/stop_locations_06_02_2025.parquet", 'wb') as f:
#     stop_locations_gdf.to_parquet(f)

Reading stoptimes, trips, feed and stop locations data 

In [8]:
stoptimes_data = pd.read_csv(f"{GCS_FILE_PATH}/stoptimes_06_02_2025.csv")

  stoptimes_data = pd.read_csv(f"{GCS_FILE_PATH}/stoptimes_06_02_2025.csv")


In [9]:
trips_data = pd.read_parquet(f"{GCS_FILE_PATH}/trips_06_02_2025.parquet")

In [10]:
feed_data = pd.read_parquet(f"{GCS_FILE_PATH}/feed_06_02_2025.parquet")

In [11]:
with fs.open(f"{GCS_FILE_PATH}/stop_locations_06_02_2025.parquet", 'rb') as f:
    stop_locations_gdf = gpd.read_parquet(f)

In [12]:
# Creating a new column column by concatenating stop_id and stop_name
stop_locations_gdf["combo_col"] = (
    stop_locations_gdf.stop_id + stop_locations_gdf.stop_name
)

In [13]:
stop_locations_gdf["combo_col"].nunique()

84403

In [14]:
# Retaining California based stops only 
# Loading California county boundaries GeoJSON data from ArcGIS REST service
CA_URL = "https://services1.arcgis.com/jUJYIo9tSA7EHvfZ/arcgis/rest/services/California_County_Boundaries/FeatureServer/0/query?outFields=*&where=1%3D1&f=geojson"
CA_county = gpd.read_file(CA_URL)

In [15]:
# Reprojecting CA_county to match the coordinate reference system of stop_locations_gdf
CA_county = CA_county.to_crs(stop_locations_gdf.crs)

In [16]:
# Spatially joining stop locations with California county boundaries to keep stops within CA
stop_locations_gdf_ca = gpd.sjoin(
    stop_locations_gdf,  
    CA_county,                    
    how='inner',
    predicate='intersects'
)

In [17]:
# Dropping duplicate rows based on geometry and combo_col
stop_locations_gdf_ca2 = stop_locations_gdf_ca.drop_duplicates(
    subset=["geometry", "combo_col"]
).reset_index()

In [18]:
len(stop_locations_gdf_ca2), stop_locations_gdf_ca2.combo_col.nunique()

(82853, 82754)

In [19]:
# Dropping duplicate rows based on combo_col
stop_locations_gdf_ca3 = stop_locations_gdf_ca2.drop_duplicates(
    subset=["combo_col"]
).reset_index(drop=True)

In [20]:
len(stop_locations_gdf_ca3), stop_locations_gdf_ca3.combo_col.nunique()

(82754, 82754)

In [21]:
# Merging stop location info into stoptimes_data based on feed_key and stop_id
stoptimesdata_california = pd.merge(
    stoptimes_data,
    stop_locations_gdf_ca3[
        [
            "feed_key",
            "stop_id",
            "stop_name",
            "location_type",
            "stop_desc",
            "geometry",
            "combo_col",
        ]
    ],
    on=["feed_key", "stop_id"],
    how="inner",
)

In [22]:
stoptimesdata_california.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1087114 entries, 0 to 1087113
Data columns (total 18 columns):
 #   Column                   Non-Null Count    Dtype   
---  ------                   --------------    -----   
 0   Unnamed: 0               1087114 non-null  int64   
 1   key                      1087114 non-null  object  
 2   _gtfs_key                1087114 non-null  object  
 3   feed_key                 1087114 non-null  object  
 4   trip_id                  1087114 non-null  object  
 5   stop_id                  1087114 non-null  object  
 6   stop_sequence            1087114 non-null  int64   
 7   arrival_time             1021260 non-null  object  
 8   departure_time           1021260 non-null  object  
 9   arrival_time_interval    1021260 non-null  object  
 10  departure_time_interval  1021260 non-null  object  
 11  arrival_hour             1021260 non-null  float64 
 12  departure_hour           1021260 non-null  float64 
 13  stop_name                10

In [23]:
trips_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162083 entries, 0 to 162082
Data columns (total 16 columns):
 #   Column                                 Non-Null Count   Dtype              
---  ------                                 --------------   -----              
 0   name                                   162083 non-null  object             
 1   gtfs_dataset_key                       162083 non-null  object             
 2   feed_key                               162083 non-null  object             
 3   trip_id                                162083 non-null  object             
 4   route_id                               162083 non-null  object             
 5   route_type                             162083 non-null  object             
 6   route_key                              162083 non-null  object             
 7   num_stop_times                         162083 non-null  int64              
 8   direction_id                           94257 non-null   float64           

In [24]:
trips_data_cleaned = trips_data.drop_duplicates() # Dropping duplicates

In [25]:
trips_data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162072 entries, 0 to 162082
Data columns (total 16 columns):
 #   Column                                 Non-Null Count   Dtype              
---  ------                                 --------------   -----              
 0   name                                   162072 non-null  object             
 1   gtfs_dataset_key                       162072 non-null  object             
 2   feed_key                               162072 non-null  object             
 3   trip_id                                162072 non-null  object             
 4   route_id                               162072 non-null  object             
 5   route_type                             162072 non-null  object             
 6   route_key                              162072 non-null  object             
 7   num_stop_times                         162072 non-null  int64              
 8   direction_id                           94246 non-null   float64           

In [26]:
#Merging stop data with trip data on trip id and feed key.
stop_trip_merged = pd.merge(stoptimesdata_california,
                            trips_data_cleaned,
                            on = ['trip_id', 'feed_key'],
                            how = 'left')

In [27]:
# Adding peak/off-peak labels to trip instances based on scheduled time buckets for the given analysis date
sched_time_of_day = gtfs_schedule_wrangling.get_trip_time_buckets(
    analysis_dt
).pipe(
    gtfs_schedule_wrangling.add_peak_offpeak_column
)[["trip_instance_key", "peak_offpeak"]]

In [28]:
#Merging scheduled time of the data 
stop_trip_merged = pd.merge(
    stop_trip_merged,
    sched_time_of_day,
    on='trip_instance_key',
    how='left'
)

In [29]:
# Counting unique trips per stop and renaming column to 'num_trips'
num_trips_per_stop = (
    stop_trip_merged.groupby(["stop_id", "combo_col"])["trip_instance_key"]
    .nunique()
    .reset_index()
    .rename(columns={"trip_instance_key": "num_trips"})
)

In [30]:
num_trips_per_stop.shape

(40702, 3)

In [31]:
# Counting unique routes per stop and renaming column to 'num_routes'
num_routes_per_stop = (
    stop_trip_merged.groupby(["stop_id", "combo_col"])["route_id"]
    .nunique()
    .reset_index()
    .rename(columns={"route_id": "num_routes"})
)

In [32]:
num_routes_per_stop.shape

(40702, 3)

In [33]:
# Filtering stop-trip data to include only peak period trips
peak_stop_times = stop_trip_merged[stop_trip_merged['peak_offpeak'] == 'peak'].copy()

In [34]:
#Filter for peak + arrival before 9 AM
am_peak_stop_trip = peak_stop_times[peak_stop_times["arrival_hour"] <= 9].copy()

#Assign time_of_day manually
am_peak_stop_trip['time_of_day'] = 'AM Peak'


In [35]:
group_cols = ['feed_key','stop_id', 'route_id', 'direction_id']

In [36]:
# Aggregating trip data into peak and off-peak summaries using the GTFS wrangling function
stop_route_summary = gtfs_schedule_wrangling.aggregate_time_of_day_to_peak_offpeak(
    df=am_peak_stop_trip,
    group_cols=group_cols,
    long_or_wide='wide'
)

In [37]:
# Calculating AM peak headway in minutes from peak_frequency (trips per hour)
stop_route_summary["am_peak_headway_minutes"] = 60 / stop_route_summary['peak_frequency']

In [38]:
stop_route_summary.head(5)

Unnamed: 0,feed_key,stop_id,route_id,direction_id,all_day_n_trips,peak_n_trips,all_day_frequency,peak_frequency,am_peak_headway_minutes
0,058a3b43698f803c8686f7e72a9efd4c,121914,11796,1.0,2.0,2.0,0.08,0.25,240.0
1,058a3b43698f803c8686f7e72a9efd4c,121915,11792,1.0,2.0,2.0,0.08,0.25,240.0
2,058a3b43698f803c8686f7e72a9efd4c,121916,11792,1.0,2.0,2.0,0.08,0.25,240.0
3,058a3b43698f803c8686f7e72a9efd4c,121917,11792,1.0,2.0,2.0,0.08,0.25,240.0
4,058a3b43698f803c8686f7e72a9efd4c,121919,11795,1.0,3.0,3.0,0.12,0.38,157.89


In [39]:
stop_route_summary.peak_frequency.describe()

count   46284.00
mean        1.67
std        74.18
min         0.12
25%         0.25
50%         0.38
75%         0.75
max     10518.75
Name: peak_frequency, dtype: float64

In [40]:
# Calculating average AM peak headway (minutes) for each stop
headways_per_stop = (
    stop_route_summary.groupby(["stop_id", "feed_key"])["am_peak_headway_minutes"]
    .mean()
    .reset_index()
)

In [41]:
# Merging trip and route counts per stop into stop location data, using combo_col and stop_id as keys
stop_summary = stop_locations_gdf_ca3.merge(
    num_trips_per_stop, on=["combo_col", "stop_id"], how="left"
).merge(num_routes_per_stop, on=["combo_col", "stop_id"], how="left")

In [42]:
# Merging AM peak headway minutes into stop_summary
stop_summary2 = stop_summary.merge(
        headways_per_stop[["feed_key","stop_id", "am_peak_headway_minutes"]],
        on=["feed_key","stop_id"],
        how="left",
        indicator= True
    )

In [43]:
stop_summary2.num_routes.describe()

count   40702.00
mean        1.36
std         0.87
min         1.00
25%         1.00
50%         1.00
75%         1.00
max        18.00
Name: num_routes, dtype: float64

In [44]:
stop_summary2.num_trips.describe()

count   40702.00
mean       42.51
std       860.48
min         1.00
25%        10.00
50%        18.00
75%        32.00
max     54142.00
Name: num_trips, dtype: float64

In [45]:
stop_summary2.am_peak_headway_minutes.describe()

count   34753.00
mean      172.82
std       125.06
min         0.01
25%        88.39
50%       157.89
75%       240.00
max       500.00
Name: am_peak_headway_minutes, dtype: float64

In [46]:
stopsummary_final = stop_summary2[['index', 'feed_key', 'stop_id', 'geometry', 'stop_name', 'stop_code',
                                   'location_type', 'stop_desc', 'COUNTY_NAME', 'num_trips',
                                   'num_routes', 'am_peak_headway_minutes']]

In [47]:
stopsummary_final.head(5)

Unnamed: 0,index,feed_key,stop_id,geometry,stop_name,stop_code,location_type,stop_desc,COUNTY_NAME,num_trips,num_routes,am_peak_headway_minutes
0,0,ace4e22d6f2c299c36eba89ccb650b1b,00eb15cb-1430-4964-b8ae-ca6183e1d0ef,POINT (-119.39065 36.55368),Grace and Laughter Apartments,,0.0,Eaton and Saginaw,Tulare,12.0,1.0,157.89
1,1,ace4e22d6f2c299c36eba89ccb650b1b,02a30e39-496f-45d4-ba1c-ac8f3c66b621,POINT (-119.37038 36.54438),El Monte Way and Randle,,0.0,Mercantile Row / Old KMart / Amigos Market,Tulare,36.0,3.0,185.26
2,2,ace4e22d6f2c299c36eba89ccb650b1b,04a2c417-05bf-4f95-bfb6-dd9cec701f11,POINT (-119.39003 36.54091),Rabobank,,0.0,Tulare and L,Tulare,24.0,2.0,198.95
3,3,ace4e22d6f2c299c36eba89ccb650b1b,05d0285f-813a-4ea9-82e0-3b8d1127e8e0,POINT (-119.33951 36.20220),Martin Luther King & O St,,0.0,Land O Lakes,Tulare,18.0,1.0,96.77
4,4,ace4e22d6f2c299c36eba89ccb650b1b,07fe70a4-21dd-4bcf-9adf-ed96f0daebbc,POINT (-119.41276 36.54707),Road 72 and Adeaide Way,,0.0,Dinuba Dollar Tree,Tulare,12.0,1.0,157.89


In [48]:
# Saving geojson file
with tempfile.NamedTemporaryFile(suffix=".geojson") as tmp:
    stopsummary_final.to_file(tmp.name, driver="GeoJSON")


    with fsspec.open(f"{GCS_FILE_PATH}/stop_summary.geojson", 'w') as f_out:
        with open(tmp.name, 'r') as f_in:
            f_out.write(f_in.read())