In [1]:
import os

os.environ["CALITP_BQ_MAX_BYTES"] = str(800_000_000_000)

import datetime as dt
import time

import branca
import folium
import fsspec
import gcsfs
import geopandas as gpd
import pandas as pd
from calitp_data_analysis import get_fs
from segment_speed_utils import gtfs_schedule_wrangling, helpers
from shared_utils import gtfs_utils_v2

fs = get_fs()
import tempfile

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
# Setting analysis date
analysis_dt = dt.date(2025, 6, 2)

In [4]:
# #Function to fetch feeds, trips, stoptimes and stops_geo data from warehouse v2
# def get_feeds_trips_stops_data(selected_date):

#     trip_cols = ["name", "gtfs_dataset_key", "feed_key", "trip_id", "route_id", "route_type", "route_key",
#                  "num_stop_times", "direction_id", "trip_first_departure_sec", "trip_first_departure_ts",
#                  "trip_last_arrival_sec", "trip_start_timezone", "trip_instance_key", "service_hours", "trip_first_departure_datetime_pacific"]
#     stoptimes_cols = ["key", "_gtfs_key", "feed_key", "trip_id", "stop_id", "stop_sequence", "arrival_time", "departure_time",
#        "arrival_time_interval", "departure_time_interval", "arrival_hour", "departure_hour"]
#     stop_cols = ["feed_key", "stop_id", "geometry", "stop_name", "stop_code", "location_type", "stop_desc"]

#     feed_data = gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(selected_date=selected_date)


#     feed_key_list = feed_data['feed_key'].tolist()

#     trips_data_list = []
#     stoptimes_data_list = []
#     stop_locations_gdf = gpd.GeoDataFrame()
#     for feed_key in feed_key_list:
#         try:
#             trips = gtfs_utils_v2.get_trips(selected_date=selected_date, operator_feeds=[feed_key])[trip_cols]
#             trips_data_list.append(trips)

#             stoptimes = gtfs_utils_v2.get_stop_times(selected_date=selected_date, operator_feeds=[feed_key],
#                                                     trip_df=trips, get_df=True)[stoptimes_cols]
#             stoptimes_data_list.append(stoptimes)

#             stops_gdf = gtfs_utils_v2.get_stops(selected_date=selected_date, operator_feeds=[feed_key])[stop_cols]
#             stop_locations_gdf = pd.concat([stop_locations_gdf, stops_gdf], ignore_index=True)
#         except:
#             print('Skipping a row')

#     trips_data = pd.concat(trips_data_list, ignore_index=True)
#     stoptimes_data = pd.concat(stoptimes_data_list, ignore_index=True)

#     return feed_data, trips_data, stoptimes_data, stop_locations_gdf

In [5]:
# feed_data, trips_data, stoptimes_data, stop_locations_gdf = get_feeds_trips_stops_data(analysis_dt)

In [6]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/ahsc_grant"

Exporting data to GCS for reuse, minimizing repeated warehouse queries.

In [7]:
# trips_data.to_parquet(f"{GCS_FILE_PATH}/trips_06_02_2025.parquet")
# stoptimes_data.to_csv(f"{GCS_FILE_PATH}/stoptimes_06_02_2025.csv")
# feed_data.to_parquet(f"{GCS_FILE_PATH}/feed_06_02_2025.parquet")
# with fsspec.open(f"{GCS_FILE_PATH}/stop_locations_06_02_2025.parquet", 'wb') as f:
#     stop_locations_gdf.to_parquet(f)

Reading stoptimes, trips, feed and stop locations data 

In [8]:
stoptimes_data = pd.read_csv(f"{GCS_FILE_PATH}/stoptimes_06_02_2025.csv")

  stoptimes_data = pd.read_csv(f"{GCS_FILE_PATH}/stoptimes_06_02_2025.csv")


In [9]:
trips_data = pd.read_parquet(f"{GCS_FILE_PATH}/trips_06_02_2025.parquet")

In [10]:
feed_data = pd.read_parquet(f"{GCS_FILE_PATH}/feed_06_02_2025.parquet")

In [11]:
with fs.open(f"{GCS_FILE_PATH}/stop_locations_06_02_2025.parquet", "rb") as f:
    stop_locations_gdf = gpd.read_parquet(f)

In [12]:
len(stop_locations_gdf)

85773

In [13]:
stop_locations_gdf.sample()

Unnamed: 0,feed_key,stop_id,geometry,stop_name,stop_code,location_type,stop_desc
29063,54d646ba547246f544b6ad2ff1045a65,BAM,POINT (-86.11165 42.31452),Bangor Amtrak Station,,,


**I created a new column that combines `stop_id` with `stop_name` to more unique column, since there are many `stop_id` values that are the same.**

In [14]:
stop_locations_gdf.stop_name.nunique()

63449

In [15]:
stop_locations_gdf.stop_id.nunique()

53010

In [16]:
stop_locations_gdf["combo_col"] = (
    stop_locations_gdf.stop_id + stop_locations_gdf.stop_name
)

**There are 85,773 rows in the gdf but 84,403 unique combo_col values...check this out.**

In [17]:
stop_locations_gdf["combo_col"].nunique()

84403

In [18]:
stop_locations_gdf["combo_col"].value_counts().head(20)

3138712Main St EB & Balsam Ave               2
3134076Mesa St EB & 3rd Ave                  2
10050067Yucca Loma Rd EB & Choco Rd          2
3117293Village Dr SB & Grand Triassic Ln     2
10043080Pahute WB & Pawnee                   2
10050105Yucca Loma Rd WB & Choco Rd          2
3119731Danbury Ave WB & I Ave                2
3113883I Ave SB & Mauna Loa St               2
3133798Willow St WB & 11th Ave               2
31387157th Ave SB & Main St                  2
3138717Lime St EB & 7th Ave                  2
3119719Sultana St NB & Main St               2
10042993Nisqualli WB & Hesperia              2
3150570Montara Rd SB & Armory Rd             2
3118559Goshute Ave EB & Pioneer Rd           2
3151362Armory Rd WB & Deseret Ave            2
3138694Cottonwood Ave SB & Pahute Ave        2
3119726Danbury Ave WB & Arrowhead Lake Rd    2
3119728Danbury Ave WB & Peach Ave            2
3138698Cottonwood Ave SB & Pendleton St      2
Name: combo_col, dtype: int64

**I checked a couple of rows with the same `stop_id` and `stop_name`. It appears multiple feed keys are associated with it. I know in our warehouse we have examples of one operator having multiple `feed_key` values. I assume this is what's happening.**

In [19]:
stop_locations_gdf.loc[
    stop_locations_gdf.combo_col == "3151362Armory Rd WB & Deseret Ave"
]

Unnamed: 0,feed_key,stop_id,geometry,stop_name,stop_code,location_type,stop_desc,combo_col
47243,51d0571614cd6cbcd86c98ff837af26f,3151362,POINT (-117.02152 34.88416),Armory Rd WB & Deseret Ave,30071,0.0,06 NB,3151362Armory Rd WB & Deseret Ave
71603,43e19da122e03aece0b453c1f6473f89,3151362,POINT (-117.02152 34.88416),Armory Rd WB & Deseret Ave,30071,0.0,06 NB,3151362Armory Rd WB & Deseret Ave


In [20]:
stop_locations_gdf.loc[
    stop_locations_gdf.combo_col == "3138698Cottonwood Ave SB & Pendleton St"
]

Unnamed: 0,feed_key,stop_id,geometry,stop_name,stop_code,location_type,stop_desc,combo_col
47247,51d0571614cd6cbcd86c98ff837af26f,3138698,POINT (-117.33858 34.44714),Cottonwood Ave SB & Pendleton St,40104,0.0,,3138698Cottonwood Ave SB & Pendleton St
71457,43e19da122e03aece0b453c1f6473f89,3138698,POINT (-117.33858 34.44714),Cottonwood Ave SB & Pendleton St,40104,0.0,,3138698Cottonwood Ave SB & Pendleton St


In [21]:
stop_locations_gdf.loc[stop_locations_gdf.combo_col == "3138712Main St EB & Balsam Ave"]

Unnamed: 0,feed_key,stop_id,geometry,stop_name,stop_code,location_type,stop_desc,combo_col
47049,51d0571614cd6cbcd86c98ff837af26f,3138712,POINT (-117.33053 34.42686),Main St EB & Balsam Ave,40170,0.0,,3138712Main St EB & Balsam Ave
71471,43e19da122e03aece0b453c1f6473f89,3138712,POINT (-117.33053 34.42686),Main St EB & Balsam Ave,40170,0.0,,3138712Main St EB & Balsam Ave


**I can see dropping `geometry` with the `combo_col` brings us closer to the gdf's length matching the total number of `combo_col` unique values.**

In [22]:
len(stop_locations_gdf.drop_duplicates(subset=["geometry", "combo_col"]))

84502

In [23]:
stop_locations_gdf["combo_col"].nunique()

84403

In [24]:
# Retaining California based stops only
# Loading California county boundaries GeoJSON data from ArcGIS REST service
CA_URL = "https://services1.arcgis.com/jUJYIo9tSA7EHvfZ/arcgis/rest/services/California_County_Boundaries/FeatureServer/0/query?outFields=*&where=1%3D1&f=geojson"
CA_county = gpd.read_file(CA_URL)

In [25]:
CA_county = CA_county.to_crs(stop_locations_gdf.crs)

In [26]:
# Spatially joining stop locations with California county boundaries to keep stops within CA
stop_locations_gdf_ca = gpd.sjoin(
    stop_locations_gdf, CA_county, how="inner", predicate="intersects"
)

In [27]:
# Length before spatial join
len(stop_locations_gdf)

85773

In [28]:
# Length after spatial join
len(stop_locations_gdf_ca)

84082

**I am actually dropping duplicated `stop_id` and `stop_name` values here.**

In [33]:
# Length when dropping dups
len(stop_locations_gdf_ca.drop_duplicates(subset=["geometry", "combo_col"]))

82853

In [34]:
stop_locations_gdf_ca2 = stop_locations_gdf_ca.drop_duplicates(
    subset=["geometry", "combo_col"]
).reset_index()

In [35]:
stop_locations_gdf_ca2.loc[
    stop_locations_gdf_ca2.combo_col == "3138712Main St EB & Balsam Ave"
]

Unnamed: 0,index,feed_key,stop_id,geometry,stop_name,stop_code,location_type,stop_desc,combo_col,index_right,OBJECTID,COUNTY_NAME,COUNTY_ABBREV,COUNTY_NUM,COUNTY_CODE,COUNTY_FIPS,ISLAND,Shape__Area,Shape__Length,GlobalID
47926,47049,51d0571614cd6cbcd86c98ff837af26f,3138712,POINT (-117.33053 34.42686),Main St EB & Balsam Ave,40170,0.0,,3138712Main St EB & Balsam Ave,35,36,San Bernardino,SBD,36,36,71,,77496812443.93,1247676.29,e4df6870-0d0b-40c1-ad3b-a60a72792cfd


In [36]:
stop_locations_gdf_ca2.combo_col.nunique()

82754

In [37]:
stop_locations_gdf_ca2.stop_id.nunique()

51813

**Strangely enough, there are still a bunch of rows that appear to be the same stop. I'm just going to be more aggressive with dropping them.**

In [38]:
# stop_locations_gdf_ca2.combo_col.value_counts().head(40)

In [39]:
stop_locations_gdf_ca2.loc[
    stop_locations_gdf_ca2.combo_col == "40463Kerner Blvd & Larkspur St"
]

Unnamed: 0,index,feed_key,stop_id,geometry,stop_name,stop_code,location_type,stop_desc,combo_col,index_right,OBJECTID,COUNTY_NAME,COUNTY_ABBREV,COUNTY_NUM,COUNTY_CODE,COUNTY_FIPS,ISLAND,Shape__Area,Shape__Length,GlobalID
4418,859,31821446901db9b7599c638071ed41d9,40463,POINT (-122.50201 37.96278),Kerner Blvd & Larkspur St,40463,0.0,,40463Kerner Blvd & Larkspur St,20,21,Marin,MRN,21,21,41,,2546999709.34,324786.53,f2604fed-7c76-4bcb-a5e0-667d6d3405b5
4471,55541,4affb88e8d9632e2fded862931aa47e8,40463,POINT (-122.50198 37.96286),Kerner Blvd & Larkspur St,40463,0.0,,40463Kerner Blvd & Larkspur St,20,21,Marin,MRN,21,21,41,,2546999709.34,324786.53,f2604fed-7c76-4bcb-a5e0-667d6d3405b5


In [40]:
stop_locations_gdf_ca2.loc[
    stop_locations_gdf_ca2.combo_col == "13024Nobel Dr & La Jolla Village Square Drwy"
]

Unnamed: 0,index,feed_key,stop_id,geometry,stop_name,stop_code,location_type,stop_desc,combo_col,index_right,OBJECTID,COUNTY_NAME,COUNTY_ABBREV,COUNTY_NUM,COUNTY_CODE,COUNTY_FIPS,ISLAND,Shape__Area,Shape__Length,GlobalID
68924,22279,a0f4d13fbed0632c618fd0f6f158dd54,13024,POINT (-117.23169 32.86777),Nobel Dr & La Jolla Village Square Drwy,13024,0.0,,13024Nobel Dr & La Jolla Village Square Drwy,36,37,San Diego,SDG,37,37,73,,15735991009.72,564113.67,414826ec-689e-4084-bd03-5195df2748bf
70534,24047,1fff52f9349da228c56eef492df5001b,13024,POINT (-117.23169 32.86777),Nobel Dr & La Jolla Village Square Drwy,13024,0.0,,13024Nobel Dr & La Jolla Village Square Drwy,36,37,San Diego,SDG,37,37,73,,15735991009.72,564113.67,414826ec-689e-4084-bd03-5195df2748bf


In [41]:
stop_locations_gdf_ca2.loc[
    stop_locations_gdf_ca2.combo_col == "52911Decoto Rd & Cabrillo Dr"
]

Unnamed: 0,index,feed_key,stop_id,geometry,stop_name,stop_code,location_type,stop_desc,combo_col,index_right,OBJECTID,COUNTY_NAME,COUNTY_ABBREV,COUNTY_NUM,COUNTY_CODE,COUNTY_FIPS,ISLAND,Shape__Area,Shape__Length,GlobalID
41147,15868,8be5cc73e7979c7f110456d07c464792,52911,POINT (-122.03634 37.56661),Decoto Rd & Cabrillo Dr,52911,0.0,,52911Decoto Rd & Cabrillo Dr,0,1,Alameda,ALA,1,1,1,,3402787058.96,308998.65,e6f92268-d2dd-4cfb-8b79-5b4b2f07c559
43348,62330,35702a19aac0ed4d2a616627483d3850,52911,POINT (-122.03634 37.56661),Decoto Rd & Cabrillo Dr,52911,,,52911Decoto Rd & Cabrillo Dr,0,1,Alameda,ALA,1,1,1,,3402787058.96,308998.65,e6f92268-d2dd-4cfb-8b79-5b4b2f07c559


In [42]:
stop_locations_gdf_ca3 = stop_locations_gdf_ca2.drop_duplicates(
    subset=["combo_col"]
).reset_index(drop=True)

In [43]:
stop_locations_gdf_ca3.loc[
    stop_locations_gdf_ca3.combo_col == "52911Decoto Rd & Cabrillo Dr"
]

Unnamed: 0,index,feed_key,stop_id,geometry,stop_name,stop_code,location_type,stop_desc,combo_col,index_right,OBJECTID,COUNTY_NAME,COUNTY_ABBREV,COUNTY_NUM,COUNTY_CODE,COUNTY_FIPS,ISLAND,Shape__Area,Shape__Length,GlobalID
41132,15868,8be5cc73e7979c7f110456d07c464792,52911,POINT (-122.03634 37.56661),Decoto Rd & Cabrillo Dr,52911,0.0,,52911Decoto Rd & Cabrillo Dr,0,1,Alameda,ALA,1,1,1,,3402787058.96,308998.65,e6f92268-d2dd-4cfb-8b79-5b4b2f07c559


In [44]:
len(stop_locations_gdf_ca3), stop_locations_gdf_ca3.combo_col.nunique()

(82754, 82754)

In [45]:
stoptimesdata_california = pd.merge(
    stoptimes_data,
    stop_locations_gdf_ca3[
        [
            "feed_key",
            "stop_id",
            "stop_name",
            "location_type",
            "stop_desc",
            "geometry",
            "combo_col",
        ]
    ],
    on=["feed_key", "stop_id"],
    how="inner",
)

In [46]:
trips_data_cleaned = trips_data.drop_duplicates()  # Dropping duplicates

In [47]:
trips_data_cleaned.shape

(162072, 16)

In [50]:
trips_data.shape

(162083, 16)

In [51]:
trips_data_cleaned.sample()

Unnamed: 0,name,gtfs_dataset_key,feed_key,trip_id,route_id,route_type,route_key,num_stop_times,direction_id,trip_first_departure_sec,trip_first_departure_ts,trip_last_arrival_sec,trip_start_timezone,trip_instance_key,service_hours,trip_first_departure_datetime_pacific
131056,Anaheim Resort Schedule,c4726e0acfbcbd26e1dc38b8bd046c03,1d3081c9043c89241e23681420a404d5,746a8c86-d6e0-4621-9970-174cf856f495:8,ff214815-c1ed-4f96-857a-e525f37efa98,3,a7ad58dc82e4adaf8f02dd5a54664046,9,,31800.0,2025-06-02 15:50:00+00:00,33000.0,America/Los_Angeles,51fe4826960c0d84b80e0f245bf70d33,0.33,2025-06-02 08:50:00


In [52]:
stoptimesdata_california.sample()

Unnamed: 0.1,Unnamed: 0,key,_gtfs_key,feed_key,trip_id,stop_id,stop_sequence,arrival_time,departure_time,arrival_time_interval,departure_time_interval,arrival_hour,departure_hour,stop_name,location_type,stop_desc,geometry,combo_col
65600,917568,2ce2e691140128b5bde0f4afd5e2eec3,01a343a738a845508bdaf4d0ffadeef8,8c84e748dadce4a3b1422c1c643926ad,t_1057367_b_25985_tn_0,784078,40,09:45:00,09:45:00,"relativedelta(hours=+9, minutes=+45)","relativedelta(hours=+9, minutes=+45)",9.0,9.0,Woodland College,0.0,,POINT (-122.61704 38.93279),784078Woodland College


In [53]:
# Merging stop data with trip data on trip id and feed key.
stop_trip_merged = pd.merge(
    stoptimesdata_california, trips_data_cleaned, on=["trip_id", "feed_key"], how="left"
)

In [54]:
len(stoptimesdata_california)

1087114

In [55]:
len(trips_data_cleaned)

162072

In [56]:
# Adding peak/off-peak labels to trip instances based on scheduled time buckets for the given analysis date
sched_time_of_day = gtfs_schedule_wrangling.get_trip_time_buckets(analysis_dt).pipe(
    gtfs_schedule_wrangling.add_peak_offpeak_column
)[["trip_instance_key", "peak_offpeak"]]

In [57]:
sched_time_of_day.head(2)

Unnamed: 0,trip_instance_key,peak_offpeak
0,5abb89287ff7b1add12e46236f7a8178,offpeak
1,d6aef57fcc7b4f410837edeaecfc5d1b,offpeak


In [58]:
# Merging scheduled time of the data
stop_trip_merged = pd.merge(
    stop_trip_merged, sched_time_of_day, on="trip_instance_key", how="left"
)

In [59]:
len(stop_trip_merged)

2312821

In [60]:
# Counting unique trips per stop and renaming column to 'num_trips'
# num_trips_per_stop = stop_trip_merged.groupby('stop_id')['trip_instance_key'].nunique().reset_index().rename(columns={'trip_instance_key':'num_trips'})

In [61]:
num_trips_per_stop = (
    stop_trip_merged.groupby(["stop_id", "combo_col"])["trip_instance_key"]
    .nunique()
    .reset_index()
    .rename(columns={"trip_instance_key": "num_trips"})
)

In [62]:
num_trips_per_stop.shape

(40702, 3)

In [63]:
# Counting unique routes per stop and renaming column to 'num_routes'
num_routes_per_stop = (
    stop_trip_merged.groupby(["stop_id", "combo_col"])["route_id"]
    .nunique()
    .reset_index()
    .rename(columns={"route_id": "num_routes"})
)

In [64]:
num_routes_per_stop.shape

(40702, 3)

In [65]:
num_routes_per_stop.head(2)

Unnamed: 0,stop_id,combo_col,num_routes
0,0,0Skyway & Princeton Wy,2
1,2,0002Del Monte Center / Gate 1,3


In [66]:
# Filtering stop-trip data to include only peak period trips
peak_stop_times = stop_trip_merged[stop_trip_merged["peak_offpeak"] == "peak"].copy()

In [92]:
peak_stop_times.sample()

Unnamed: 0.1,Unnamed: 0,key,_gtfs_key,feed_key,trip_id,stop_id,stop_sequence,arrival_time,departure_time,arrival_time_interval,departure_time_interval,arrival_hour,departure_hour,stop_name,location_type,stop_desc,geometry,combo_col,name,gtfs_dataset_key,route_id,route_type,route_key,num_stop_times,direction_id,trip_first_departure_sec,trip_first_departure_ts,trip_last_arrival_sec,trip_start_timezone,trip_instance_key,service_hours,trip_first_departure_datetime_pacific,peak_offpeak
1276981,1707874,f8b61e5d617a8bafff847d4c31c5b9c6,287be3ae624d67cb5228d32598a71019,6a5a841d0f829e6f8aba4e1f619e7a9e,TL-145,TL-5,50,00:25:19,00:25:29,"relativedelta(minutes=+25, seconds=+19)","relativedelta(minutes=+25, seconds=+29)",0.0,0.0,Terminal 3,0.0,,POINT (-118.40699 33.94494),TL-5Terminal 3,LAX Shuttles Schedule,723210f3a6d61ee3936df401e18a5636,TL-6,3,97c593f5365773269f315ff1ec424c43,15,,54614.0,2025-06-02 22:10:14+00:00,57124.0,America/Los_Angeles,3a252f1439d503952cf290fa3f9faede,0.7,2025-06-02 15:10:14,peak


**I think 9am is considered peak time**

In [67]:
# Filter for peak + arrival before 9 AM
am_peak_stop_trip = peak_stop_times[peak_stop_times["arrival_hour"] <= 9].copy()

# Assign time_of_day manually
am_peak_stop_trip["time_of_day"] = "AM Peak"

In [68]:
group_cols = ["feed_key", "stop_id", "route_id", "direction_id"]

In [69]:
stop_route_summary = gtfs_schedule_wrangling.aggregate_time_of_day_to_peak_offpeak(
    df=am_peak_stop_trip, group_cols=group_cols, long_or_wide="wide"
)

In [70]:
stop_route_summary.head()

Unnamed: 0,feed_key,stop_id,route_id,direction_id,all_day_n_trips,peak_n_trips,all_day_frequency,peak_frequency
0,058a3b43698f803c8686f7e72a9efd4c,121914,11796,1.0,2.0,2.0,0.08,0.25
1,058a3b43698f803c8686f7e72a9efd4c,121915,11792,1.0,2.0,2.0,0.08,0.25
2,058a3b43698f803c8686f7e72a9efd4c,121916,11792,1.0,2.0,2.0,0.08,0.25
3,058a3b43698f803c8686f7e72a9efd4c,121917,11792,1.0,2.0,2.0,0.08,0.25
4,058a3b43698f803c8686f7e72a9efd4c,121919,11795,1.0,3.0,3.0,0.12,0.38


In [79]:
stop_route_summary["am_peak_headway_minutes"] = 60 / stop_route_summary["peak_frequency"]

In [72]:
stop_route_summary.head(5)

Unnamed: 0,feed_key,stop_id,route_id,direction_id,all_day_n_trips,peak_n_trips,all_day_frequency,peak_frequency,peak_headway
0,058a3b43698f803c8686f7e72a9efd4c,121914,11796,1.0,2.0,2.0,0.08,0.25,240.0
1,058a3b43698f803c8686f7e72a9efd4c,121915,11792,1.0,2.0,2.0,0.08,0.25,240.0
2,058a3b43698f803c8686f7e72a9efd4c,121916,11792,1.0,2.0,2.0,0.08,0.25,240.0
3,058a3b43698f803c8686f7e72a9efd4c,121917,11792,1.0,2.0,2.0,0.08,0.25,240.0
4,058a3b43698f803c8686f7e72a9efd4c,121919,11795,1.0,3.0,3.0,0.12,0.38,157.89


In [93]:
stop_route_summary.peak_frequency.describe()

count   46284.00
mean        1.67
std        74.18
min         0.12
25%         0.25
50%         0.38
75%         0.75
max     10518.75
Name: peak_frequency, dtype: float64

In [80]:
# Calculating average AM peak headway (minutes) for each stop
headways_per_stop = (
    stop_route_summary.groupby(["stop_id", "feed_key"])["am_peak_headway_minutes"]
    .mean()
    .reset_index()
)

In [74]:
headways_per_stop.head()

Unnamed: 0,stop_id,feed_key,peak_headway
0,0,a42c96ce9df6597edaaea081a44f261a,500.0
1,2,bcbaf48b76d91fc8f19b728ef8d257e7,370.0
2,2,cd299184726656597ae2cdb4f4e81e4a,120.0
3,3,bcbaf48b76d91fc8f19b728ef8d257e7,370.0
4,3,cd299184726656597ae2cdb4f4e81e4a,120.0


In [76]:
stop_summary = stop_locations_gdf_ca3.merge(
    num_trips_per_stop, on=["combo_col", "stop_id"], how="left"
).merge(num_routes_per_stop, on=["combo_col", "stop_id"], how="left")

In [77]:
len(stop_summary)

82754

In [84]:
stop_summary2 = stop_summary.merge(
        headways_per_stop[["feed_key","stop_id", "am_peak_headway_minutes"]],
        on=["feed_key","stop_id"],
        how="left",
        indicator= True
    )

In [85]:
stop_summary2._merge.value_counts()

left_only     48001
both          34753
right_only        0
Name: _merge, dtype: int64

In [86]:
stop_summary2.head()

Unnamed: 0,index,feed_key,stop_id,geometry,stop_name,stop_code,location_type,stop_desc,combo_col,index_right,OBJECTID,COUNTY_NAME,COUNTY_ABBREV,COUNTY_NUM,COUNTY_CODE,COUNTY_FIPS,ISLAND,Shape__Area,Shape__Length,GlobalID,num_trips,num_routes,am_peak_headway_minutes,_merge
0,0,ace4e22d6f2c299c36eba89ccb650b1b,00eb15cb-1430-4964-b8ae-ca6183e1d0ef,POINT (-119.39065 36.55368),Grace and Laughter Apartments,,0.0,Eaton and Saginaw,00eb15cb-1430-4964-b8ae-ca6183e1d0efGrace and Laughter Apartments,53,54,Tulare,TUL,54,54,107,,19311031190.7,654530.72,709be848-8aac-4cd3-bf13-8b982ede6775,12.0,1.0,157.89,both
1,1,ace4e22d6f2c299c36eba89ccb650b1b,02a30e39-496f-45d4-ba1c-ac8f3c66b621,POINT (-119.37038 36.54438),El Monte Way and Randle,,0.0,Mercantile Row / Old KMart / Amigos Market,02a30e39-496f-45d4-ba1c-ac8f3c66b621El Monte Way and Randle,53,54,Tulare,TUL,54,54,107,,19311031190.7,654530.72,709be848-8aac-4cd3-bf13-8b982ede6775,36.0,3.0,185.26,both
2,2,ace4e22d6f2c299c36eba89ccb650b1b,04a2c417-05bf-4f95-bfb6-dd9cec701f11,POINT (-119.39003 36.54091),Rabobank,,0.0,Tulare and L,04a2c417-05bf-4f95-bfb6-dd9cec701f11Rabobank,53,54,Tulare,TUL,54,54,107,,19311031190.7,654530.72,709be848-8aac-4cd3-bf13-8b982ede6775,24.0,2.0,198.95,both
3,3,ace4e22d6f2c299c36eba89ccb650b1b,05d0285f-813a-4ea9-82e0-3b8d1127e8e0,POINT (-119.33951 36.20220),Martin Luther King & O St,,0.0,Land O Lakes,05d0285f-813a-4ea9-82e0-3b8d1127e8e0Martin Luther King & O St,53,54,Tulare,TUL,54,54,107,,19311031190.7,654530.72,709be848-8aac-4cd3-bf13-8b982ede6775,18.0,1.0,96.77,both
4,4,ace4e22d6f2c299c36eba89ccb650b1b,07fe70a4-21dd-4bcf-9adf-ed96f0daebbc,POINT (-119.41276 36.54707),Road 72 and Adeaide Way,,0.0,Dinuba Dollar Tree,07fe70a4-21dd-4bcf-9adf-ed96f0daebbcRoad 72 and Adeaide Way,53,54,Tulare,TUL,54,54,107,,19311031190.7,654530.72,709be848-8aac-4cd3-bf13-8b982ede6775,12.0,1.0,157.89,both


In [90]:
stop_summary2.num_routes.describe()

count   40702.00
mean        1.36
std         0.87
min         1.00
25%         1.00
50%         1.00
75%         1.00
max        18.00
Name: num_routes, dtype: float64

In [96]:
stop_summary2.num_trips.describe()

count   40702.00
mean       42.51
std       860.48
min         1.00
25%        10.00
50%        18.00
75%        32.00
max     54142.00
Name: num_trips, dtype: float64

In [91]:
stop_summary2.am_peak_headway_minutes.describe()

count   34753.00
mean      172.82
std       125.06
min         0.01
25%        88.39
50%       157.89
75%       240.00
max       500.00
Name: am_peak_headway_minutes, dtype: float64

In [94]:
stop_summary2.am_peak_headway_minutes.min()

0.00570409982174688

In [87]:
stop_summary2.shape

(82754, 24)

In [89]:
stop_locations_gdf_ca3.combo_col.nunique()

82754

In [88]:
stop_locations_gdf_ca3.shape

(82754, 20)

In [None]:
# # Saving geojson file
# with tempfile.NamedTemporaryFile(suffix=".geojson") as tmp:
#     stop_summary_final.to_file(tmp.name, driver="GeoJSON")


#     with fsspec.open(f"{GCS_FILE_PATH}/stop_summary.geojson", 'w') as f_out:
#         with open(tmp.name, 'r') as f_in:
#             f_out.write(f_in.read())