In [1]:
import pandas as pd
import geopandas as gpd

from functools import cache

from calitp_data_analysis.gcs_geopandas import GCSGeoPandas
from calitp_data_analysis.geography_utils import WGS84, CA_NAD83Albers_m
from calitp_data_analysis.sql import query_sql

@cache
def gcs_geopandas():
    return GCSGeoPandas()

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
eocs = pd.read_excel('./County_EOCs.xlsx')
eocs.columns = eocs.columns.str.lower().str.replace(' ', '_')

In [3]:
eocs.head(3)

Unnamed: 0,oa,physical_eoc_address,region,lat,lon
0,Alpine,"99 Water St Markleville, CA 96120",Inland,38.69453,-119.77899
1,Amador,"700 Court St., Jackson, CA 95642",Inland,38.351547,-120.76937
2,Butte,"205 Mira Loma Suite 30, Oroville, CA 95965",Inland,39.520803,-121.552203


In [4]:
path = 'calitp-analytics-data/data-analyses/gtfs_schedule/county_eocs.parquet'

In [5]:
# gcs_geopandas().geo_data_frame_to_parquet(eocs, path)

In [6]:
# eocs = gpd.GeoDataFrame(eocs, geometry=gpd.points_from_xy(eocs.lon, eocs.lat), crs=WGS84)

In [7]:
eocs = gcs_geopandas().read_parquet(path).to_crs(CA_NAD83Albers_m)

In [8]:
# https://gis.data.ca.gov/datasets/CDEGIS::california-counties-3/explore

counties = gpd.read_file("https://services3.arcgis.com/fdvHcZVgB2QSRNkL/arcgis/rest/services/California_Counties/FeatureServer/0/query?outFields=*&where=1%3D1&f=geojson")
counties = counties.to_crs(CA_NAD83Albers_m)

In [9]:
eocs.head(3)

Unnamed: 0,oa,physical_eoc_address,region,lat,lon,geometry
0,Alpine,"99 Water St Markleville, CA 96120",Inland,38.69453,-119.77899,POINT (19202.170 75408.742)
1,Amador,"700 Court St., Jackson, CA 95642",Inland,38.351547,-120.76937,POINT (-67154.315 37532.982)
2,Butte,"205 Mira Loma Suite 30, Oroville, CA 95965",Inland,39.520803,-121.552203,POINT (-133351.643 168309.474)


In [10]:
counties.head(3)

Unnamed: 0,OBJECTID,Year,CountyName,Shape__Area,Shape__Length,COUNTY_FIPS,DistrictCount,geometry
0,117,2023,Alameda,3079162000.0,435624.781963,1,22,"MULTIPOLYGON (((-173749.936 -59197.380, -17384..."
1,118,2023,Alpine,3156006000.0,275565.411314,3,2,"POLYGON ((36008.436 77542.221, 36052.787 77431..."
2,119,2023,Amador,2562528000.0,359598.555079,5,2,"POLYGON ((-6290.108 76304.882, -6290.029 76294..."


## read stop data

In [11]:
stops = gcs_geopandas().read_parquet('gs://calitp-analytics-data/data-analyses/traffic_ops/ca_transit_stops.parquet').to_crs(CA_NAD83Albers_m)

In [33]:
stops.agency = stops.agency.str.replace('UCSC and City of Santa Cruz Beach Shuttle', 'Santa Cruz Metropolitan Transit District')
# TODO investigate/fix

In [34]:
stops.routetypes.unique()

array(['3', '0, 3', '2, 3'], dtype=object)

In [35]:
stops = stops.assign(has_bus = stops.routetypes.map(lambda x: True if '3' in x or '11' in x else False))

In [36]:
stops = stops[stops['has_bus']]

In [37]:
joined = gpd.sjoin(counties, stops)

In [38]:
joined.columns

Index(['OBJECTID', 'Year', 'CountyName', 'Shape__Area', 'Shape__Length',
       'COUNTY_FIPS', 'DistrictCount', 'geometry', 'index_right', 'agency',
       'stop_id', 'stop_name', 'n_routes', 'route_ids_served', 'routetypes',
       'n_arrivals', 'n_hours_in_service', 'meters_to_ca_state_highway',
       'base64_url', 'district_name', 'has_bus'],
      dtype='object')

In [39]:
group_cols = ['CountyName', 'agency']

In [40]:
df = joined.groupby(group_cols)['n_arrivals'].agg(['sum', 'count']).reset_index().rename(columns={'sum': 'daily_arrivals', 'count': 'n_stops'})

In [41]:
# df

### stop count and arrivals analysis

* add up daily arrivals (trips x stops), measures which agency runs the most service, likely has more resources
* add up number of stops, indirectly measures footprint and service (between daily arrivals and spatial footprint?)

In [42]:
max_stop_count_agency = df.sort_values('n_stops', ascending=False).groupby('CountyName').head(1).reset_index(drop=True)

In [43]:
max_arrivals_agency = df.sort_values('daily_arrivals', ascending=False).groupby('CountyName').head(1).reset_index(drop=True)

In [44]:
both_max = max_arrivals_agency[['CountyName', 'agency']].merge(max_stop_count_agency[['CountyName', 'agency']], on='CountyName')
both_max = both_max.rename(columns={'agency_x': 'agency_max_arrivals', 'agency_y': 'agency_max_stop_locations'})

In [45]:
# both_max

### service area analysis

* buffer each stop by 1km, sum up area in each County
* agencies with a large footprint may be better positioned to assist in an emergency

In [46]:
joined2 = gpd.sjoin(stops, counties)

In [47]:
joined2.geometry = joined2.buffer(1000)

In [48]:
county_dissolve = joined2[['agency', 'CountyName', 'geometry']].dissolve(['agency', 'CountyName'])

In [49]:
county_dissolve = county_dissolve.assign(area = county_dissolve.geometry.map(lambda x: x.area)).reset_index()

In [50]:
# county_dissolve.explore(column='agency')

In [51]:
max_area_agency = county_dissolve.sort_values('area', ascending=False).groupby('CountyName')[['agency', 'CountyName']].head(1).reset_index(drop=True).rename(columns={'agency': 'agency_max_service_area'})

### compile

In [52]:
export_df = both_max.merge(max_area_agency, on='CountyName')
export_df.assign(all_same_agency = (export_df.agency_max_arrivals == export_df.agency_max_stop_locations) & (export_df.agency_max_stop_locations == export_df.agency_max_service_area)).to_csv('EOC_draft.csv')

### using `fct_monthly_scheduled_stops`?

    * new warehouse table, want to confirm how stop arrival calculation should work
    * would expect bus_arrivals / n_days to be <= daily_stop_arrivals?

In [41]:
bus_stops = query_sql(
'''
SELECT name, month_first_day, day_type, route_type_3 + route_type_11 AS bus_arrivals,
daily_stop_arrivals, stop_id, n_days, pt_geom
FROM `cal-itp-data-infra.mart_gtfs_rollup.fct_monthly_scheduled_stops`
WHERE year = 2025 AND month = 12
AND route_type_3 > 0 OR route_type_11 > 0
LIMIT 1000
'''
)

In [42]:
bus_stops = gpd.GeoDataFrame(bus_stops, geometry=gpd.GeoSeries.from_wkt(bus_stops.pt_geom), crs=WGS84).drop(columns=['pt_geom'])

In [43]:
bus_stops.head(10)

Unnamed: 0,name,month_first_day,day_type,bus_arrivals,daily_stop_arrivals,stop_id,n_days,geometry
0,Eastern Sierra Schedule,2023-11-01,Saturday,310.7,18.0,20377,3,POINT (-118.97121 37.63867)
1,Eastern Sierra Schedule,2023-12-01,Saturday,111.2,408.0,20318,5,POINT (-118.96680 37.63918)
2,Eastern Sierra Schedule,2024-01-01,Saturday,122.3,72.0,20327,4,POINT (-118.96859 37.63739)
3,Eastern Sierra Schedule,2024-01-01,Saturday,349.1,80.8,20357,4,POINT (-118.97536 37.64025)
4,Eastern Sierra Schedule,2024-02-01,Sunday,152.0,156.0,4230550,4,POINT (-118.98591 37.62834)
5,Eastern Sierra Schedule,2024-02-01,Saturday,1216.0,204.0,20305,4,POINT (-118.98755 37.64661)
6,Eastern Sierra Schedule,2024-05-01,Weekday,21.0,17.0,20386,6,POINT (-118.99583 37.63885)
7,Eastern Sierra Schedule,2024-05-01,Saturday,176.0,703.5,4230553,4,POINT (-118.96654 37.63580)
8,Eastern Sierra Schedule,2024-06-01,Sunday,42.0,312.0,20288,5,POINT (-118.97172 37.64818)
9,Flixbus Schedule,2025-12-01,Saturday,14.0,2.0,d3fe1a76-8aa7-43b3-ae72-d29ca1601bd7,4,POINT (-105.51930 39.74173)


In [52]:
type(bus_stops)

geopandas.geodataframe.GeoDataFrame

In [56]:
# bus_stops.drop(columns='month_first_day').explore()