In [1]:
import intake
import pandas as pd
import geopandas as gpd

from calitp_data_analysis.geography_utils import CA_NAD83Albers_m
from shared_utils import rt_utils, rt_dates

import google.auth
credentials, project = google.auth.default()

In [2]:
import sys
sys.path.append('../open_data/')

In [3]:
import open_data_utils

In [4]:
from calitp_data_analysis.gcs_geopandas import GCSGeoPandas
gcsgp = GCSGeoPandas()

# GTFS-based stats for section 1e

## Methodology

### Route Mileage and Number of Routes

First, we spatially overlay GTFS-derived routes with Metropolitan Planning Organization boundaries. If a route crosses an MPO boundary, it will be counted in both regions. However, mileage counts will include only the portion of the route's mileage within that MPO region to avoid double counting.

For this analysis, we've added a deduplication step to only keep unique `route_id` and `shape_id` for each GTFS feed. In some cases, multiple agencies share a GTFS feed. This avoids double counting routes, but in those cases agency metadata from this analysis may not be accurate.

Our methodology aims to count mileage in each direction of a route. This is implemented by keeping the two longest GTFS shapes for each route, which generally correspond to the longest service pattern in each direction. Where multiple routes interline, we count mileage of each route separately.

To count the number of routes, we count the number of unique GTFS `route_id`.

### Number of Stops

In [53]:
catalog = intake.open_catalog("../_shared_utils/shared_utils/shared_data_catalog.yml")

In [54]:
mpos = catalog.metropolitan_planning_orgs.read()[['MPO', 'geometry']].to_crs(CA_NAD83Albers_m).rename(columns={"MPO":"mpo"})

In [55]:
ct_dist = catalog.caltrans_districts.read().to_crs(CA_NAD83Albers_m)

In [56]:
analysis_date = rt_dates.DATES['jun2025']

In [57]:
# stops = catalog.ca_transit_stops.read()
stops = gcsgp.read_parquet('gs://calitp-analytics-data/data-analyses/traffic_ops/export/ca_transit_stops_2025-06-11.parquet')

In [58]:
stops = stops.to_crs(CA_NAD83Albers_m).clip(ct_dist)

In [59]:
# routes = catalog.ca_transit_routes.read()
routes_raw = gcsgp.read_parquet('gs://calitp-analytics-data/data-analyses/traffic_ops/export/ca_transit_routes_2025-06-11.parquet')

In [60]:
routes = (routes_raw
          .drop_duplicates(subset=['route_id', 'shape_id', 'base64_url'])  # deduplicate identical routes where agencies share a feed...
          .to_crs(CA_NAD83Albers_m)
          .clip(ct_dist)
         )

In [78]:
# routes.to_file('routes_2025-06-11.geojson')

In [62]:
# hqta = catalog.hqta_areas.read().to_crs(CA_NAD83Albers_m).clip(ct_dist)

In [63]:
routes['length'] = routes.geometry.map(lambda x: x.length)

In [64]:
routes_top2 = routes.sort_values(['agency', 'route_id', 'length'],
                  ascending=False).groupby(['agency', 'route_id']).head(2)
routes_top2 = routes_top2.assign(miles = routes_top2.length / rt_utils.METERS_PER_MILE)

In [65]:
def overlay_to_routes(mpo_gdf, routes_gdf = routes_top2):
    
    overlaid = mpo_gdf.overlay(routes_gdf, keep_geom_type=False)
    return overlaid

In [66]:
mpo_routes = mpos.groupby('mpo').apply(overlay_to_routes, routes_gdf = routes_top2)
mpo_routes = mpo_routes.reset_index(drop=True)

In [67]:
# routes.query('agency.str.contains("San Fran") & route_id == "1"')

In [80]:
mpo_routes.query('agency.str.contains("San Fran") & route_name == "1"')

Unnamed: 0,mpo,org_id,agency,route_id,route_type,route_name,shape_id,n_trips,base64_url,length,miles,geometry
1264,Metropolitan Transportation Commission,rechaapWbeffO33OX,City and County of San Francisco,1,3,1,102,108,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZW...,9255.124666,5.750882,"LINESTRING (-210761.771 -21895.630, -210745.49..."
1265,Metropolitan Transportation Commission,rechaapWbeffO33OX,City and County of San Francisco,1,3,1,152,105,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZW...,9248.132498,5.746537,"LINESTRING (-219262.706 -23425.359, -219277.08..."


In [79]:
mpo_routes.query('mpo == "Metropolitan Transportation Commission"').to_file('mtc_routes_top2_2025-06-11.geojson')

In [84]:
# mpo_routes.query('mpo == "Metropolitan Transportation Commission" & agency.str.contains("Lake")').explore()

In [83]:
# mpo_routes.query('mpo == "Metropolitan Transportation Commission"').explore()

## routes

* keep longest 2 shapes per route -- imperfect but ok for a general estimate

In [70]:
mpo_routes.columns

Index(['mpo', 'org_id', 'agency', 'route_id', 'route_type', 'route_name',
       'shape_id', 'n_trips', 'base64_url', 'length', 'miles', 'geometry'],
      dtype='object')

In [71]:
countable = mpo_routes.drop_duplicates(subset=['mpo', 'agency', 'route_id']).drop(columns=['miles', 'geometry'])

In [72]:
countable.groupby('mpo').count().reset_index()[['mpo', 'route_id']]

Unnamed: 0,mpo,route_id
0,Association of Monterey Bay Area Governments,64
1,Butte County Association of Governments,23
2,Fresno Council of Governments,35
3,Kern Council of Governments,41
4,Kings County Association of Governments,15
5,Madera County Transportation Commission,15
6,Merced County Association of Governments,34
7,Metropolitan Transportation Commission,585
8,Sacramento Area Council of Governments,217
9,San Diego Association of Governments,159


In [73]:
mpo_route_count = countable.groupby('mpo').count().reset_index()[['mpo', 'route_id']].rename(columns={"route_id":"n_routes"})

In [74]:
mpo_route_mi = mpo_routes.groupby('mpo')[['miles']].sum().reset_index().rename(columns={"miles":"total_route_miles"})

In [75]:
mpo_route_info = mpo_route_count.merge(mpo_route_mi, on='mpo')

In [76]:
routes_top2.route_id.count()

4301

In [77]:
routes_top2.miles.sum()

79738.2298365877

In [35]:
mpo_route_info.round(1).sort_values('total_route_miles', ascending=False)

Unnamed: 0,mpo,n_routes,total_route_miles
14,Southern California Association of Governments,933,37069.5
7,Metropolitan Transportation Commission,585,18039.6
8,Sacramento Area Council of Governments,217,11114.0
9,San Diego Association of Governments,159,5630.0
12,Santa Barbara County Association of Governments,82,5174.9
3,Kern Council of Governments,41,5150.2
10,San Joaquin Council of Governments,69,4584.5
2,Fresno Council of Governments,35,4375.9
6,Merced County Association of Governments,34,4011.5
17,Tulare County Association of Governments,49,3964.9


In [71]:
mpo_route_info.round(1).to_csv('routes_by_mpo.csv')

## stops

In [25]:
def overlay_to_stops(mpo_gdf, stops_gdf = stops):
    
    overlaid = mpo_gdf.overlay(stops_gdf, keep_geom_type=False)
    return overlaid

In [26]:
mpo_stops = mpos.groupby('mpo').apply(overlay_to_stops)

In [27]:
mpo_stops = mpo_stops.reset_index(drop=True)

In [28]:
mpo_stop_count = mpo_stops.groupby(['mpo', 'routetypes'])[['stop_id']].count().reset_index().rename(columns={'stop_id':'stop_count'})

In [29]:
def route_types_to_name(route_types):
    rttype_list = [x.strip() for x in route_types.split(',')]
    name_list = [rt_utils.route_type_names[rttype] for rttype in rttype_list]
    return " and ".join(name_list)

In [30]:
mpo_stop_count = mpo_stop_count.assign(
    route_type_names = mpo_stop_count.routetypes.map(lambda x: route_types_to_name(x)))

In [31]:
mpo_stop_count.head(2)

Unnamed: 0,mpo,routetypes,stop_count,route_type_names
0,Association of Monterey Bay Area Governments,"2, 3",3,Rail and Bus
1,Association of Monterey Bay Area Governments,3,1795,Bus


In [32]:
mpo_stop_count.to_csv('stops_by_mpo.csv')

In [33]:
all_stop_count = stops[['routetypes', 'stop_id']].groupby(['routetypes'])[['stop_id']].count().reset_index().rename(columns={'stop_id':'stop_count'})

In [34]:
all_stop_count = all_stop_count.assign(
    route_type_names = all_stop_count.routetypes.map(lambda x: route_types_to_name(x)))

In [35]:
all_stop_count.to_csv('all_stops.csv')

In [36]:
all_stop_count

Unnamed: 0,routetypes,stop_count,route_type_names
0,0,1482,"Tram, Streetcar, Light rail"
1,"0, 3",286,"Tram, Streetcar, Light rail and Bus"
2,1,119,"Subway, Metro"
3,2,351,Rail
4,"2, 3",113,Rail and Bus
5,3,124416,Bus
6,"3, 5",2,Bus and Cable tram
7,4,49,Ferry
8,5,230,Cable tram


## hqta

* MTS areas, dissolve
* HQTC outside MTS areas, dissolve

In [37]:
mts = hqta[hqta.hqta_type.str.contains('major')]

In [38]:
mts_dissolved = mts.dissolve()

In [39]:
SQ_M_TO_SQ_MI = 1/2.59e6

In [40]:
mts_dissolved.geometry.area.iloc[0] * SQ_M_TO_SQ_MI

1306.6461206834435

In [41]:
hqtc = hqta[~hqta.hqta_type.str.contains('major')]
hqtc_dissolved = hqtc.dissolve()

In [42]:
hqtc_outside_mts = hqtc_dissolved.overlay(mts_dissolved, how='difference')

In [43]:
hqtc_outside_mts.geometry.area.iloc[0] * SQ_M_TO_SQ_MI

419.3658103979564

In [44]:
import shapely

In [45]:
def overlay_to_mts_hqtc(mpo_gdf):
    mpo_mts = mts_dissolved.overlay(mpo_gdf)
    # print(mpo_mts.geometry)
    if not mpo_mts.geometry.empty:
        mpo_gdf['major_transit_stop_sq_mi'] = mpo_mts.geometry.area.iloc[0] * SQ_M_TO_SQ_MI
    mpo_hqtc = hqtc_outside_mts.overlay(mpo_gdf)
    # print(mpo_hqtc.geometry)
    if not mpo_hqtc.geometry.empty:
        mpo_gdf['additional_hq_corridor_sq_mi'] = mpo_hqtc.geometry.area.iloc[0] * SQ_M_TO_SQ_MI
    
    return mpo_gdf

In [46]:
mpo_mts_hqtc = mpos.groupby('mpo').apply(overlay_to_mts_hqtc)

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  mpo_mts_hqtc = mpos.groupby('mpo').apply(overlay_to_mts_hqtc)


In [47]:
mpo_mts_hqtc.round(1).drop(columns=['geometry']).to_csv('mts_hq_corr_by_mpo.csv')

## MTC questions

In [48]:
mtc = mpo_routes.query('mpo=="Metropolitan Transportation Commission"')

In [49]:
mtc.groupby('agency').sum().sort_values('miles', ascending=False)

  mtc.groupby('agency').sum().sort_values('miles', ascending=False)


Unnamed: 0_level_0,n_trips,length,miles
agency,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alameda-Contra Costa Transit District,4730,3485935.0,2166.0652
Amtrak,4,881905.7,2051.210219
Santa Clara Valley Transportation Authority,3081,2698753.0,1676.931505
Greyhound,4,447030.6,1289.211954
City and County of San Francisco,6991,1591804.0,989.103576
San Mateo County Transit District,1431,1311802.0,815.118035
Cloverdale Transit,201,1289374.0,801.181541
"Golden Gate Bridge, Highway and Transportation District",177,1227361.0,762.648936
San Francisco Bay Area Rapid Transit District,942,1151523.0,715.525038
Mendocino Transit Authority,5,399332.0,570.579868


In [59]:
# mtc.explore()

In [51]:
countable_mtc = countable.query('mpo=="Metropolitan Transportation Commission"')

In [52]:
countable_mtc

Unnamed: 0,mpo,org_id,agency,route_id,route_type,route_name,shape_id,n_trips,base64_url
401,Metropolitan Transportation Commission,recH2FdHvrL7nIpHA,Yolo County Transportation District,230,3,230,0da28263-d4c3-4b19-ac9f-c212d25750bd,3,aHR0cHM6Ly93d3cueW9sb2J1cy5jb20vR1RGUy9nb29nbG...
403,Metropolitan Transportation Commission,recH2FdHvrL7nIpHA,Yolo County Transportation District,138,3,138,0a364e0e-4501-49fd-825f-a0edd62c6c34,7,aHR0cHM6Ly93d3cueW9sb2J1cy5jb20vR1RGUy9nb29nbG...
405,Metropolitan Transportation Commission,recIKnsnTdKQ0vsiv,Western Contra Costa Transit Authority,Lynx,3,Lynx,p_5178,11,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZW...
407,Metropolitan Transportation Commission,recIKnsnTdKQ0vsiv,Western Contra Costa Transit Authority,JX,3,JX,p_898753,13,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZW...
409,Metropolitan Transportation Commission,recIKnsnTdKQ0vsiv,Western Contra Costa Transit Authority,JPX,3,JPX,p_5197,9,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZW...
...,...,...,...,...,...,...,...,...,...
1469,Metropolitan Transportation Commission,recOZgevYf7Jimm9L,Alameda-Contra Costa Transit District,19,3,19,shp-19-03,16,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZW...
1471,Metropolitan Transportation Commission,recOZgevYf7Jimm9L,Alameda-Contra Costa Transit District,18,3,18,shp-18-52,69,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZW...
1473,Metropolitan Transportation Commission,recOZgevYf7Jimm9L,Alameda-Contra Costa Transit District,14,3,14,shp-14-53,62,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZW...
1475,Metropolitan Transportation Commission,recOZgevYf7Jimm9L,Alameda-Contra Costa Transit District,12,3,12,shp-12-52,46,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZW...


### santa barbara?

In [53]:
sb = mpo_routes.query('mpo.str.contains("Santa Barbara")')

In [60]:
# sb.explore()

In [61]:
# sb.query('agency.str.contains("Ventura")').explore()

In [56]:
sb_ct = countable.query('mpo.str.contains("Santa Barbara")')

In [57]:
sb_ct.groupby('agency').count() #  exclude VCTC duplicates, otherwise OK

Unnamed: 0_level_0,mpo,org_id,route_id,route_type,route_name,shape_id,n_trips,base64_url
agency,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Amtrak,2,2,2,2,2,2,2,2
City of Guadalupe,2,2,2,2,2,2,2,2
City of Lompoc,5,5,5,5,5,5,5,5
City of Santa Maria,16,16,16,16,16,16,16,16
City of Solvang,2,2,2,2,2,2,2,2
San Luis Obispo Regional Transit Authority,2,2,2,2,2,2,2,2
Santa Barbara County Association of Governments,5,5,5,5,5,5,5,5
Ventura County Transportation Commission,12,12,12,12,12,12,12,12


In [58]:
sb_ct.query('agency.str.contains("Metro")')

Unnamed: 0,mpo,org_id,agency,route_id,route_type,route_name,shape_id,n_trips,base64_url
