# Caltrans District + Legislative District
* How to add Caltrans and Legislative District to `fct_monthly_operator_summary` and `fct_monthly_routes`

In [1]:
from datetime import datetime
from functools import cache
from pathlib import Path
from typing import Literal

import _sql_query
import geopandas as gpd
import google.auth
import pandas as pd
import pandas_gbq
import publish_public_data
from calitp_data_analysis import geography_utils
from calitp_data_analysis.gcs_geopandas import GCSGeoPandas
from shared_utils import geo_utils, gtfs_utils_v2, portfolio_utils, publish_utils
from update_vars import GTFS_DATA_DICT

In [2]:
credentials, project = google.auth.default()

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [4]:
@cache
def gcs_geopandas():
    return GCSGeoPandas()

## Caltrans District
* Testing `cal-itp-data-infra-staging.tiffany_mart_transit_database.bridge_gtfs_analysis_name_x_ntd` per [Issue 1791](https://github.com/cal-itp/data-analyses/issues/1791)

In [5]:
query_sql = f"""
        SELECT 
            *
        FROM `cal-itp-data-infra-staging`.`tiffany_mart_transit_database`.`bridge_gtfs_analysis_name_x_ntd`
    """

In [6]:
query_sql

'\n        SELECT \n            *\n        FROM `cal-itp-data-infra-staging`.`tiffany_mart_transit_database`.`bridge_gtfs_analysis_name_x_ntd`\n    '

In [7]:
project = "cal-itp-data-infra"

In [8]:
df = pandas_gbq.read_gbq(
        query_sql, 
        project_id = project,
        dialect = "standard",
        credentials = credentials
    )

  import pkg_resources  # noqa


Downloading: 100%|[32m██████████[0m|


In [9]:
df.columns

Index(['organization_name', 'organization_source_record_id',
       'schedule_source_record_id', 'schedule_gtfs_dataset_name',
       'analysis_name', 'regional_feed_type', 'county_name',
       'caltrans_district', 'caltrans_district_name', 'ntd_id', 'ntd_id_2022',
       'rtpa_name', 'mpo_name'],
      dtype='object')

In [56]:
df = df[['analysis_name', 'county_name',
       'caltrans_district', 'caltrans_district_name', 'ntd_id', 'ntd_id_2022',
       'rtpa_name', 'mpo_name']].drop_duplicates(subset = ["analysis_name","county_name","caltrans_district"])

In [57]:
df.shape

(179, 8)

In [58]:
len(df.drop_duplicates())

179

In [59]:
df.analysis_name.nunique()

179

#### Why are there multiple values? 

In [60]:
df.analysis_name.value_counts().head()

City of Banning                    1
Yuba-Sutter Transit Authority      1
City of Lompoc                     1
Redwood Coast Transit Authority    1
Anaheim Transportation Network     1
Name: analysis_name, dtype: int64

In [61]:
df.loc[df.analysis_name.str.contains("Obispo")]

Unnamed: 0,analysis_name,county_name,caltrans_district,caltrans_district_name,ntd_id,ntd_id_2022,rtpa_name,mpo_name
26,San Luis Obispo Regional Transit Authority,San Luis Obispo,5,San Luis Obispo / Santa Barbara,90156,90156,San Luis Obispo Council of Governments,San Diego Association of Governments


In [62]:
df.loc[df.analysis_name.str.contains("Alhambra")]

Unnamed: 0,analysis_name,county_name,caltrans_district,caltrans_district_name,ntd_id,ntd_id_2022,rtpa_name,mpo_name
90,City of Alhambra,Los Angeles,7,Los Angeles / Ventura,90247,90247,Southern California Association of Governments,Southern California Association of Governments


### fct_monthly_operator_summary

In [63]:
monthly_operator_summary_url = "gs://calitp-analytics-data/data-analyses/gtfs_digest/raw/fct_monthly_operator_summary_2025_12.parquet"

In [64]:
monthly_operator_summary_df = pd.read_parquet(monthly_operator_summary_url).drop(columns = ["vp_base64_url", "schedule_base64_url", "tu_base64_url"])

In [65]:
monthly_operator_summary_df.sample().T

Unnamed: 0,1935
month,8
year,2025
month_first_day,2025-08-01 00:00:00
schedule_name,SunLine Avail Schedule
vp_name,SunLine Vehicle Positions
tu_name,SunLine Trip Updates
day_type,Sunday
analysis_name,SunLine Transit Agency
n_trips,1770
daily_trips,354.00


In [66]:
monthly_operator_summary_df2 = pd.merge(
    monthly_operator_summary_df,
    df,
    on = ['analysis_name'],
    how = "outer",
    indicator= True
)

In [67]:
monthly_operator_summary_df2._merge.value_counts()

both          2512
left_only     1055
right_only      93
Name: _merge, dtype: int64

In [68]:
monthly_operator_summary_df2.loc[monthly_operator_summary_df2._merge == "right_only"] [['analysis_name']].drop_duplicates().sort_values(by = ["analysis_name"])

Unnamed: 0,analysis_name
3644,Amador Regional Transit System
3612,Amtrak
3591,Angel Island-Tiburon Ferry Company
3598,Basin Transit
3593,Calaveras Transit Agency
3628,Chemehuevi Indian Tribe
3609,City of Alhambra
3641,City of Artesia
3652,City of Arvin
3648,City of Baldwin Park


#### Mountain View is missing an analysis_name value.

In [69]:
monthly_operator_summary_df.loc[monthly_operator_summary_df.schedule_name.str.contains("Mountain")].sample()

Unnamed: 0,month,year,month_first_day,schedule_name,vp_name,tu_name,day_type,analysis_name,n_trips,daily_trips,ttl_service_hours,n_routes,n_shapes,n_stops,n_days,vp_messages_per_minute,n_vp_trips,daily_vp_trips,pct_vp_trips,n_vp_routes,pct_vp_service_hours,tu_messages_per_minute,n_tu_trips,daily_tu_trips,pct_tu_trips,n_tu_routes,pct_tu_service_hours
929,1,2025,2025-01-01,Mountain Transit GMV Schedule,Mountain Transit GMV VehiclePositions,Mountain Transit GMV TripUpdate,Saturday,,508,127.0,42217.9,5.0,19.0,124.0,4,3.0,459,114.8,0.91,1.0,0.01,1.7,461,115.3,0.91,1.0,0.04


#### Same thing with Roseville

In [70]:
monthly_operator_summary_df.loc[monthly_operator_summary_df.schedule_name.str.contains("Roseville")].sample()

Unnamed: 0,month,year,month_first_day,schedule_name,vp_name,tu_name,day_type,analysis_name,n_trips,daily_trips,ttl_service_hours,n_routes,n_shapes,n_stops,n_days,vp_messages_per_minute,n_vp_trips,daily_vp_trips,pct_vp_trips,n_vp_routes,pct_vp_service_hours,tu_messages_per_minute,n_tu_trips,daily_tu_trips,pct_tu_trips,n_tu_routes,pct_tu_service_hours
577,9,2025,2025-09-01,Roseville Transit TripShot Schedule,Roseville Transit TripShot VehiclePositions,Roseville Transit TripShot TripUpdates,Saturday,,96,48.0,11227.5,5.0,17.0,135.0,2,3.0,96,48.0,1.0,1.0,0.01,3.0,96,48.0,1.0,1.0,0.08


In [82]:
# df.loc[df.analysis_name.str.contains("Roseville")]

#### Same thing with YoloBus

In [71]:
monthly_operator_summary_df.loc[monthly_operator_summary_df.schedule_name.str.contains("Yolo")].sample()

Unnamed: 0,month,year,month_first_day,schedule_name,vp_name,tu_name,day_type,analysis_name,n_trips,daily_trips,ttl_service_hours,n_routes,n_shapes,n_stops,n_days,vp_messages_per_minute,n_vp_trips,daily_vp_trips,pct_vp_trips,n_vp_routes,pct_vp_service_hours,tu_messages_per_minute,n_tu_trips,daily_tu_trips,pct_tu_trips,n_tu_routes,pct_tu_service_hours
249,8,2025,2025-08-01,Yolobus Schedule,Yolobus Vehicle Positions,Yolobus TripUpdates,Saturday,,500,125.0,148993.7,7.0,8.0,223.0,4,3.0,486,121.5,0.97,1.0,0.0,2.5,500,125.0,1.0,1.0,0.02


In [81]:
df.loc[df.analysis_name.str.contains("Yolo")]

Unnamed: 0,analysis_name,county_name,caltrans_district,caltrans_district_name,ntd_id,ntd_id_2022,rtpa_name,mpo_name
28,Yolo County Transportation District,Yolo,3,Marysville / Sacramento,90090,90090,Sacramento Area Council of Governments,Sacramento Area Council of Governments


#### Alhambra is just missing

In [72]:
monthly_operator_summary_df.loc[monthly_operator_summary_df.schedule_name.str.contains("Alhambra")]

Unnamed: 0,month,year,month_first_day,schedule_name,vp_name,tu_name,day_type,analysis_name,n_trips,daily_trips,ttl_service_hours,n_routes,n_shapes,n_stops,n_days,vp_messages_per_minute,n_vp_trips,daily_vp_trips,pct_vp_trips,n_vp_routes,pct_vp_service_hours,tu_messages_per_minute,n_tu_trips,daily_tu_trips,pct_tu_trips,n_tu_routes,pct_tu_service_hours


In [73]:
monthly_operator_summary_df.loc[monthly_operator_summary_df.schedule_name.str.contains("Needles")]

Unnamed: 0,month,year,month_first_day,schedule_name,vp_name,tu_name,day_type,analysis_name,n_trips,daily_trips,ttl_service_hours,n_routes,n_shapes,n_stops,n_days,vp_messages_per_minute,n_vp_trips,daily_vp_trips,pct_vp_trips,n_vp_routes,pct_vp_service_hours,tu_messages_per_minute,n_tu_trips,daily_tu_trips,pct_tu_trips,n_tu_routes,pct_tu_service_hours


In [74]:
monthly_operator_summary_df2.loc[monthly_operator_summary_df2._merge == "right_only"].analysis_name.nunique()

93

### fct_monthly_routes  

In [75]:
monthly_routes_url = "gs://calitp-analytics-data/data-analyses/gtfs_digest/processed/fct_monthly_routes_2025_12.parquet"

In [76]:
monthly_routes_gdf = gpd.read_parquet(monthly_routes_url,storage_options={"token": credentials.token})

In [77]:
monthly_routes_gdf.shape

(76342, 11)

#### Why do the rows increase so dramatically?

In [78]:
monthly_routes_gdf2 = pd.merge(
    monthly_routes_gdf,
    df,
    on = ['analysis_name'],
    how = "outer",
    indicator= True
)

In [79]:
monthly_routes_gdf2._merge.value_counts()

both          76227
left_only       115
right_only        6
Name: _merge, dtype: int64