# Caltrans District + Legislative District
* How to add Caltrans and Legislative District to `fct_monthly_operator_summary` and `fct_monthly_routes`

In [14]:
from datetime import datetime
from functools import cache
from pathlib import Path
from typing import Literal

import _sql_query
import geopandas as gpd
import google.auth
import pandas as pd
import pandas_gbq
import publish_public_data
from calitp_data_analysis import geography_utils
from calitp_data_analysis.gcs_geopandas import GCSGeoPandas
from shared_utils import geo_utils, gtfs_utils_v2, portfolio_utils, publish_utils
from update_vars import GTFS_DATA_DICT

In [15]:
credentials, project = google.auth.default()

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
@cache
def gcs_geopandas():
    return GCSGeoPandas()

## Caltrans District
* Testing `cal-itp-data-infra-staging.tiffany_mart_transit_database.bridge_gtfs_analysis_name_x_ntd` per [Issue 1791](https://github.com/cal-itp/data-analyses/issues/1791)

In [6]:
query_sql = f"""
        SELECT 
            *
        FROM `cal-itp-data-infra-staging`.`tiffany_mart_transit_database`.`bridge_gtfs_analysis_name_x_ntd`
    """

In [7]:
query_sql

'\n        SELECT \n            *\n        FROM `cal-itp-data-infra-staging`.`tiffany_mart_transit_database`.`bridge_gtfs_analysis_name_x_ntd`\n    '

In [11]:
project = "cal-itp-data-infra"

In [17]:
df = pandas_gbq.read_gbq(
        query_sql, 
        project_id = project,
        dialect = "standard",
        credentials = credentials
    )

Downloading: 100%|[32m██████████[0m|


In [32]:
df.columns

Index(['organization_name', 'organization_source_record_id',
       'schedule_source_record_id', 'schedule_gtfs_dataset_name',
       'analysis_name', 'regional_feed_type', 'county_name',
       'caltrans_district', 'caltrans_district_name', 'ntd_id', 'ntd_id_2022',
       'rtpa_name', 'mpo_name'],
      dtype='object')

In [35]:
df = df[['analysis_name', 'county_name',
       'caltrans_district', 'caltrans_district_name', 'ntd_id', 'ntd_id_2022',
       'rtpa_name', 'mpo_name']]

In [19]:
df.shape

(208, 13)

In [36]:
df.sample()

Unnamed: 0,analysis_name,county_name,caltrans_district,caltrans_district_name,ntd_id,ntd_id_2022,rtpa_name,mpo_name
78,City of Fairfield,Solano,4,Bay Area / Oakland,90092,90092,Metropolitan Transportation Commission,Metropolitan Transportation Commission


In [20]:
df.organization_name.nunique()

184

In [23]:
df.analysis_name.nunique()

179

In [43]:
df.loc[df.analysis_name.str.contains("Alhambra")]

Unnamed: 0,analysis_name,county_name,caltrans_district,caltrans_district_name,ntd_id,ntd_id_2022,rtpa_name,mpo_name
90,City of Alhambra,Los Angeles,7,Los Angeles / Ventura,90247,90247,Southern California Association of Governments,Southern California Association of Governments


In [22]:
df.organization_name.value_counts().head()

City of San Luis Obispo                         4
City and County of San Francisco                3
City of Tracy                                   2
Presidio Trust                                  2
Mission Bay Transportation Management Agency    2
Name: organization_name, dtype: int64

### monthly_operator_summary

In [29]:
monthly_operator_summary_url = "gs://calitp-analytics-data/data-analyses/gtfs_digest/raw/fct_monthly_operator_summary_2025_12.parquet"

In [59]:
monthly_operator_summary_df = pd.read_parquet(monthly_operator_summary_url).drop(columns = ["vp_base64_url", "schedule_base64_url", "tu_base64_url"])

In [62]:
monthly_operator_summary_df.sample().T

Unnamed: 0,1234
month,1
year,2025
month_first_day,2025-01-01 00:00:00
schedule_name,GET Schedule
vp_name,GET VehiclePositions
tu_name,GET TripUpdates
day_type,Sunday
analysis_name,Golden Empire Transit District
n_trips,2200
daily_trips,550.00


In [37]:
monthly_operator_summary_df2 = pd.merge(
    monthly_operator_summary_df,
    df,
    on = ['analysis_name'],
    how = "outer",
    indicator= True
)

In [38]:
monthly_operator_summary_df2._merge.value_counts()

both          3324
left_only     1055
right_only      99
Name: _merge, dtype: int64

In [54]:
monthly_operator_summary_df2.loc[monthly_operator_summary_df2._merge == "right_only"] [['analysis_name']].drop_duplicates().sort_values(by = ["analysis_name"])

Unnamed: 0,analysis_name
4462,Amador Regional Transit System
4428,Amtrak
4405,Angel Island-Tiburon Ferry Company
4412,Basin Transit
4407,Calaveras Transit Agency
4446,Chemehuevi Indian Tribe
4425,City of Alhambra
4459,City of Artesia
4470,City of Arvin
4466,City of Baldwin Park


### Quite a few operators are missing that appear in the NTD crosswalk. Check `fct_monthly_schedule_route_direction_summary`

#### Mountain View is missing an analysis_name value.

In [64]:
monthly_operator_summary_df.loc[monthly_operator_summary_df.schedule_name.str.contains("Mountain")].sample()

Unnamed: 0,month,year,month_first_day,schedule_name,vp_name,tu_name,day_type,analysis_name,n_trips,daily_trips,ttl_service_hours,n_routes,n_shapes,n_stops,n_days,vp_messages_per_minute,n_vp_trips,daily_vp_trips,pct_vp_trips,n_vp_routes,pct_vp_service_hours,tu_messages_per_minute,n_tu_trips,daily_tu_trips,pct_tu_trips,n_tu_routes,pct_tu_service_hours
3127,9,2025,2025-09-01,Mountain View Community Shuttle Schedule,Mountain View Community Shuttle VehiclePositions,Mountain View Community Shuttle TripUpdates,Weekday,,924,44.0,49156.8,2.0,2.0,56.0,21,3.0,924,44.0,1.0,1.0,0.03,3.0,924,44.0,1.0,1.0,0.25


#### Same thing with Roseville

In [70]:
monthly_operator_summary_df.loc[monthly_operator_summary_df.schedule_name.str.contains("Roseville")].sample()

Unnamed: 0,month,year,month_first_day,schedule_name,vp_name,tu_name,day_type,analysis_name,n_trips,daily_trips,ttl_service_hours,n_routes,n_shapes,n_stops,n_days,vp_messages_per_minute,n_vp_trips,daily_vp_trips,pct_vp_trips,n_vp_routes,pct_vp_service_hours,tu_messages_per_minute,n_tu_trips,daily_tu_trips,pct_tu_trips,n_tu_routes,pct_tu_service_hours
2857,10,2025,2025-10-01,Roseville Transit TripShot Schedule,Roseville Transit TripShot VehiclePositions,Roseville Transit TripShot TripUpdates,Weekday,,3819,166.0,669090.1,32.04,52.04,192.04,23,3.0,3344,145.4,0.88,1.0,0.01,3.0,3490,151.7,0.92,1.0,0.07


#### Same thing with YoloBus

In [72]:
monthly_operator_summary_df.loc[monthly_operator_summary_df.schedule_name.str.contains("Yolo")].sample()

Unnamed: 0,month,year,month_first_day,schedule_name,vp_name,tu_name,day_type,analysis_name,n_trips,daily_trips,ttl_service_hours,n_routes,n_shapes,n_stops,n_days,vp_messages_per_minute,n_vp_trips,daily_vp_trips,pct_vp_trips,n_vp_routes,pct_vp_service_hours,tu_messages_per_minute,n_tu_trips,daily_tu_trips,pct_tu_trips,n_tu_routes,pct_tu_service_hours
249,8,2025,2025-08-01,Yolobus Schedule,Yolobus Vehicle Positions,Yolobus TripUpdates,Saturday,,500,125.0,148993.7,7.0,8.0,223.0,4,3.0,486,121.5,0.97,1.0,0.0,2.5,500,125.0,1.0,1.0,0.02


#### Alhambra is just missing

In [61]:
monthly_operator_summary_df.loc[monthly_operator_summary_df.schedule_name.str.contains("Alhambra")]

Unnamed: 0,month,year,month_first_day,schedule_name,vp_name,tu_name,day_type,analysis_name,n_trips,daily_trips,ttl_service_hours,n_routes,n_shapes,n_stops,n_days,vp_messages_per_minute,n_vp_trips,daily_vp_trips,pct_vp_trips,n_vp_routes,pct_vp_service_hours,tu_messages_per_minute,n_tu_trips,daily_tu_trips,pct_tu_trips,n_tu_routes,pct_tu_service_hours


In [66]:
monthly_operator_summary_df.loc[monthly_operator_summary_df.schedule_name.str.contains("Needles")]

Unnamed: 0,month,year,month_first_day,schedule_name,vp_name,tu_name,day_type,analysis_name,n_trips,daily_trips,ttl_service_hours,n_routes,n_shapes,n_stops,n_days,vp_messages_per_minute,n_vp_trips,daily_vp_trips,pct_vp_trips,n_vp_routes,pct_vp_service_hours,tu_messages_per_minute,n_tu_trips,daily_tu_trips,pct_tu_trips,n_tu_routes,pct_tu_service_hours


In [45]:
monthly_operator_summary_df2.loc[monthly_operator_summary_df2._merge == "right_only"].analysis_name.nunique()

93

### schedule_rt_route_direction_summary

In [73]:
sched_route_url = "gs://calitp-analytics-data/data-analyses/gtfs_digest/raw/schedule_rt_route_direction_summary_2025_12.parquet"

In [74]:
sched_route_df = pd.read_parquet(sched_route_url)

In [76]:
sched_route_df = sched_route_df.pipe(portfolio_utils.standardize_portfolio_organization_names)

In [77]:
sched_route_df.sample()

Unnamed: 0,name,month_first_day,month,year,day_type,route_name,direction_id,route_type,route_color,route_typology,daily_trips_all_day,daily_stop_arrivals_all_day,daily_distinct_stops_all_day,frequency_all_day,daily_service_hours,daily_flex_service_hours,daily_trips_owl,daily_trips_early_am,daily_trips_am_peak,daily_trips_midday,daily_trips_pm_peak,daily_trips_evening,daily_trips_peak,daily_trips_offpeak,frequency_owl,frequency_early_am,frequency_am_peak,frequency_midday,frequency_pm_peak,frequency_evening,frequency_peak,frequency_offpeak,schedule_base64_url,tu_name,vp_name,schedule_name,tu_base64_url,vp_base64_url,tu_num_distinct_updates,daily_tu_num_distinct_updates,daily_tu_num_skipped_stops,daily_tu_num_canceled_stops,daily_tu_num_added_stops,daily_tu_num_scheduled_stops,n_tu_trips,daily_tu_trips,vp_num_distinct_updates,daily_vp_num_distinct_updates,n_vp_trips,daily_vp_trips,n_rt_trips,n_rt_days,analysis_name,source_record_id
10509,Bay Area 511 San Francisco Bay Ferry Schedule,2025-10-01,10,2025,Weekday,OA__OA Oakland & Alameda,1,4,4fab47,ferry,21.0,47.0,47.0,1.05,9.17,,,1.0,5.0,6.0,6.0,3.0,11.0,10.0,,0.33,1.67,1.2,1.2,0.75,1.38,,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1TQg==,Bay Area 511 San Francisco Bay Ferry TripUpdates,Bay Area 511 San Francisco Bay Ferry VehiclePositions,Bay Area 511 San Francisco Bay Ferry Schedule,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L3RyaXB1cGRhdGVzP2FnZW5jeT1TQg==,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L3ZlaGljbGVwb3NpdGlvbnM_YWdlbmN5PVNC,9292,4646.0,0.0,0.0,0.0,46.0,42,21.0,3762,1881.0,42,21.0,42,2,San Francisco Bay Ferry and Oakland Alameda Water Shuttle Schedule,recMVgDUdjNM24Ram


In [78]:
sched_route_df2 = pd.merge(
    sched_route_df,
    df,
    on = ['analysis_name'],
    how = "outer",
    indicator= True
)

In [79]:
sched_route_df2._merge.value_counts()

both          21913
right_only       97
left_only         0
Name: _merge, dtype: int64

In [80]:
sched_route_df2.loc[sched_route_df2._merge == "right_only"] [['analysis_name']].drop_duplicates().sort_values(by = ["analysis_name"])

Unnamed: 0,analysis_name
21994,Amador Regional Transit System
21961,Amtrak
21938,Angel Island-Tiburon Ferry Company
21945,Basin Transit
21940,Calaveras Transit Agency
21977,Chemehuevi Indian Tribe
21958,City of Alhambra
21990,City of Artesia
22002,City of Arvin
21998,City of Baldwin Park


In [81]:
sched_route_df2.loc[sched_route_df2.schedule_name.str.contains("Mountain")].sample()

ValueError: Cannot mask with non-boolean array containing NA / NaN values