# Observations from running the operator + Caltrans district websites for the first time since using the new warehouse tables.
* Using these databases
* PROD_PROJECT = "cal-itp-data-infra"
* PROD_MART = "mart_gtfs_rollup"
* MONTH_DATE_COL = "month_first_day"

In [1]:
import geopandas as gpd
import numpy as np
import pandas as pd

In [2]:
from shared_utils import (
    bq_utils,
    geo_utils,
    gtfs_utils_v2,
    portfolio_utils,
    publish_utils,
    rt_dates,
)

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [3]:
from calitp_data_analysis import geography_utils
from shared_utils import geo_utils, gtfs_utils_v2, portfolio_utils, publish_utils
from update_vars import GTFS_DATA_DICT, analysis_month, file_name, previous_month, last_year

In [4]:
import google.auth

credentials, project = google.auth.default()

In [5]:
from calitp_data_analysis.sql import get_engine

db_engine = get_engine()

In [6]:
from calitp_data_analysis.gcs_pandas import GCSPandas
from functools import cache

In [7]:
@cache
def gcs_pandas():
    return GCSPandas()

In [8]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [9]:
GCS_PATH = f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/"

In [10]:
operator_hourly_summary_url = f"{GCS_PATH}{GTFS_DATA_DICT.gtfs_digest_rollup.hourly_day_type_summary}_{file_name}.parquet"

## Hourly Summary
* `Departure Hours` is over 24.

In [11]:
operator_hourly_summary_df = gcs_pandas().read_parquet(operator_hourly_summary_url)

In [12]:
operator_hourly_summary_df.sample()

Unnamed: 0,Analysis Name,Date,Day Type,Departure Hour,N Trips
73140,Tahoe Transportation District,11-2025,Saturday,12,10


In [13]:
operator_hourly_summary_df["Departure Hour"].unique()

<IntegerArray>
[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
Length: 32, dtype: Int64

## Crosswalk

In [14]:
crosswalk_url = f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/{GTFS_DATA_DICT.gtfs_digest_rollup.crosswalk}_{file_name}.parquet"

crosswalk_df = gcs_pandas().read_parquet(crosswalk_url)

In [15]:
crosswalk_df.shape

(165, 7)

In [16]:
crosswalk_df.sample()

Unnamed: 0,name,analysis_name,county_name,caltrans_district,caltrans_district_int,ntd_id,ntd_id_2022
59,Mountain Transit Schedule,Mountain Area Regional Transit Authority,San Bernardino,08-San Bernardino / Riverside,8,9R02-91012,91012


In [17]:
crosswalk_df.analysis_name.nunique()

143

In [18]:
crosswalk_df.analysis_name.value_counts().head()

analysis_name
City and County of San Francisco                                                                      3
San Luis Obispo Regional Transit Authority                                                            3
Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)    2
City of Tracy                                                                                         2
Eastern Sierra Transit Authority                                                                      2
Name: count, dtype: int64

In [19]:
crosswalk_df.loc[crosswalk_df.analysis_name.str.contains("County of San Francisco")]

Unnamed: 0,name,analysis_name,county_name,caltrans_district,caltrans_district_int,ntd_id,ntd_id_2022
88,Bay Area 511 Muni Schedule,City and County of San Francisco,San Francisco,04-Bay Area / Oakland,4,90015,90015
99,Golden Gate Park Shuttle Schedule,City and County of San Francisco,San Francisco,04-Bay Area / Oakland,4,90015,90015
116,Bay Area 511 Golden Gate Park Shuttle Schedule,City and County of San Francisco,San Francisco,04-Bay Area / Oakland,4,90015,90015


## Schedule Route Summary

In [20]:
schedule_rt_route_direction_summary_url = f"{GCS_PATH}{GTFS_DATA_DICT.gtfs_digest_rollup.schedule_rt_route_direction}_{file_name}.parquet"

In [21]:
schedule_rt_route_direction_summary_df = gcs_pandas().read_parquet(schedule_rt_route_direction_summary_url)

In [22]:
schedule_rt_route_direction_summary_df.sample()

Unnamed: 0,Index,Date,Analysis Name,Route,Direction,Frequency All Day,Frequency Offpeak,Frequency Peak,Daily Service Hours,Daily Trips Peak,Daily Trips Offpeak,Daily Trips All Day,Day Type,Route Type,Route Typology,Daily Service Minutes,Average Scheduled Minutes,Headway All Day,Headway Peak,Headway Offpeak
29552,45093,06/2025,UCSC and City of Santa Cruz Beach Shuttle,3A__3A Capitola Mall/Live Oak,0,0.85,0.56,1.0,8.63,8.0,9.0,17.0,Sunday,3,Bus,517.8,30.46,70.59,60.0,107.14


In [23]:
schedule_rt_route_direction_summary_df["Analysis Name"].nunique()

82

In [24]:
PROD_PROJECT = "cal-itp-data-infra"
PROD_MART = "mart_gtfs_rollup"
MONTH_DATE_COL = "month_first_day"

In [25]:
schedule_rt_route_direction_summary_df_og = bq_utils.download_table(
        project_name=PROD_PROJECT,
        dataset_name=PROD_MART,
        table_name=GTFS_DATA_DICT.gtfs_digest_rollup.schedule_rt_route_direction,
        date_col=MONTH_DATE_COL,
        start_date=last_year,
        end_date=analysis_month,
    )

  import pkg_resources  # noqa


Downloading: 100%|[32m██████████[0m|
query: SELECT * FROM  `cal-itp-data-infra`.`mart_gtfs_rollup`.`fct_monthly_schedule_rt_route_direction_summary` WHERE month_first_day >= DATE('2024-12-01') AND month_first_day <= DATE('2025-12-01')


In [26]:
schedule_rt_route_direction_summary_df_og.sample()

Unnamed: 0,name,month_first_day,month,year,day_type,route_name,direction_id,route_type,route_color,route_typology,daily_trips_all_day,daily_stop_arrivals_all_day,daily_distinct_stops_all_day,frequency_all_day,daily_service_hours,daily_flex_service_hours,daily_trips_owl,daily_trips_early_am,daily_trips_am_peak,daily_trips_midday,daily_trips_pm_peak,daily_trips_evening,daily_trips_peak,daily_trips_offpeak,frequency_owl,frequency_early_am,frequency_am_peak,frequency_midday,frequency_pm_peak,frequency_evening,frequency_peak,frequency_offpeak,schedule_base64_url,tu_name,vp_name,schedule_name,tu_base64_url,vp_base64_url,tu_num_distinct_updates,daily_tu_num_distinct_updates,daily_tu_num_skipped_stops,daily_tu_num_canceled_stops,daily_tu_num_added_stops,daily_tu_num_scheduled_stops,n_tu_trips,daily_tu_trips,vp_num_distinct_updates,daily_vp_num_distinct_updates,n_vp_trips,daily_vp_trips,n_rt_trips,n_rt_days
110803,Anaheim Resort Schedule,2025-10-01,10,2025,Saturday,bc404235-c139-4efb-90fb-798fbbddc35c__Grove District Line 03 Grove District Line,,3,96307B,bus,19.75,155.0,138.25,1.04,11.37,,1.0,1.0,6.0,0.0,3.75,8.0,9.75,10.0,0.25,0.33,2.0,0.0,0.75,2.0,1.22,0.63,aHR0cHM6Ly9hcnQudHJpcHNob3QuY29tL3YxL2d0ZnMuemlwP3JlZ2lvbklkPUNBNTU4RERDLUQ3RjItNEI0OC05Q0FDLURFRUExMTM0RjgyMA==,Anaheim Resort TripUpdates,,Anaheim Resort Schedule,aHR0cHM6Ly9hcnQudHJpcHNob3QuY29tL3YxL2d0ZnMvcmVhbHRpbWUvdHJpcFVwZGF0ZS9DQTU1OEREQy1EN0YyLTRCNDgtOUNBQy1ERUVBMTEzNEY4MjA=,,3152,788.0,0.0,0.0,0.0,6.75,6,1.5,,,0,0.0,6,4


In [27]:
schedule_rt_route_direction_summary_df_og.columns

Index(['name', 'month_first_day', 'month', 'year', 'day_type', 'route_name',
       'direction_id', 'route_type', 'route_color', 'route_typology',
       'daily_trips_all_day', 'daily_stop_arrivals_all_day',
       'daily_distinct_stops_all_day', 'frequency_all_day',
       'daily_service_hours', 'daily_flex_service_hours', 'daily_trips_owl',
       'daily_trips_early_am', 'daily_trips_am_peak', 'daily_trips_midday',
       'daily_trips_pm_peak', 'daily_trips_evening', 'daily_trips_peak',
       'daily_trips_offpeak', 'frequency_owl', 'frequency_early_am',
       'frequency_am_peak', 'frequency_midday', 'frequency_pm_peak',
       'frequency_evening', 'frequency_peak', 'frequency_offpeak',
       'schedule_base64_url', 'tu_name', 'vp_name', 'schedule_name',
       'tu_base64_url', 'vp_base64_url', 'tu_num_distinct_updates',
       'daily_tu_num_distinct_updates', 'daily_tu_num_skipped_stops',
       'daily_tu_num_canceled_stops', 'daily_tu_num_added_stops',
       'daily_tu_num_schedul

In [28]:
schedule_rt_route_direction_summary_df_og.name.nunique()

140

In [29]:
schedule_rt_route_direction_summary_df_og.vp_name.nunique()

136

In [30]:
schedule_rt_route_direction_summary_df_og.tu_name.nunique()

133

In [31]:
f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/{GTFS_DATA_DICT.gtfs_digest_rollup.schedule_rt_route_direction}_{file_name}.parquet"

'gs://calitp-analytics-data/data-analyses/gtfs_digest/raw/fct_monthly_schedule_rt_route_direction_summary_2025_12.parquet'

## Operator Summary
* Missing operators

In [32]:
operator_summary_url = f"{GCS_PATH}{GTFS_DATA_DICT.gtfs_digest_rollup.operator_summary}_{file_name}.parquet"

In [33]:
operator_df = gcs_pandas().read_parquet(
    operator_summary_url,
    filters=[
        ("Day Type", "==", "Weekday"),],
)

In [34]:
operator_df.sample()

Unnamed: 0,Date,Analysis Name,Caltrans District,VP Name,TU Name,N Trips,Day Type,Daily Trips,Ttl Service Hours,N Routes,N Days,N Shapes,N Stops,VP Messages Per Minute,N VP Trips,Daily VP Trips,Pct VP Trips,N VP Routes,Pct VP Service Hours,TU Messages Per Minute,N TU Trips,Daily TU Trips,Pct TU Trips,N TU Routes,Pct TU Service Hours,Percent of Trips with Trip Updates,Percent of Trips with Vehicle Positions
256,2025-12-01,Sonoma-Marin Area Rail Transit District,04-Bay Area / Oakland,Bay Area 511 Sonoma-Marin Area Rail Transit VehiclePositions,Bay Area 511 Sonoma-Marin Area Rail Transit TripUpdates,762,Weekday,38.1,1032.7,1.0,20,2.0,28.0,3.0,762,38.1,1.0,1.0,1.23,3.0,762,38.1,1.0,1.0,1.87,100.0,100.0


In [35]:
(operator_df.groupby(['Caltrans District'], 
                      observed=True, group_keys=False)
           .agg({
               "Analysis Name": "nunique",
               "VP Name":"nunique",
               "TU Name":"nunique"
           }))

Unnamed: 0_level_0,Analysis Name,VP Name,TU Name
Caltrans District,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
01-Eureka,1,1,1
02-Redding,1,1,1
03-Marysville / Sacramento,7,7,7
04-Bay Area / Oakland,19,20,20
05-San Luis Obispo / Santa Barbara,5,6,6
06-Fresno / Bakersfield,6,6,6
07-Los Angeles / Ventura,19,21,22
08-San Bernardino / Riverside,4,5,5
09-Bishop,1,1,1
10-Stockton,7,7,7


In [36]:
crosswalk_df.sample()

Unnamed: 0,name,analysis_name,county_name,caltrans_district,caltrans_district_int,ntd_id,ntd_id_2022
112,Bell Gardens Schedule,City of Bell Gardens,Los Angeles,07-Los Angeles / Ventura,7,90253,90253


In [37]:
crosswalk_df.groupby("caltrans_district").agg({"analysis_name":"nunique",
                                              "name":"nunique"})

Unnamed: 0_level_0,analysis_name,name
caltrans_district,Unnamed: 1_level_1,Unnamed: 2_level_1
01-Eureka,5,6
02-Redding,7,9
03-Marysville / Sacramento,14,16
04-Bay Area / Oakland,21,26
05-San Luis Obispo / Santa Barbara,9,11
06-Fresno / Bakersfield,11,11
07-Los Angeles / Ventura,45,48
08-San Bernardino / Riverside,11,11
09-Bishop,1,2
10-Stockton,13,18


In [38]:
op_summary_og_df = bq_utils.download_table(
        project_name=PROD_PROJECT,
        dataset_name=PROD_MART,
        table_name=GTFS_DATA_DICT.gtfs_digest_rollup.operator_summary,
        date_col=MONTH_DATE_COL,
        start_date=last_year,
        end_date=analysis_month,
    )

Downloading: 100%|[32m██████████[0m|
query: SELECT * FROM  `cal-itp-data-infra`.`mart_gtfs_rollup`.`fct_monthly_operator_summary` WHERE month_first_day >= DATE('2024-12-01') AND month_first_day <= DATE('2025-12-01')


In [39]:
op_summary_og_df.sample()

Unnamed: 0,month,year,month_first_day,schedule_name,schedule_base64_url,vp_name,vp_base64_url,tu_name,tu_base64_url,day_type,analysis_name,n_trips,daily_trips,ttl_service_hours,n_routes,n_shapes,n_stops,n_days,vp_messages_per_minute,n_vp_trips,daily_vp_trips,pct_vp_trips,n_vp_routes,pct_vp_service_hours,tu_messages_per_minute,n_tu_trips,daily_tu_trips,pct_tu_trips,n_tu_routes,pct_tu_service_hours
3578,10,2025,2025-10-01,Santa Cruz Schedule,aHR0cHM6Ly9kZXZlbG9wZXIuc2NtZXRyby5vcmcvZ3Rmcy56aXA=,Santa Cruz Vehicle Positions,aHR0cHM6Ly9ydC5zY21ldHJvLm9yZy9ndGZzcnQvdmVoaWNsZXM=,Santa Cruz Trip Updates,aHR0cHM6Ly9ydC5zY21ldHJvLm9yZy9ndGZzcnQvdHJpcHM=,Weekday,UCSC and City of Santa Cruz Beach Shuttle,17664,768.0,14839.2,24.0,67.0,751.0,23,3.0,17326,753.3,0.98,1.0,1.31,3.0,17664,768.0,1.0,1.0,2.9


### Some values are missing after joining the crosswalk.

In [40]:
op_summary_og_df.analysis_name.nunique()

86

In [42]:
original_analysis_names = set(list(op_summary_og_df.analysis_name.unique()))

In [41]:
operator_df["Analysis Name"].nunique()

72

In [44]:
cleaned_analysis_names = set(list(operator_df["Analysis Name"].unique()))

In [45]:
original_analysis_names-cleaned_analysis_names 

{'Capitol Corridor Joint Powers Authority',
 'City of Banning',
 'City of Beaumont',
 'City of Irvine',
 'City of Rosemead',
 'Emeryville Transportation Management Agency',
 'Mendocino Transit Authority',
 'Mountain View Transportation Management Association',
 None,
 'Presidio Trust',
 'San Diego Metropolitan Transit System, Airport, Flagship Cruises',
 'San Francisco Bay Ferry and Oakland Alameda Water Shuttle Schedule',
 'Sonoma County Transit Schedule',
 'University of California, Berkeley',
 'University of California, Los Angeles'}