# Observations from running the operator + Caltrans district websites for the first time since using the new warehouse tables.
* Using these databases
* PROD_PROJECT = "cal-itp-data-infra"
* PROD_MART = "mart_gtfs_rollup"
* MONTH_DATE_COL = "month_first_day"

In [37]:
import geopandas as gpd
import numpy as np
import pandas as pd

In [39]:
from datetime import datetime

In [2]:
from shared_utils import (
    bq_utils,
    geo_utils,
    gtfs_utils_v2,
    portfolio_utils,
    publish_utils,
    rt_dates,
)

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [3]:
from calitp_data_analysis import geography_utils
from shared_utils import geo_utils, gtfs_utils_v2, portfolio_utils, publish_utils
from update_vars import GTFS_DATA_DICT, analysis_month, file_name, previous_month, last_year

In [4]:
import google.auth

credentials, project = google.auth.default()

In [5]:
from calitp_data_analysis.sql import get_engine

db_engine = get_engine()

In [6]:
from calitp_data_analysis.gcs_pandas import GCSPandas
from functools import cache

In [7]:
@cache
def gcs_pandas():
    return GCSPandas()

In [8]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [9]:
GCS_PATH = f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/"

In [10]:
operator_hourly_summary_url = f"{GCS_PATH}{GTFS_DATA_DICT.gtfs_digest_rollup.hourly_day_type_summary}_{file_name}.parquet"

## Big Blue Bus # of Routes wrong for Operator + Caltrans Digest
* They are supposed to have 17 routes.
### Operator

In [12]:
analysis_name = 'City of Santa Monica'

In [11]:
schedule_rt_route_direction_summary_url = f"{GCS_PATH}{GTFS_DATA_DICT.gtfs_digest_rollup.schedule_rt_route_direction}_{file_name}.parquet"

In [13]:
schedule_rt_route_direction_summary_df = gcs_pandas().read_parquet(schedule_rt_route_direction_summary_url,
                                                         filters=[[("Analysis Name", "==", analysis_name)]])

In [15]:
schedule_rt_route_direction_summary_df.Date.value_counts()

Date
03/2025    79
01/2026    77
12/2025    77
09/2025    77
11/2025    77
04/2025    77
10/2025    77
05/2025    77
06/2025    75
07/2025    71
08/2025    18
01/2025     4
02/2025     3
Name: count, dtype: int64

In [17]:
schedule_rt_route_direction_summary_df.Route.nunique()

22

In [32]:
analysis_month

'2026-01-01'

In [40]:
dt = datetime.strptime(analysis_month, "%Y-%m-%d")

# Format it as MM/YYYY
analysis_month_for_filtering = dt.strftime("%m/%Y")


In [41]:
analysis_month_for_filtering

'01/2026'

In [42]:
schedule_rt_route_direction_summary_df.loc[schedule_rt_route_direction_summary_df.Date == analysis_month_for_filtering][["Route"]].nunique()

Route    17
dtype: int64

In [21]:
schedule_rt_route_direction_summary_df[["Route", "Date"]].sort_values(by = ["Route", "Date"]).drop_duplicates(subset = ["Route"])

Unnamed: 0,Route,Date
0,14__14 Bundy Drive & Centinela Avenue,01/2026
234,15__15 Barrington Avenue,01/2026
373,16__16 Wilshire Boulevard/Bundy Drive-Marina Del Rey,01/2026
21,17__17 Culver City Station - UCLA,01/2026
29,18__18 UCLA - Marina del Rey,01/2026
50,1__1 Main Street & Santa Monica Boulevard/UCLA,01/2026
83,2__2 Wilshire Boulevard/UCLA,01/2026
89,3807__3 Lincoln Blvd/LAX,01/2025
90,3813__9 Pacific Palisades,01/2025
603,3815__R12 Venice/Westwood Sta/UCLA Rapid,01/2025


In [22]:
og_file = "gs://calitp-analytics-data/data-analyses/gtfs_digest/raw/fct_monthly_schedule_rt_route_direction_summary_2026_01.parquet"

In [23]:
schedule_rt_route_direction_summary_og_df = gcs_pandas().read_parquet(og_file,
                                                         filters=[[("analysis_name", "==", analysis_name)]])

In [26]:
schedule_rt_route_direction_summary_og_df.route_name.nunique()

22

In [25]:
schedule_rt_route_direction_summary_og_df[["route_name", "month_first_day"]].sort_values(by = ["route_name", "month_first_day"]).drop_duplicates(subset = ["route_name"])

Unnamed: 0,route_name,month_first_day
18,14__14 Bundy Drive & Centinela Avenue,2025-03-01
701,15__15 Barrington Avenue,2025-03-01
1025,16__16 Wilshire Boulevard/Bundy Drive-Marina Del Rey,2025-03-01
1051,17__17 Culver City Station - UCLA,2025-03-01
114,18__18 UCLA - Marina del Rey,2025-03-01
152,1__1 Main Street & Santa Monica Boulevard/UCLA,2025-03-01
1193,2__2 Wilshire Boulevard/UCLA,2025-03-01
285,3807__3 Lincoln Blvd/LAX,2025-01-01
286,3813__9 Pacific Palisades,2025-01-01
1526,3815__R12 Venice/Westwood Sta/UCLA Rapid,2025-01-01


### Caltrans District

In [43]:
district = '07-Los Angeles / Ventura'

In [44]:
operator_summary_url = f"{GCS_PATH}{GTFS_DATA_DICT.gtfs_digest_rollup.operator_summary}_{file_name}.parquet"

In [54]:
f"{GCS_PATH}{GTFS_DATA_DICT.gtfs_digest_rollup.operator_summary}_{file_name}.parquet"

'gs://calitp-analytics-data/data-analyses/gtfs_digest/processed/fct_monthly_operator_summary_2026_01.parquet'

In [55]:
GTFS_DATA_DICT.gtfs_digest_rollup.operator_summary

'fct_monthly_operator_summary'

In [45]:
operator_df = gcs_pandas().read_parquet(
    operator_summary_url,
    filters=[
        ("Caltrans District", "==", district),
        ("Date", "==", pd.Timestamp(analysis_month)),
        ("Day Type", "==", "Weekday")
    ],
)

In [None]:
operator_df

In [49]:
operator_df.loc[operator_df["Analysis Name"].str.contains("Santa Monica")]

Unnamed: 0,Date,Analysis Name,Caltrans District,VP Name,TU Name,N Trips,Day Type,Daily Trips,Ttl Service Hours,N Routes,N Days,N Shapes,N Stops,VP Messages Per Minute,N VP Trips,Daily VP Trips,Pct VP Trips,Pct VP Service Hours,TU Messages Per Minute,N TU Trips,Daily TU Trips,Pct TU Trips,Pct TU Service Hours
3,2026-01-01,City of Santa Monica,07-Los Angeles / Ventura,Big Blue Bus Clever Vehicle Positions,Big Blue Bus TripUpdates,33038,Weekday,1501.7,26816.2,16.45,22,51.64,889.05,3.0,61756,2807.1,100.0,2.27,2.0,62854,2857.0,100.0,5.13


#### Number of Routes really fluctuates

In [58]:
operator_df2 = gcs_pandas().read_parquet(
    operator_summary_url,
    filters=[
        ("Caltrans District", "==", district),
        ("Day Type", "==", "Weekday")
    ],
)

In [59]:
operator_df2.loc[operator_df2["Analysis Name"].str.contains("Santa Monica")][["N Routes","Day Type","Date"]].sort_values(by = ["Date"])

Unnamed: 0,N Routes,Day Type,Date
1,18.13,Weekday,2025-01-01
2,18.55,Weekday,2025-02-01
0,19.0,Weekday,2025-03-01
31,19.0,Weekday,2025-03-01
37,17.91,Weekday,2025-04-01
28,18.95,Weekday,2025-05-01
35,18.24,Weekday,2025-06-01
36,17.53,Weekday,2025-07-01
30,17.0,Weekday,2025-09-01
34,17.0,Weekday,2025-09-01


## Hourly Summary
* `Departure Hours` is over 24.

In [None]:
operator_hourly_summary_df = gcs_pandas().read_parquet(operator_hourly_summary_url)

In [None]:
operator_hourly_summary_df.sample()

In [None]:
operator_hourly_summary_df["Departure Hour"].unique()

## Crosswalk

In [None]:
crosswalk_url = f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/{GTFS_DATA_DICT.gtfs_digest_rollup.crosswalk}_{file_name}.parquet"

crosswalk_df = gcs_pandas().read_parquet(crosswalk_url)

In [None]:
crosswalk_df.shape

In [None]:
crosswalk_df.sample()

In [None]:
crosswalk_df.analysis_name.nunique()

In [None]:
crosswalk_df.analysis_name.value_counts().head()

In [None]:
crosswalk_df.loc[crosswalk_df.analysis_name.str.contains("County of San Francisco")]

## Schedule Route Summary

In [None]:
schedule_rt_route_direction_summary_url = f"{GCS_PATH}{GTFS_DATA_DICT.gtfs_digest_rollup.schedule_rt_route_direction}_{file_name}.parquet"

In [None]:
schedule_rt_route_direction_summary_df = gcs_pandas().read_parquet(schedule_rt_route_direction_summary_url)

In [None]:
schedule_rt_route_direction_summary_df.sample()

In [None]:
schedule_rt_route_direction_summary_df["Analysis Name"].nunique()

In [None]:
PROD_PROJECT = "cal-itp-data-infra"
PROD_MART = "mart_gtfs_rollup"
MONTH_DATE_COL = "month_first_day"

In [None]:
schedule_rt_route_direction_summary_df_og = bq_utils.download_table(
        project_name=PROD_PROJECT,
        dataset_name=PROD_MART,
        table_name=GTFS_DATA_DICT.gtfs_digest_rollup.schedule_rt_route_direction,
        date_col=MONTH_DATE_COL,
        start_date=last_year,
        end_date=analysis_month,
    )

In [None]:
schedule_rt_route_direction_summary_df_og.sample()

In [None]:
schedule_rt_route_direction_summary_df_og.columns

In [None]:
schedule_rt_route_direction_summary_df_og.name.nunique()

In [None]:
schedule_rt_route_direction_summary_df_og.vp_name.nunique()

In [None]:
schedule_rt_route_direction_summary_df_og.tu_name.nunique()

In [None]:
f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/{GTFS_DATA_DICT.gtfs_digest_rollup.schedule_rt_route_direction}_{file_name}.parquet"

## Operator Summary
* Missing operators

In [None]:
operator_summary_url = f"{GCS_PATH}{GTFS_DATA_DICT.gtfs_digest_rollup.operator_summary}_{file_name}.parquet"

In [None]:
operator_df = gcs_pandas().read_parquet(
    operator_summary_url,
    filters=[
        ("Day Type", "==", "Weekday"),],
)

In [None]:
operator_df.sample()

In [None]:
(operator_df.groupby(['Caltrans District'], 
                      observed=True, group_keys=False)
           .agg({
               "Analysis Name": "nunique",
               "VP Name":"nunique",
               "TU Name":"nunique"
           }))

In [None]:
crosswalk_df.sample()

In [None]:
crosswalk_df.groupby("caltrans_district").agg({"analysis_name":"nunique",
                                              "name":"nunique"})

In [None]:
op_summary_og_df = bq_utils.download_table(
        project_name=PROD_PROJECT,
        dataset_name=PROD_MART,
        table_name=GTFS_DATA_DICT.gtfs_digest_rollup.operator_summary,
        date_col=MONTH_DATE_COL,
        start_date=last_year,
        end_date=analysis_month,
    )

In [None]:
op_summary_og_df.sample()

### Some values are missing after joining the crosswalk.

In [None]:
op_summary_og_df.analysis_name.nunique()

In [None]:
original_analysis_names = set(list(op_summary_og_df.analysis_name.unique()))

In [None]:
operator_df["Analysis Name"].nunique()

In [None]:
cleaned_analysis_names = set(list(operator_df["Analysis Name"].unique()))

In [None]:
original_analysis_names-cleaned_analysis_names 