In [1]:
from functools import cache
from pathlib import Path

import pandas as pd
import google.auth

from calitp_data_analysis.gcs_pandas import GCSPandas
from calitp_data_analysis import utils
from calitp_data_analysis.sql import get_engine

from shared_utils import bq_utils
from update_vars import GTFS_DATA_DICT, analysis_month, file_name, last_year, previous_month

import _prep_crosswalk_ntd  # needed for load_crosswalk()

# Initialize credentials and DB engine
credentials, project = google.auth.default()
db_engine = get_engine()


@cache
def gcs_pandas():
    return GCSPandas()

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:
PROD_PROJECT = "cal-itp-data-infra"
PROD_MART = "mart_gtfs_rollup"
MONTH_DATE_COL = "month_first_day"

In [5]:
df = bq_utils.download_table(
        project_name=PROD_PROJECT,
        dataset_name=PROD_MART,
        table_name=GTFS_DATA_DICT.gtfs_digest_rollup.operator_summary,
        date_col=MONTH_DATE_COL,
        start_date=last_year,
        end_date=analysis_month,
    )

  import pkg_resources  # noqa


Downloading: 100%|[32m██████████[0m|
query: SELECT * FROM  `cal-itp-data-infra`.`mart_gtfs_rollup`.`fct_monthly_operator_summary` WHERE month_first_day >= DATE('2024-12-01') AND month_first_day <= DATE('2025-12-01')


In [6]:
crosswalk_url = f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/{GTFS_DATA_DICT.gtfs_digest_rollup.crosswalk}_{file_name}.parquet"

In [12]:
crosswalk_df = gcs_pandas().read_parquet(crosswalk_url)[["name","analysis_name","caltrans_district"]]

In [11]:
crosswalk_df.sample()

Unnamed: 0,name,analysis_name,county_name,caltrans_district,caltrans_district_int,ntd_id,ntd_id_2022
83,Palos Verdes PTA Schedule,Palos Verdes Peninsula Transit Authority,Los Angeles,07-Los Angeles / Ventura,7,90287,90287


In [29]:
df.columns

Index(['month', 'year', 'month_first_day', 'schedule_name',
       'schedule_base64_url', 'vp_name', 'vp_base64_url', 'tu_name',
       'tu_base64_url', 'day_type', 'n_trips', 'daily_trips',
       'ttl_service_hours', 'n_routes', 'n_shapes', 'n_stops', 'n_days',
       'vp_messages_per_minute', 'n_vp_trips', 'daily_vp_trips',
       'pct_vp_trips', 'pct_vp_service_hours', 'tu_messages_per_minute',
       'n_tu_trips', 'daily_tu_trips', 'pct_tu_trips', 'pct_tu_service_hours'],
      dtype='object')

In [9]:
df.sample().T

Unnamed: 0,3669
month,5
year,2025
month_first_day,2025-05-01 00:00:00
schedule_name,Lake Schedule
schedule_base64_url,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3Rmcy9sYWtldHJhbnNpdC1jYS11cy9sYWtldHJhbnNpdC1jYS11cy56aXA=
vp_name,Lake Vehicle Positions
vp_base64_url,aHR0cHM6Ly9hcGkuZ29zd2lmdC5seS9yZWFsLXRpbWUvbGFrZS1jb3VudHkvZ3Rmcy1ydC12ZWhpY2xlLXBvc2l0aW9ucw==
tu_name,Lake Trip Updates
tu_base64_url,aHR0cHM6Ly9hcGkuZ29zd2lmdC5seS9yZWFsLXRpbWUvbGFrZS1jb3VudHkvZ3Rmcy1ydC10cmlwLXVwZGF0ZXM=
day_type,Saturday


In [13]:
m1 = pd.merge(df, crosswalk_df, left_on = ["schedule_name"], right_on = ["name"], how="inner").drop_duplicates().reset_index()

In [14]:
m1.shape

(2949, 31)

In [18]:
m1.analysis_name.nunique()

83

In [20]:
crosswalk_df.analysis_name.nunique()

143

In [15]:
df.shape

(4750, 27)

In [16]:
m2 = pd.merge(df, crosswalk_df, left_on = ["schedule_name"], right_on = ["name"], how="inner")

In [17]:
m2.shape

(2949, 30)

In [21]:
def load_fct_operator_hourly_summary(
    project_name: str,
    date_col: str,
    dataset_name: str,
    start_date: str,
    end_date: str,
    file_name: str,
) -> pd.DataFrame:
    df = bq_utils.download_table(
        project_name=project_name,
        dataset_name=dataset_name,
        table_name=GTFS_DATA_DICT.gtfs_digest_rollup.hourly_day_type_summary,
        date_col=date_col,
        start_date=start_date,
        end_date=end_date,
    )

    # Merge with crosswalk
    crosswalk_url = f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/{GTFS_DATA_DICT.gtfs_digest_rollup.crosswalk}_{file_name}.parquet"

    crosswalk_df = gcs_pandas().read_parquet(crosswalk_url)[["name", "analysis_name",]]

    m1 = pd.merge(df, crosswalk_df, on="name", how="inner").drop_duplicates().reset_index()
    
    gcs_pandas().data_frame_to_parquet(m1, f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/{GTFS_DATA_DICT.gtfs_digest_rollup.hourly_day_type_summary}_{file_name}.parquet")
   
    return m1

In [22]:
fct_operator_hourly_summary = load_fct_operator_hourly_summary(
        project_name=PROD_PROJECT,
        date_col=MONTH_DATE_COL,
        dataset_name=PROD_MART,
        start_date=last_year,
        end_date=analysis_month,
        file_name=file_name,
    )

Downloading: 100%|[32m██████████[0m|
query: SELECT * FROM  `cal-itp-data-infra`.`mart_gtfs_rollup`.`fct_operator_hourly_summary` WHERE month_first_day >= DATE('2024-12-01') AND month_first_day <= DATE('2025-12-01')


In [23]:
fct_operator_hourly_summary.sample()

Unnamed: 0,index,month_first_day,name,day_type,departure_hour,n_trips,service_hours,flex_service_hours,analysis_name
49214,49214,2025-07-01,Plumas Schedule,Weekday,15,9,8.3,0.0,Plumas Transit Systems


In [25]:
fct_operator_hourly_summary.analysis_name.nunique()

141

In [26]:
fct_operator_hourly_summary.analysis_name.unique()

array(['City of Alhambra', 'Amador Regional Transit System',
       'Anaheim Transportation Network',
       'Antelope Valley Transit Authority', 'City of Arcadia',
       'City of Artesia', 'City of Arvin',
       'Butte County Association of Governments', 'City of Baldwin Park',
       'Alameda-Contra Costa Transit District',
       'San Joaquin Regional Rail Commission',
       'San Francisco Bay Area Rapid Transit District',
       'Peninsula Corridor Joint Powers Board',
       'Central Contra Costa Transit Authority', 'City of Fairfield',
       'Golden Gate Bridge, Highway and Transportation District',
       'City and County of San Francisco',
       'Marin County Transit District', 'City of Petaluma',
       'City of Rio Vista', 'San Mateo County Transit District',
       'Santa Clara Valley Transportation Authority',
       'City of Santa Rosa', 'Solano Transportation Authority',
       'Sonoma-Marin Area Rail Transit District',
       'Eastern Contra Costa Transit Authority'

In [28]:
fct_operator_hourly_summary.loc[fct_operator_hourly_summary.analysis_name.str.contains("San Francisco")].departure_hour.value_counts()

departure_hour
16    150
12    150
15    150
13    150
17    150
14    150
9     126
10    126
11    126
8      78
18     78
1      78
19     78
20     78
22     78
21     78
23     78
7      78
0      78
6      66
5      65
4      52
2      39
3      39
Name: count, dtype: Int64

In [24]:
fct_operator_hourly_summary.departure_hour.unique()

<IntegerArray>
[  18,    6,   15,   13,   12,   14,   11,   10,    9,    8,   16,    7,   17,
    5,    4,    2,    1,    0,    3,   23,   22,   20,   19,   21, <NA>]
Length: 25, dtype: Int64