# Download data from warehouse -> add it to our private GCS bucket as a parquet -> clean it -> add it to the public GCS bucket

In [1]:
from datetime import datetime
from functools import cache
from pathlib import Path
from typing import Literal

import _sql_query
import geopandas as gpd
import pandas as pd
from calitp_data_analysis.gcs_geopandas import GCSGeoPandas

In [2]:
from shared_utils import (
    bq_utils,
    geo_utils,
    gtfs_utils_v2,
    portfolio_utils,
    publish_utils,
    rt_dates,
)

In [3]:
import publish_public_data
from calitp_data_analysis import geography_utils
from shared_utils import geo_utils, gtfs_utils_v2, portfolio_utils, publish_utils
from update_vars import (
    GTFS_DATA_DICT,
    analysis_month,
    file_name,
    last_year,
    previous_month,
)

In [4]:
from calitp_data_analysis import geography_utils, utils

In [5]:
import google.auth

credentials, project = google.auth.default()

In [6]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [7]:
PUBLIC_GCS = GTFS_DATA_DICT.gcs_paths.PUBLIC_GCS

In [8]:
@cache
def gcs_geopandas():
    return GCSGeoPandas()

In [9]:
analysis_month

'2025-11-05'

In [10]:
file_name

'2025_11'

In [11]:
PROD_PROJECT = "cal-itp-data-infra"
STG_PROJECT = "cal-itp-data-infra-staging"
STG_MART = "tiffany_mart_gtfs_rollup"
MONTH_DATE_COL = "month_first_day"

In [12]:
TRANSIT_MART = "mart_transit_database"

In [13]:
PROD_MART = "mart_gtfs_rollup"

## 1) Crosswalk

In [14]:
def load_crosswalk(project_name:str,
                  dataset_name:str,)->pd.DataFrame:
    df = bq_utils.download_table(
    project_name= "cal-itp-data-infra",
    dataset_name= "mart_transit_database",
    table_name = "bridge_gtfs_analysis_name_x_ntd",
    date_col = None,
  )
    df2 = (df
        .dropna(subset = ["ntd_id","ntd_id_2022"])
        .drop_duplicates(subset = ["analysis_name","organization_name"])
        .reset_index()
         )
    return df2

In [15]:
crosswalk_df = load_crosswalk(project_name = PROD_PROJECT,
                             dataset_name = TRANSIT_MART)

  import pkg_resources  # noqa


Downloading: 100%|[32m██████████[0m|
query: SELECT * FROM  `cal-itp-data-infra`.`mart_transit_database`.`bridge_gtfs_analysis_name_x_ntd`


In [16]:
crosswalk_df.sample()

Unnamed: 0,index,organization_name,organization_source_record_id,schedule_source_record_id,schedule_gtfs_dataset_name,analysis_name,regional_feed_type,county_name,caltrans_district,caltrans_district_name,ntd_id,ntd_id_2022,rtpa_name,mpo_name
35,77,North County Transit District,recRBcrX4ZvTyvSnm,recltDJn10nOSilgD,North County Schedule,North County Transit District,,San Diego,11,San Diego,90030,90030,San Diego Association of Governments,San Diego Association of Governments


In [17]:
crosswalk_df.analysis_name.value_counts().head()

Humboldt Transit Authority                                                                            2
San Luis Obispo Regional Transit Authority                                                            2
Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)    2
Peninsula Corridor Joint Powers Board                                                                 1
Riverside Transit Agency                                                                              1
Name: analysis_name, dtype: int64

In [18]:
crosswalk_df.loc[crosswalk_df.analysis_name == "Humboldt Transit Authority"].T

Unnamed: 0,19,137
index,39,483
organization_name,Humboldt Transit Authority,City of Arcata
organization_source_record_id,recaa3naoNR4a5RsJ,recaaoqEDvwhcmIVT
schedule_source_record_id,recrGgXZxqm3dOPH5,recGTiyx7VcxcUkRu
schedule_gtfs_dataset_name,Humboldt Flex,Humboldt Schedule
analysis_name,Humboldt Transit Authority,Humboldt Transit Authority
regional_feed_type,,
county_name,Humboldt,Humboldt
caltrans_district,1,1
caltrans_district_name,Eureka,Eureka


## 2) schedule_rt_route_direction_summary
* Add Caltrans District to organize 

In [19]:
def load_schedule_rt_route_direction_summary(
    project_name: str, 
    date_col:str, 
    dataset_name: str, 
    start_date: str, 
    end_date: str, 
    file_name:str
) -> pd.DataFrame:
    df = bq_utils.download_table(
        project_name=project_name,
        dataset_name=dataset_name,
        table_name="fct_monthly_schedule_rt_route_direction_summary",
        date_col= date_col,
        start_date=start_date,
        end_date=end_date,
    )

    
    df.to_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/{GTFS_DATA_DICT.gtfs_digest_rollup.schedule_rt_route_direction}_{file_name}.parquet"
    )

    return df

In [37]:
schedule_rt_route_direction_summary = load_schedule_rt_route_direction_summary(
    project_name=PROD_PROJECT,
    date_col = MONTH_DATE_COL,
    dataset_name=PROD_MART,
    start_date= last_year,
    end_date=analysis_month,
    file_name = file_name
)

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.
Downloading: 100%|[32m██████████[0m|
query: SELECT * FROM  `cal-itp-data-infra`.`mart_gtfs_rollup`.`fct_monthly_schedule_rt_route_direction_summary` WHERE month_first_day >= DATE('2024-11-13') AND month_first_day <= DATE('2025-11-05')


In [21]:
portfolio_utils.standardize_portfolio_organization_names??

[0;31mSignature:[0m
[0mportfolio_utils[0m[0;34m.[0m[0mstandardize_portfolio_organization_names[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdf[0m[0;34m:[0m [0mpandas[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mframe[0m[0;34m.[0m[0mDataFrame[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0mpandas[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mframe[0m[0;34m.[0m[0mDataFrame[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mSource:[0m   
[0;32mdef[0m [0mstandardize_portfolio_organization_names[0m[0;34m([0m[0mdf[0m[0;34m:[0m [0mpd[0m[0;34m.[0m[0mDataFrame[0m[0;34m)[0m [0;34m->[0m [0mpd[0m[0;34m.[0m[0mDataFrame[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0mportfolio_name_df[0m [0;34m=[0m [0mload_portfolio_names[0m[0;34m([0m[0;34m)[0m[0;34m[0m
[0;34m[0m    [0;31m# Map the preferred organization name using schedule_gtfs_dataset_name.[0m[0;34m[0m
[0;34m[0m    [0mm1[0m [0;34m=[0m [0mpd[

In [22]:
def prep_schedule_rt_route_direction_summary(
    file_name:str
) ->pd.DataFrame:

    df = pd.read_parquet(f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/{GTFS_DATA_DICT.gtfs_digest_rollup.schedule_rt_route_direction}_{file_name}.parquet")
    
    # Prepare data for portfolio
    df2 = df[
        [
            "month_first_day",
            "name",
            "route_name",
            "direction_id",
            "frequency_all_day",
            "frequency_offpeak",
            "frequency_peak",
            "daily_service_hours",
            "daily_trips_peak",
            "daily_trips_offpeak",
            "daily_trips_all_day",
            "day_type",
            "route_type",
            "route_typology",
        ]
    ]
    # Temporarily add analysis_name
    df2 = df2.pipe(portfolio_utils.standardize_portfolio_organization_names)
    df2 = df2.drop_duplicates()

    # Clean columns
    df2.route_typology = df2.route_typology.str.title()
    df2.columns = df2.columns.str.replace("_", " ").str.title()
    df2["Month First Day"] = pd.to_datetime(df2["Month First Day"]).dt.strftime("%B %Y")
    df2 = df2.rename(
        columns={
            "Direction Id": "Direction",
            "Month First Day": "Date",
            "Route Name": "Route",
        }
    )

    # Add some new columns
    df2["Daily Service Minutes"] = df2["Daily Service Hours"] * 60
    df2["Average Scheduled Minutes"] = (
        df2["Daily Service Minutes"] / df2["Daily Trips All Day"]
    )
    df2["Average Scheduled Minutes"].describe()
    df2["Headway All Day"] = 60 / df2["Frequency All Day"]
    df2["Headway Peak"] = 60 / df2["Frequency Peak"]
    df2["Headway Offpeak"] = 60 / df2["Frequency Offpeak"]

    # Save
    df2.to_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/{GTFS_DATA_DICT.gtfs_digest_rollup.schedule_rt_route_direction}_{file_name}.parquet"
    )
    
    return df2

In [23]:
schedule_rt_route_direction_summary_df = prep_schedule_rt_route_direction_summary(
    file_name = file_name,
)

In [24]:
f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/{GTFS_DATA_DICT.gtfs_digest_rollup.schedule_rt_route_direction}_{file_name}.parquet"

'gs://calitp-analytics-data/data-analyses/gtfs_digest/processed/fct_monthly_schedule_rt_route_direction_summary_2025_11.parquet'

## 3) fct_monthly_operator_summary
* Add Caltrans District + Legislative District

In [25]:
def load_operator_summary(
    project_name: str, 
    date_col:str, 
    dataset_name: str, 
    start_date: str, 
    end_date: str, 
    file_name:str
) -> pd.DataFrame:
    df = bq_utils.download_table(
        project_name=project_name,
        dataset_name=dataset_name,
        table_name="fct_monthly_schedule_rt_route_direction_summary",
        date_col= date_col,
        start_date=start_date,
        end_date=end_date,
    )

    
    df.to_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/{GTFS_DATA_DICT.gtfs_digest_rollup.schedule_rt_route_direction}_{file_name}.parquet"
    )

    return df

In [39]:
monthly_operator_summary_df = load_operator_summary(project_name=PROD_PROJECT,
    date_col = MONTH_DATE_COL,
    dataset_name=PROD_MART,
    start_date= last_year,
    end_date=analysis_month,
    file_name = file_name,)

Downloading: 100%|[32m██████████[0m|
query: SELECT * FROM  `cal-itp-data-infra`.`mart_gtfs_rollup`.`fct_monthly_schedule_rt_route_direction_summary` WHERE month_first_day >= DATE('2024-11-13') AND month_first_day <= DATE('2025-11-05')


In [27]:
monthly_operator_summary_df.columns

Index(['name', 'month_first_day', 'month', 'year', 'day_type', 'route_name',
       'direction_id', 'route_type', 'route_color', 'route_typology',
       'daily_trips_all_day', 'daily_stop_arrivals_all_day',
       'daily_distinct_stops_all_day', 'frequency_all_day',
       'daily_service_hours', 'daily_flex_service_hours', 'daily_trips_owl',
       'daily_trips_early_am', 'daily_trips_am_peak', 'daily_trips_midday',
       'daily_trips_pm_peak', 'daily_trips_evening', 'daily_trips_peak',
       'daily_trips_offpeak', 'frequency_owl', 'frequency_early_am',
       'frequency_am_peak', 'frequency_midday', 'frequency_pm_peak',
       'frequency_evening', 'frequency_peak', 'frequency_offpeak',
       'schedule_base64_url', 'tu_name', 'vp_name', 'schedule_name',
       'tu_base64_url', 'vp_base64_url', 'tu_num_distinct_updates',
       'daily_tu_num_distinct_updates', 'daily_tu_num_skipped_stops',
       'daily_tu_num_canceled_stops', 'daily_tu_num_added_stops',
       'daily_tu_num_schedul

In [28]:
monthly_operator_summary_df.sample()

Unnamed: 0,name,month_first_day,month,year,day_type,route_name,direction_id,route_type,route_color,route_typology,daily_trips_all_day,daily_stop_arrivals_all_day,daily_distinct_stops_all_day,frequency_all_day,daily_service_hours,daily_flex_service_hours,daily_trips_owl,daily_trips_early_am,daily_trips_am_peak,daily_trips_midday,daily_trips_pm_peak,daily_trips_evening,daily_trips_peak,daily_trips_offpeak,frequency_owl,frequency_early_am,frequency_am_peak,frequency_midday,frequency_pm_peak,frequency_evening,frequency_peak,frequency_offpeak,schedule_base64_url,tu_name,vp_name,schedule_name,tu_base64_url,vp_base64_url,tu_num_distinct_updates,daily_tu_num_distinct_updates,daily_tu_num_skipped_stops,daily_tu_num_canceled_stops,daily_tu_num_added_stops,daily_tu_num_scheduled_stops,n_tu_trips,daily_tu_trips,vp_num_distinct_updates,daily_vp_num_distinct_updates,n_vp_trips,daily_vp_trips,n_rt_trips,n_rt_days
3013,Bay Area 511 Regional Schedule,2025-10-01,10,2025,Saturday,SC:31__31 Evergreen Valley Coll - Eastridge,0,3,29588c,bus,12.0,192.0,192.0,0.92,2.95,,0.0,0.0,2.0,7.0,3.0,0.0,5.0,7.0,0.0,0.0,0.67,1.4,0.6,0.0,0.63,0.44,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1SRw==,,Bay Area 511 Regional VehiclePositions,Bay Area 511 Regional Schedule,,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L3ZlaGljbGVwb3NpdGlvbnM_YWdlbmN5PVJH,,,,,,,0,0.0,552,552.0,12,12.0,12,1


In [29]:
def prep_operator_summary(file_name:str) -> pd.DataFrame:

    df = pd.read_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/{GTFS_DATA_DICT.gtfs_digest_rollup.operator_summary}_{file_name}.parquet"
    )
    
    # Prepare data for portfolio
    df2 = df[
        [
            "month_first_day",
            "analysis_name",
            "vp_name",
            "tu_name",
            "n_trips",
            "day_type",
            "daily_trips",
            "ttl_service_hours",
            "n_routes",
            "n_days",
            "n_shapes",
            "n_stops",
            "vp_messages_per_minute",
            "n_vp_trips",
            "daily_vp_trips",
            "pct_vp_trips",
            "n_vp_routes",
            "pct_vp_service_hours",
            "tu_messages_per_minute",
            "n_tu_trips",
            "daily_tu_trips",
            "pct_tu_trips",
            "n_tu_routes",
            "pct_tu_service_hours",
        ]
    ]
    df2.columns = df2.columns.str.replace("_", " ").str.title()

    df2 = df2.rename(
        columns={
            "Month First Day": "Date",
        }
    )
    df2.columns = df2.columns.str.replace("Vp", "VP").str.replace("Tu", "TU")

    # Create a couple of new columns
    df2["Percent of Trips with Trip Updates"] = (
        df2["N TU Trips"] / df2["N Trips"]
    ) * 100

    df2["Percent of Trips with Vehicle Positions"] = (
        df2["N VP Trips"] / df2["N Trips"]
    ) * 100

    """
    df2["Percent of Trips with Vehicle Positions"] = df2[
        "Percent of Trips with Vehicle Positions"
    ].clip(upper=100.0)
    df2["Percent of Trips with Trip Updates"] = df2[
        "Percent of Trips with Trip Updates"
    ].clip(upper=100.0)
    """
    # Save
    df2.to_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/{GTFS_DATA_DICT.gtfs_digest_rollup.operator_summary}_{file_name}.parquet"
    )
    return df2

In [30]:
monthly_operator_summary_clean = prep_operator_summary(
    file_name = file_name,)

ERROR! Session/line number was not unique in database. History logging moved to new session 2


In [31]:
monthly_operator_summary_clean.head(1)

Unnamed: 0,Date,Analysis Name,VP Name,TU Name,N Trips,Day Type,Daily Trips,Ttl Service Hours,N Routes,N Days,N Shapes,N Stops,VP Messages Per Minute,N VP Trips,Daily VP Trips,Pct VP Trips,N VP Routes,Pct VP Service Hours,TU Messages Per Minute,N TU Trips,Daily TU Trips,Pct TU Trips,N TU Routes,Pct TU Service Hours,Percent of Trips with Trip Updates,Percent of Trips with Vehicle Positions
0,2025-11-01,,County Connection Swiftly Vehicle Position,County Connection Swiftly Trip Update,891,Saturday,297.0,270975.2,11.0,3,19.67,559.0,3.0,872,290.7,0.98,1.0,0.0,3.0,876,292.0,0.98,1.0,0.0,98.32,97.87


## 4) fct_monthly_routes 
* Add Caltrans District + Legislative District

In [32]:
def load_fct_monthly_routes(
    project_name: str, 
    date_col:str, 
    dataset_name: str, 
    start_date: str, 
    end_date: str, 
    file_name:str
) -> pd.DataFrame:
    gdf = bq_utils.download_table(
         project_name=project_name,
        dataset_name=dataset_name,
        table_name="fct_monthly_routes",
        date_col= date_col,
        start_date=start_date,
        end_date=end_date,
        geom_col="pt_array",
        geom_type="line",
    )

   
    utils.geoparquet_gcs_export(gdf = gdf,
                           gcs_file_path = f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/",
                           file_name = f"{GTFS_DATA_DICT.gtfs_digest_rollup.route_map}_{file_name}")
    return gdf

In [33]:
monthly_routes_gdf = load_operator_summary(project_name=STG_PROJECT,
    date_col = MONTH_DATE_COL,
    dataset_name=PROD_MART,
    start_date= previous_month,
    end_date=analysis_month,
    file_name = file_name,)

Downloading: 100%|[32m██████████[0m|
query: SELECT * FROM  `cal-itp-data-infra-staging`.`mart_gtfs_rollup`.`fct_monthly_schedule_rt_route_direction_summary` WHERE month_first_day >= DATE('2025-10-15') AND month_first_day <= DATE('2025-11-05')


In [38]:
# len(monthly_routes_gdf)

3523

In [42]:
def prep_fct_monthly_routes(project_name: str, 
    date_col:str, 
    dataset_name: str, 
    start_date: str, 
    end_date: str, 
    file_name:str) -> pd.DataFrame:

    gdf = load_operator_summary(project_name=STG_PROJECT,
    date_col = MONTH_DATE_COL,
    dataset_name=PROD_MART,
    start_date= previous_month,
    end_date=analysis_month,
    file_name = file_name,)
    
    
    # Add analysis name
    # Prepare data for portfolio
    gdf = gdf.pipe(portfolio_utils.standardize_portfolio_organization_names)

    # Keep only the most recent route geography
    gdf2 = gdf.sort_values(
        by=["month_first_day", "analysis_name", "route_name"],
        ascending=[False, True, True],
    ).drop_duplicates(subset=["analysis_name", "route_name"])

    gdf2 = gdf2.drop(
        columns=[
            "shape_id",
            "shape_array_key",
            "n_trips",
            "direction_id",
        ]
    )

    # Convert the geometry to line
    # m1 = geo_utils.convert_to_gdf(m1, "pt_array", "line")

    # Clean
    gdf2.columns = gdf2.columns.str.replace("_", " ").str.title()

    utils.geoparquet_gcs_export(gdf = gdf2,
                           gcs_file_path = f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/",
                           file_name = f"{GTFS_DATA_DICT.gtfs_digest_rollup.route_map}_{file_name}")

    return gdf2

In [43]:
monthly_routes_gdf = prep_fct_monthly_routes(project_name=PROD_PROJECT,
    date_col = MONTH_DATE_COL,
    dataset_name=PROD_MART,
    start_date= previous_month,
    end_date=analysis_month,
    file_name = file_name,)

OAuthError: ('Error code invalid_grant: Refresh token has expired', '{"error":"invalid_grant","error_description":"Refresh token has expired"}')

## 5) fct_operator_hourly_summary

In [None]:
def load_fct_operator_hourly_summary(project_name: str, 
                           date_col:str, 
                           dataset_name: str, 
                           start_date: str, 
                           end_date: str, 
                           download_date: str) -> pd.DataFrame:
     df = bq_utils.download_table(
        project_name=project_name,
        dataset_name=dataset_name,
        table_name="fct_operator_hourly_summary",
        date_col= date_col,
        start_date=start_date,
        end_date=end_date,
    )

    df.to_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/{GTFS_DATA_DICT.gtfs_digest_rollup.hourly_day_type_summary}_{file_name}.parquet"
    )
    return df

In [None]:
fct_operator_hourly_summary = download_fct_operator_hourly_summary(

    project_name=STG_PROJECT,
    date_col = MONTH_DATE_COL,
    dataset_name=PROD_MART,
    start_date= previous_month,
    end_date=analysis_month,
)

In [None]:
def clean_fct_operator_hourly_summary(
                           file_name:str) -> pd.DataFrame:

    df = pd.read_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/{GTFS_DATA_DICT.gtfs_digest_rollup.hourly_day_type_summary}_{file_name}.parquet"
    )
    # Prepare data
    df2 = (
        df.groupby(["analysis_name", "month_first_day", "day_type", "departure_hour"])
        .agg({"n_trips": "sum"})
        .reset_index()
    )

    df2.columns = df2.columns.str.replace("_", " ").str.title()

    df2 = df2.rename(columns={"Month First Day": "Date"})

    df2["Date"] = df2["Date"].dt.strftime("%m-%Y")

    df2.to_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/{GTFS_DATA_DICT.gtfs_digest_rollup.hourly_day_type_summary}_{file_name}.parquet"
    )
    return df2

In [None]:
clean_fct_operator_hourly_summary_df = clean_fct_operator_hourly_summary(
    file_name = file_name,)

## Publish everything to GCS

In [None]:
GCS = GTFS_DATA_DICT["digest_tables_ah"].dir

In [None]:
GTFS_DATA_DICT["digest_tables_ah"].schedule_rt_route_direction_summary

In [None]:
file_names = [
    GTFS_DATA_DICT["digest_tables_ah"][f]
    for f in [
        "schedule_rt_route_direction_summary",
        "fct_monthly_operator_summary",
        "fct_operator_hourly_summary",
    ]
]

In [None]:
file_names = [f"{key}_{date_str}.parquet" for key in file_names]

In [None]:
file_names = [f"{GCS}{key}" for key in file_names]

In [None]:
for f in file_names:
    publish_utils.write_to_public_gcs(f, f"gtfs_digest/{Path(f).name}", PUBLIC_GCS)

In [None]:
for f in file_names:
    publish_public_data.export_parquet_as_csv_or_geojson(f, filetype="df")