# Download data from warehouse -> add it to our private GCS bucket as a parquet -> clean it -> add it to the public GCS bucket

In [17]:
from datetime import datetime
from functools import cache
from pathlib import Path

import geopandas as gpd
import pandas as pd
from calitp_data_analysis.gcs_geopandas import GCSGeoPandas
from calitp_data_analysis.gcs_pandas import GCSPandas

In [18]:
from shared_utils import (
    bq_utils,
    geo_utils,
    gtfs_utils_v2,
    portfolio_utils,
    publish_utils,
    rt_dates,
)

In [19]:
import publish_public_data
from shared_utils import geo_utils, gtfs_utils_v2, portfolio_utils, publish_utils
from update_vars import (
    GTFS_DATA_DICT,
    analysis_month,
    file_name,
    last_year,
    previous_month,
)

In [20]:
from calitp_data_analysis import geography_utils, utils

In [21]:
import google.auth
import pandas_gbq

credentials, project = google.auth.default()

In [22]:
from calitp_data_analysis.sql import get_engine

# from calitp_data_analysis.tables import tbls

db_engine = get_engine()

In [23]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [24]:
PUBLIC_GCS = GTFS_DATA_DICT.gcs_paths.PUBLIC_GCS

In [25]:
@cache
def gcs_geopandas():
    return GCSGeoPandas()

In [26]:
@cache
def gcs_pandas():
    return GCSPandas()

In [27]:
analysis_month

'2025-12-01'

In [28]:
file_name

'2025_12'

In [29]:
PROD_PROJECT = "cal-itp-data-infra"
STG_PROJECT = "cal-itp-data-infra-staging"
STG_MART = "tiffany_mart_gtfs_rollup"
MONTH_DATE_COL = "month_first_day"

In [30]:
TRANSIT_MART = "mart_transit_database"

In [31]:
PROD_MART = "mart_gtfs_rollup"

## 1) Crosswalk

In [32]:
def load_crosswalk() -> pd.DataFrame:
    df = bq_utils.download_table(
        project_name="cal-itp-data-infra",
        dataset_name="mart_transit_database",
        table_name="bridge_gtfs_analysis_name_x_ntd",
        date_col=None,
    )
    df2 = (
        df.dropna(subset=["ntd_id", "ntd_id_2022"])
        .drop_duplicates(
            subset=["analysis_name", "organization_name", "schedule_gtfs_dataset_name"]
        )
        .reset_index()
    )

    df2 = df2.rename(columns={"schedule_gtfs_dataset_name": "name"})

    df2["caltrans_district_int"] = df2.caltrans_district
    df2.caltrans_district = df2.caltrans_district.apply(lambda x: '{0:0>2}'.format(x)) 
    
    df2["caltrans_district"] = (
        df2.caltrans_district.astype(str) + "-" + df2.caltrans_district_name
    )

    df2 = df2[
        [
            "name",
            "analysis_name",
            "county_name",
            "caltrans_district",
            "caltrans_district_int",
            "ntd_id",
            "ntd_id_2022",
        ]
    ]

    gcs_pandas().data_frame_to_parquet(df2, f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/{GTFS_DATA_DICT.gtfs_digest_rollup.crosswalk}_{file_name}.parquet")
    
    return df2

In [33]:
crosswalk_df = load_crosswalk()

  import pkg_resources  # noqa


Downloading: 100%|[32m██████████[0m|
query: SELECT * FROM  `cal-itp-data-infra`.`mart_transit_database`.`bridge_gtfs_analysis_name_x_ntd`


### YML

In [34]:
# yml_df = crosswalk_df[["caltrans_district", "caltrans_district_name", "analysis_name"]]

KeyError: "['caltrans_district_name'] not in index"

## 2) NTD Data

In [35]:
ntd_query_sql = f"""
        SELECT 
        number_of_state_counties,
        primary_uza_name,
        density,
        number_of_counties_with_service,
        state_admin_funds_expended,
        service_area_sq_miles,
        population,
        service_area_pop,
        subrecipient_type,
        primary_uza_code,
        reporter_type,
        organization_type,
        agency_name,
        voms_pt,
        voms_do,
        ntd_id,
        year
        FROM `cal-itp-data-infra-staging`.`mart_ntd`.`dim_annual_agency_information`
        WHERE state = 'CA' AND _is_current = TRUE
    """


mobility_query_sql = f"""
            SELECT
            agency_name,
            counties_served,
            hq_county,
            is_public_entity,
            is_publicly_operating,
            funding_sources,
            on_demand_vehicles_at_max_service,
            vehicles_at_max_service
            FROM
            cal-itp-data-infra.mart_transit_database.dim_mobility_mart_providers  
            """


def load_mobility(query: str) -> pd.DataFrame:
    with db_engine.connect() as connection:
        df = pd.read_sql(query, connection)
    df2 = df.sort_values(
        by=["on_demand_vehicles_at_max_service", "vehicles_at_max_service"],
        ascending=[False, False],
    )
    df3 = df2.groupby("agency_name").first().reset_index()
    return df3


def load_ntd(query: str) -> pd.DataFrame:
    with db_engine.connect() as connection:
        df = pd.read_sql(query, connection)
    df2 = df.sort_values(by=df.columns.tolist(), na_position="last")
    df3 = df2.groupby("agency_name").first().reset_index()
    return df3

In [39]:
def merge_ntd_mobility(ntd_query: str, mobility_query: str) -> pd.DataFrame:
    """
    Merge NTD (dim_annual_ntd_agency_information) with
    mobility providers (dim_mobility_mart_providers)
    and dedupe and keep 1 row per agency.
    """
    ntd = load_ntd(ntd_query)
    mobility = load_mobility(mobility_query)
    crosswalk = load_crosswalk()[["analysis_name", "ntd_id_2022"]]

    m1 = pd.merge(mobility, ntd, how="inner", on="agency_name")

    m1 = m1.drop_duplicates(subset="agency_name").reset_index(drop=True)

    # Wherever possible, allow nullable integers. These columns are integers, but can be
    # missing if we don't find corresponding NTD info
    integrify_cols = [
        "number_of_state_counties",
        "number_of_counties_with_service",
        "service_area_sq_miles",
        "service_area_pop",
        "on_demand_vehicles_at_max_service",
        "vehicles_at_max_service",
        "voms_pt",
        "voms_do",
        "year",
    ]
    m1[integrify_cols] = m1[integrify_cols].astype("Int64")

    # Merge with crosswalk to get analysis_name
    m1 = pd.merge(
        m1, crosswalk, left_on=["ntd_id"], right_on=["ntd_id_2022"], how="inner"
    )

    # m1.columns = m1.columns.str.replace("_", " ").str.title()
    gcs_pandas().data_frame_to_parquet(m1,  f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/{GTFS_DATA_DICT.gtfs_digest_rollup.ntd_profile}_{file_name}.parquet")

    return m1

In [40]:
ntd_data_df = merge_ntd_mobility(
    ntd_query=ntd_query_sql, mobility_query=mobility_query_sql
)

Downloading: 100%|[32m██████████[0m|
query: SELECT * FROM  `cal-itp-data-infra`.`mart_transit_database`.`bridge_gtfs_analysis_name_x_ntd`


In [41]:
len(ntd_data_df)

158

## 3) schedule_rt_route_direction_summary
* Add Caltrans District to organize 

In [42]:
def load_schedule_rt_route_direction_summary(
    project_name: str,
    date_col: str,
    dataset_name: str,
    start_date: str,
    end_date: str,
    file_name: str,
) -> pd.DataFrame:
    df = bq_utils.download_table(
        project_name=project_name,
        dataset_name=dataset_name,
        table_name=GTFS_DATA_DICT.gtfs_digest_rollup.schedule_rt_route_direction,
        date_col=date_col,
        start_date=start_date,
        end_date=end_date,
    )

    # Merge with crosswalk
    crosswalk_df = load_crosswalk()[["name", "analysis_name"]]
    m1 = pd.merge(df, crosswalk_df, on=["name"], how="inner")

    gcs_pandas().data_frame_to_parquet(m1,  f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/{GTFS_DATA_DICT.gtfs_digest_rollup.schedule_rt_route_direction}_{file_name}.parquet")
    
    return m1

In [43]:
schedule_rt_route_direction_summary = load_schedule_rt_route_direction_summary(
    project_name=PROD_PROJECT,
    date_col=MONTH_DATE_COL,
    dataset_name=PROD_MART,
    start_date=last_year,
    end_date=analysis_month,
    file_name=file_name,
)

Downloading: 100%|[32m██████████[0m|
query: SELECT * FROM  `cal-itp-data-infra`.`mart_gtfs_rollup`.`fct_monthly_schedule_rt_route_direction_summary` WHERE month_first_day >= DATE('2024-12-01') AND month_first_day <= DATE('2025-12-01')
Downloading: 100%|[32m██████████[0m|
query: SELECT * FROM  `cal-itp-data-infra`.`mart_transit_database`.`bridge_gtfs_analysis_name_x_ntd`


In [44]:
schedule_rt_route_direction_summary.sample()

Unnamed: 0,name,month_first_day,month,year,day_type,route_name,direction_id,route_type,route_color,route_typology,daily_trips_all_day,daily_stop_arrivals_all_day,daily_distinct_stops_all_day,frequency_all_day,daily_service_hours,daily_flex_service_hours,daily_trips_owl,daily_trips_early_am,daily_trips_am_peak,daily_trips_midday,daily_trips_pm_peak,daily_trips_evening,daily_trips_peak,daily_trips_offpeak,frequency_owl,frequency_early_am,frequency_am_peak,frequency_midday,frequency_pm_peak,frequency_evening,frequency_peak,frequency_offpeak,schedule_base64_url,tu_name,vp_name,schedule_name,tu_base64_url,vp_base64_url,tu_num_distinct_updates,daily_tu_num_distinct_updates,daily_tu_num_skipped_stops,daily_tu_num_canceled_stops,daily_tu_num_added_stops,daily_tu_num_scheduled_stops,n_tu_trips,daily_tu_trips,vp_num_distinct_updates,daily_vp_num_distinct_updates,n_vp_trips,daily_vp_trips,n_rt_trips,n_rt_days,analysis_name
53881,Bay Area 511 Petaluma Schedule,2025-06-01,6,2025,Weekday,10__10 Petaluma Blvd,0,3,B53B2C,bus,12.0,84.0,84.0,0.92,1.8,,0.0,0.0,3.0,5.0,4.0,0.0,7.0,5.0,0.0,0.0,1.0,1.0,0.8,0.0,0.88,0.31,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1QRQ==,Bay Area 511 Petaluma TripUpdates,,Bay Area 511 Petaluma Schedule,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L3RyaXB1cGRhdGVzP2FnZW5jeT1QRQ==,,1344,149.33,0.0,0.0,0.0,16.33,21,2.33,,,0,0.0,21,9,City of Petaluma


In [45]:
schedule_rt_route_direction_summary.loc[
    schedule_rt_route_direction_summary.name.str.contains("511 Muni")
].shape

(9063, 53)

In [46]:
crosswalk_df = load_crosswalk()[["name", "analysis_name"]]

Downloading: 100%|[32m██████████[0m|
query: SELECT * FROM  `cal-itp-data-infra`.`mart_transit_database`.`bridge_gtfs_analysis_name_x_ntd`


In [47]:
crosswalk_df.loc[crosswalk_df.analysis_name.str.contains("San Francisco")]

Unnamed: 0,name,analysis_name
88,Bay Area 511 Muni Schedule,City and County of San Francisco
99,Golden Gate Park Shuttle Schedule,City and County of San Francisco
116,Bay Area 511 Golden Gate Park Shuttle Schedule,City and County of San Francisco
129,Bay Area 511 BART Schedule,San Francisco Bay Area Rapid Transit District


In [48]:
def prep_schedule_rt_route_direction_summary(file_name: str) -> pd.DataFrame:

    df = gcs_pandas().read_parquet(f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/{GTFS_DATA_DICT.gtfs_digest_rollup.schedule_rt_route_direction}_{file_name}.parquet")

    # Prepare data for portfolio
    df2 = df[
        [
            "month_first_day",
            "analysis_name",
            "route_name",
            "direction_id",
            "frequency_all_day",
            "frequency_offpeak",
            "frequency_peak",
            "daily_service_hours",
            "daily_trips_peak",
            "daily_trips_offpeak",
            "daily_trips_all_day",
            "day_type",
            "route_type",
            "route_typology",
        ]
    ]

    # Drop duplicates
    df2 = df2.drop_duplicates().reset_index()

    # Clean columns
    df2.route_typology = df2.route_typology.str.title()
    df2.columns = df2.columns.str.replace("_", " ").str.title()
    df2["Month First Day"] = pd.to_datetime(df2["Month First Day"]).dt.strftime("%m/%Y")
    df2 = df2.rename(
        columns={
            "Direction Id": "Direction",
            "Month First Day": "Date",
            "Route Name": "Route",
        }
    )

    # Add some new columns
    df2["Daily Service Minutes"] = df2["Daily Service Hours"] * 60
    df2["Average Scheduled Minutes"] = (
        df2["Daily Service Minutes"] / df2["Daily Trips All Day"]
    )
    df2["Average Scheduled Minutes"].describe()
    df2["Headway All Day"] = 60 / df2["Frequency All Day"]
    df2["Headway Peak"] = 60 / df2["Frequency Peak"]
    df2["Headway Offpeak"] = 60 / df2["Frequency Offpeak"]

    # Save
    gcs_pandas().data_frame_to_parquet(df2, f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/{GTFS_DATA_DICT.gtfs_digest_rollup.schedule_rt_route_direction}_{file_name}.parquet")
    
    return df2

In [49]:
schedule_rt_route_direction_summary_df = prep_schedule_rt_route_direction_summary(
    file_name=file_name,
)

In [50]:
schedule_rt_route_direction_summary_df.sample()

Unnamed: 0,Index,Date,Analysis Name,Route,Direction,Frequency All Day,Frequency Offpeak,Frequency Peak,Daily Service Hours,Daily Trips Peak,Daily Trips Offpeak,Daily Trips All Day,Day Type,Route Type,Route Typology,Daily Service Minutes,Average Scheduled Minutes,Headway All Day,Headway Peak,Headway Offpeak
47950,75178,04/2025,Monterey-Salinas Transit,042__42 Salinas - Alisal,0,1.3,0.88,1.5,10.92,12.0,14.0,26.0,Weekday,3,Bus,655.2,25.2,46.15,40.0,68.18


In [51]:
sf = schedule_rt_route_direction_summary_df.loc[
    schedule_rt_route_direction_summary_df["Analysis Name"].str.contains(
        "City and County of San Francisco"
    )
]

In [52]:
sf.shape

(4837, 20)

## 4) fct_monthly_operator_summary
* Add Caltrans District + Legislative District
* Missing many D3 operators: Roseville, EL Dorado, Glenn, Placer, Sac, Thaoe, Truckee, Yuba Sutter

In [53]:
def load_operator_summary(
    project_name: str,
    date_col: str,
    dataset_name: str,
    start_date: str,
    end_date: str,
    file_name: str,
) -> pd.DataFrame:
    df = bq_utils.download_table(
        project_name=project_name,
        dataset_name=dataset_name,
        table_name=GTFS_DATA_DICT.gtfs_digest_rollup.operator_summary,
        date_col=date_col,
        start_date=start_date,
        end_date=end_date,
    )

    # Merge with crosswalk
    crosswalk_df = load_crosswalk()[["analysis_name", "caltrans_district"]]

    m1 = pd.merge(df, crosswalk_df, on=["analysis_name"], how="inner")

    m1 = m1.drop_duplicates().reset_index()

    gcs_pandas().data_frame_to_parquet(m1,  f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/{GTFS_DATA_DICT.gtfs_digest_rollup.operator_summary}_{file_name}.parquet")

    return m1

In [54]:
monthly_operator_summary_df = load_operator_summary(
    project_name=PROD_PROJECT,
    date_col=MONTH_DATE_COL,
    dataset_name=PROD_MART,
    start_date=last_year,
    end_date=analysis_month,
    file_name=file_name,
)

Downloading: 100%|[32m██████████[0m|
query: SELECT * FROM  `cal-itp-data-infra`.`mart_gtfs_rollup`.`fct_monthly_operator_summary` WHERE month_first_day >= DATE('2024-12-01') AND month_first_day <= DATE('2025-12-01')
Downloading: 100%|[32m██████████[0m|
query: SELECT * FROM  `cal-itp-data-infra`.`mart_transit_database`.`bridge_gtfs_analysis_name_x_ntd`


In [55]:
len(monthly_operator_summary_df.loc[monthly_operator_summary_df.analysis_name.str.contains("San Francisco")])

39

In [56]:
monthly_operator_summary_df = monthly_operator_summary_df.fillna("NA")

In [57]:
def find_ops_monthly_summary(operator:str):
    
    display(monthly_operator_summary_df.loc[monthly_operator_summary_df.schedule_name.str.contains(operator)].shape)
    display(monthly_operator_summary_df.loc[monthly_operator_summary_df.vp_name.str.contains(operator)].shape)
    
    display(monthly_operator_summary_df.loc[monthly_operator_summary_df.tu_name.str.contains(operator)].shape)
    display(monthly_operator_summary_df.loc[monthly_operator_summary_df.analysis_name.str.contains(operator)].shape)

In [58]:
# crosswalk_df.loc[crosswalk_df.caltrans_district.str.contains("11")]

AttributeError: 'DataFrame' object has no attribute 'caltrans_district'

In [59]:
find_ops_monthly_summary("Imperial")

(0, 32)

(0, 32)

(0, 32)

(0, 32)

In [60]:
find_ops_monthly_summary("North County")

(0, 32)

(0, 32)

(0, 32)

(0, 32)

In [61]:
find_ops_monthly_summary("Yuma County")

(0, 32)

(0, 32)

(0, 32)

(0, 32)

In [62]:
crosswalk_df = load_crosswalk()[["analysis_name", "caltrans_district"]]

Downloading: 100%|[32m██████████[0m|
query: SELECT * FROM  `cal-itp-data-infra`.`mart_transit_database`.`bridge_gtfs_analysis_name_x_ntd`


In [63]:
crosswalk_df.loc[crosswalk_df.caltrans_district.str.contains("12")]

Unnamed: 0,analysis_name,caltrans_district
16,City of Laguna Beach,12-Orange County
28,Orange County Transportation Authority,12-Orange County
54,Anaheim Transportation Network,12-Orange County
158,City of Laguna Beach,12-Orange County


In [64]:
find_ops_monthly_summary("Laguna")

(0, 32)

(0, 32)

(0, 32)

(0, 32)

In [65]:
find_ops_monthly_summary("Anaheim Transportation Network")

(0, 32)

(0, 32)

(0, 32)

(78, 32)

In [66]:
find_ops_monthly_summary("Orange County Transportation Authority")

(0, 32)

(0, 32)

(0, 32)

(39, 32)

In [67]:
crosswalk_df.loc[crosswalk_df.caltrans_district.str.contains("3")]

Unnamed: 0,analysis_name,caltrans_district
27,Yolo County Transportation District,03-Marysville / Sacramento
34,Glenn County,03-Marysville / Sacramento
40,Nevada County,03-Marysville / Sacramento
41,Placer County,03-Marysville / Sacramento
43,Tahoe Transportation District,03-Marysville / Sacramento
49,Yuba-Sutter Transit Authority,03-Marysville / Sacramento
68,"University of California, Davis",03-Marysville / Sacramento
85,Placer County,03-Marysville / Sacramento
86,Town of Truckee,03-Marysville / Sacramento
93,Sacramento County,03-Marysville / Sacramento


In [68]:
find_ops_monthly_summary("Roseville")

(8, 32)

(8, 32)

(8, 32)

(8, 32)

In [69]:
monthly_operator_summary_df[["schedule_name"]].drop_duplicates().sort_values(by = ["schedule_name"])

Unnamed: 0,schedule_name
373,Anaheim Resort Schedule
386,Anaheim Resort Schedule v2
52,Antelope Valley Transit Authority Schedule
681,Arcadia Schedule
694,B-Line Schedule
137,Bay Area 511 AC Transit Schedule
1632,Bay Area 511 ACE Schedule
163,Bay Area 511 Caltrain Schedule
150,Bay Area 511 County Connection Schedule
1658,Bay Area 511 Dumbarton Express Schedule


In [70]:
crosswalk_df = load_crosswalk()[["analysis_name", "caltrans_district"]]

Downloading: 100%|[32m██████████[0m|
query: SELECT * FROM  `cal-itp-data-infra`.`mart_transit_database`.`bridge_gtfs_analysis_name_x_ntd`


In [71]:
crosswalk_df.loc[crosswalk_df.caltrans_district.str.contains("Marysville")][["analysis_name"]].drop_duplicates()

Unnamed: 0,analysis_name
27,Yolo County Transportation District
34,Glenn County
40,Nevada County
41,Placer County
43,Tahoe Transportation District
49,Yuba-Sutter Transit Authority
68,"University of California, Davis"
86,Town of Truckee
93,Sacramento County
108,Sacramento Regional Transit District


In [72]:
monthly_operator_summary_df.loc[monthly_operator_summary_df.caltrans_district.str.contains("Marysville")][["analysis_name"]].drop_duplicates()

Unnamed: 0,analysis_name
51,"University of California, Davis"
399,Yolo County Transportation District
414,City of Roseville
457,Nevada County
598,City of Elk Grove
607,Sacramento Regional Transit District
694,Butte County Association of Governments


In [73]:
len(monthly_operator_summary_df.drop_duplicates())

2362

In [76]:
def prep_operator_summary(file_name: str) -> pd.DataFrame:

    df = gcs_pandas().read_parquet(f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/{GTFS_DATA_DICT.gtfs_digest_rollup.operator_summary}_{file_name}.parquet")

    # Prepare data for portfolio
    df2 = df[
        [
            "month_first_day",
            "analysis_name",
            "caltrans_district",
            "vp_name",
            "tu_name",
            "n_trips",
            "day_type",
            "daily_trips",
            "ttl_service_hours",
            "n_routes",
            "n_days",
            "n_shapes",
            "n_stops",
            "vp_messages_per_minute",
            "n_vp_trips",
            "daily_vp_trips",
            "pct_vp_trips",
            "n_vp_routes",
            "pct_vp_service_hours",
            "tu_messages_per_minute",
            "n_tu_trips",
            "daily_tu_trips",
            "pct_tu_trips",
            "n_tu_routes",
            "pct_tu_service_hours",
        ]
    ]
    df2.columns = df2.columns.str.replace("_", " ").str.title()

    df2 = df2.rename(
        columns={
            "Month First Day": "Date",
        }
    )
    df2.columns = df2.columns.str.replace("Vp", "VP").str.replace("Tu", "TU")

    # Create a couple of new columns
    df2["Percent of Trips with Trip Updates"] = (
        df2["N TU Trips"] / df2["N Trips"]
    ) * 100

    df2["Percent of Trips with Vehicle Positions"] = (
        df2["N VP Trips"] / df2["N Trips"]
    ) * 100

    """
    df2["Percent of Trips with Vehicle Positions"] = df2[
        "Percent of Trips with Vehicle Positions"
    ].clip(upper=100.0)
    df2["Percent of Trips with Trip Updates"] = df2[
        "Percent of Trips with Trip Updates"
    ].clip(upper=100.0)
    """
    # Save
    gcs_pandas().data_frame_to_parquet(df2,  f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/{GTFS_DATA_DICT.gtfs_digest_rollup.operator_summary}_{file_name}.parquet")
    
    return df2

In [77]:
monthly_operator_summary_clean = prep_operator_summary(
    file_name=file_name,
)

In [78]:
monthly_operator_summary_clean.head(1)

Unnamed: 0,Date,Analysis Name,Caltrans District,VP Name,TU Name,N Trips,Day Type,Daily Trips,Ttl Service Hours,N Routes,N Days,N Shapes,N Stops,VP Messages Per Minute,N VP Trips,Daily VP Trips,Pct VP Trips,N VP Routes,Pct VP Service Hours,TU Messages Per Minute,N TU Trips,Daily TU Trips,Pct TU Trips,N TU Routes,Pct TU Service Hours,Percent of Trips with Trip Updates,Percent of Trips with Vehicle Positions
0,2025-10-01,San Luis Obispo Regional Transit Authority,05-San Luis Obispo / Santa Barbara,SLO Vehicle Positions,SLO Trip Updates,220,Saturday,55.0,168.3,4.0,4,4.0,97.0,3.0,203,50.8,0.93,1.0,0.92,3.0,203,50.8,0.93,1.0,0.92,92.27,92.27


In [79]:
len(monthly_operator_summary_clean)

2362

## 5) fct_monthly_routes 
* Add Caltrans District + Legislative District

In [80]:
def load_fct_monthly_routes(
    project_name: str,
    date_col: str,
    dataset_name: str,
    start_date: str,
    end_date: str,
    file_name: str,
) -> pd.DataFrame:
    gdf = bq_utils.download_table(
        project_name=project_name,
        dataset_name=dataset_name,
        table_name=GTFS_DATA_DICT.gtfs_digest_rollup.route_map,
        date_col=date_col,
        start_date=start_date,
        end_date=end_date,
        geom_col="pt_array",
        geom_type="line",
    )

    # Merge with crosswalk
    crosswalk_df = load_crosswalk()[["name", "analysis_name", "caltrans_district"]]
    m1 = pd.merge(gdf, crosswalk_df, on=["name"], how="inner")

    utils.geoparquet_gcs_export(
        gdf=m1,
        gcs_file_path=f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/",
        file_name=f"{GTFS_DATA_DICT.gtfs_digest_rollup.route_map}_{file_name}",
    )
    return m1

In [81]:
monthly_routes_gdf = load_fct_monthly_routes(
    project_name=PROD_PROJECT,
    date_col=MONTH_DATE_COL,
    dataset_name=PROD_MART,
    start_date=previous_month,
    end_date=analysis_month,
    file_name=file_name,
)

Downloading: 100%|[32m██████████[0m|
query: SELECT * FROM  `cal-itp-data-infra`.`mart_gtfs_rollup`.`fct_monthly_routes` WHERE month_first_day >= DATE('2025-11-01') AND month_first_day <= DATE('2025-12-01')
Downloading: 100%|[32m██████████[0m|
query: SELECT * FROM  `cal-itp-data-infra`.`mart_transit_database`.`bridge_gtfs_analysis_name_x_ntd`


In [82]:

def prep_fct_monthly_routes(file_name: str) -> pd.DataFrame:
    """
    Prepare monthly route data by reading from GCS, cleaning, and exporting.
    """
    gdf = gpd.read_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/"
        f"{GTFS_DATA_DICT.gtfs_digest_rollup.route_map}_{file_name}.parquet",
        storage_options={"token": credentials.token},
    )

    # Keep only the most recent route geography
    gdf2 = (
        gdf.sort_values(
            by=["month_first_day", "analysis_name", "route_name"],
            ascending=[False, True, True],
        )
        .drop_duplicates(subset=["analysis_name", "route_name"])
    )

    # Drop unnecessary columns
    gdf2 = gdf2.drop(columns=["shape_id", "shape_array_key", "n_trips", "direction_id"])

    # Convert to miles
    gdf2["route_length_miles"] = (
        gdf2.geometry.to_crs(geography_utils.CA_NAD83Albers_ft).length / 5_280
    )

    # Clean column names
    gdf2.columns = gdf2.columns.str.replace("_", " ").str.title()

    # Export to GCS
    utils.geoparquet_gcs_export(
        gdf=gdf2,
        gcs_file_path=f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/",
        file_name=f"{GTFS_DATA_DICT.gtfs_digest_rollup.route_map}_{file_name}",
    )



In [83]:
monthly_routes_gdf = prep_fct_monthly_routes(
    project_name=PROD_PROJECT,
    date_col=MONTH_DATE_COL,
    dataset_name=PROD_MART,
    start_date=previous_month,
    end_date=analysis_month,
    file_name=file_name,
)

In [84]:
monthly_routes_gdf["Caltrans District"].unique()

array(['04-Bay Area / Oakland', '10-Stockton', '12-Orange County',
       '07-Los Angeles / Ventura', '08-San Bernardino / Riverside',
       '03-Marysville / Sacramento', '06-Fresno / Bakersfield',
       '05-San Luis Obispo / Santa Barbara', '09-Bishop', '01-Eureka',
       '11-San Diego', '02-Redding'], dtype=object)

In [85]:
monthly_routes_gdf.columns

Index(['Name', 'Year', 'Month', 'Month First Day', 'Route Name', 'Route Type',
       'Geometry', 'Analysis Name', 'Caltrans District', 'Route Length Miles'],
      dtype='object')

In [86]:
gdf = gpd.read_parquet(
    f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/{GTFS_DATA_DICT.gtfs_digest_rollup.route_map}_{file_name}.parquet",
    storage_options={"token": credentials.token},
)

In [87]:
gdf.columns

Index(['Name', 'Year', 'Month', 'Month First Day', 'Route Name', 'Route Type',
       'Geometry', 'Analysis Name', 'Caltrans District', 'Route Length Miles'],
      dtype='object')

In [91]:
# gdf.loc[gdf["Analysis Name"].str.contains("San Francisco")].drop(columns=["Geometry"])

In [89]:
# gdf_og.loc[gdf_og.route_mile > 192][["name","route_name","geometry"]].explore()

## 5) fct_operator_hourly_summary

In [92]:
def load_fct_operator_hourly_summary(
    project_name: str,
    date_col: str,
    dataset_name: str,
    start_date: str,
    end_date: str,
    file_name: str,
) -> pd.DataFrame:
    df = bq_utils.download_table(
        project_name=project_name,
        dataset_name=dataset_name,
        table_name=GTFS_DATA_DICT.gtfs_digest_rollup.hourly_day_type_summary,
        date_col=date_col,
        start_date=start_date,
        end_date=end_date,
    )

    gcs_pandas().data_frame_to_parquet(df, f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/{GTFS_DATA_DICT.gtfs_digest_rollup.hourly_day_type_summary}_{file_name}.parquet")
   
    return df

In [93]:
fct_operator_hourly_summary = load_fct_operator_hourly_summary(
    project_name=PROD_PROJECT,
    date_col=MONTH_DATE_COL,
    dataset_name=PROD_MART,
    start_date=last_year,
    end_date=analysis_month,
    file_name=file_name,
)

Downloading: 100%|[32m██████████[0m|
query: SELECT * FROM  `cal-itp-data-infra`.`mart_gtfs_rollup`.`fct_operator_hourly_summary` WHERE month_first_day >= DATE('2024-12-01') AND month_first_day <= DATE('2025-12-01')


In [94]:
def prep_fct_operator_hourly_summary(file_name: str) -> pd.DataFrame:

    df = gcs_pandas().read_parquet(f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/{GTFS_DATA_DICT.gtfs_digest_rollup.hourly_day_type_summary}_{file_name}.parquet")
    
    # Prepare data
    df2 = (
        df.groupby(["analysis_name", "month_first_day", "day_type", "departure_hour"])
        .agg({"n_trips": "sum"})
        .reset_index()
    )

    df2.columns = df2.columns.str.replace("_", " ").str.title()

    df2 = df2.rename(columns={"Month First Day": "Date"})

    df2["Date"] = df2["Date"].dt.strftime("%m-%Y")

    gcs_pandas().data_frame_to_parquet(df2, f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/{GTFS_DATA_DICT.gtfs_digest_rollup.hourly_day_type_summary}_{file_name}.parquet")
    
    return df2

In [95]:
clean_fct_operator_hourly_summary_df = prep_fct_operator_hourly_summary(
    file_name=file_name,
)

## Publish everything to GCS

### `gtfs_digest/publish_public_data.py`

In [144]:
from typing import Literal

In [130]:
GCS = GTFS_DATA_DICT["gtfs_digest_rollup"].dir
    

In [115]:
GTFS_DATA_DICT["gtfs_digest_rollup"].

{'dir': '${gcs_paths.DIGEST_GCS}', 'schedule_rt_route_direction': 'fct_monthly_schedule_rt_route_direction_summary', 'route_map': 'fct_monthly_routes', 'operator_summary': 'fct_monthly_operator_summary', 'hourly_day_type_summary': 'fct_operator_hourly_summary', 'ntd_profile': 'ntd_profile', 'crosswalk': 'crosswalk'}

In [140]:
df_file_keys = [
        "schedule_rt_route_direction",
        "operator_summary",
        "hourly_day_type_summary",
    ]

In [141]:
gdf_file_keys = [
        "route_map"]

In [159]:
def grab_filepaths(
    table_section: Literal["gtfs_digest_rollup"], 
    file_keys: list,
    file_name: str) -> list:
    """
    https://github.com/cal-itp/data-analyses/blob/main/_shared_utils/shared_utils/gtfs_analytics_data.yml
    
    table_section corresponds to "schedule_tables", "digest_tables", 
    "speeds_tables", etc
    """
    GCS = GTFS_DATA_DICT[table_section].dir
    
    file_paths = [GTFS_DATA_DICT[table_section][f] for f in file_keys]
    
    return [f"{GCS}processed/{f}_{file_name}.parquet" for f in file_paths]

In [160]:
df_file_paths = grab_filepaths("gtfs_digest_rollup", df_file_keys, file_name)

In [161]:
df_file_paths

['gs://calitp-analytics-data/data-analyses/gtfs_digest/processed/fct_monthly_schedule_rt_route_direction_summary_2025_12.parquet',
 'gs://calitp-analytics-data/data-analyses/gtfs_digest/processed/fct_monthly_operator_summary_2025_12.parquet',
 'gs://calitp-analytics-data/data-analyses/gtfs_digest/processed/fct_operator_hourly_summary_2025_12.parquet']

In [158]:
f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/{GTFS_DATA_DICT.gtfs_digest_rollup.hourly_day_type_summary}_{file_name}.parquet"

'gs://calitp-analytics-data/data-analyses/gtfs_digest/processed/fct_operator_hourly_summary_2025_12.parquet'

In [162]:
gdf_file_paths = grab_filepaths("gtfs_digest_rollup", gdf_file_keys, file_name)

In [163]:
gdf_file_paths

['gs://calitp-analytics-data/data-analyses/gtfs_digest/processed/fct_monthly_routes_2025_12.parquet']

In [164]:
for f in df_file_paths + gdf_file_paths:
        publish_utils.write_to_public_gcs(
            f,
            f"gtfs_digest/{Path(f).name}",
            PUBLIC_GCS
        )

Uploaded gtfs_digest/fct_monthly_schedule_rt_route_direction_summary_2025_12.parquet
Uploaded gtfs_digest/fct_monthly_operator_summary_2025_12.parquet
Uploaded gtfs_digest/fct_operator_hourly_summary_2025_12.parquet
Uploaded gtfs_digest/fct_monthly_routes_2025_12.parquet


In [167]:
def export_parquet_as_csv_or_geojson(
    filename: str,
    filetype: Literal["df", "gdf"],
):
    """
    For parquets, we want to export as csv.
    For geoparquets, we want to export as geojson.
    """
    if filetype=="df":
        df = gcs_pandas().read_parquet(filename)
        df.to_csv(
            f"{PUBLIC_GCS}gtfs_digest/"
            f"{Path(filename).stem}.csv", index=False
        )
        
        
    elif filetype=="gdf":
        df = gpd.read_parquet(filename, storage_options={"token": credentials.token},)
        utils.geojson_gcs_export(
            df,
            f"{PUBLIC_GCS}gtfs_digest/",
            Path(filename).stem,
            geojson_type = "geojson"
        )
        

In [169]:
for f in df_file_paths:
   export_parquet_as_csv_or_geojson(f, filetype="df")

In [170]:
for f in gdf_file_paths:
    publish_public_data.export_parquet_as_csv_or_geojson(f, filetype="gdf")