# Download data from warehouse -> add it to our private GCS bucket as a parquet -> clean it -> add it to the public GCS bucket

In [1]:
from datetime import datetime
from functools import cache
from pathlib import Path
from typing import Literal

import _sql_query
import geopandas as gpd
import pandas as pd
from calitp_data_analysis.gcs_geopandas import GCSGeoPandas

In [2]:
from shared_utils import (
    bq_utils,
    geo_utils,
    gtfs_utils_v2,
    portfolio_utils,
    publish_utils,
    rt_dates,
)

In [3]:
import publish_public_data
from calitp_data_analysis import geography_utils
from shared_utils import geo_utils, gtfs_utils_v2, portfolio_utils, publish_utils
from update_vars import GTFS_DATA_DICT, analysis_month, file_name, previous_month

In [4]:
from calitp_data_analysis import geography_utils, utils

In [5]:
import google.auth
credentials, project = google.auth.default()

In [6]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [7]:
PUBLIC_GCS = GTFS_DATA_DICT.gcs_paths.PUBLIC_GCS

In [8]:
@cache
def gcs_geopandas():
    return GCSGeoPandas()

In [9]:
analysis_month

'2025-11-05'

In [10]:
file_name

'2025_11'

## 1) schedule_rt_route_direction_summary
* Add Caltrans District to organize 

In [11]:
PROD_PROJECT = "cal-itp-data-infra"
STG_PROJECT = "cal-itp-data-infra-staging"
STG_MART = "tiffany_mart_gtfs_rollup"
MONTH_DATE_COL = "month_first_day"

In [13]:
GTFS_DATA_DICT.gtfs_digest_rollup.schedule_rt_route_direction

'fct_monthly_schedule_rt_route_direction_summary'

In [14]:
def prep_schedule_rt_route_direction_summary(
    project_name: str, 
    date_col:str, 
    dataset_name: str, 
    start_date: str, 
    end_date: str, 
    file_name:str
) -> pd.DataFrame:
    df = bq_utils.download_table(
        project_name=project_name,
        dataset_name=dataset_name,
        table_name="fct_monthly_schedule_rt_route_direction_summary",
        date_col= date_col,
        start_date=start_date,
        end_date=end_date,
    )

    
    df.to_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/{GTFS_DATA_DICT.gtfs_digest_rollup.schedule_rt_route_direction}_{file_name}.parquet"
    )

    # Prepare data for portfolio
    df2 = df[
        [
            "month_first_day",
            "name",
            "route_name",
            "direction_id",
            "frequency_all_day",
            "frequency_offpeak",
            "frequency_peak",
            "daily_service_hours",
            "daily_trips_peak",
            "daily_trips_offpeak",
            "daily_trips_all_day",
            "day_type",
            "route_type",
            "route_typology",
        ]
    ]
    # Temporarily add analysis_name
    df2 = df2.pipe(portfolio_utils.standardize_portfolio_organization_names)
    print(len(df2))
    df2 = df2.drop_duplicates()
    print(len(df2))

    # Clean columns
    df2.route_typology = df2.route_typology.str.title()
    df2.columns = df2.columns.str.replace("_", " ").str.title()
    df2["Month First Day"] = pd.to_datetime(df2["Month First Day"]).dt.strftime("%B %Y")
    df2 = df2.rename(
        columns={
            "Direction Id": "Direction",
            "Month First Day": "Date",
            "Route Name": "Route",
        }
    )

    # Add some new columns
    df2["Daily Service Minutes"] = df2["Daily Service Hours"] * 60
    df2["Average Scheduled Minutes"] = (
        df2["Daily Service Minutes"] / df2["Daily Trips All Day"]
    )
    df2["Average Scheduled Minutes"].describe()
    df2["Headway All Day"] = 60 / df2["Frequency All Day"]
    df2["Headway Peak"] = 60 / df2["Frequency Peak"]
    df2["Headway Offpeak"] = 60 / df2["Frequency Offpeak"]

    # Save
    df2.to_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/{GTFS_DATA_DICT.gtfs_digest_rollup.schedule_rt_route_direction}_{file_name}.parquet"
    )
    
    return df2

In [15]:
schedule_rt_route_direction_summary_df = prep_schedule_rt_route_direction_summary(
    project_name=STG_PROJECT,
    date_col = MONTH_DATE_COL,
    dataset_name=STG_MART,
    start_date= previous_month,
    end_date=analysis_month,
    file_name = file_name,
)

  import pkg_resources  # noqa


Downloading: 100%|[32m██████████[0m|
query: SELECT * FROM  `cal-itp-data-infra-staging`.`tiffany_mart_gtfs_rollup`.`fct_monthly_schedule_rt_route_direction_summary` WHERE month_first_day >= DATE('2025-10-15') AND month_first_day <= DATE('2025-11-05')
18045
10956


In [16]:
f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/{GTFS_DATA_DICT.gtfs_digest_rollup.schedule_rt_route_direction}_{file_name}.parquet"

'gs://calitp-analytics-data/data-analyses/gtfs_digest/processed/fct_monthly_schedule_rt_route_direction_summary_2025_11.parquet'

## 2) fct_monthly_operator_summary
* Add Caltrans District + Legislative District

In [None]:
def clean_operator_summary(project_name: str, 
                           date_col:str, 
                           dataset_name: str, 
                           start_date: str, 
                           end_date: str, 
                           file_name:str) -> pd.DataFrame:
    df = bq_utils.download_table(
        project_name=project_name,
        dataset_name=dataset_name,
        table_name="fct_monthly_operator_summary",
        date_col= date_col,
        start_date=start_date,
        end_date=end_date,
    )

    df.to_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/{GTFS_DATA_DICT.gtfs_digest_rollup.operator_summary}_{file_name}.parquet"
    )

    # Prepare data for portfolio
    df2 = df[
        [
            "month_first_day",
            "analysis_name",
            "vp_name",
            "tu_name",
            "n_trips",
            "day_type",
            "daily_trips",
            "ttl_service_hours",
            "n_routes",
            "n_days",
            "n_shapes",
            "n_stops",
            "vp_messages_per_minute",
            "n_vp_trips",
            "daily_vp_trips",
            "pct_vp_trips",
            "n_vp_routes",
            "pct_vp_service_hours",
            "tu_messages_per_minute",
            "n_tu_trips",
            "daily_tu_trips",
            "pct_tu_trips",
            "n_tu_routes",
            "pct_tu_service_hours",
        ]
    ]
    df2.columns = df2.columns.str.replace("_", " ").str.title()

    df2 = df2.rename(
        columns={
            "Month First Day": "Date",
        }
    )
    df2.columns = df2.columns.str.replace("Vp", "VP").str.replace("Tu", "TU")

    # Create a couple of new columns
    df2["Percent of Trips with Trip Updates"] = (
        df2["N TU Trips"] / df2["N Trips"]
    ) * 100

    df2["Percent of Trips with Vehicle Positions"] = (
        df2["N VP Trips"] / df2["N Trips"]
    ) * 100

    """
    df2["Percent of Trips with Vehicle Positions"] = df2[
        "Percent of Trips with Vehicle Positions"
    ].clip(upper=100.0)
    df2["Percent of Trips with Trip Updates"] = df2[
        "Percent of Trips with Trip Updates"
    ].clip(upper=100.0)
    """
    # Save
    df2.to_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/{GTFS_DATA_DICT.gtfs_digest_rollup.operator_summary}_{file_name}.parquet"
    )
    return df2

In [None]:
monthly_operator_summary_clean = clean_operator_summary(project_name=STG_PROJECT,
    date_col = MONTH_DATE_COL,
    dataset_name=STG_MART,
    start_date= previous_month,
    end_date=analysis_month,
    file_name = file_name,)

In [None]:
monthly_operator_summary_clean.head(1)

## 3) fct_monthly_routes 
* Add Caltrans District + Legislative District

In [None]:
def clean_fct_monthly_routes(project_name: str, 
    date_col:str, 
    dataset_name: str, 
    start_date: str, 
    end_date: str, 
    file_name:str) -> pd.DataFrame:

    # Add analysis name
    gdf = bq_utils.download_table(
         project_name=project_name,
        dataset_name=dataset_name,
        table_name="fct_monthly_routes",
        date_col= date_col,
        start_date=start_date,
        end_date=end_date,
        geom_col="pt_array",
        geom_type="line",
    )

   
    utils.geoparquet_gcs_export(gdf = fct_monthly_routes,
                           gcs_file_path = f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/",
                           file_name = f"{GTFS_DATA_DICT.gtfs_digest_rollup.route_map}_{file_name}")
    # Prepare data for portfolio
    gdf = gdf.pipe(portfolio_utils.standardize_portfolio_organization_names)

    # Keep only the most recent route geography
    gdf2 = gdf.sort_values(
        by=["month_first_day", "analysis_name", "route_name"],
        ascending=[False, True, True],
    ).drop_duplicates(subset=["analysis_name", "route_name"])

    gdf2 = gdf2.drop(
        columns=[
            "shape_id",
            "shape_array_key",
            "n_trips",
            "direction_id",
        ]
    )

    # Add route_type, route_color, and route_typology
    route_info_df = pd.read_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/{GTFS_DATA_DICT.gtfs_digest_rollup.route_map}_{file_name}.parquet"
    )[
        [
            "name",
            "month_first_day",
            "route_name",
            # "route_type",
            "route_color",
            "route_typology",
        ]
    ]

    m1 = (
        pd.merge(
            gdf2,
            route_info_df,
            on=[
                "name",
                "month_first_day",
                "route_name",
            ],
            how="left",
        )
        .drop(columns=["month_first_day"])
        .drop_duplicates(subset=["analysis_name", "route_name"])
    )

    # Convert the geometry to line
    # m1 = geo_utils.convert_to_gdf(m1, "pt_array", "line")

    # Clean
    m1.columns = m1.columns.str.replace("_", " ").str.title()

    utils.geoparquet_gcs_export(gdf = m1,
                           gcs_file_path = f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/",
                           file_name = f"{GTFS_DATA_DICT.gtfs_digest_rollup.route_map}_{file_name}")

    return m1

In [None]:
# df = gpd.read_parquet("gs://calitp-analytics-data/data-analyses/gtfs_digest/raw/fct_monthly_routes_2025_11.parquet", storage_options={"token": credentials.token})

In [None]:
# df_test = df.sample(10)

In [None]:
# df_test.columns

In [None]:
# df_test[["route_name","geometry"]].explore("route_name")

In [None]:
fct_monthly_routes = clean_fct_monthly_routes(project_name=STG_PROJECT,
    date_col = MONTH_DATE_COL,
    dataset_name=STG_MART,
    start_date= previous_month,
    end_date=analysis_month,
    file_name = file_name,)

In [None]:
len(fct_monthly_routes)

## 4) fct_operator_hourly_summary

In [None]:
def download_fct_operator_hourly_summary(download_date: str) -> pd.DataFrame:
    df = _sql_query.download_with_pandas_gbq(
        project="cal-itp-data-infra-staging",
        filename="tiffany_mart_gtfs_rollup.fct_operator_hourly_summary",
    )

    # Save to our private GCS
    df.to_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/fct_operator_hourly_summary_{download_date}.parquet"
    )
    return df

In [None]:
# fct_operator_hourly_summary = download_fct_operator_hourly_summary()

In [None]:
def clean_fct_operator_hourly_summary(project_name: str, 
                           date_col:str, 
                           dataset_name: str, 
                           start_date: str, 
                           end_date: str, 
                           file_name:str) -> pd.DataFrame:
    df = bq_utils.download_table(
        project_name=project_name,
        dataset_name=dataset_name,
        table_name="fct_operator_hourly_summary",
        date_col= date_col,
        start_date=start_date,
        end_date=end_date,
    )

    df.to_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/{GTFS_DATA_DICT.gtfs_digest_rollup.hourly_day_type_summary}_{file_name}.parquet"
    )

    # Prepare data
    df2 = (
        df.groupby(["analysis_name", "month_first_day", "day_type", "departure_hour"])
        .agg({"n_trips": "sum"})
        .reset_index()
    )

    df2.columns = df2.columns.str.replace("_", " ").str.title()

    df2 = df2.rename(columns={"Month First Day": "Date"})

    df2["Date"] = df2["Date"].dt.strftime("%m-%Y")

    df2.to_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/{GTFS_DATA_DICT.gtfs_digest_rollup.hourly_day_type_summary}_{file_name}.parquet"
    )
    return df2

In [None]:
clean_fct_operator_hourly_summary_df = clean_fct_operator_hourly_summary(
    project_name=STG_PROJECT,
    date_col = MONTH_DATE_COL,
    dataset_name=STG_MART,
    start_date= previous_month,
    end_date=analysis_month,
    file_name = file_name,)

## Publish everything to GCS

In [None]:
GCS = GTFS_DATA_DICT["digest_tables_ah"].dir

In [None]:
GTFS_DATA_DICT["digest_tables_ah"].schedule_rt_route_direction_summary

In [None]:
file_names = [
    GTFS_DATA_DICT["digest_tables_ah"][f]
    for f in [
        "schedule_rt_route_direction_summary",
        "fct_monthly_operator_summary",
        "fct_operator_hourly_summary",
    ]
]

In [None]:
file_names = [f"{key}_{date_str}.parquet" for key in file_names]

In [None]:
file_names = [f"{GCS}{key}" for key in file_names]

In [None]:
for f in file_names:
    publish_utils.write_to_public_gcs(f, f"gtfs_digest/{Path(f).name}", PUBLIC_GCS)

In [None]:
for f in file_names:
    publish_public_data.export_parquet_as_csv_or_geojson(f, filetype="df")