# Download data from warehouse -> add it to our private GCS bucket as a parquet -> clean it -> add it to the public GCS bucket

In [1]:
from pathlib import Path
from typing import Literal

import _sql_query
import pandas as pd
import publish_public_data
from shared_utils import gtfs_utils_v2, portfolio_utils, publish_utils
from update_vars import GTFS_DATA_DICT

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
PUBLIC_GCS = GTFS_DATA_DICT.gcs_paths.PUBLIC_GCS

In [4]:
PUBLIC_GCS

'gs://calitp-publish-data-analysis/'

## 1) schedule_rt_route_direction_summary

In [5]:
def download_schedule_rt_route_direction_summary() -> pd.DataFrame:
    df = _sql_query.download_with_pandas_gbq(
        project="cal-itp-data-infra-staging",
        filename="tiffany_mart_gtfs_rollup.fct_monthly_schedule_rt_route_direction_summary",
    )

    # Save to our private GCS
    df.to_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}schedule_rt_route_direction_summary.parquet"
    )

    return df

In [6]:
def prep_schedule_rt_route_direction_summary() -> pd.DataFrame:
    df = pd.read_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}schedule_rt_route_direction_summary.parquet"
    )
    df2 = df[
        [
            "month_first_day",
            "name",
            "route_name",
            "direction_id",
            "frequency_all_day",
            "frequency_offpeak",
            "frequency_peak",
            "daily_service_hours",
            "daily_trips_peak",
            "daily_trips_offpeak",
            "daily_trips_all_day",
            "day_type",
        ]
    ]
    # Temporarily add analysis_name
    df2 = df2.pipe(portfolio_utils.standardize_portfolio_organization_names)
    df2 = df2.drop_duplicates()
    
    # Clean columns
    df2.columns = df2.columns.str.replace("_", " ").str.title()
    df2["Month First Day"] = pd.to_datetime(df2["Month First Day"]).dt.strftime("%B %Y")
    df2 = df2.rename(
        columns={
            "Direction Id": "Direction",
            "Month First Day": "Date",
            "Route Name": "Route",
        }
    )
    # Add some new columns
    df2["Daily Service Minutes"] = df2["Daily Service Hours"] * 60
    df2["Average Scheduled Minutes"] = (
        df2["Daily Service Minutes"] / df2["Daily Trips All Day"]
    )
    df2["Average Scheduled Minutes"].describe()
    df2["Headway All Day"] = 60 / df2["Frequency All Day"]
    df2["Headway Peak"] = 60 / df2["Frequency Peak"]
    df2["Headway Offpeak"] = 60 / df2["Frequency Offpeak"]

    return df2

In [7]:
schedule_rt_route_direction_summary = prep_schedule_rt_route_direction_summary()

## 2) fct_monthly_operator_summary

In [8]:
def download_operator_summary() -> pd.DataFrame:
    df = _sql_query.download_with_pandas_gbq(
        project="cal-itp-data-infra-staging",
        filename="tiffany_mart_gtfs_rollup.fct_monthly_operator_summary",
    )

    # Save to our private GCS
    df.to_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}fct_monthly_operator_summary.parquet"
    )
    return df

In [9]:
monthly_operator_summary = download_operator_summary()


        SELECT 
            *
        FROM `cal-itp-data-infra-staging`.`tiffany_mart_gtfs_rollup`.`fct_monthly_operator_summary`
        WHERE month_first_day >=  DATE('2025-01-01')
    


  import pkg_resources  # noqa


Downloading: 100%|[32m██████████[0m|
download time: 0:00:01.998545


In [10]:
def clean_operator_summary() -> pd.DataFrame:
    df = pd.read_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}fct_monthly_operator_summary.parquet"
    )

    df2 = df[
        [
            "month_first_day",
            "analysis_name",
            "vp_name",
            "tu_name",
            "n_trips",
            "day_type",
            "daily_trips",
            "ttl_service_hours",
            "n_routes",
            "n_days",
            "n_shapes",
            "n_stops",
            "vp_messages_per_minute",
            "n_vp_trips",
            "daily_vp_trips",
            "pct_vp_trips",
            "n_vp_routes",
            "pct_vp_service_hours",
            "tu_messages_per_minute",
            "n_tu_trips",
            "daily_tu_trips",
            "pct_tu_trips",
            "n_tu_routes",
            "pct_tu_service_hours",
        ]
    ]
    df2.columns = df2.columns.str.replace("_", " ").str.title()

    df2 = df2.rename(
        columns={
            "Month First Day": "Date",
        }
    )
    df2.columns = df2.columns.str.replace("Vp", "VP").str.replace("Tu", "TU")

    # Create a couple of new columns
    df2["Percent of Trips with Trip Updates"] = (
        df2["N TU Trips"] / df2["N Trips"]
    ) * 100

    df2["Percent of Trips with Vehicle Positions"] = (
        df2["N VP Trips"] / df2["N Trips"]
    ) * 100

    df2["Percent of Trips with Vehicle Positions"] = df2[
        "Percent of Trips with Vehicle Positions"
    ].clip(upper=100.0)
    df2["Percent of Trips with Trip Updates"] = df2[
        "Percent of Trips with Trip Updates"
    ].clip(upper=100.0)

    return df2

In [11]:
monthly_operator_summary_clean = clean_operator_summary()

## 3) fct_monthly_routes - Still need to figure out how to convert the geometry

In [12]:
def download_fct_monthly_routes() -> pd.DataFrame:
    df = _sql_query.download_with_pandas_gbq(
        project="cal-itp-data-infra-staging",
        filename="tiffany_mart_gtfs_rollup.fct_monthly_routes",
    )

    # Save to our private GCS
    df.to_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}fct_monthly_routes.parquet"
    )
    return df

In [13]:
# fct_monthly_routes = download_fct_monthly_routes()

## 4) fct_operator_hourly_summary

In [14]:
def download_fct_operator_hourly_summary() -> pd.DataFrame:
    df = _sql_query.download_with_pandas_gbq(
        project="cal-itp-data-infra-staging",
        filename="tiffany_mart_gtfs_rollup.fct_operator_hourly_summary",
    )

    # Save to our private GCS
    df.to_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}fct_operator_hourly_summary.parquet"
    )
    return df

In [15]:
fct_operator_hourly_summary = download_fct_operator_hourly_summary()


        SELECT 
            *
        FROM `cal-itp-data-infra-staging`.`tiffany_mart_gtfs_rollup`.`fct_operator_hourly_summary`
        WHERE month_first_day >=  DATE('2025-01-01')
    
Downloading: 100%|[32m██████████[0m|
download time: 0:00:02.010985


In [16]:
def clean_fct_operator_hourly_summary() -> pd.DataFrame:
    df = pd.read_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}fct_operator_hourly_summary.parquet"
    )
    df2 = (
        df.groupby(["analysis_name", "month_first_day", "day_type", "departure_hour"])
        .agg({"n_trips": "sum"})
        .reset_index()
    )
    df2.columns = df2.columns.str.replace("_", " ").str.title()

    df2 = df2.rename(columns={"Month First Day": "Date"})
    return df2

In [17]:
clean_fct_operator_hourly_summary_df = clean_fct_operator_hourly_summary()

## Publish everything to GCS

In [18]:
digest_df_keys = [
    "schedule_rt_route_direction_summary",
    "fct_monthly_operator_summary",
    "fct_operator_hourly_summary"
]

In [19]:
df_filepaths = publish_public_data.grab_filepaths("digest_tables_ah", digest_df_keys)

In [20]:
for f in df_filepaths:
    publish_utils.write_to_public_gcs(f, f"gtfs_digest/{Path(f).name}", PUBLIC_GCS)

Uploaded gtfs_digest/schedule_rt_route_direction_summary.parquet
Uploaded gtfs_digest/fct_monthly_operator_summary.parquet
Uploaded gtfs_digest/fct_operator_hourly_summary.parquet
