# Download data from warehouse -> add it to our private GCS bucket as a parquet -> clean it -> add it to the public GCS bucket

In [1]:
from datetime import datetime
from functools import cache
from pathlib import Path
from typing import Literal

import _sql_query
import geopandas as gpd
import pandas as pd
from calitp_data_analysis.gcs_geopandas import GCSGeoPandas

In [2]:
from shared_utils import (
    bq_utils,
    geo_utils,
    gtfs_utils_v2,
    portfolio_utils,
    publish_utils,
    rt_dates,
)

In [3]:
import publish_public_data
from calitp_data_analysis import geography_utils
from shared_utils import geo_utils, gtfs_utils_v2, portfolio_utils, publish_utils
from update_vars import (
    GTFS_DATA_DICT,
    analysis_month,
    file_name,
    last_year,
    previous_month,
)

In [4]:
from calitp_data_analysis import geography_utils, utils

In [5]:
import google.auth
import pandas_gbq

credentials, project = google.auth.default()

In [6]:
from calitp_data_analysis.sql import get_engine
from calitp_data_analysis.tables import tbls
db_engine = get_engine()


In [7]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [8]:
PUBLIC_GCS = GTFS_DATA_DICT.gcs_paths.PUBLIC_GCS

In [9]:
@cache
def gcs_geopandas():
    return GCSGeoPandas()

In [10]:
analysis_month

'2025-11-05'

In [11]:
file_name

'2025_11'

In [12]:
PROD_PROJECT = "cal-itp-data-infra"
STG_PROJECT = "cal-itp-data-infra-staging"
STG_MART = "tiffany_mart_gtfs_rollup"
MONTH_DATE_COL = "month_first_day"

In [13]:
TRANSIT_MART = "mart_transit_database"

In [14]:
PROD_MART = "mart_gtfs_rollup"

## 1) Crosswalk

In [15]:
def load_crosswalk()->pd.DataFrame:
    df = bq_utils.download_table(
    project_name= "cal-itp-data-infra",
    dataset_name= "mart_transit_database",
    table_name = "bridge_gtfs_analysis_name_x_ntd",
    date_col = None,
  )
    df2 = (df
        .dropna(subset = ["ntd_id","ntd_id_2022"])
        .drop_duplicates(subset = ["analysis_name","organization_name"])
        .reset_index()
         )

    df2 = df2.rename(columns = {"schedule_gtfs_dataset_name":"name"})

    df2 = df2[['name',
       'analysis_name',
        'county_name',
       'caltrans_district', 
        'caltrans_district_name', 
        'ntd_id', 
        'ntd_id_2022',]]
    return df2

In [16]:
crosswalk_df = load_crosswalk()

  import pkg_resources  # noqa


Downloading: 100%|[32m██████████[0m|
query: SELECT * FROM  `cal-itp-data-infra`.`mart_transit_database`.`bridge_gtfs_analysis_name_x_ntd`


### What to do if there are more than 2 values for an operator?

In [17]:
crosswalk_df.analysis_name.value_counts().head()

Humboldt Transit Authority                                                                            2
Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)    2
San Luis Obispo Regional Transit Authority                                                            2
City of El Monte                                                                                      1
Golden Empire Transit District                                                                        1
Name: analysis_name, dtype: int64

In [18]:
crosswalk_df.loc[crosswalk_df.analysis_name == "Humboldt Transit Authority"].T

Unnamed: 0,51,118
name,Humboldt Schedule,Humboldt Flex
analysis_name,Humboldt Transit Authority,Humboldt Transit Authority
county_name,Humboldt,Humboldt
caltrans_district,1,1
caltrans_district_name,Eureka,Eureka
ntd_id,9R02-91018,9R02-91036
ntd_id_2022,91018,91036


In [21]:
crosswalk_df[["name","analysis_name"]].sort_values(by = ["analysis_name"])

Unnamed: 0,name,analysis_name
98,Bay Area 511 AC Transit Schedule,Alameda-Contra Costa Transit District
73,Amador Schedule,Amador Regional Transit System
24,Anaheim Resort Schedule,Anaheim Transportation Network
74,Antelope Valley Transit Authority Schedule,Antelope Valley Transit Authority
132,Morongo Basin Schedule,Basin Transit
49,B-Line Schedule,Butte County Association of Governments
8,Calaveras Schedule,Calaveras Transit Agency
87,Bay Area 511 County Connection Schedule,Central Contra Costa Transit Authority
91,Havasu Landing Ferry Schedule,Chemehuevi Indian Tribe
4,Bay Area 511 Muni Schedule,City and County of San Francisco


## 2) NTD Data

In [None]:
ntd_query_sql = f"""
        SELECT 
        number_of_state_counties,
        primary_uza_name,
        density,
        number_of_counties_with_service,
        state_admin_funds_expended,
        service_area_sq_miles,
        population,
        service_area_pop,
        subrecipient_type,
        primary_uza_code,
        reporter_type,
        organization_type,
        agency_name,
        voms_pt,
        voms_do,
        ntd_id,
        year
        FROM `cal-itp-data-infra-staging`.`mart_ntd`.`dim_annual_agency_information`
        WHERE state = 'CA' AND _is_current = TRUE
    """


mobility_query_sql = f"""
            SELECT
            agency_name,
            counties_served,
            hq_county,
            is_public_entity,
            is_publicly_operating,
            funding_sources,
            on_demand_vehicles_at_max_service,
            vehicles_at_max_service
            FROM
            cal-itp-data-infra.mart_transit_database.dim_mobility_mart_providers  
            """


def load_mobility(query:str)->pd.DataFrame:
    with db_engine.connect() as connection:
        df = pd.read_sql(query, connection)
    df2 = df.sort_values(
        by=["on_demand_vehicles_at_max_service","vehicles_at_max_service"], 
        ascending = [False, False]
    )
    df3 = df2.groupby('agency_name').first().reset_index()
    return df3

    
def load_ntd(query:str)->pd.DataFrame:
    with db_engine.connect() as connection:
        df = pd.read_sql(query, connection)
    df2 = df.sort_values(by=df.columns.tolist(), na_position="last")
    df3 = df2.groupby("agency_name").first().reset_index()
    return df3

In [None]:
def merge_ntd_mobility(ntd_query:str,
                      mobility_query:str)->pd.DataFrame:
    """
    Merge NTD (dim_annual_ntd_agency_information) with 
    mobility providers (dim_mobility_mart_providers)
    and dedupe and keep 1 row per agency.
    """
    ntd = load_ntd(ntd_query)
    mobility = load_mobility(mobility_query)
    crosswalk = load_crosswalk()[["analysis_name","ntd_id_2022"]]
    
    m1 = pd.merge(
        mobility,
        ntd,
        how="inner",
        on="agency_name"
    )

    m1 = m1.drop_duplicates(
        subset="agency_name"
    ).reset_index(
        drop=True
    )
    
    # Wherever possible, allow nullable integers. These columns are integers, but can be
    # missing if we don't find corresponding NTD info
    integrify_cols = [
        "number_of_state_counties", "number_of_counties_with_service", 
        "service_area_sq_miles", "service_area_pop",
        "on_demand_vehicles_at_max_service", "vehicles_at_max_service",
        "voms_pt", "voms_do", "year",
    ]
    m1[integrify_cols] = m1[integrify_cols].astype("Int64")

    # Merge with crosswalk to get analysis_name
    m1 = pd.merge(m1, crosswalk,  
                  left_on = ["ntd_id"],
                  right_on = ["ntd_id_2022"], 
                  how = "inner")

    # m1.columns = m1.columns.str.replace("_", " ").str.title()
    
    m1.to_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/{GTFS_DATA_DICT.gtfs_digest_rollup.ntd_profile}_{file_name}.parquet"
    )

    return m1

    

In [None]:
ntd_data_df = merge_ntd_mobility(ntd_query = ntd_query_sql,
                                mobility_query = mobility_query_sql)

In [None]:
len(ntd_data_df)

## 3) schedule_rt_route_direction_summary
* Add Caltrans District to organize 

In [33]:
def load_schedule_rt_route_direction_summary(
    project_name: str, 
    date_col:str, 
    dataset_name: str, 
    start_date: str, 
    end_date: str, 
    file_name:str
) -> pd.DataFrame:
    df = bq_utils.download_table(
        project_name=project_name,
        dataset_name=dataset_name,
        table_name=GTFS_DATA_DICT.gtfs_digest_rollup.schedule_rt_route_direction,
        date_col= date_col,
        start_date=start_date,
        end_date=end_date,
    )

    # Merge with crosswalk
    crosswalk_df = load_crosswalk()[["name","analysis_name"]]
    m1 = pd.merge(df, crosswalk_df, on = ["name"], how =  "inner")
    
    m1.to_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/{GTFS_DATA_DICT.gtfs_digest_rollup.schedule_rt_route_direction}_{file_name}.parquet"
    )

    return m1

In [34]:
schedule_rt_route_direction_summary = load_schedule_rt_route_direction_summary(
    project_name=PROD_PROJECT,
    date_col = MONTH_DATE_COL,
    dataset_name=PROD_MART,
    start_date= last_year,
    end_date=analysis_month,
    file_name = file_name
)

Downloading: 100%|[32m██████████[0m|
query: SELECT * FROM  `cal-itp-data-infra`.`mart_gtfs_rollup`.`fct_monthly_schedule_rt_route_direction_summary` WHERE month_first_day >= DATE('2024-11-13') AND month_first_day <= DATE('2025-11-05')
Downloading: 100%|[32m██████████[0m|
query: SELECT * FROM  `cal-itp-data-infra`.`mart_transit_database`.`bridge_gtfs_analysis_name_x_ntd`


In [39]:
def prep_schedule_rt_route_direction_summary(
    file_name:str
) ->pd.DataFrame:

    df = pd.read_parquet(f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/{GTFS_DATA_DICT.gtfs_digest_rollup.schedule_rt_route_direction}_{file_name}.parquet")
    
    # Prepare data for portfolio
    df2 = df[
        [
            "month_first_day",
            "analysis_name",
            "route_name",
            "direction_id",
            "frequency_all_day",
            "frequency_offpeak",
            "frequency_peak",
            "daily_service_hours",
            "daily_trips_peak",
            "daily_trips_offpeak",
            "daily_trips_all_day",
            "day_type",
            "route_type",
            "route_typology",
        ]
    ]

    # Drop duplicates
    df2 = df2.drop_duplicates().reset_index()
    
    # Clean columns
    df2.route_typology = df2.route_typology.str.title()
    df2.columns = df2.columns.str.replace("_", " ").str.title()
    df2["Month First Day"] = pd.to_datetime(df2["Month First Day"]).dt.strftime("%B %Y")
    df2 = df2.rename(
        columns={
            "Direction Id": "Direction",
            "Month First Day": "Date",
            "Route Name": "Route",
        }
    )

    # Add some new columns
    df2["Daily Service Minutes"] = df2["Daily Service Hours"] * 60
    df2["Average Scheduled Minutes"] = (
        df2["Daily Service Minutes"] / df2["Daily Trips All Day"]
    )
    df2["Average Scheduled Minutes"].describe()
    df2["Headway All Day"] = 60 / df2["Frequency All Day"]
    df2["Headway Peak"] = 60 / df2["Frequency Peak"]
    df2["Headway Offpeak"] = 60 / df2["Frequency Offpeak"]

    # Save
    df2.to_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/{GTFS_DATA_DICT.gtfs_digest_rollup.schedule_rt_route_direction}_{file_name}.parquet"
    )
    
    return df2

In [40]:
schedule_rt_route_direction_summary_df = prep_schedule_rt_route_direction_summary(
    file_name = file_name,
)

In [42]:
schedule_rt_route_direction_summary_df.columns

Index(['Index', 'Date', 'Analysis Name', 'Route', 'Direction',
       'Frequency All Day', 'Frequency Offpeak', 'Frequency Peak',
       'Daily Service Hours', 'Daily Trips Peak', 'Daily Trips Offpeak',
       'Daily Trips All Day', 'Day Type', 'Route Type', 'Route Typology',
       'Daily Service Minutes', 'Average Scheduled Minutes', 'Headway All Day',
       'Headway Peak', 'Headway Offpeak'],
      dtype='object')

In [43]:
sf = schedule_rt_route_direction_summary_df.loc[schedule_rt_route_direction_summary_df["Analysis Name"].str.contains("Francisco")]

In [44]:
sf.sample()

Unnamed: 0,Index,Date,Analysis Name,Route,Direction,Frequency All Day,Frequency Offpeak,Frequency Peak,Daily Service Hours,Daily Trips Peak,Daily Trips Offpeak,Daily Trips All Day,Day Type,Route Type,Route Typology,Daily Service Minutes,Average Scheduled Minutes,Headway All Day,Headway Peak,Headway Offpeak
37902,64315,October 2025,City and County of San Francisco,7__7 HAIGHT-NORIEGA,1,3.25,2.63,4.5,70.23,36.0,42.0,78.0,Saturday,3,Bus,4213.8,54.02,18.46,13.33,22.81


## 4) fct_monthly_operator_summary
* Add Caltrans District + Legislative District

In [None]:
def load_operator_summary(
    project_name: str, 
    date_col:str, 
    dataset_name: str, 
    start_date: str, 
    end_date: str, 
    file_name:str
) -> pd.DataFrame:
    df = bq_utils.download_table(
        project_name=project_name,
        dataset_name=dataset_name,
        table_name=GTFS_DATA_DICT.gtfs_digest_rollup.operator_summary,
        date_col= date_col,
        start_date=start_date,
        end_date=end_date,
    )

    # Merge with crosswalk
    crosswalk_df = load_crosswalk()[["analysis_name", 
                                     "caltrans_district_name",
                                     "caltrans_district"]]
    
    m1 = pd.merge(df, crosswalk_df, on = ["analysis_name"], how =  "inner")

    m1 = m1.drop_duplicates().reset_index()
    
    m1.to_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/{GTFS_DATA_DICT.gtfs_digest_rollup.operator_summary}_{file_name}.parquet"
    )

    return m1

In [None]:
monthly_operator_summary_df = load_operator_summary(project_name=PROD_PROJECT,
    date_col = MONTH_DATE_COL,
    dataset_name=PROD_MART,
    start_date= last_year,
    end_date=analysis_month,
    file_name = file_name,)

In [None]:
len(monthly_operator_summary_df)

In [None]:
len(monthly_operator_summary_df.drop_duplicates())

In [None]:
def prep_operator_summary(file_name:str) -> pd.DataFrame:

    df = pd.read_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/{GTFS_DATA_DICT.gtfs_digest_rollup.operator_summary}_{file_name}.parquet"
    )
    
    # Prepare data for portfolio
    df2 = df[
        [
            "month_first_day",
            "analysis_name",
            "vp_name",
            "tu_name",
            "n_trips",
            "day_type",
            "daily_trips",
            "ttl_service_hours",
            "n_routes",
            "n_days",
            "n_shapes",
            "n_stops",
            "vp_messages_per_minute",
            "n_vp_trips",
            "daily_vp_trips",
            "pct_vp_trips",
            "n_vp_routes",
            "pct_vp_service_hours",
            "tu_messages_per_minute",
            "n_tu_trips",
            "daily_tu_trips",
            "pct_tu_trips",
            "n_tu_routes",
            "pct_tu_service_hours",
        ]
    ]
    df2.columns = df2.columns.str.replace("_", " ").str.title()

    df2 = df2.rename(
        columns={
            "Month First Day": "Date",
        }
    )
    df2.columns = df2.columns.str.replace("Vp", "VP").str.replace("Tu", "TU")

    # Create a couple of new columns
    df2["Percent of Trips with Trip Updates"] = (
        df2["N TU Trips"] / df2["N Trips"]
    ) * 100

    df2["Percent of Trips with Vehicle Positions"] = (
        df2["N VP Trips"] / df2["N Trips"]
    ) * 100

    """
    df2["Percent of Trips with Vehicle Positions"] = df2[
        "Percent of Trips with Vehicle Positions"
    ].clip(upper=100.0)
    df2["Percent of Trips with Trip Updates"] = df2[
        "Percent of Trips with Trip Updates"
    ].clip(upper=100.0)
    """
    # Save
    df2.to_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/{GTFS_DATA_DICT.gtfs_digest_rollup.operator_summary}_{file_name}.parquet"
    )
    return df2

In [None]:
monthly_operator_summary_clean = prep_operator_summary(
    file_name = file_name,)

In [None]:
monthly_operator_summary_clean.head(1)

In [None]:
len(monthly_operator_summary_clean)

## 5) fct_monthly_routes 
* Add Caltrans District + Legislative District

In [56]:
def load_fct_monthly_routes(
    project_name: str, 
    date_col:str, 
    dataset_name: str, 
    start_date: str, 
    end_date: str, 
    file_name:str
) -> pd.DataFrame:
    gdf = bq_utils.download_table(
         project_name=project_name,
        dataset_name=dataset_name,
        table_name=GTFS_DATA_DICT.gtfs_digest_rollup.route_map,
        date_col= date_col,
        start_date=start_date,
        end_date=end_date,
        geom_col="pt_array",
        geom_type="line",
    )

    # Merge with crosswalk
    crosswalk_df = load_crosswalk()[["name","analysis_name"]]
    m1 = pd.merge(gdf, crosswalk_df, on = ["name"], how =  "inner")
    
    utils.geoparquet_gcs_export(gdf = m1,
                           gcs_file_path = f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/",
                           file_name = f"{GTFS_DATA_DICT.gtfs_digest_rollup.route_map}_{file_name}")
    return m1

In [52]:
monthly_routes_gdf = load_fct_monthly_routes(project_name=PROD_PROJECT,
    date_col = MONTH_DATE_COL,
    dataset_name=PROD_MART,
    start_date= previous_month,
    end_date=analysis_month,
    file_name = file_name,)

Downloading: 100%|[32m██████████[0m|
query: SELECT * FROM  `cal-itp-data-infra`.`mart_gtfs_rollup`.`fct_monthly_routes` WHERE month_first_day >= DATE('2025-10-15') AND month_first_day <= DATE('2025-11-05')
Downloading: 100%|[32m██████████[0m|
query: SELECT * FROM  `cal-itp-data-infra`.`mart_transit_database`.`bridge_gtfs_analysis_name_x_ntd`


In [53]:
sf_gdf = monthly_routes_gdf.loc[monthly_routes_gdf.name == 'Bay Area 511 Muni Schedule']

In [54]:
len(sf_gdf)

135

In [55]:
sf_gdf.columns

Index(['name', 'year', 'month', 'month_first_day', 'route_name',
       'direction_id', 'route_type', 'shape_id', 'shape_array_key', 'n_trips',
       'geometry'],
      dtype='object')

In [57]:
crosswalk_df = load_crosswalk()[["name","analysis_name"]]
m1 = pd.merge(sf_gdf, crosswalk_df, on = ["name"], how =  "inner")

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.
Downloading: 100%|[32m██████████[0m|
query: SELECT * FROM  `cal-itp-data-infra`.`mart_transit_database`.`bridge_gtfs_analysis_name_x_ntd`


In [58]:
m1.shape

(135, 12)

In [59]:
gdf = gpd.read_parquet(f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/{GTFS_DATA_DICT.gtfs_digest_rollup.route_map}_{file_name}.parquet", storage_options={"token": credentials.token})

In [60]:
gdf.loc[gdf.name == 'Bay Area 511 Muni Schedule'].shape

(135, 12)

In [62]:
m2 = m1.sort_values(
        by=["month_first_day", "analysis_name", "route_name"],
        ascending=[False, True, True],
    ).drop_duplicates(subset=["analysis_name", "route_name"])

In [63]:
m2.shape

(68, 12)

In [64]:
m2 = m2.drop(
        columns=[
            "shape_id",
            "shape_array_key",
            "n_trips",
            "direction_id",
        ]
    )

    # Convert to miles
m2["route_length_miles"] = (m2.geometry.to_crs(
            geography_utils.CA_NAD83Albers_ft
        ).length) / 5_280
    

In [65]:
m2.shape

(68, 9)

In [69]:
# m2.drop(columns = ["geometry"])

In [67]:
def prep_fct_monthly_routes(project_name: str, 
    date_col:str, 
    dataset_name: str, 
    start_date: str, 
    end_date: str, 
    file_name:str) -> pd.DataFrame:

    gdf = gpd.read_parquet(f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/{GTFS_DATA_DICT.gtfs_digest_rollup.route_map}_{file_name}.parquet",
                                        storage_options={"token": credentials.token})

    # Keep only the most recent route geography
    gdf2 = gdf.sort_values(
        by=["month_first_day", "analysis_name", "route_name"],
        ascending=[False, True, True],
    ).drop_duplicates(subset=["analysis_name", "route_name"])

    gdf2 = gdf2.drop(
        columns=[
            "shape_id",
            "shape_array_key",
            "n_trips",
            "direction_id",
        ]
    )

    # Convert to miles
    gdf2["route_length_miles"] = (gdf2.geometry.to_crs(
            geography_utils.CA_NAD83Albers_ft
        ).length) / 5_280
    
    # Convert the geometry to line
    # m1 = geo_utils.convert_to_gdf(m1, "pt_array", "line")

    # Clean
    gdf2.columns = gdf2.columns.str.replace("_", " ").str.title()

    utils.geoparquet_gcs_export(gdf = gdf2,
                           gcs_file_path = f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/",
                           file_name = f"{GTFS_DATA_DICT.gtfs_digest_rollup.route_map}_{file_name}")

    return gdf2

In [68]:
monthly_routes_gdf = prep_fct_monthly_routes(project_name=PROD_PROJECT,
    date_col = MONTH_DATE_COL,
    dataset_name=PROD_MART,
    start_date= previous_month,
    end_date=analysis_month,
    file_name = file_name,)

In [70]:
gdf = gpd.read_parquet(f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/{GTFS_DATA_DICT.gtfs_digest_rollup.route_map}_{file_name}.parquet",
                                        storage_options={"token": credentials.token})

In [72]:
gdf.columns

Index(['Name', 'Year', 'Month', 'Month First Day', 'Route Name', 'Route Type',
       'Geometry', 'Analysis Name', 'Route Length Miles'],
      dtype='object')

In [77]:
gdf.loc[gdf["Analysis Name"].str.contains("San Francisco")].drop(columns = ["Geometry"])

Unnamed: 0,Name,Year,Month,Month First Day,Route Name,Route Type,Analysis Name,Route Length Miles
1608,Bay Area 511 Muni Schedule,2025,11,2025-11-01,12__12 FOLSOM-PACIFIC,3,City and County of San Francisco,6.53
1654,Bay Area 511 Muni Schedule,2025,11,2025-11-01,14R__14R MISSION RAPID,3,City and County of San Francisco,8.69
1629,Bay Area 511 Muni Schedule,2025,11,2025-11-01,14__14 MISSION,3,City and County of San Francisco,7.8
1632,Bay Area 511 Muni Schedule,2025,11,2025-11-01,15__15 BAYVIEW HUNTERS POINT EXPRESS,3,City and County of San Francisco,7.23
1647,Bay Area 511 Muni Schedule,2025,11,2025-11-01,18__18 46TH AVENUE,3,City and County of San Francisco,7.3
1636,Bay Area 511 Muni Schedule,2025,11,2025-11-01,19__19 POLK,3,City and County of San Francisco,7.42
1694,Bay Area 511 Muni Schedule,2025,11,2025-11-01,1X__1X CALIFORNIA EXPRESS,3,City and County of San Francisco,5.75
1589,Bay Area 511 Muni Schedule,2025,11,2025-11-01,1__1 CALIFORNIA,3,City and County of San Francisco,5.75
1598,Bay Area 511 Muni Schedule,2025,11,2025-11-01,22__22 FILLMORE,3,City and County of San Francisco,5.51
1686,Bay Area 511 Muni Schedule,2025,11,2025-11-01,23__23 MONTEREY,3,City and County of San Francisco,9.21


In [None]:
# gdf_og.loc[gdf_og.route_mile > 192][["name","route_name","geometry"]].explore()

## 5) fct_operator_hourly_summary

In [None]:
def load_fct_operator_hourly_summary(project_name: str, 
                           date_col:str, 
                           dataset_name: str, 
                           start_date: str, 
                           end_date: str, 
                           file_name: str) -> pd.DataFrame:
    df = bq_utils.download_table(
        project_name=project_name,
        dataset_name=dataset_name,
        table_name=GTFS_DATA_DICT.gtfs_digest_rollup.hourly_day_type_summary,
        date_col= date_col,
        start_date=start_date,
        end_date=end_date,
    )
    df.to_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/{GTFS_DATA_DICT.gtfs_digest_rollup.hourly_day_type_summary}_{file_name}.parquet"
    )
    return df

In [None]:
fct_operator_hourly_summary = load_fct_operator_hourly_summary(

    project_name=PROD_PROJECT,
    date_col = MONTH_DATE_COL,
    dataset_name=PROD_MART,
    start_date= last_year,
    end_date=analysis_month,
    file_name = file_name
)

In [None]:
def clean_fct_operator_hourly_summary(
                           file_name:str) -> pd.DataFrame:

    df = pd.read_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/{GTFS_DATA_DICT.gtfs_digest_rollup.hourly_day_type_summary}_{file_name}.parquet"
    )
    # Prepare data
    df2 = (
        df.groupby(["analysis_name", "month_first_day", "day_type", "departure_hour"])
        .agg({"n_trips": "sum"})
        .reset_index()
    )

    df2.columns = df2.columns.str.replace("_", " ").str.title()

    df2 = df2.rename(columns={"Month First Day": "Date"})

    df2["Date"] = df2["Date"].dt.strftime("%m-%Y")

    df2.to_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/{GTFS_DATA_DICT.gtfs_digest_rollup.hourly_day_type_summary}_{file_name}.parquet"
    )
    return df2

In [None]:
clean_fct_operator_hourly_summary_df = clean_fct_operator_hourly_summary(
    file_name = file_name,)

## Publish everything to GCS

In [None]:
GCS = GTFS_DATA_DICT["gtfs_digest_rollup"].dir

In [None]:
GCS

In [None]:
GTFS_DATA_DICT["gtfs_digest_rollup"].schedule_rt_route_direction

In [None]:
file_names = [
    GTFS_DATA_DICT["gtfs_digest_rollup"][f]
    for f in [
        "schedule_rt_route_direction",
        "route_map",
        "operator_summary",
        "hourly_day_type_summary"
    ]
]

In [None]:
file_names = [f"processed/{key}_{file_name}.parquet" for key in file_names]

In [None]:
file_names

In [None]:
[f"{GCS}{key}" for key in file_names]

In [None]:
#for f in file_names:
#    publish_utils.write_to_public_gcs(f, f"gtfs_digest/{Path(f).name}", PUBLIC_GCS)

In [None]:
#for f in file_names:
 #   publish_public_data.export_parquet_as_csv_or_geojson(f, filetype="df")