# _01_ntd_ridership_utils refactor
06/24/2025

Finally decided to centralize the functions needed to run the monthly/annual ridership reports

initially each report contains duplicate functions, modified to work with their respective dataset. 

Now, all common functions will be refactored to work with both monthly/annual datasets. 

In [12]:
# all imports from _01_utils
import gcsfs
import geopandas as gpd
import os
import pandas as pd
import shutil

from calitp_data_analysis.tables import tbls
from siuba import _, collect, count, filter, show_query, select, distinct
from calitp_data_analysis.sql import to_snakecase
#from segment_speed_utils.project_vars import PUBLIC_GCS

import sys
sys.path.append("./monthly_ridership_report")
from update_vars import GCS_FILE_PATH, NTD_MODES, NTD_TOS, YEAR, MONTH


fs = gcsfs.GCSFileSystem()

RTPA_URL = ("https://services3.arcgis.com/bWPjFyq029ChCGur/arcgis/rest/services/"
       "RTPAs/FeatureServer/0/query?outFields=*&where=1%3D1&f=geojson"
      )


## Update add_change_columns

In [5]:
def add_change_columns(
    df: pd.DataFrame,
	sort_cols,
	group_cols,
	change_col
) -> pd.DataFrame:
    """
    This function works with the warehouse `dim_monthly_ntd_ridership_with_adjustments` long data format.
    Sorts the df by ntd id, mode, tos, period month and period year. then adds 2 new columns, 1. previous year/month UPT and 2. UPT change 1yr.
    """

    
    df[["period_year","period_month"]] = df[["period_year","period_month"]].astype(int)

    df = df.assign(
        previous_y_m_upt = (df.sort_values(sort_cols)
                        .groupby(group_cols)["upt"] 
                        .apply(lambda x: x.shift(1))
                       )
    )

    df["change_1yr"] = (df["upt"] - df[change_col])
    
    df = get_percent_change(
        df,
        change_col = change_col
    )
    
    return df

In [None]:
## moved to function
# for produce_ntd_monthly_ridership_by_rtpa
# in monthly report sript

monthly_sort_cols =  [
    "ntd_id",
    "mode", 
    "tos",
    "period_month", 
    "period_year"
] # got the order correct with ["period_month", "period_year"]! sorted years with grouped months

monthly_group_cols = [
    "ntd_id",
    "mode", 
    "tos"
              ]

monthly_change_col ="previous_y_m_upt"

df = add_change_columns(
    df,
    sort_cols = monthly_sort_cols,
    group_cols = monthly_group_cols,
    change_col = monthly_change_col
)

In [None]:
## moved to function
# for produce_annual_ntd_ridership_data_by_rtpa
# in annual report script

annual_sort_cols =  [
    "ntd_id",
    "year",
    "mode", 
    "service",
] # got the order correct with ["period_month", "period_year"]! sorted years with grouped months

annual_group_cols = [
    "ntd_id",
    "mode", 
    "service"
              ]

annual_change_col ="previous_y_upt"

df = add_change_columns(
    df,
    sort_cols = annual_sort_cols,
    group_cols = annual_group_cols,
    change_col = annual_change_col
)

## Update get_percent_change

In [6]:
def get_percent_change(
    df: pd.DataFrame,
    change_col: str
) -> pd.DataFrame:
    """
    updated to work with the warehouse `dim_monthly_ntd_ridership_with_adjustments` long data format. 
    
    """
    df["pct_change_1yr"] = (
        (df["upt"] - df[change_col])
        .divide(df["upt"])
        .round(4)
    )
    
    return df

In [None]:
## added to add_change_columns function
df = get_percent_change(
    df,
    change_col=change_col
)

## update sum_by_group

In [7]:
def sum_by_group(
    df: pd.DataFrame,
    group_cols: list,
    group_col2: list,
    agg_cols: dict,
    
) -> pd.DataFrame:
    """
    since data is now long to begin with, this replaces old sum_by_group, make_long and assemble_long_df functions.
    """
    grouped_df = df.groupby(group_cols+
                             group_col2
                           ).agg(agg_cols
    ).reset_index()
    
    #get %change back
    grouped_df = get_percent_change(grouped_df)
    
    #decimal to whole number
    grouped_df["pct_change_1yr"] = grouped_df["pct_change_1yr"]*100
    
    return grouped_df

In [None]:
# for monthly report notebook
monthly_group_col_2 = [
    'period_year',
    'period_month',
    'period_year_month']

monthly_agg_col = {
        "upt":"sum",
        "previous_y_m_upt":"sum",
        "change_1yr":"sum"
    }

by_,,,_long = sum_by_group(
    df,
    group_cols= ,,,_cols,
    group_col2= monthly_group_col_2,# look into combingin with base grou_cols
    agg_cols = monthly_agg_col
)

In [None]:
# for annual report notebook 
annual_group_col_2 = ["year"]

monthly_agg_col= {
                "upt": "sum",
                "previous_y_upt": "sum",
                "change_1yr": "sum",
            }

# used in save_rtpa_outputs
by_,,,_long = sum_by_group(
    df,
    group_cols= ,,,_cols,
    group_col2= annual_group_col_2,# look into combingin with base grou_cols
    agg_cols = monthly_agg_col
)

## update save_rtpa_outputs

In [8]:
def save_rtpa_outputs(
    df: pd.DataFrame, 
    year: int, 
    col_dict: dict,
    month: str,
    cover_sheet_path: str,
    cover_sheet_index_col: str,
    output_file_name: str,
    monthly_upload_to_public: bool = False,
    annual_upload_to_public: bool = False,
):
    """
    Export an excel for each RTPA, adds a READ ME tab, then writes into a folder.
    Zip that folder. 
    Upload zipped file to GCS.
    """

    print("creating individual RTPA excel files")
    
    for i in df["rtpa_name"].unique():
        
        print(f"creating excel file for: {i}")
        
        # Filename should be snakecase
        rtpa_snakecase = i.replace(' ', '_').lower()
        
        #insertng readme cover sheet, 
        cover_sheet = pd.read_excel(cover_sheet_path, index_col = cover_sheet_index_col)
        cover_sheet.to_excel(
            f"./{year}_{month}/{rtpa_snakecase}.xlsx", sheet_name = "README")

        rtpa_data =(df[df["rtpa_name"] == i]
         .sort_values("ntd_id")
         #got error from excel not recognizing timezone, made list to include dropping "execution_ts" column
         .drop(columns = "_merge")
         #cleaning column names
         .rename(columns=lambda x: x.replace("_"," ").title().strip())
         #rename columns
         .rename(columns=col_dict)
                   )
        #column lists for aggregations
        agency_cols = ["ntd_id", "agency", "rtpa_name"]
        mode_cols = ["mode", "rtpa_name"]
        tos_cols = ["tos", "rtpa_name"]
        reporter_type = ["reporter_type", "rtpa_name"] # may break the monthly report

        # Creating aggregations
        by_agency_long = sum_by_group((df[df["rtpa_name"] == i]), agency_cols)                                 
        by_mode_long = sum_by_group((df[df["rtpa_name"] == i]), mode_cols)
        by_tos_long = sum_by_group((df[df["rtpa_name"] == i]), tos_cols)
        by_reporter_type_long = sum_by_group((df[df["rtpa_name"] == i]), reporter_type)
        
        #writing pages to excel fil
        with pd.ExcelWriter(f"./{year}_{month}/{rtpa_snakecase}.xlsx", mode ="a") as writer:
            rtpa_data.to_excel(writer, sheet_name = "RTPA Ridership Data", index=False)
            by_agency_long.to_excel(writer, sheet_name = "Aggregated by Agency", index=False)
            by_mode_long.to_excel(writer, sheet_name = "Aggregated by Mode", index=False)
            by_tos_long.to_excel(writer, sheet_name = "Aggregated by TOS", index=False)
            by_reporter_type_long.to_excel(writer, sheet_name="Aggregate by Reporter Type", index=False)
    
    
    print("zipping all excel files")
    
    shutil.make_archive(f"./{output_file_name}", "zip", f"{year}_{month}")
    
    print("Zipped folder")
    
    print("Upload to private GCS")
    fs.upload(
        f"./{output_file_name}.zip", 
        f"{GCS_FILE_PATH}{year}_{month}.zip"
    )
    
    if monthly_upload_to_public:
        fs.upload(
            f"./{output_file_name}.zip",
            f"{PUBLIC_GCS}ntd_monthly_ridership/{year}_{month}.zip"
        )
        print("Uploaded to public GCS - monthly report")
        
    if annual_upload_to_public:
        fs.upload(
            f"./{output_file_name}.zip",
            f"{PUBLIC_GCS}ntd_annual_ridership/{year}_{month}_annual_report_data.zip"
        )
    
        print("Uploaded to public GCS - annual report")
    
    print("complete")
    
    return

In [None]:
# for monthly report script, variables for save_rtpa_output

col_dict ={
    'Uace Cd': "UACE Code",
    'Dt': "Date",
    'Ntd Id': "NTD ID",
    'Tos': "Type of Service",
    'Legacy Ntd Id': "Legacy NTD ID",
    'Upt': "UPT",
    'Vrm': "VRM",
    'Vrh': "VRH",
    'Voms': "VOMS",
    'Rtpa': "RTPA",
    'Previous Y M Upt': "Previous Year/Month UPT",
    'Change 1Yr': "Change in 1 Year UPT",
    'Pct Change 1Yr': "Percent Change in 1 Year UPT",
    'Tos Full': "Type of Service Full Name"
}

monthly_cover_sheet_path = "./cover_sheet_template.xlsx"
monthly_index_col = "**NTD Monthly Ridership by RTPA**"
monthly_data_file_name = f"{year}_{month}_monthly_report_data"

In [None]:
# for annual report sript, variables for save_rtpa_output
col_dict= {
    "source_agency":"agency",
    "service":"tos"
}

annual_cover_sheet_path = "./annual_report_cover_sheet_template.xlsx"
annual_index_col = "**NTD Annual Ridership by RTPA**"
annual_data_file_name = f"{year}_{month}_annual_report_data"

## update remove_local_outputs

In [9]:
def remove_local_outputs(
    year: int, 
    month: str
):
    print("removing data folder")
    shutil.rmtree(f"{year}_{month}/")
    
    
    if os.path.exist(f"{year}_{month}_annual_report_data.zip"):
        os.remove(f"{year}_{month}_annual_report_data.zip")
        print("removing annual data zip file")
    
    elif os.path.exist(f"{year}_{month}_monthly_report_data.zip"):
        os.remove(f"{year}_{month}_monthly_report_data.zip")
        print("removing monthly data zip file")
    
    else:
        print("Could not find report data to delete")
    
    

In [None]:
# for monthly ridership script


In [None]:
# for annual ridership script


## update produce_ntd_monthly_ridership_by_rtpa

In [10]:
def produce_ntd_monthly_ridership_by_rtpa(year: int, month: int) -> pd.DataFrame:
    """
    This function works with the warehouse `dim_monthly_ntd_ridership_with_adjustments` long data format.
    Import NTD data from warehouse, filter to CA,
    merge in crosswalk, checks for unmerged rows, then creates new columns for full Mode and TOS name.

    """

    full_upt = (
        tbls.mart_ntd.dim_monthly_ridership_with_adjustments()
        >> filter(
            _.period_year.isin(
                ["2018", "2019", "2020", "2021", "2022", "2023", "2024", "2025"]
            )
        )
        >>select(
            _.ntd_id,
            _.agency,
            _.reporter_type,
            _.period_year_month,
            _.period_year,
            _.period_month,
            _.mode,
            _.tos,
            _.mode_type_of_service_status,
            _.primary_uza_name,
            _.upt
            
        )
        >> collect()
    ).rename(
        columns={
            "mode_type_of_service_status": "Status",
            "primary_uza_name": "uza_name",
        }
    )

    full_upt = full_upt[full_upt.agency.notna()].reset_index(drop=True)

    # full_upt.to_parquet(
    #     f"{GCS_FILE_PATH}ntd_monthly_ridership_{year}_{month}.parquet"
    # )

    ca = full_upt[
        (full_upt["uza_name"].str.contains(", CA")) & (full_upt.agency.notna())
    ].reset_index(drop=True)

    # use new crosswalk function
    crosswalk = ntd_id_to_rtpa_crosswalk(split_scag=True)

    min_year = 2018

    # get agencies with last report year and data after > 2018.
    last_report_year = (
        tbls.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt()
        >> filter(
            _.year >= min_year,  # see if this changes anything
            _.last_report_year >= min_year,
            _.primary_uza_name.str.contains(", CA")
            | _.primary_uza_name.str.contains("CA-NV")
            | _.primary_uza_name.str.contains("California Non-UZA"),
        )
        >> distinct(
            "source_agency",
            #'agency_status',
            #'legacy_ntd_id',
            "last_report_year",
            #'mode',
            "ntd_id",
            #'reporter_type',
            #'reporting_module',
            #'service',
            #'uace_code',
            #'primary_uza_name',
            #'uza_population',
            #'year',
            #'upt',
        )
        >> collect()
    )

    # merge last report year to CA UPT data
    df = pd.merge(ca, last_report_year, left_on="ntd_id", right_on="ntd_id", how="inner")

    # merge crosswalk to CA last report year
    df = pd.merge(
        df,
        # Merging on too many columns can create problems
        # because csvs and dtypes aren't stable / consistent
        # for NTD ID, Legacy NTD ID, and UZA
        crosswalk[["ntd_id_2022", "rtpa_name"]],
        left_on="ntd_id",
        right_on="ntd_id_2022",
        how="left",
        indicator=True,
    )

    print(df._merge.value_counts())

    # check for unmerged rows
    if len(df[df._merge == "left_only"]) > 0:
        raise ValueError("There are unmerged rows to crosswalk")
    
    monthly_sort_cols =  [
    "ntd_id",
    "mode", 
    "tos",
    "period_month", 
    "period_year"
] # got the order correct with ["period_month", "period_year"]! sorted years with grouped months

    monthly_group_cols = [
        "ntd_id",
        "mode", 
        "tos"
                  ]

    monthly_change_col ="previous_y_m_upt"

    df = add_change_columns(
        df,
        sort_cols = monthly_sort_cols,
        group_cols = monthly_group_cols,
        change_col = monthly_change_col
    )

    
    df = df.assign(
        Mode_full = df["mode"].map(NTD_MODES),
        TOS_full = df["tos"].map(NTD_TOS)
    )
    
    return df

## update produce_annual_ntd_ridership_data_by_rtpa

In [11]:
def produce_annual_ntd_ridership_data_by_rtpa(min_year: str, split_scag: bool) -> pd.DataFrame:
    """
    Function that ingest time series ridership data from `mart_ntd_funding_and_expenses.fct_service..._by_mode_upt`. 
    Filters for CA agencies with last report year and year of data greater than min_year
    Merges in ntd_id_to_rtpa_crosswalk function. Aggregates by agency, mode and TOS. calculates change in UPT.
    """
    from annual_ridership_module import add_change_columns
    
    
    print("ingest annual ridership data from warehouse")
    
    ntd_service =(
        tbls.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt()
        >> filter(
            _.year >= min_year,
            _.last_report_year >= min_year,
            _.primary_uza_name.str.contains(", CA") | 
            _.primary_uza_name.str.contains("CA-NV") |
            _.primary_uza_name.str.contains("California Non-UZA") 
        )
        >> select(
            'source_agency',
            'agency_status',
            'legacy_ntd_id',
            'last_report_year',
            'mode',
            'ntd_id',
            'reporter_type',
            'reporting_module',
            'service',
            'uace_code',
            'primary_uza_name',
            'uza_population',
            'year',
            'upt',
        )
        >> collect())
    
    ntd_service = (
        ntd_service.groupby(
            [
                "source_agency",
                "agency_status",
                #"city",
                #"state",
                "ntd_id",
                "primary_uza_name",
                "reporter_type",
                "mode",
                "service",
                "last_report_year",
                "year",
            ]
        )
        .agg({"upt": "sum"})
        .sort_values(by="ntd_id")
        .reset_index()
    )
    
    print("create crosswalk from ntd_id_to_rtpa_crosswalk function")
    
    # Creating crosswalk using function, enable splitting scag to indivdual CTC
    ntd_to_rtpa_crosswalk = ntd_id_to_rtpa_crosswalk(split_scag=split_scag)
    
    
    print("merge ntd data to crosswalk")
    # merge service data to crosswalk
    ntd_data_by_rtpa = ntd_service.merge(
        ntd_to_rtpa_crosswalk,
        how="left",
        left_on=[
            "ntd_id",
            # "agency", "reporter_type", "city" # sometime agency name, reporter type and city name change or are inconsistent, causing possible fanout
        ],
        right_on="ntd_id_2022",
        indicator=True,
    )
    
    # list of ntd_id with LA County Dept of Public Works name
    lacdpw_list = [
        "90269",
        "90270",
        "90272",
        "90273",
        "90274",
        "90275",
        "90276",
        "90277",
        "90278",
        "90279",
    ]
    
    # replace LA County Public Works agencies with their own RTPA
    ntd_data_by_rtpa.loc[
        ntd_data_by_rtpa["ntd_id"].isin(lacdpw_list), ["rtpa_name", "_merge"]
    ] = ["Los Angeles County Department of Public Works", "both"]
    
    print(ntd_data_by_rtpa._merge.value_counts())
        
    if len(ntd_data_by_rtpa[ntd_data_by_rtpa._merge=="left_only"]) > 0:
        raise ValueError("There are unmerged rows to crosswalk")
    
    print("add `change_column` to data")
    
    annual_sort_cols =  [
        "ntd_id",
        "year",
        "mode", 
        "service",
    ] # got the order correct with ["period_month", "period_year"]! sorted years with grouped months

    annual_group_cols = [
        "ntd_id",
        "mode", 
        "service"
                  ]

    annual_change_col ="previous_y_upt"

    df = add_change_columns(
        df,
        sort_cols = annual_sort_cols,
        group_cols = annual_group_cols,
        change_col = annual_change_col
    )
    
    print("map mode and tos desc.")
    ntd_data_by_rtpa = ntd_data_by_rtpa.assign(
        mode_full = ntd_data_by_rtpa["mode"].map(NTD_MODES),
        service_full = ntd_data_by_rtpa["service"].map(NTD_TOS)
    )
    print("complete")
    return ntd_data_by_rtpa

# TEST

In [14]:
from _01_ntd_ridership_utils import ntd_id_to_rtpa_crosswalk

ModuleNotFoundError: No module named 'segment_speed_utils'

In [None]:
# monthly_ridership_by_rtpa script

In [13]:
df = produce_ntd_monthly_ridership_by_rtpa(YEAR, MONTH)
print(df.columns)

NameError: name 'ntd_id_to_rtpa_crosswalk' is not defined

In [None]:
# annua_ridership_module script