In [1]:
import sys

sys.path.append("../bus_service_increase")
sys.path.append("../starter_kit")  # to test out style_df function
sys.path.append("../")

In [2]:
%%capture
import warnings

warnings.filterwarnings("ignore")
from calitp_data_analysis.sql import query_sql, to_snakecase

import altair as alt
import calitp_data_analysis.magics
import pandas as pd
from shared_utils.portfolio_utils import label_visualization
# from _starterkit_utils import style_df
from calitp_data_analysis import calitp_color_palette as cp
from explore_monthly_ridership_by_rtpa import sum_by_group
from IPython.display import HTML, Markdown, display
from update_vars import YEAR, MONTH, PUBLIC_FILENAME, YEAR, NTD_MODES, NTD_TOS
from _01_ntd_ridership_utils import ntd_id_to_rtpa_crosswalk, add_change_columns, get_percent_change, sum_by_group

# for testing


# from monthly_ridership_by_rtpa import get_percent_change
# from shared_utils.rt_dates import MONTH_DICT

# Temp file path for testing
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/ntd/"

# alt.renderers.enable("html")
alt.data_transformers.enable("default", max_rows=None)

In [3]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

In [4]:
# parameters cell for local
rtpa = "Metropolitan Transportation Commission"  # lots of reporters
# rtpa = "Butte County Association of Governments" # 2 reporters
# rtpa = "San Joaquin Council of Governments" # 4 reporters

In [5]:
%%capture_parameters
rtpa

{"rtpa": "Metropolitan Transportation Commission"}


---

## 09/24/2025 Update new table in function

In [89]:
# initiald function with current data table
def produce_ntd_monthly_ridership_by_rtpa(year: int, month: int) -> pd.DataFrame:
    """
    This function works with the warehouse `dim_monthly_ntd_ridership_with_adjustments` long data format.
    Import NTD data from warehouse, filter to CA,
    merge in crosswalk, checks for unmerged rows, then creates new columns for full Mode and TOS name.

    """
    # REMOVED 2025 FOR TESTING PURPOSES
    monthly_query ="""
    SELECT 
      ntd_id,
      agency,
      reporter_type,
      period_year_month,
      period_year,
      period_month,
      mode,
      tos,
      mode_type_of_service_status AS Status,
      primary_uza_name as uza_name,
      upt
    FROM
      `cal-itp-data-infra.mart_ntd.dim_monthly_ridership_with_adjustments`
    WHERE
      period_year IN ("2018", "2019", "2020", "2021", "2022", "2023", "2024") 
      AND agency IS NOT NULL
    """
    full_upt = query_sql(monthly_query, as_df=True)

    # full_upt.to_parquet(
    #     f"{GCS_FILE_PATH}ntd_monthly_ridership_{year}_{month}.parquet"
    # )

    ca = full_upt[
        (full_upt["uza_name"].str.contains(", CA")) & (full_upt.agency.notna())
    ].reset_index(drop=True)

    # use new crosswalk function
    crosswalk = ntd_id_to_rtpa_crosswalk(split_scag=True)

    min_year = 2018

    # get agencies with last report year and data after > 2018.
    last_report_query = """
    SELECT DISTINCT
      source_agency,
      last_report_year,
      ntd_id,
    FROM
      `cal-itp-data-infra.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt`
    WHERE
      year >= 2018
      AND last_report_year >= 2018
      AND (
        primary_uza_name LIKE "%, CA%"
        OR primary_uza_name LIKE "%CA-NV%"
        OR primary_uza_name LIKE "%California Non-UZA%"
      )
    """

    last_report_year = query_sql(last_report_query, as_df=True)

    # merge last report year to CA UPT data
    df = pd.merge(ca, last_report_year, left_on="ntd_id", right_on="ntd_id", how="inner")
    # merge crosswalk to CA last report year
    df = pd.merge(
        df,
        # Merging on too many columns can create problems
        # because csvs and dtypes aren't stable / consistent
        # for NTD ID, Legacy NTD ID, and UZA
        crosswalk[["ntd_id_2022", "rtpa_name"]],
        left_on="ntd_id",
        right_on="ntd_id_2022",
        how="left",
        indicator=True,
    )

    print(df._merge.value_counts())

    # check for unmerged rows
    if len(df[df._merge == "left_only"]) > 0:
        raise ValueError("There are unmerged rows to crosswalk")
    
    monthly_sort_cols =  [
    "ntd_id",
    "mode", 
    "tos",
    "period_month", 
    "period_year"
] # got the order correct with ["period_month", "period_year"]! sorted years with grouped months

    monthly_group_cols = [
        "ntd_id",
        "mode", 
        "tos"
                  ]

    monthly_change_col ="previous_y_m_upt"

    df = add_change_columns(
        df,
        sort_cols = monthly_sort_cols,
        group_cols = monthly_group_cols,
        change_col = monthly_change_col
    )

    
    df = df.assign(
        Mode_full = df["mode"].map(NTD_MODES),
        TOS_full = df["tos"].map(NTD_TOS)
    )
    
    return df

In [90]:
# new data table
def produce_ntd_monthly_ridership_by_rtpa_v2(year: int, month: int) -> pd.DataFrame:
    """
    This function works with the warehouse `dim_monthly_ntd_ridership_with_adjustments` long data format.
    Import NTD data from warehouse, filter to CA,
    merge in crosswalk, checks for unmerged rows, then creates new columns for full Mode and TOS name.

    """
    # REMOVED 2025 FOR TESTING PURPOSES
    monthly_query ="""
    SELECT 
      ntd_id,
      agency,
      reporter_type,
      period_year_month,
      period_year,
      period_month,
      mode,
      tos,
      mode_type_of_service_status AS Status,
      uza_name,
      upt
    FROM
      `cal-itp-data-infra.mart_ntd_ridership.fct_complete_monthly_ridership_with_adjustments_and_estimates`
    WHERE
      period_year IN ("2018", "2019", "2020", "2021", "2022", "2023", "2024")
      AND agency IS NOT NULL
    """
    full_upt = query_sql(monthly_query, as_df=True)

    # full_upt.to_parquet(
    #     f"{GCS_FILE_PATH}ntd_monthly_ridership_{year}_{month}.parquet"
    # )

    ca = full_upt[
        (full_upt["uza_name"].str.contains(", CA")) & (full_upt.agency.notna())
    ].reset_index(drop=True)

    # use new crosswalk function
    crosswalk = ntd_id_to_rtpa_crosswalk(split_scag=True)

    min_year = 2018

    # get agencies with last report year and data after > 2018.
    last_report_query = """
    SELECT DISTINCT
      source_agency,
      last_report_year,
      ntd_id,
    FROM
      `cal-itp-data-infra.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt`
    WHERE
      year >= 2018
      AND last_report_year >= 2018
      AND (
        primary_uza_name LIKE "%, CA%"
        OR primary_uza_name LIKE "%CA-NV%"
        OR primary_uza_name LIKE "%California Non-UZA%"
      )
    """

    last_report_year = query_sql(last_report_query, as_df=True)

    # merge last report year to CA UPT data
    df = pd.merge(ca, last_report_year, left_on="ntd_id", right_on="ntd_id", how="inner")
    # merge crosswalk to CA last report year
    df = pd.merge(
        df,
        # Merging on too many columns can create problems
        # because csvs and dtypes aren't stable / consistent
        # for NTD ID, Legacy NTD ID, and UZA
        crosswalk[["ntd_id_2022", "rtpa_name"]],
        left_on="ntd_id",
        right_on="ntd_id_2022",
        how="left",
        indicator=True,
    )

    print(df._merge.value_counts())

    # check for unmerged rows
    if len(df[df._merge == "left_only"]) > 0:
        raise ValueError("There are unmerged rows to crosswalk")
    
    monthly_sort_cols =  [
    "ntd_id",
    "mode", 
    "tos",
    "period_month", 
    "period_year"
] # got the order correct with ["period_month", "period_year"]! sorted years with grouped months

    monthly_group_cols = [
        "ntd_id",
        "mode", 
        "tos"
                  ]

    monthly_change_col ="previous_y_m_upt"

    df = add_change_columns(
        df,
        sort_cols = monthly_sort_cols,
        group_cols = monthly_group_cols,
        change_col = monthly_change_col
    )

    
    df = df.assign(
        Mode_full = df["mode"].map(NTD_MODES),
        TOS_full = df["tos"].map(NTD_TOS)
    )
    
    return df

In [None]:
df_v1 = produce_ntd_monthly_ridership_by_rtpa(YEAR, MONTH)
print(df_v1.columns) # good to go!

In [None]:
df_v2 = produce_ntd_monthly_ridership_by_rtpa_v2(YEAR, MONTH)
print(df_v2.columns) # good to go!

In [None]:
display(
    df_v1.equals(df_v2),
    df_v1["ntd_id"].nunique(),
    df_v2["ntd_id"].nunique(),
)
# Why do they differ?

In [82]:
df_1_agg = df_v1.groupby(["ntd_id","agency"]).agg({"upt":"sum"}).reset_index()
df_2_agg = df_v2.groupby(["ntd_id","agency"]).agg({"upt":"sum"}).reset_index()

check_merge = df_1_agg.merge(
    df_2_agg, 
    on=["ntd_id","agency"], 
    how="outer", 
    indicator=True,
)


In [87]:
display(
    df_1_agg.head(),
    df_2_agg.head()
)

Unnamed: 0,ntd_id,agency,upt
0,90003,San Francisco Bay Area Rapid Transit District,498083640.0
1,90004,Golden Empire Transit District,32057995.0
2,90006,Santa Cruz Metropolitan Transit District,26924483.0
3,90007,City of Modesto,6833170.0
4,90008,City of Santa Monica,67250838.0


Unnamed: 0,ntd_id,agency,upt
0,90003,San Francisco Bay Area Rapid Transit District,508474182.0
1,90004,Golden Empire Transit District,32471592.0
2,90006,Santa Cruz Metropolitan Transit District,27622287.0
3,90007,City of Modesto,6833170.0
4,90008,City of Santa Monica,67990466.0


In [83]:
check_merge["_merge"].value_counts()

both          84
left_only     20
right_only     7
Name: _merge, dtype: int64

In [84]:
check_merge[check_merge["agency"].str.contains("Modesto")]

Unnamed: 0,ntd_id,agency,upt_x,upt_y,_merge
3,90007,City of Modesto,6833170.0,,left_only
104,90007,City of Modesto,,6833170.0,right_only


In [85]:
check_merge[check_merge["ntd_id"]=="90007"]

Unnamed: 0,ntd_id,agency,upt_x,upt_y,_merge
3,90007,City of Modesto,6833170.0,,left_only
104,90007,City of Modesto,,6833170.0,right_only


In [88]:
check_merge.sort_values(by="ntd_id")

Unnamed: 0,ntd_id,agency,upt_x,upt_y,_merge
0,90003,San Francisco Bay Area Rapid Transit District,498083600.0,508474200.0,both
1,90004,Golden Empire Transit District,32058000.0,32471590.0,both
2,90006,Santa Cruz Metropolitan Transit District,26924480.0,27622290.0,both
104,90007,City of Modesto,,6833170.0,right_only
3,90007,City of Modesto,6833170.0,,left_only
4,90008,City of Santa Monica,67250840.0,67990470.0,both
5,90009,San Mateo County Transit District,68128230.0,68960190.0,both
6,90010,City of Torrance,19075450.0,19285150.0,both
7,90012,San Joaquin Regional Transit District,19145430.0,19329050.0,both
8,90013,Santa Clara Valley Transportation Authority,190424300.0,195010200.0,both


## 09/18/2025 remove siuba from report

In [None]:
# DEPRECATED OLD FUNCTION
# def produce_ntd_monthly_ridership_by_rtpa(year: int, month: int) -> pd.DataFrame:
#     """
#     This function works with the warehouse `dim_monthly_ntd_ridership_with_adjustments` long data format.
#     Import NTD data from warehouse, filter to CA,
#     merge in crosswalk, checks for unmerged rows, then creates new columns for full Mode and TOS name.

#     """

#     full_upt = (
#         tbls.mart_ntd.dim_monthly_ridership_with_adjustments()
#         >> filter(
#             _.period_year.isin(
#                 ["2018", "2019", "2020", "2021", "2022", "2023", "2024", "2025"]
#             )
#         )
#         >>select(
#             _.ntd_id,
#             _.agency,
#             _.reporter_type,
#             _.period_year_month,
#             _.period_year,
#             _.period_month,
#             _.mode,
#             _.tos,
#             _.mode_type_of_service_status,
#             _.primary_uza_name,
#             _.upt
            
#         )
#         >> collect()
#     ).rename(
#         columns={
#             "mode_type_of_service_status": "Status",
#             "primary_uza_name": "uza_name",
#         }
#     )

#     full_upt = full_upt[full_upt.agency.notna()].reset_index(drop=True)

#     full_upt.to_parquet(
#         f"{GCS_FILE_PATH}ntd_monthly_ridership_{year}_{month}.parquet"
#     )

#     ca = full_upt[
#         (full_upt["uza_name"].str.contains(", CA")) & (full_upt.agency.notna())
#     ].reset_index(drop=True)

#     # use new crosswalk function
#     crosswalk = ntd_id_to_rtpa_crosswalk(split_scag=True)

#     min_year = 2018

#     # get agencies with last report year and data after > 2018.
#     last_report_year = (
#         tbls.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt()
#         >> filter(
#             _.year >= min_year,  # see if this changes anything
#             _.last_report_year >= min_year,
#             _.primary_uza_name.str.contains(", CA")
#             | _.primary_uza_name.str.contains("CA-NV")
#             | _.primary_uza_name.str.contains("California Non-UZA"),
#         )
#         >> distinct(
#             "source_agency",
#             "last_report_year",
#             "ntd_id",
#         )
#         >> collect()
#     )

#     # merge last report year to CA UPT data
#     df = pd.merge(ca, last_report_year, left_on="ntd_id", right_on="ntd_id", how="inner")

In [9]:
# full_upt = (
#         tbls.mart_ntd.dim_monthly_ridership_with_adjustments()
#         >> filter(
#             _.period_year.isin(
#                 ["2018", "2019", "2020", "2021", "2022", "2023", "2024", "2025"]
#             )
#         )
#         >>select(
#             _.ntd_id,
#             _.agency,
#             _.reporter_type,
#             _.period_year_month,
#             _.period_year,
#             _.period_month,
#             _.mode,
#             _.tos,
#             _.mode_type_of_service_status,
#             _.primary_uza_name,
#             _.upt
            
#         )
#         >> collect()
#     ).rename(
#         columns={
#             "mode_type_of_service_status": "Status",
#             "primary_uza_name": "uza_name",
#         }
#     )

# full_upt = full_upt[full_upt.agency.notna()].reset_index(drop=True)

# full_upt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209340 entries, 0 to 209339
Data columns (total 11 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   ntd_id             209340 non-null  object 
 1   agency             209340 non-null  object 
 2   reporter_type      209340 non-null  object 
 3   period_year_month  209340 non-null  object 
 4   period_year        209340 non-null  object 
 5   period_month       209340 non-null  object 
 6   mode               209340 non-null  object 
 7   tos                209340 non-null  object 
 8   Status             209340 non-null  object 
 9   uza_name           208800 non-null  object 
 10  upt                116688 non-null  float64
dtypes: float64(1), object(10)
memory usage: 17.6+ MB


In [13]:
# min_year = 2018
# last_report_year = (
#         tbls.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt()
#         >> filter(
#             _.year >= min_year,  # see if this changes anything
#             _.last_report_year >= min_year,
#             _.primary_uza_name.str.contains(", CA")
#             | _.primary_uza_name.str.contains("CA-NV")
#             | _.primary_uza_name.str.contains("California Non-UZA"),
#         )
#         >> distinct(
#             "source_agency",
#             "last_report_year",
#             "ntd_id",
#         )
#         >> collect()
#     )

# last_report_year.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227 entries, 0 to 226
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   source_agency     227 non-null    object
 1   last_report_year  227 non-null    int64 
 2   ntd_id            227 non-null    object
dtypes: int64(1), object(2)
memory usage: 5.4+ KB


In [None]:
# merge last report year to CA UPT data
df = pd.merge(ca, last_report_year, left_on="ntd_id", right_on="ntd_id", how="inner")
    # merge crosswalk to CA last report year
df = pd.merge(
    df,
        # Merging on too many columns can create problems
        # because csvs and dtypes aren't stable / consistent
        # for NTD ID, Legacy NTD ID, and UZA
    crosswalk[["ntd_id_2022", "rtpa_name"]],
    left_on="ntd_id",
    right_on="ntd_id_2022",
    how="left",
    indicator=True,
    )

In [17]:
display(
    full_upt.equals(full_upt_test),
    last_report_year.equals(last_report_year_test)
) # TRUE TRUE!!
# good to go

True

True

In [37]:
# UPDATED without siuba
def produce_ntd_monthly_ridership_by_rtpa(year: int, month: int) -> pd.DataFrame:
    """
    This function works with the warehouse `dim_monthly_ntd_ridership_with_adjustments` long data format.
    Import NTD data from warehouse, filter to CA,
    merge in crosswalk, checks for unmerged rows, then creates new columns for full Mode and TOS name.

    """
    monthly_query ="""
    SELECT 
      ntd_id,
      agency,
      reporter_type,
      period_year_month,
      period_year,
      period_month,
      mode,
      tos,
      mode_type_of_service_status AS Status,
      primary_uza_name as uza_name,
      upt
    FROM
      `cal-itp-data-infra.mart_ntd.dim_monthly_ridership_with_adjustments`
    WHERE
      period_year IN ("2018", "2019", "2020", "2021", "2022", "2023", "2024", "2025")
      AND agency IS NOT NULL
    """
    full_upt = query_sql(monthly_query, as_df=True)

    # full_upt.to_parquet(
    #     f"{GCS_FILE_PATH}ntd_monthly_ridership_{year}_{month}.parquet"
    # )

    ca = full_upt[
        (full_upt["uza_name"].str.contains(", CA")) & (full_upt.agency.notna())
    ].reset_index(drop=True)

    # use new crosswalk function
    crosswalk = ntd_id_to_rtpa_crosswalk(split_scag=True)

    min_year = 2018

    # get agencies with last report year and data after > 2018.
    last_report_query = """
    SELECT DISTINCT
      source_agency,
      last_report_year,
      ntd_id,
    FROM
      `cal-itp-data-infra.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt`
    WHERE
      year >= 2018
      AND last_report_year >= 2018
      AND (
        primary_uza_name LIKE "%, CA%"
        OR primary_uza_name LIKE "%CA-NV%"
        OR primary_uza_name LIKE "%California Non-UZA%"
      )
    """

    last_report_year = query_sql(last_report_query, as_df=True)

    # merge last report year to CA UPT data
    df = pd.merge(ca, last_report_year, left_on="ntd_id", right_on="ntd_id", how="inner")
    # merge crosswalk to CA last report year
    df = pd.merge(
        df,
        # Merging on too many columns can create problems
        # because csvs and dtypes aren't stable / consistent
        # for NTD ID, Legacy NTD ID, and UZA
        crosswalk[["ntd_id_2022", "rtpa_name"]],
        left_on="ntd_id",
        right_on="ntd_id_2022",
        how="left",
        indicator=True,
    )

    print(df._merge.value_counts())

    # check for unmerged rows
    if len(df[df._merge == "left_only"]) > 0:
        raise ValueError("There are unmerged rows to crosswalk")
    
    monthly_sort_cols =  [
    "ntd_id",
    "mode", 
    "tos",
    "period_month", 
    "period_year"
] # got the order correct with ["period_month", "period_year"]! sorted years with grouped months

    monthly_group_cols = [
        "ntd_id",
        "mode", 
        "tos"
                  ]

    monthly_change_col ="previous_y_m_upt"

    df = add_change_columns(
        df,
        sort_cols = monthly_sort_cols,
        group_cols = monthly_group_cols,
        change_col = monthly_change_col
    )

    
    df = df.assign(
        Mode_full = df["mode"].map(NTD_MODES),
        TOS_full = df["tos"].map(NTD_TOS)
    )
    
    return df

## Update `produce_ntd_monthly_ridership_by_rtpa` function

In [None]:
from calitp_data_analysis.tables import tbls
from siuba import _, collect, distinct, filter, select

In [None]:
# from annual_ridership_report.annual_ridership_module import ntd_id_to_rtpa_crosswalk

In [None]:
from _01_ntd_ridership_utils import ntd_id_to_rtpa_crosswalk, add_change_columns

In [None]:
xwalk_no_split = ntd_id_to_rtpa_crosswalk(split_scag=False)

In [None]:
# display(
#     test.head(),
#     test.info(),
#     test["rtpa_name"].value_counts()
# )

In [None]:
xwalk_w_split = ntd_id_to_rtpa_crosswalk(split_scag=True)

In [None]:
xwalk_no_split["rtpa_name"].unique() == xwalk_w_split["rtpa_name"].unique()

In [None]:
xwalk_w_split.columns.tolist()

In [None]:
# updated
def produce_ntd_monthly_ridership_by_rtpa(year: int, month: int) -> pd.DataFrame:
    """
    This function works with the warehouse `dim_monthly_ntd_ridership_with_adjustments` long data format.
    Import NTD data from warehouse, filter to CA,
    merge in crosswalk, checks for unmerged rows, then creates new columns for full Mode and TOS name.

    """

    full_upt = (
        tbls.mart_ntd.dim_monthly_ridership_with_adjustments()
        >> filter(
            _.period_year.isin(
                ["2018", "2019", "2020", "2021", "2022", "2023", "2024", "2025"]
            )
        )
        >>select(
            _.ntd_id,
            _.agency,
            _.reporter_type,
            _.period_year_month,
            _.period_year,
            _.period_month,
            _.mode,
            _.tos,
            _.mode_type_of_service_status,
            _.primary_uza_name,
            _.upt
            
        )
        >> collect()
    ).rename(
        columns={
            "mode_type_of_service_status": "Status",
            "primary_uza_name": "uza_name",
        }
    )

    full_upt = full_upt[full_upt.agency.notna()].reset_index(drop=True)

    # full_upt.to_parquet(
    #     f"{GCS_FILE_PATH}ntd_monthly_ridership_{year}_{month}.parquet"
    # )

    ca = full_upt[
        (full_upt["uza_name"].str.contains(", CA")) & (full_upt.agency.notna())
    ].reset_index(drop=True)

    # use new crosswalk function
    crosswalk = ntd_id_to_rtpa_crosswalk(split_scag=True)

    min_year = 2018

    # get agencies with last report year and data after > 2018.
    last_report_year = (
        tbls.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt()
        >> filter(
            _.year >= min_year,  # see if this changes anything
            _.last_report_year >= min_year,
            _.primary_uza_name.str.contains(", CA")
            | _.primary_uza_name.str.contains("CA-NV")
            | _.primary_uza_name.str.contains("California Non-UZA"),
        )
        >> distinct(
            "source_agency",
            #'agency_status',
            #'legacy_ntd_id',
            "last_report_year",
            #'mode',
            "ntd_id",
            #'reporter_type',
            #'reporting_module',
            #'service',
            #'uace_code',
            #'primary_uza_name',
            #'uza_population',
            #'year',
            #'upt',
        )
        >> collect()
    )

    # merge last report year to CA UPT data
    df = pd.merge(ca, last_report_year, left_on="ntd_id", right_on="ntd_id", how="inner")

    # merge crosswalk to CA last report year
    df = pd.merge(
        df,
        # Merging on too many columns can create problems
        # because csvs and dtypes aren't stable / consistent
        # for NTD ID, Legacy NTD ID, and UZA
        crosswalk[["ntd_id_2022", "rtpa_name"]],
        left_on="ntd_id",
        right_on="ntd_id_2022",
        how="left",
        indicator=True,
    )

    print(df._merge.value_counts())

    # check for unmerged rows
    if len(df[df._merge == "left_only"]) > 0:
        raise ValueError("There are unmerged rows to crosswalk")
    
    df = add_change_columns(df)
    
    df = df.assign(
        Mode_full = df["mode"].map(NTD_MODES),
        TOS_full = df["tos"].map(NTD_TOS)
    )
    
    return df

In [None]:
data = produce_ntd_monthly_ridership_by_rtpa(YEAR, MONTH)

In [None]:
display(
    data.info(),
    data["rtpa_name"].value_counts(),
    data.describe()
)

In [None]:
df_check = pd.read_parquet(
        f"{GCS_FILE_PATH}ca_monthly_ridership_{YEAR}_{MONTH}.parquet"
    )

In [None]:
df_check["rtpa_name"].sort_values().unique()
# wher is SLOCOG??!?!
# City of SLO was in SANDAG, adjusted city of slo RTPA in airtable 6/20/2025. should be fixed in the next report update

In [None]:
ntd_rtpa_orgs = (
        tbls.mart_transit_database.dim_organizations()
        >> filter(
            _._is_current == True,
            _.ntd_id_2022.notna(),
            _.rtpa_name.notna(),
        )
        >> select(
            _.name, 
            _.ntd_id_2022, 
            _.rtpa_name, 
            _.mpo_name, 
            _.key
        )
        >> collect()
    )

    # join bridge org county geo to get agency counties
bridge_counties = (
        tbls.mart_transit_database.bridge_organizations_x_headquarters_county_geography()
        >> filter(
            _._is_current == True
        )
        >> select(
            _.county_geography_name, 
            _.organization_key
        )
        >> collect()
    )

In [None]:
ntd_rtpa_orgs[ntd_rtpa_orgs["name"].str.contains("San Luis")]

In [None]:
bridge_counties["county_geography_name"].value_counts()

In [None]:
ntd_to_rtpa_crosswalk = ntd_rtpa_orgs.merge(
        bridge_counties, 
        left_on="key", 
        right_on="organization_key", 
        how="left"
    )

In [None]:
ntd_to_rtpa_crosswalk[ntd_to_rtpa_crosswalk["rtpa_name"].str.contains("San Luis")]

---

# {rtpa}
## Monthly Ridership Trends

**Download data from our public [folder](https://console.cloud.google.com/storage/browser/calitp-publish-data-analysis)** by navigating to `ntd_monthly_ridership` and selecting a file.

Transit operators/agencies that are **Urban full reporters, that submit monthly ridership data to NTD from 2018 to present**, are included in this report. Reporters that were previously Urban full reporters, but are currently not, may appear. This may result in Reporters showing zero or partial ridership data in the report. 

If a Reporter is not a monthly reporter, or has not reported data since 2018, they will not appear in the report.

Examples: 
- Reporter A is an urban full reporter from 2019-2022, then became a reduced reporter for 2023. Reporter A's ridership data will be displayed for 2019-2022 only.
- Reporter B is an urban full reporter from 2000-2017, then became a reduced reporter for 2018. Reporter B will be named in the report, but will not display ridership data.
- Reporter C was a reduced reporter form 2015-2020, then became an urban full reporter and began submitting monthly ridership data to NTD for 2021. Reporter C's ridership data will be displayed for 2021-present.

In [None]:
URL = "https://console.cloud.google.com/storage/" "browser/calitp-publish-data-analysis"

display(
    HTML(
        f"""
        <a href={URL}>
        Download the latest month of data: {PUBLIC_FILENAME}</a>
        """
    )
)

In [None]:
MIN_YEAR = 2018

# updated to filter for period_year greater than 2018
df = (
    pd.read_parquet(
        f"{GCS_FILE_PATH}ca_monthly_ridership_{YEAR}_{MONTH}.parquet",
        filters=[[("RTPA", "==", rtpa), ("period_year", ">=", MIN_YEAR)]],
    )
    .drop(
        # updated columns names to match new df
        columns=["mode", "tos", "execution_ts"]
    )
    .rename(columns={"Mode_full": "Mode", "TOS_full": "TOS"})
)

In [None]:
df.info()

In [None]:
# REMOVE

# find columns that are recent enough to plot
# MIN_YEAR = 2018

# might not need the rest of these
# not_id_cols = [c for c in df.columns if "/" in c]

# recent_years = [
#    c for c in not_id_cols if int(c.split("/")[1]) >= MIN_YEAR and
#    "pct" not in c
# ]

# upt_cols = [
#    c for c in recent_years if "change" not in c
# ]

# change_cols = [c for c in recent_years if "change" in c]

In [None]:
# REMOVE

# do i need this anymore?
# what does the data look like initially? filters/groups the DF by the `group_cols` list, keeps the initial wide data

# give this a try with current data NOPE!
# def OLD_sum_by_group(df: pd.DataFrame, group_cols: list) -> pd.DataFrame:
#    """
#    Since df is wide, use pivot_table() to sum up all
#    the columns that show UPT.
#    """
#    grouped_df = df.pivot_table(
#        index = group_cols,
#        values = "upt",
#        aggfunc="sum"
#    ).reset_index().reindex(columns = group_cols + recent_years)

#    return grouped_df


# DONT THINK I NEED THIS ANYMORE!!!
# def make_long(df: pd.DataFrame, group_cols: list, value_cols: list):
#    df_long = df[group_cols + value_cols].melt(
#        id_vars = group_cols,
#        value_vars = value_cols,
#    )

#    df_long = df_long.assign(
#        variable = df_long.variable.str.replace("change_1yr_", "")
#    )

#    return df_long

In [None]:
## MOVED FUNCTION TO SCRIPT, SO IT CAN BE USED BY save_rtpa_outputs

# function is read in with imports

# UPDATED AND TESTED, this works! gives the same results as the old old sum_by_group, make_long and assemble_long_df functions

# def sum_by_group(
#    df: pd.DataFrame,
#    group_cols: list) -> pd.DataFrame:
#    """
#    since data is now long to begin with, this replaces old sum_by_group, make_long and assemble_long_df functions.
#    """
#    from TEST_monthly_ridership_by_rtpa import get_percent_change
#    grouped_df = df.groupby(group_cols+
#                             ['period_year',
#                             'period_month',
#                             'period_year_month']
#                           ).agg({
#        "upt":"sum",
#        "previous_y_m_upt":"sum",
#        "change_1yr":"sum"
#    }
#    ).reset_index()

# get %change back
#    grouped_df = get_percent_change(grouped_df)

# decimal to whole number
#    grouped_df["pct_change_1yr"] = grouped_df["pct_change_1yr"]*100

#    return grouped_df

In [None]:
# REMOVE

# takes the sum_by_group dfs and makes them long
# unpivots the sum_by_group dfs, each row is a different year/month

# DONT THINK I NEED THIS ANYMORE!!!
# def assemble_long_df(df: pd.DataFrame, group_cols: list) -> pd.DataFrame:
#    """
#    Need df to be long to make chart.
#    Let's put raw UPT and change side-by-side.
#    """
#    df_raw = make_long(df, group_cols, upt_cols).rename(
#        columns = {"value": "upt"})
#    df_change = make_long(df, group_cols, change_cols).rename(
#        columns = {"value": "change_1yr"})

#    final = pd.merge(
#        df_raw,
#        df_change,
#        on = group_cols + ["variable"],
#        how = "left"
#    )

#    final = final.assign(
#        year = final.variable.str.split("/", expand=True)[1],
#        month = final.variable.str.split("/", expand=True)[0].str.zfill(2)
#    )

#    final = final.assign(
#        year_month = final.year + "-" + final.month
#    )

#    return final

In [None]:
# KEEPING AS IS FOR NOW, BUT MAY NEED TO REWORK


def remove_zero_upt_rows(df: pd.DataFrame) -> pd.DataFrame:
    """
    takes the by_agency_long df, filters for rows with zero UPT AND zero change_1yr.
    then removes these rows from the by_agency_long df.
    resulting df should only contain rows with change in UPT not due to (0 UPT - 0 UPT) and net zero change in UPT
    zero UPT and zero change_1yr occurs when (0 upt - 0upt = 0change), dont need to show this
    but a net zero change (100upt - 100upt = 0 change), should be shown
    """

    # df of rows with zero UPT for 2 years
    zero_UPT_2years = df[(df["upt"] == 0) & (df["change_1yr"] == 0)]

    merge = df.merge(zero_UPT_2years, how="left", indicator=True)
    no_zero_UPT_rows = (
        merge[merge["_merge"] == "left_only"]
        .drop(columns=["_merge"])
        .dropna(subset="change_1yr")
    )

    return no_zero_UPT_rows

In [None]:
def group_by_agency(df):
    """
    Take in the 'by_ageny_long' df and aggregatese by rtpa, and calculates upt % of total.
    To be used in pie chart
    """
    initial_agg = df.groupby("agency").agg(total_upt=("upt", "sum")).reset_index()

    # % total columns
    initial_agg["pct_of_total_upt"] = (
        initial_agg["total_upt"] / initial_agg["total_upt"].sum()
    ) * 100

    # cleaning data types and rounding
    initial_agg["total_upt"] = initial_agg["total_upt"].astype("int64")
    initial_agg["pct_of_total_upt"] = initial_agg["pct_of_total_upt"].round(decimals=2)
    cleaned_agg = initial_agg.sort_values(by="total_upt", ascending=False)

    return cleaned_agg

In [None]:
# NEW

agency_cols = ["ntd_id", "agency", "RTPA"]
mode_cols = ["Mode", "RTPA"]
tos_cols = ["TOS", "RTPA"]

# sum_by_group still works! now that it is imported!
by_agency_long = sum_by_group(df, agency_cols)
by_mode_long = sum_by_group(df, mode_cols)
by_tos_long = sum_by_group(df, tos_cols)

# zero upt still works!
by_agency_long_no_zero_upt = remove_zero_upt_rows(by_agency_long)

# total UPT check

total_upt = by_agency_long["upt"].sum()
agency_count = by_agency_long["agency"].nunique()


agency_agg_yr = group_by_agency(by_agency_long)

In [None]:
# REMOVE

# I STILL NEED FILTERED df BY AGENCY, MODE AND TOS

# by_agency_long = assemble_long_df(by_agency, agency_cols)
# by_mode_long = assemble_long_df(by_mode, mode_cols)
# by_tos_long = assemble_long_df(by_tos, tos_cols)

# by_agency_long_no_zero_upt = remove_zero_upt_rows(by_agency_long)

In [None]:
LABELING_DICT = {
    "upt": "Unlinked Passenger Trips",
    "change_1yr": "Change in Unlinked Passenger Trips from Prior Year",
    "TOS": "Type of Service",
    "year_month": "Date",
}


def labeling(word: str) -> str:
    return label_visualization(word, LABELING_DICT)


WIDTH = 300
HEIGHT = 150

In [None]:
def make_line_chart(
    df: pd.DataFrame,
    y_col: str,
    color_col: str,
) -> alt.Chart:
    df = df[df[y_col] > 0].dropna(subset=y_col)

    x_label = [
        i
        for i in df.period_year_month.unique()
        if any(substring in i for substring in ["-01", "-06"])
    ]
    chart = (
        (
            alt.Chart(df)
            .mark_line()
            .encode(
                x=alt.X(
                    "period_year_month:O", axis=alt.Axis(values=x_label), title="Date"
                ),
                y=alt.Y(y_col, title=labeling(y_col)),
                color=alt.Color(
                    color_col,
                    title="",
                    scale=alt.Scale(
                        range=cp.CALITP_CATEGORY_BRIGHT_COLORS
                        + cp.CALITP_CATEGORY_BOLD_COLORS
                    ),
                ),
                tooltip=["period_year_month", y_col, color_col, "RTPA"],
            )
            .properties(width=WIDTH, height=HEIGHT)
            .facet(color_col, columns=2, title="")
            .resolve_scale(y="independent")
        )
        .properties(title=f"{labeling(y_col)} by {labeling(color_col)}")
        .interactive()
    )

    return chart

In [None]:
def make_bar_chart(
    df: pd.DataFrame,
    y_col: str,
    color_col: str,
) -> alt.Chart:

    def short_label(word):
        shorten_dict = {
            "change_1yr": "Change",
            "pct_change_1yr": "Change",
        }
        return shorten_dict[word]

    # For change column, we are missing everything prior to 2023
    # df = df.dropna(subset = y_col)

    # need flag for y_col >,<, 0, missing?
    # count function to how many agencies fall in those categories, then look at those agencies
    # present table

    x_label = [
        i
        for i in df.period_year_month.unique()
        if any(substring in i for substring in ["-01", "-03", "-06", "-09"])
    ]

    chart = (
        (
            alt.Chart(df)
            .mark_bar()
            .encode(
                x=alt.X(
                    "period_year_month:O", axis=alt.Axis(values=x_label), title="Date"
                ),
                y=alt.Y(y_col, title=short_label(y_col)),
                color=alt.Color(
                    color_col,
                    title="",
                    scale=alt.Scale(
                        range=cp.CALITP_CATEGORY_BRIGHT_COLORS
                        + cp.CALITP_CATEGORY_BOLD_COLORS
                    ),
                ),
                tooltip=["period_year_month", y_col, color_col, "RTPA"],
            )
            .properties(width=WIDTH, height=HEIGHT)
            .facet(color_col, columns=2, title="")
            .resolve_scale(x="shared", y="independent")
        )
        .properties(title=f"{labeling(y_col)} by {labeling(color_col)}")
        .interactive()
    )

    return chart

In [None]:
### initial pie code
def make_pie_chart(df, col, color_col):

    pie = (
        alt.Chart(df)
        .mark_arc(radius=150)
        .encode(
            theta=col,
            color=color_col,
            tooltip=["agency", "total_upt", "pct_of_total_upt"],
        )
        .properties(
            title=f"Total Unlinked Passenger Trips per agency in RTPA since {MIN_YEAR}",
        )
    )

    return pie

In [None]:
## moved to prod notebook 1/15/2025
# simple bar chart for total agencies and UPT
def total_upt_chart(df: pd.DataFrame, x_col: str, y_col: str, tool_tip: list):
    bar_chart = (
        alt.Chart(df)
        .mark_bar()
        .encode(
            x=alt.X(x_col).sort("-y"),
            y=alt.Y(y_col),
            tooltip=tool_tip,
            color=alt.Color(
                x_col,
                title="",
                scale=alt.Scale(
                    range=cp.CALITP_CATEGORY_BRIGHT_COLORS
                    + cp.CALITP_CATEGORY_BOLD_COLORS
                ),
            ),
        )
        .properties(
            title=f"Total Unlinked Passenger Trips per agency in RTPA since {MIN_YEAR}",
            width=WIDTH,
            height=HEIGHT,
        )
        .resolve_scale(y="independent")
        .interactive()
    )

    return bar_chart

### Totals

In [None]:
Markdown(
    f"""
Within {rtpa}:
- Number of Agencies/Operators: <b>{agency_count}</b>.
- Total Unlinked Passenger Trips since {MIN_YEAR}: <b>{total_upt:,}</b>.
- Individual agency/operator ridership breakdown:
"""
)

In [None]:
display(agency_agg_yr.reset_index(drop=True))

In [None]:
df_html = agency_agg_yr.reset_index(drop=True).to_html()

print(df_html)

In [None]:
df_markdown = agency_agg_yr.reset_index(drop=True).to_markdown()

print(df_markdown)

In [None]:
tooltip_list = ["agency", "total_upt", "pct_of_total_upt"]

total_upt_chart(agency_agg_yr, x_col="agency", y_col="total_upt", tool_tip=tooltip_list)

In [None]:
make_pie_chart(agency_agg_yr, col="total_upt", color_col="agency")

### Transit Agency

In [None]:
make_line_chart(by_agency_long, y_col="upt", color_col="agency")

Change in Unlinked Passenger Trips from the prior year. For example, July 2023's change would be the change in July 2023's reported values against July 2022's reported values.

In [None]:
# TEST OF BAR CHART WITH % CHANGE 1 YEAR
make_bar_chart(
    by_agency_long_no_zero_upt[by_agency_long_no_zero_upt["period_year"] >= 2018],
    y_col="pct_change_1yr",
    color_col="agency",
)

In [None]:
make_bar_chart(
    by_agency_long_no_zero_upt[by_agency_long_no_zero_upt["period_year"] >= 2024],
    y_col="change_1yr",
    color_col="agency",
)

In [None]:
make_bar_chart(by_agency_long_no_zero_upt, y_col="change_1yr", color_col="agency")

### Transit Mode

In [None]:
make_line_chart(by_mode_long, y_col="upt", color_col="Mode")

In [None]:
make_bar_chart(
    by_mode_long[by_mode_long["period_year"] >= 2024],
    y_col="change_1yr",
    color_col="Mode",
)

### Type of Service

In [None]:
make_line_chart(by_tos_long, y_col="upt", color_col="TOS")

In [None]:
make_bar_chart(
    by_tos_long[by_tos_long["period_year"] >= 2024], y_col="change_1yr", color_col="TOS"
)