In [1]:
import sys

sys.path.append("../bus_service_increase")
sys.path.append("../starter_kit")  # to test out style_df function
sys.path.append("../")

In [86]:
%%capture
import warnings

warnings.filterwarnings("ignore")
from calitp_data_analysis.sql import query_sql, to_snakecase

import altair as alt
import calitp_data_analysis.magics
import pandas as pd
from shared_utils.portfolio_utils import label_visualization
# from _starterkit_utils import style_df
from calitp_data_analysis import calitp_color_palette as cp
from explore_monthly_ridership_by_rtpa import sum_by_group
from IPython.display import HTML, Markdown, display
from update_vars import YEAR, MONTH, PUBLIC_FILENAME, YEAR, NTD_MODES, NTD_TOS
from _01_ntd_ridership_utils import ntd_id_to_rtpa_crosswalk, add_change_columns, get_percent_change, sum_by_group

# for testing


# from monthly_ridership_by_rtpa import get_percent_change
# from shared_utils.rt_dates import MONTH_DICT

# Temp file path for testing
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/ntd/"

# alt.renderers.enable("html")
alt.data_transformers.enable("default", max_rows=None)

In [3]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

In [4]:
# parameters cell for local
rtpa = "Metropolitan Transportation Commission"  # lots of reporters
# rtpa = "Butte County Association of Governments" # 2 reporters
# rtpa = "San Joaquin Council of Governments" # 4 reporters

In [5]:
%%capture_parameters
rtpa

{"rtpa": "Metropolitan Transportation Commission"}


---

## 09/24/2025 Update new table in function

In [6]:
# initiald function with current data table
def produce_ntd_monthly_ridership_by_rtpa(year: int, month: int) -> pd.DataFrame:
    """
    This function works with the warehouse `dim_monthly_ntd_ridership_with_adjustments` long data format.
    Import NTD data from warehouse, filter to CA,
    merge in crosswalk, checks for unmerged rows, then creates new columns for full Mode and TOS name.

    """
    # REMOVED 2025 FOR TESTING PURPOSES
    monthly_query ="""
    SELECT 
      ntd_id,
      agency,
      reporter_type,
      period_year_month,
      period_year,
      period_month,
      mode,
      tos,
      mode_type_of_service_status AS Status,
      primary_uza_name as uza_name,
      upt
    FROM
      `cal-itp-data-infra.mart_ntd.dim_monthly_ridership_with_adjustments`
    WHERE
      period_year IN ("2018", "2019", "2020", "2021", "2022", "2023", "2024") 
      AND agency IS NOT NULL
    """
    full_upt = query_sql(monthly_query, as_df=True)

    # full_upt.to_parquet(
    #     f"{GCS_FILE_PATH}ntd_monthly_ridership_{year}_{month}.parquet"
    # )

    ca = full_upt[
        (full_upt["uza_name"].str.contains(", CA")) & (full_upt.agency.notna())
    ].reset_index(drop=True)

    # use new crosswalk function
    crosswalk = ntd_id_to_rtpa_crosswalk(split_scag=True)

    min_year = 2018

    # get agencies with last report year and data after > 2018.
    last_report_query = """
    SELECT DISTINCT
      source_agency,
      last_report_year,
      ntd_id,
    FROM
      `cal-itp-data-infra.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt`
    WHERE
      year >= 2018
      AND last_report_year >= 2018
      AND (
        primary_uza_name LIKE "%, CA%"
        OR primary_uza_name LIKE "%CA-NV%"
        OR primary_uza_name LIKE "%California Non-UZA%"
      )
    """

    last_report_year = query_sql(last_report_query, as_df=True)

    # merge last report year to CA UPT data
    df = pd.merge(ca, last_report_year, left_on="ntd_id", right_on="ntd_id", how="inner")
    # merge crosswalk to CA last report year
    df = pd.merge(
        df,
        # Merging on too many columns can create problems
        # because csvs and dtypes aren't stable / consistent
        # for NTD ID, Legacy NTD ID, and UZA
        crosswalk[["ntd_id_2022", "rtpa_name"]],
        left_on="ntd_id",
        right_on="ntd_id_2022",
        how="left",
        indicator=True,
    )

    print(df._merge.value_counts())

    # check for unmerged rows
    if len(df[df._merge == "left_only"]) > 0:
        raise ValueError("There are unmerged rows to crosswalk")
    
    monthly_sort_cols =  [
    "ntd_id",
    "mode", 
    "tos",
    "period_month", 
    "period_year"
] # got the order correct with ["period_month", "period_year"]! sorted years with grouped months

    monthly_group_cols = [
        "ntd_id",
        "mode", 
        "tos"
                  ]

    monthly_change_col ="previous_y_m_upt"

    df = add_change_columns(
        df,
        sort_cols = monthly_sort_cols,
        group_cols = monthly_group_cols,
        change_col = monthly_change_col
    )

    
    df = df.assign(
        Mode_full = df["mode"].map(NTD_MODES),
        TOS_full = df["tos"].map(NTD_TOS)
    )
    
    return df

In [7]:
# new data table
def produce_ntd_monthly_ridership_by_rtpa_v2(year: int, month: int) -> pd.DataFrame:
    """
    This function works with the warehouse `dim_monthly_ntd_ridership_with_adjustments` long data format.
    Import NTD data from warehouse, filter to CA,
    merge in crosswalk, checks for unmerged rows, then creates new columns for full Mode and TOS name.

    """
    # REMOVED 2025 FOR TESTING PURPOSES
    monthly_query ="""
    SELECT 
      ntd_id,
      agency,
      reporter_type,
      period_year_month,
      period_year,
      period_month,
      mode,
      tos,
      mode_type_of_service_status AS Status,
      uza_name,
      upt
    FROM
      `cal-itp-data-infra.mart_ntd_ridership.fct_complete_monthly_ridership_with_adjustments_and_estimates`
    WHERE
      period_year IN ("2018", "2019", "2020", "2021", "2022", "2023", "2024")
      AND agency IS NOT NULL
    """
    full_upt = query_sql(monthly_query, as_df=True)

    # full_upt.to_parquet(
    #     f"{GCS_FILE_PATH}ntd_monthly_ridership_{year}_{month}.parquet"
    # )

    ca = full_upt[
        (full_upt["uza_name"].str.contains(", CA")) & (full_upt.agency.notna())
    ].reset_index(drop=True)

    # use new crosswalk function
    crosswalk = ntd_id_to_rtpa_crosswalk(split_scag=True)

    min_year = 2018

    # get agencies with last report year and data after > 2018.
    last_report_query = """
    SELECT DISTINCT
      source_agency,
      last_report_year,
      ntd_id,
    FROM
      `cal-itp-data-infra.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt`
    WHERE
      year >= 2018
      AND last_report_year >= 2018
      AND (
        primary_uza_name LIKE "%, CA%"
        OR primary_uza_name LIKE "%CA-NV%"
        OR primary_uza_name LIKE "%California Non-UZA%"
      )
    """

    last_report_year = query_sql(last_report_query, as_df=True)

    # merge last report year to CA UPT data
    df = pd.merge(ca, last_report_year, left_on="ntd_id", right_on="ntd_id", how="inner")
    # merge crosswalk to CA last report year
    df = pd.merge(
        df,
        # Merging on too many columns can create problems
        # because csvs and dtypes aren't stable / consistent
        # for NTD ID, Legacy NTD ID, and UZA
        crosswalk[["ntd_id_2022", "rtpa_name"]],
        left_on="ntd_id",
        right_on="ntd_id_2022",
        how="left",
        indicator=True,
    )

    print(df._merge.value_counts())

    # check for unmerged rows
    if len(df[df._merge == "left_only"]) > 0:
        raise ValueError("There are unmerged rows to crosswalk")
    
    monthly_sort_cols =  [
    "ntd_id",
    "mode", 
    "tos",
    "period_month", 
    "period_year"
] # got the order correct with ["period_month", "period_year"]! sorted years with grouped months

    monthly_group_cols = [
        "ntd_id",
        "mode", 
        "tos"
                  ]

    monthly_change_col ="previous_y_m_upt"

    df = add_change_columns(
        df,
        sort_cols = monthly_sort_cols,
        group_cols = monthly_group_cols,
        change_col = monthly_change_col
    )

    
    df = df.assign(
        Mode_full = df["mode"].map(NTD_MODES),
        TOS_full = df["tos"].map(NTD_TOS)
    )
    
    return df

In [8]:
df_v1 = produce_ntd_monthly_ridership_by_rtpa(YEAR, MONTH)
print(df_v1.columns) # good to go!

both          27720
left_only         0
right_only        0
Name: _merge, dtype: int64
Index(['ntd_id', 'agency', 'reporter_type', 'period_year_month', 'period_year',
       'period_month', 'mode', 'tos', 'Status', 'uza_name', 'upt',
       'source_agency', 'last_report_year', 'ntd_id_2022', 'rtpa_name',
       '_merge', 'previous_y_m_upt', 'change_1yr', 'pct_change_1yr',
       'Mode_full', 'TOS_full'],
      dtype='object')


In [9]:
df_v2 = produce_ntd_monthly_ridership_by_rtpa_v2(YEAR, MONTH)
print(df_v2.columns) # good to go!

both          17845
left_only         0
right_only        0
Name: _merge, dtype: int64
Index(['ntd_id', 'agency', 'reporter_type', 'period_year_month', 'period_year',
       'period_month', 'mode', 'tos', 'Status', 'uza_name', 'upt',
       'source_agency', 'last_report_year', 'ntd_id_2022', 'rtpa_name',
       '_merge', 'previous_y_m_upt', 'change_1yr', 'pct_change_1yr',
       'Mode_full', 'TOS_full'],
      dtype='object')


In [10]:
display(
    df_v1.equals(df_v2),
    df_v1["ntd_id"].nunique(),
    df_v2["ntd_id"].nunique(),
)
# Why do they differ?

False

104

91

In [11]:
df_1_agg = df_v1.groupby(["ntd_id","agency"]).agg({"upt":"sum"}).reset_index()
df_2_agg = df_v2.groupby(["ntd_id","agency"]).agg({"upt":"sum"}).reset_index()

check_merge = df_1_agg.merge(
    df_2_agg, 
    on=["ntd_id","agency"], 
    how="outer", 
    indicator=True,
)


In [12]:
display(
    df_1_agg.head(),
    df_2_agg.head()
)

Unnamed: 0,ntd_id,agency,upt
0,90003,San Francisco Bay Area Rapid Transit District,473709062.0
1,90004,Golden Empire Transit District,29783332.0
2,90006,Santa Cruz Metropolitan Transit District,24556631.0
3,90007,City of Modesto,6833170.0
4,90008,City of Santa Monica,62236252.0


Unnamed: 0,ntd_id,agency,upt
0,90003,San Francisco Bay Area Rapid Transit District,473709062.0
1,90004,Golden Empire Transit District,29783332.0
2,90006,Santa Cruz Metropolitan Transit District,24556631.0
3,90007,City of Modesto,6833170.0
4,90008,City of Santa Monica,62236252.0


In [13]:
check_merge["_merge"].value_counts()

both          84
left_only     20
right_only     7
Name: _merge, dtype: int64

In [14]:
check_merge[check_merge["agency"].str.contains("Modesto")]

Unnamed: 0,ntd_id,agency,upt_x,upt_y,_merge
3,90007,City of Modesto,6833170.0,,left_only
104,90007,City of Modesto,,6833170.0,right_only


In [15]:
check_merge[check_merge["ntd_id"]=="90007"]

Unnamed: 0,ntd_id,agency,upt_x,upt_y,_merge
3,90007,City of Modesto,6833170.0,,left_only
104,90007,City of Modesto,,6833170.0,right_only


In [18]:
check_merge[check_merge["_merge"]!="both"].sort_values(by="ntd_id")

Unnamed: 0,ntd_id,agency,upt_x,upt_y,_merge
3,90007,City of Modesto,6833170.0,,left_only
104,90007,City of Modesto,,6833170.0,right_only
105,90022,City of Norwalk,,7726865.0,right_only
15,90022,City of Norwalk,7700156.0,,left_only
17,90024,City of La Mirada,207699.0,,left_only
106,90024,City of La Mirada,,207699.0,right_only
29,90044,City of Arcadia,0.0,,left_only
30,90050,City of Simi Valley,0.0,,left_only
31,90052,City of Corona,0.0,,left_only
39,90089,County of Sonoma,4955790.0,,left_only


## Warehouse research
- RE: 90007 Modesto
  - Compared identical year, month, tos, mode. The dim_monthly table includes NULL UPT rows, whereas the fct_monthly tables does not have NULL rows. Other than the NULL rows, the reamining UPT values match
  
- RE: 90296 Claremont
  - same variables as previous. dim_monthly tables ONLY contain NULL rows for this agency. fct_monthly does not contain this agency

- Filtered dataframe for `left_only` & `right_only`
  - All agencies were accounted for
  - agencies with 0 in dim_montly had NaN in fct_monthly
  - agencies with data in dim_monthly had equivienelt data in_fct_monthly, just didnt match for some reason. 

- Solution
  - Demo'd observations during round table meeting. OK'd to remove 0/NULL values and to make note of it on the report.


## if i took out `zero/NULL/NaN` rows from dim_monthly, will it be the same as fct_monthly?

In [54]:
set(df_v1[df_v1["upt"]!=0].dropna()["ntd_id"].unique()) == set(df_v2["ntd_id"].unique()) # TRUE!!!!!!!!!!

True

In [53]:
dim_monthly_set = set(df_v1[df_v1["upt"]!=0].dropna()["ntd_id"].unique())
fct_monthly_set = set(df_v2["ntd_id"].unique())

diff_dim_fct = dim_monthly_set - fct_monthly_set

diff_fct_dim = fct_monthly_set - dim_monthly_set

print(diff_dim_fct, diff_dim_fct) 

set() set()


## how many agencies do i lose if i drop zero values rows?

In [64]:
df_v1[
    (df_v1["upt"]==0) 
    | (df_v1["upt"].isna())
]["ntd_id"].nunique()

73

In [None]:
df_v1[
    (df_v1["upt"]==0) 
    | (df_v1["upt"].isna())
]

In [65]:
df_v1[
    (df_v1["upt"] > 0) 
    & (df_v1["upt"].notna())
]["ntd_id"].nunique()

91

In [61]:
print(
    f"""Total Number of unique ntd_id from dim_monthly test table: {df_v1["ntd_id"].nunique()}.
    Number of unique ntd_id with zero-value/NaN/NULL metrics: {df_v1[df_v1["upt"]==0 | df_v1["upt"].isna()]["ntd_id"].nunique()}.
    Number of remaining uniqe ntd_id:{df_v1[df_v1["upt"]!=0].dropna()["ntd_id"].nunique()}
    """
)

Number of unique ntd_id from dim_monthly test table: 104.
    Number of unique ntd_id with zero-value/NaN/NULL metrics: 15.
    Number of remaining uniqe ntd_id:91
    


## Results
Confirm that the unique non-zero/null ntd_id values in dim_monthly are the same in fct_monthly. confident to move forward in 

## 09/18/2025 remove siuba from report

In [None]:
# DEPRECATED OLD FUNCTION
# def produce_ntd_monthly_ridership_by_rtpa(year: int, month: int) -> pd.DataFrame:
#     """
#     This function works with the warehouse `dim_monthly_ntd_ridership_with_adjustments` long data format.
#     Import NTD data from warehouse, filter to CA,
#     merge in crosswalk, checks for unmerged rows, then creates new columns for full Mode and TOS name.

#     """

#     full_upt = (
#         tbls.mart_ntd.dim_monthly_ridership_with_adjustments()
#         >> filter(
#             _.period_year.isin(
#                 ["2018", "2019", "2020", "2021", "2022", "2023", "2024", "2025"]
#             )
#         )
#         >>select(
#             _.ntd_id,
#             _.agency,
#             _.reporter_type,
#             _.period_year_month,
#             _.period_year,
#             _.period_month,
#             _.mode,
#             _.tos,
#             _.mode_type_of_service_status,
#             _.primary_uza_name,
#             _.upt
            
#         )
#         >> collect()
#     ).rename(
#         columns={
#             "mode_type_of_service_status": "Status",
#             "primary_uza_name": "uza_name",
#         }
#     )

#     full_upt = full_upt[full_upt.agency.notna()].reset_index(drop=True)

#     full_upt.to_parquet(
#         f"{GCS_FILE_PATH}ntd_monthly_ridership_{year}_{month}.parquet"
#     )

#     ca = full_upt[
#         (full_upt["uza_name"].str.contains(", CA")) & (full_upt.agency.notna())
#     ].reset_index(drop=True)

#     # use new crosswalk function
#     crosswalk = ntd_id_to_rtpa_crosswalk(split_scag=True)

#     min_year = 2018

#     # get agencies with last report year and data after > 2018.
#     last_report_year = (
#         tbls.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt()
#         >> filter(
#             _.year >= min_year,  # see if this changes anything
#             _.last_report_year >= min_year,
#             _.primary_uza_name.str.contains(", CA")
#             | _.primary_uza_name.str.contains("CA-NV")
#             | _.primary_uza_name.str.contains("California Non-UZA"),
#         )
#         >> distinct(
#             "source_agency",
#             "last_report_year",
#             "ntd_id",
#         )
#         >> collect()
#     )

#     # merge last report year to CA UPT data
#     df = pd.merge(ca, last_report_year, left_on="ntd_id", right_on="ntd_id", how="inner")

In [None]:
# full_upt = (
#         tbls.mart_ntd.dim_monthly_ridership_with_adjustments()
#         >> filter(
#             _.period_year.isin(
#                 ["2018", "2019", "2020", "2021", "2022", "2023", "2024", "2025"]
#             )
#         )
#         >>select(
#             _.ntd_id,
#             _.agency,
#             _.reporter_type,
#             _.period_year_month,
#             _.period_year,
#             _.period_month,
#             _.mode,
#             _.tos,
#             _.mode_type_of_service_status,
#             _.primary_uza_name,
#             _.upt
            
#         )
#         >> collect()
#     ).rename(
#         columns={
#             "mode_type_of_service_status": "Status",
#             "primary_uza_name": "uza_name",
#         }
#     )

# full_upt = full_upt[full_upt.agency.notna()].reset_index(drop=True)

# full_upt.info()

In [None]:
# min_year = 2018
# last_report_year = (
#         tbls.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt()
#         >> filter(
#             _.year >= min_year,  # see if this changes anything
#             _.last_report_year >= min_year,
#             _.primary_uza_name.str.contains(", CA")
#             | _.primary_uza_name.str.contains("CA-NV")
#             | _.primary_uza_name.str.contains("California Non-UZA"),
#         )
#         >> distinct(
#             "source_agency",
#             "last_report_year",
#             "ntd_id",
#         )
#         >> collect()
#     )

# last_report_year.info()

In [None]:
# merge last report year to CA UPT data
df = pd.merge(ca, last_report_year, left_on="ntd_id", right_on="ntd_id", how="inner")
    # merge crosswalk to CA last report year
df = pd.merge(
    df,
        # Merging on too many columns can create problems
        # because csvs and dtypes aren't stable / consistent
        # for NTD ID, Legacy NTD ID, and UZA
    crosswalk[["ntd_id_2022", "rtpa_name"]],
    left_on="ntd_id",
    right_on="ntd_id_2022",
    how="left",
    indicator=True,
    )

In [None]:
display(
    full_upt.equals(full_upt_test),
    last_report_year.equals(last_report_year_test)
) # TRUE TRUE!!
# good to go

In [None]:
# UPDATED without siuba
def produce_ntd_monthly_ridership_by_rtpa(year: int, month: int) -> pd.DataFrame:
    """
    This function works with the warehouse `dim_monthly_ntd_ridership_with_adjustments` long data format.
    Import NTD data from warehouse, filter to CA,
    merge in crosswalk, checks for unmerged rows, then creates new columns for full Mode and TOS name.

    """
    monthly_query ="""
    SELECT 
      ntd_id,
      agency,
      reporter_type,
      period_year_month,
      period_year,
      period_month,
      mode,
      tos,
      mode_type_of_service_status AS Status,
      primary_uza_name as uza_name,
      upt
    FROM
      `cal-itp-data-infra.mart_ntd.dim_monthly_ridership_with_adjustments`
    WHERE
      period_year IN ("2018", "2019", "2020", "2021", "2022", "2023", "2024", "2025")
      AND agency IS NOT NULL
    """
    full_upt = query_sql(monthly_query, as_df=True)

    # full_upt.to_parquet(
    #     f"{GCS_FILE_PATH}ntd_monthly_ridership_{year}_{month}.parquet"
    # )

    ca = full_upt[
        (full_upt["uza_name"].str.contains(", CA")) & (full_upt.agency.notna())
    ].reset_index(drop=True)

    # use new crosswalk function
    crosswalk = ntd_id_to_rtpa_crosswalk(split_scag=True)

    min_year = 2018

    # get agencies with last report year and data after > 2018.
    last_report_query = """
    SELECT DISTINCT
      source_agency,
      last_report_year,
      ntd_id,
    FROM
      `cal-itp-data-infra.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt`
    WHERE
      year >= 2018
      AND last_report_year >= 2018
      AND (
        primary_uza_name LIKE "%, CA%"
        OR primary_uza_name LIKE "%CA-NV%"
        OR primary_uza_name LIKE "%California Non-UZA%"
      )
    """

    last_report_year = query_sql(last_report_query, as_df=True)

    # merge last report year to CA UPT data
    df = pd.merge(ca, last_report_year, left_on="ntd_id", right_on="ntd_id", how="inner")
    # merge crosswalk to CA last report year
    df = pd.merge(
        df,
        # Merging on too many columns can create problems
        # because csvs and dtypes aren't stable / consistent
        # for NTD ID, Legacy NTD ID, and UZA
        crosswalk[["ntd_id_2022", "rtpa_name"]],
        left_on="ntd_id",
        right_on="ntd_id_2022",
        how="left",
        indicator=True,
    )

    print(df._merge.value_counts())

    # check for unmerged rows
    if len(df[df._merge == "left_only"]) > 0:
        raise ValueError("There are unmerged rows to crosswalk")
    
    monthly_sort_cols =  [
    "ntd_id",
    "mode", 
    "tos",
    "period_month", 
    "period_year"
] # got the order correct with ["period_month", "period_year"]! sorted years with grouped months

    monthly_group_cols = [
        "ntd_id",
        "mode", 
        "tos"
                  ]

    monthly_change_col ="previous_y_m_upt"

    df = add_change_columns(
        df,
        sort_cols = monthly_sort_cols,
        group_cols = monthly_group_cols,
        change_col = monthly_change_col
    )

    
    df = df.assign(
        Mode_full = df["mode"].map(NTD_MODES),
        TOS_full = df["tos"].map(NTD_TOS)
    )
    
    return df

## Update `produce_ntd_monthly_ridership_by_rtpa` function

In [None]:
from calitp_data_analysis.tables import tbls
from siuba import _, collect, distinct, filter, select

In [None]:
# from annual_ridership_report.annual_ridership_module import ntd_id_to_rtpa_crosswalk

In [None]:
from _01_ntd_ridership_utils import ntd_id_to_rtpa_crosswalk, add_change_columns

In [None]:
xwalk_no_split = ntd_id_to_rtpa_crosswalk(split_scag=False)

In [None]:
# display(
#     test.head(),
#     test.info(),
#     test["rtpa_name"].value_counts()
# )

In [None]:
xwalk_w_split = ntd_id_to_rtpa_crosswalk(split_scag=True)

In [None]:
xwalk_no_split["rtpa_name"].unique() == xwalk_w_split["rtpa_name"].unique()

In [None]:
xwalk_w_split.columns.tolist()

In [None]:
# updated
def produce_ntd_monthly_ridership_by_rtpa(year: int, month: int) -> pd.DataFrame:
    """
    This function works with the warehouse `dim_monthly_ntd_ridership_with_adjustments` long data format.
    Import NTD data from warehouse, filter to CA,
    merge in crosswalk, checks for unmerged rows, then creates new columns for full Mode and TOS name.

    """

    full_upt = (
        tbls.mart_ntd.dim_monthly_ridership_with_adjustments()
        >> filter(
            _.period_year.isin(
                ["2018", "2019", "2020", "2021", "2022", "2023", "2024", "2025"]
            )
        )
        >>select(
            _.ntd_id,
            _.agency,
            _.reporter_type,
            _.period_year_month,
            _.period_year,
            _.period_month,
            _.mode,
            _.tos,
            _.mode_type_of_service_status,
            _.primary_uza_name,
            _.upt
            
        )
        >> collect()
    ).rename(
        columns={
            "mode_type_of_service_status": "Status",
            "primary_uza_name": "uza_name",
        }
    )

    full_upt = full_upt[full_upt.agency.notna()].reset_index(drop=True)

    # full_upt.to_parquet(
    #     f"{GCS_FILE_PATH}ntd_monthly_ridership_{year}_{month}.parquet"
    # )

    ca = full_upt[
        (full_upt["uza_name"].str.contains(", CA")) & (full_upt.agency.notna())
    ].reset_index(drop=True)

    # use new crosswalk function
    crosswalk = ntd_id_to_rtpa_crosswalk(split_scag=True)

    min_year = 2018

    # get agencies with last report year and data after > 2018.
    last_report_year = (
        tbls.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt()
        >> filter(
            _.year >= min_year,  # see if this changes anything
            _.last_report_year >= min_year,
            _.primary_uza_name.str.contains(", CA")
            | _.primary_uza_name.str.contains("CA-NV")
            | _.primary_uza_name.str.contains("California Non-UZA"),
        )
        >> distinct(
            "source_agency",
            #'agency_status',
            #'legacy_ntd_id',
            "last_report_year",
            #'mode',
            "ntd_id",
            #'reporter_type',
            #'reporting_module',
            #'service',
            #'uace_code',
            #'primary_uza_name',
            #'uza_population',
            #'year',
            #'upt',
        )
        >> collect()
    )

    # merge last report year to CA UPT data
    df = pd.merge(ca, last_report_year, left_on="ntd_id", right_on="ntd_id", how="inner")

    # merge crosswalk to CA last report year
    df = pd.merge(
        df,
        # Merging on too many columns can create problems
        # because csvs and dtypes aren't stable / consistent
        # for NTD ID, Legacy NTD ID, and UZA
        crosswalk[["ntd_id_2022", "rtpa_name"]],
        left_on="ntd_id",
        right_on="ntd_id_2022",
        how="left",
        indicator=True,
    )

    print(df._merge.value_counts())

    # check for unmerged rows
    if len(df[df._merge == "left_only"]) > 0:
        raise ValueError("There are unmerged rows to crosswalk")
    
    df = add_change_columns(df)
    
    df = df.assign(
        Mode_full = df["mode"].map(NTD_MODES),
        TOS_full = df["tos"].map(NTD_TOS)
    )
    
    return df

In [None]:
data = produce_ntd_monthly_ridership_by_rtpa(YEAR, MONTH)

In [None]:
display(
    data.info(),
    data["rtpa_name"].value_counts(),
    data.describe()
)

In [None]:
df_check = pd.read_parquet(
        f"{GCS_FILE_PATH}ca_monthly_ridership_{YEAR}_{MONTH}.parquet"
    )

In [None]:
df_check["rtpa_name"].sort_values().unique()
# wher is SLOCOG??!?!
# City of SLO was in SANDAG, adjusted city of slo RTPA in airtable 6/20/2025. should be fixed in the next report update

In [None]:
ntd_rtpa_orgs = (
        tbls.mart_transit_database.dim_organizations()
        >> filter(
            _._is_current == True,
            _.ntd_id_2022.notna(),
            _.rtpa_name.notna(),
        )
        >> select(
            _.name, 
            _.ntd_id_2022, 
            _.rtpa_name, 
            _.mpo_name, 
            _.key
        )
        >> collect()
    )

    # join bridge org county geo to get agency counties
bridge_counties = (
        tbls.mart_transit_database.bridge_organizations_x_headquarters_county_geography()
        >> filter(
            _._is_current == True
        )
        >> select(
            _.county_geography_name, 
            _.organization_key
        )
        >> collect()
    )

In [None]:
ntd_rtpa_orgs[ntd_rtpa_orgs["name"].str.contains("San Luis")]

In [None]:
bridge_counties["county_geography_name"].value_counts()

In [None]:
ntd_to_rtpa_crosswalk = ntd_rtpa_orgs.merge(
        bridge_counties, 
        left_on="key", 
        right_on="organization_key", 
        how="left"
    )

In [None]:
ntd_to_rtpa_crosswalk[ntd_to_rtpa_crosswalk["rtpa_name"].str.contains("San Luis")]

---

# {rtpa}
## Monthly Ridership Trends

**Download data from our public [folder](https://console.cloud.google.com/storage/browser/calitp-publish-data-analysis)** by navigating to `ntd_monthly_ridership` and selecting a file.

Transit operators/agencies that are **Urban full reporters, that submit monthly ridership data to NTD from 2018 to present**, are included in this report. Reporters that were previously Urban full reporters, but are currently not, may appear. This may result in Reporters showing zero or partial ridership data in the report. 

If a Reporter is not a monthly reporter, or has not reported data since 2018, they will not appear in the report.

Examples: 
- Reporter A is an urban full reporter from 2019-2022, then became a reduced reporter for 2023. Reporter A's ridership data will be displayed for 2019-2022 only.
- Reporter B is an urban full reporter from 2000-2017, then became a reduced reporter for 2018. Reporter B will be named in the report, but will not display ridership data.
- Reporter C was a reduced reporter form 2015-2020, then became an urban full reporter and began submitting monthly ridership data to NTD for 2021. Reporter C's ridership data will be displayed for 2021-present.

In [76]:
URL = "https://console.cloud.google.com/storage/" "browser/calitp-publish-data-analysis"

display(
    HTML(
        f"""
        <a href={URL}>
        Download the latest month of data: {PUBLIC_FILENAME}</a>
        """
    )
)

In [81]:
MIN_YEAR = 2018

# updated to filter for period_year greater than 2018
df = produce_ntd_monthly_ridership_by_rtpa_v2(YEAR, MONTH).drop(
    columns=["mode", "tos"]
).rename(
    columns={"Mode_full": "Mode", "TOS_full": "TOS"}
)


both          17845
left_only         0
right_only        0
Name: _merge, dtype: int64


In [82]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17845 entries, 0 to 17844
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   ntd_id             17845 non-null  object  
 1   agency             17845 non-null  object  
 2   reporter_type      17845 non-null  object  
 3   period_year_month  17845 non-null  object  
 4   period_year        17845 non-null  int64   
 5   period_month       17845 non-null  int64   
 6   Status             17845 non-null  object  
 7   uza_name           17845 non-null  object  
 8   upt                17845 non-null  float64 
 9   source_agency      17845 non-null  object  
 10  last_report_year   17845 non-null  int64   
 11  ntd_id_2022        17845 non-null  object  
 12  rtpa_name          17845 non-null  object  
 13  _merge             17845 non-null  category
 14  previous_y_m_upt   17591 non-null  float64 
 15  change_1yr         17591 non-null  float64 
 16  pct_

In [95]:
# KEEPING AS IS FOR NOW, BUT MAY NEED TO REWORK


def remove_zero_upt_rows(df: pd.DataFrame) -> pd.DataFrame:
    """
    takes the by_agency_long df, filters for rows with zero UPT AND zero change_1yr.
    then removes these rows from the by_agency_long df.
    resulting df should only contain rows with change in UPT not due to (0 UPT - 0 UPT) and net zero change in UPT
    zero UPT and zero change_1yr occurs when (0 upt - 0upt = 0change), dont need to show this
    but a net zero change (100upt - 100upt = 0 change), should be shown
    """

    # df of rows with zero UPT for 2 years
    zero_UPT_2years = df[(df["upt"] == 0) & (df["change_1yr"] == 0)]

    merge = df.merge(zero_UPT_2years, how="left", indicator=True)
    no_zero_UPT_rows = (
        merge[merge["_merge"] == "left_only"]
        .drop(columns=["_merge"])
        .dropna(subset="change_1yr")
    )

    return no_zero_UPT_rows

In [83]:
def group_by_agency(df):
    """
    Take in the 'by_ageny_long' df and aggregatese by rtpa, and calculates upt % of total.
    To be used in pie chart
    """
    initial_agg = df.groupby("agency").agg(total_upt=("upt", "sum")).reset_index()

    # % total columns
    initial_agg["pct_of_total_upt"] = (
        initial_agg["total_upt"] / initial_agg["total_upt"].sum()
    ) * 100

    # cleaning data types and rounding
    initial_agg["total_upt"] = initial_agg["total_upt"].astype("int64")
    initial_agg["pct_of_total_upt"] = initial_agg["pct_of_total_upt"].round(decimals=2)
    cleaned_agg = initial_agg.sort_values(by="total_upt", ascending=False)

    return cleaned_agg

In [88]:
from _01_ntd_ridership_utils import sum_by_group

In [96]:
agency_cols = ["ntd_id", "agency", "rtpa_name"]
mode_cols = ["Mode", "rtpa_name"]
tos_cols = ["TOS", "rtpa_name"]

# by_agency_long = _01_ntd_ridership_utils.sum_by_group(df, agency_cols)
# by_mode_long = _01_ntd_ridership_utils.sum_by_group(df, mode_cols)
# by_tos_long = _01_ntd_ridership_utils.sum_by_group(df, tos_cols)

monthly_group_col_2 = [
            'period_year',
            'period_month',
            'period_year_month']

monthly_agg_col = {
            "upt":"sum",
            "previous_y_m_upt":"sum",
            "change_1yr":"sum"
            }
monthly_change_col ="previous_y_m_upt"

by_agency_long = sum_by_group(
                df = df,
                group_cols= agency_cols,
                group_col2= monthly_group_col_2,# look into combingin with base grou_cols
                agg_cols = monthly_agg_col,
                change_col= monthly_change_col
            )

by_mode_long = sum_by_group(
                df = df,
                group_cols= mode_cols,
                group_col2= monthly_group_col_2,# look into combingin with base grou_cols
                agg_cols = monthly_agg_col,
                change_col= monthly_change_col
            )
    
by_tos_long = sum_by_group(
                df = df,
                group_cols= tos_cols,
                group_col2= monthly_group_col_2,# look into combingin with base grou_cols
                agg_cols = monthly_agg_col,
                change_col= monthly_change_col
            )

# remove zero UPT rows 
by_agency_long_no_zero_upt = remove_zero_upt_rows(by_agency_long)

# agg by agency, for pie chart
agency_agg_yr = group_by_agency(by_agency_long)

# total UPT check 
total_upt = by_agency_long["upt"].sum()
agency_count = by_agency_long["agency"].nunique()

In [None]:
# REMOVE

# I STILL NEED FILTERED df BY AGENCY, MODE AND TOS

# by_agency_long = assemble_long_df(by_agency, agency_cols)
# by_mode_long = assemble_long_df(by_mode, mode_cols)
# by_tos_long = assemble_long_df(by_tos, tos_cols)

# by_agency_long_no_zero_upt = remove_zero_upt_rows(by_agency_long)

## TEST Charts

In [69]:
LABELING_DICT = {
    "upt": "Unlinked Passenger Trips",
    "change_1yr": "Change in Unlinked Passenger Trips from Prior Year",
    "TOS": "Type of Service",
    "year_month": "Date",
}


def labeling(word: str) -> str:
    return label_visualization(word, LABELING_DICT)


WIDTH = 300
HEIGHT = 150

In [104]:
def make_line_chart(
    df: pd.DataFrame,
    y_col: str,
    color_col: str,
) -> alt.Chart:
    df = df[df[y_col] > 0].dropna(subset=y_col)

    x_label = [
        i
        for i in df.period_year_month.unique()
        if any(substring in i for substring in ["-01", "-06"])
    ]
    chart = (
        (
            alt.Chart(df)
            .mark_line()
            .encode(
                x=alt.X(
                    "period_year_month:O", axis=alt.Axis(values=x_label), title="Date"
                ),
                y=alt.Y(y_col, title=labeling(y_col)),
                color=alt.Color(
                    color_col,
                    title="",
                    scale=alt.Scale(
                        range=cp.CALITP_CATEGORY_BRIGHT_COLORS
                        + cp.CALITP_CATEGORY_BOLD_COLORS
                    ),
                ),
                tooltip=["period_year_month", y_col, color_col, "rtpa_name"],
            )
            .properties(width=WIDTH, height=HEIGHT)
            .facet(color_col, columns=2, title="")
            .resolve_scale(y="independent")
        )
        .properties(title=f"{labeling(y_col)} by {labeling(color_col)}")
        .interactive()
    )

    return chart

In [105]:
def make_bar_chart(
    df: pd.DataFrame,
    y_col: str,
    color_col: str,
) -> alt.Chart:

    def short_label(word):
        shorten_dict = {
            "change_1yr": "Change",
            "pct_change_1yr": "Change",
        }
        return shorten_dict[word]

    # For change column, we are missing everything prior to 2023
    # df = df.dropna(subset = y_col)

    # need flag for y_col >,<, 0, missing?
    # count function to how many agencies fall in those categories, then look at those agencies
    # present table

    x_label = [
        i
        for i in df.period_year_month.unique()
        if any(substring in i for substring in ["-01", "-03", "-06", "-09"])
    ]

    chart = (
        (
            alt.Chart(df)
            .mark_bar()
            .encode(
                x=alt.X(
                    "period_year_month:O", axis=alt.Axis(values=x_label), title="Date"
                ),
                y=alt.Y(y_col, title=short_label(y_col)),
                color=alt.Color(
                    color_col,
                    title="",
                    scale=alt.Scale(
                        range=cp.CALITP_CATEGORY_BRIGHT_COLORS
                        + cp.CALITP_CATEGORY_BOLD_COLORS
                    ),
                ),
                tooltip=["period_year_month", y_col, color_col, "rtpa_name"],
            )
            .properties(width=WIDTH, height=HEIGHT)
            .facet(color_col, columns=2, title="")
            .resolve_scale(x="shared", y="independent")
        )
        .properties(title=f"{labeling(y_col)} by {labeling(color_col)}")
        .interactive()
    )

    return chart

In [72]:
### initial pie code
def make_pie_chart(df, col, color_col):

    pie = (
        alt.Chart(df)
        .mark_arc(radius=150)
        .encode(
            theta=col,
            color=color_col,
            tooltip=["agency", "total_upt", "pct_of_total_upt"],
        )
        .properties(
            title=f"Total Unlinked Passenger Trips per agency in RTPA since {MIN_YEAR}",
        )
    )

    return pie

In [73]:
## moved to prod notebook 1/15/2025
# simple bar chart for total agencies and UPT
def total_upt_chart(df: pd.DataFrame, x_col: str, y_col: str, tool_tip: list):
    bar_chart = (
        alt.Chart(df)
        .mark_bar()
        .encode(
            x=alt.X(x_col).sort("-y"),
            y=alt.Y(y_col),
            tooltip=tool_tip,
            color=alt.Color(
                x_col,
                title="",
                scale=alt.Scale(
                    range=cp.CALITP_CATEGORY_BRIGHT_COLORS
                    + cp.CALITP_CATEGORY_BOLD_COLORS
                ),
            ),
        )
        .properties(
            title=f"Total Unlinked Passenger Trips per agency in RTPA since {MIN_YEAR}",
            width=WIDTH,
            height=HEIGHT,
        )
        .resolve_scale(y="independent")
        .interactive()
    )

    return bar_chart

### Totals

In [97]:
Markdown(
    f"""
Within {rtpa}:
- Number of Agencies/Operators: <b>{agency_count}</b>.
- Total Unlinked Passenger Trips since {MIN_YEAR}: <b>{total_upt:,}</b>.
- Individual agency/operator ridership breakdown:
"""
)


Within Metropolitan Transportation Commission:
- Number of Agencies/Operators: <b>91</b>.
- Total Unlinked Passenger Trips since 2018: <b>6,282,761,066.0</b>.
- Individual agency/operator ridership breakdown:


In [None]:
df_html = agency_agg_yr.reset_index(drop=True).to_html()

print(df_html)

In [None]:
df_markdown = agency_agg_yr.reset_index(drop=True).to_markdown()

print(df_markdown)

In [99]:
tooltip_list = ["agency", "total_upt", "pct_of_total_upt"]

total_upt_chart(agency_agg_yr, x_col="agency", y_col="total_upt", tool_tip=tooltip_list)

In [100]:
make_pie_chart(agency_agg_yr, col="total_upt", color_col="agency")

### Transit Agency

In [102]:
by_agency_long.head()

Unnamed: 0,ntd_id,agency,rtpa_name,period_year,period_month,period_year_month,upt,previous_y_m_upt,change_1yr,pct_change_1yr
0,90003,San Francisco Bay Area Rapid Transit District,Metropolitan Transportation Commission,2018,1,2018-01,10363070.0,0.0,0.0,inf
1,90003,San Francisco Bay Area Rapid Transit District,Metropolitan Transportation Commission,2018,2,2018-02,9826945.0,4162679.0,5664266.0,136.07
2,90003,San Francisco Bay Area Rapid Transit District,Metropolitan Transportation Commission,2018,3,2018-03,10958137.0,4241099.0,6717038.0,158.38
3,90003,San Francisco Bay Area Rapid Transit District,Metropolitan Transportation Commission,2018,4,2018-04,10623437.0,4514672.0,6108765.0,135.31
4,90003,San Francisco Bay Area Rapid Transit District,Metropolitan Transportation Commission,2018,5,2018-05,11198585.0,4677267.0,6521318.0,139.43


In [106]:
make_line_chart(by_agency_long, y_col="upt", color_col="agency")

Change in Unlinked Passenger Trips from the prior year. For example, July 2023's change would be the change in July 2023's reported values against July 2022's reported values.

In [107]:
# TEST OF BAR CHART WITH % CHANGE 1 YEAR
make_bar_chart(
    by_agency_long_no_zero_upt[by_agency_long_no_zero_upt["period_year"] >= 2018],
    y_col="pct_change_1yr",
    color_col="agency",
)

In [None]:
make_bar_chart(
    by_agency_long_no_zero_upt[by_agency_long_no_zero_upt["period_year"] >= 2024],
    y_col="change_1yr",
    color_col="agency",
)

In [None]:
make_bar_chart(by_agency_long_no_zero_upt, y_col="change_1yr", color_col="agency")

### Transit Mode

In [None]:
make_line_chart(by_mode_long, y_col="upt", color_col="Mode")

In [None]:
make_bar_chart(
    by_mode_long[by_mode_long["period_year"] >= 2024],
    y_col="change_1yr",
    color_col="Mode",
)

### Type of Service

In [None]:
make_line_chart(by_tos_long, y_col="upt", color_col="TOS")

In [None]:
make_bar_chart(
    by_tos_long[by_tos_long["period_year"] >= 2024], y_col="change_1yr", color_col="TOS"
)