In [None]:
import os
import shutil
import sys

import altair as alt
import annual_ridership_module
import gcsfs
import geopandas as gpd
import pandas as pd
from calitp_data_analysis.sql import to_snakecase
from calitp_data_analysis.tables import tbls

# from segment_speed_utils.project_vars import PUBLIC_GCS
from siuba import _, collect, count, filter, select, show_query

sys.path.append("../")  # up one level
from update_vars import GCS_FILE_PATH, MONTH, NTD_MODES, NTD_TOS, YEAR

GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/ntd/"

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

## 6/4/2025 Switch over to from `ntd_id_to_rtpa_xwalk` to  `dim_organizations.rtpa_name`
- dim_organizations now includes a rtpa_name column. Orgs without an rtpa_name were filled in accordingly.
- No need to use use the old xwalk, so adjust code to integrate rtpa data from dim_orgs

## Work plan
1. edit the `produce_ntd_ridership_data_by_rtpa` function to:
    - replace old xwalk with `dim_orgs`
    - COMPLETE ~~adjust unmerged Los Angeles County rows to its own rtpa `Los Angeles County Department of Public Works`~~
    - COMPLETE ~~update SCAG agencies to their county CTC~~
2. **COMPLETE** ~~update the `deploy_portfolio_yaml` file to:~~
    - ~~get RTPA data from `dim_orgs`~~
    - ~~append `Los Angeles County Department of Public Works` to the list of unique RTPA names~~
3. Update total upt bar chart
    - make horizontal bars
    - add data labels to end of bars
    - move legend to bottom
    
    

### updated functions

In [None]:
# MIGRATED TO MODULE 6/18/2025
# to be used within the big function

def ntd_id_to_rtpa_crosswalk(split_scag:bool) -> pd.DataFrame:
    """
    Creates ntd_id to rtpa crosswalk. Reads in dim_orgs, merge in county data from bridge table.
    enable split_scag to separate the SCAG to individual county CTC for RTPA. disable split_scag to have all socal counties keep SCAG as RTPA
    
    """
    #split socal counties to county CTC
    socal_county_dict = {
        "Ventura": "Ventura County Transportation Commission",
        "Los Angeles": "Los Angeles County Metropolitan Transportation Authority",
        "San Bernardino": "San Bernardino County Transportation Authority",
        "Riverside": "Riverside County Transportation Commission",
        "Orange": "Orange County Transportation Authority",
    }
    
    # Get agencies and RTPA name
    ntd_rtpa_orgs = (
        tbls.mart_transit_database.dim_organizations()
        >> filter(
            _._is_current == True,
            _.ntd_id_2022.notna(),
            _.rtpa_name.notna(),
        )
        >> select(
            _.name, 
            _.ntd_id_2022, 
            _.rtpa_name, 
            _.mpo_name, 
            _.key
        )
        >> collect()
    )

    # join bridge org county geo to get agency counties
    bridge_counties = (
        tbls.mart_transit_database.bridge_organizations_x_headquarters_county_geography()
        >> filter(
            _._is_current == True
        )
        >> select(
            _.county_geography_name, 
            _.organization_key
        )
        >> collect()
    )
    
    # merge to get crosswalk
    ntd_to_rtpa_crosswalk = ntd_rtpa_orgs.merge(
        bridge_counties, 
        left_on="key", 
        right_on="organization_key", 
        how="left"
    )
    
    # locate SoCal counties, replace initial RTPA name with dictionary.
    if split_scag == True:
        ntd_to_rtpa_crosswalk.loc[
            ntd_to_rtpa_crosswalk["county_geography_name"].isin(
                socal_county_dict.keys()
            ),
            "rtpa_name",
        ] = ntd_to_rtpa_crosswalk["county_geography_name"].map(socal_county_dict)
        
    return ntd_to_rtpa_crosswalk

In [None]:
## update function
min_year = 2018

def produce_annual_ntd_ridership_data_by_rtpa(min_year: str, split_scag: bool) -> pd.DataFrame:
    """
    Function that ingest time series ridership data from `mart_ntd_funding_and_expenses.fct_service..._by_mode_upt`. 
    Filters for CA agencies with last report year and year of data greater than min_year
    Merges in ntd_id_to_rtpa_crosswalk function. Aggregates by agency, mode and TOS. calculates change in UPT.
    """
    from annual_ridership_module import add_change_columns
    
    
    print("ingest annual ridership data from warehouse")
    
    ntd_service =(
        tbls.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt()
        >> filter(
            _.year >= min_year,
            _.last_report_year >= min_year,
            _.primary_uza_name.str.contains(", CA") | 
            _.primary_uza_name.str.contains("CA-NV") |
            _.primary_uza_name.str.contains("California Non-UZA") 
        )
        >> select(
            'source_agency',
            'agency_status',
            'legacy_ntd_id',
            'last_report_year',
            'mode',
            'ntd_id',
            'reporter_type',
            'reporting_module',
            'service',
            'uace_code',
            'primary_uza_name',
            'uza_population',
            'year',
            'upt',
        )
        >> collect())
    
    ntd_service = (
        ntd_service.groupby(
            [
                "source_agency",
                "agency_status",
                #"city",
                #"state",
                "ntd_id",
                "primary_uza_name",
                "reporter_type",
                "mode",
                "service",
                "last_report_year",
                "year",
            ]
        )
        .agg({"upt": "sum"})
        .sort_values(by="ntd_id")
        .reset_index()
    )
    
    print("create crosswalk from ntd_id_to_rtpa_crosswalk function")
    
    # Creating crosswalk
    ntd_to_rtpa_crosswalk = ntd_id_to_rtpa_crosswalk(split_scag=split_scag)
    
    
    
    print("merge ntd data to crosswalk")
    # merge service data to crosswalk
    ntd_data_by_rtpa = ntd_service.merge(
        ntd_to_rtpa_crosswalk,
        how="left",
        left_on=[
            "ntd_id",
            # "agency", "reporter_type", "city" # sometime agency name, reporter type and city name change or are inconsistent, causing possible fanout
        ],
        right_on="ntd_id_2022",
        indicator=True,
    )
    
    # list of ntd_id with LA County Dept of Public Works name
    lacdpw_list = [
        "90269",
        "90270",
        "90272",
        "90273",
        "90274",
        "90275",
        "90276",
        "90277",
        "90278",
        "90279",
    ]
    
    ntd_data_by_rtpa.loc[
        ntd_data_by_rtpa["ntd_id"].isin(lacdpw_list), ["rtpa_name", "_merge"]
    ] = ["Los Angeles County Department of Public Works", "both"]
    
    print(ntd_data_by_rtpa._merge.value_counts())
        
    if len(ntd_data_by_rtpa[ntd_data_by_rtpa._merge=="left_only"]) > 0:
        raise ValueError("There are unmerged rows to crosswalk")
    
    print("add `change_column` to data")
    ntd_data_by_rtpa = annual_ridership_module.add_change_columns(ntd_data_by_rtpa)
    
    print("map mode and tos desc.")
    ntd_data_by_rtpa = ntd_data_by_rtpa.assign(
        mode_full = ntd_data_by_rtpa["mode"].map(NTD_MODES),
        service_full = ntd_data_by_rtpa["service"].map(NTD_TOS)
    )
    
    return ntd_data_by_rtpa

In [None]:
annual_repport_data_no_split = produce_annual_ntd_ridership_data_by_rtpa(min_year= min_year, split_scag=False)

In [None]:
annual_repport_data_w_split = produce_annual_ntd_ridership_data_by_rtpa(min_year= min_year, split_scag=True)

In [None]:
annual_repport_data_no_split

In [None]:
annual_repport_data_w_split.info()

In [None]:
annual_repport_data_w_split= ntd_id_to_rtpa_crosswalk(min_year=min_year, split_scag=True)

In [None]:
annual_repport_data_w_split.head()

In [None]:
display(
    test_xwalk.info(),
    test_xwalk["rtpa_name"].sort_values().value_counts(),
    test_xwalk.head()
    
)
    

In [1]:
# MIGRATED TO .PY FILE 06/18/2025
# updated script to get RTPAs from dim_orgs
"""
Creates site .yml with chapters for each RTPA in the ntd/rtpa crosswalk, places it in the portfolio/sites directory

Yaml structure is not nested by district, it is just all RTPAs
in the navigation panel.
"""

import sys

sys.path.append("../")  # up one level

from pathlib import Path

import pandas as pd
from shared_utils import portfolio_utils
from update_vars import GCS_FILE_PATH
from annual_ridership_module import ntd_id_to_rtpa_crosswalk

PORTFOLIO_SITE_YAML = Path("../../portfolio/sites/ntd_annual_ridership_report.yml")

In [40]:
# read in rtpa data from dim_orgs
if __name__ == "__main__":
    df = ntd_id_to_rtpa_crosswalk(split_scag=True)["rtpa_name"].drop_duplicates().to_frame()
    ladpw= pd.DataFrame({"rtpa_name":["Los Angeles County Department of Public Works"]})
    df = pd.concat([df, ladpw], ignore_index=True).sort_values(by="rtpa_name")

    # portfolio_utils.create_portfolio_yaml_chapters_no_sections(
    #     PORTFOLIO_SITE_YAML, chapter_name="rtpa", chapter_values=list(df.rtpa_name)
    # )

In [41]:
df

Unnamed: 0,rtpa_name
1,Alpine County Local Transportation Commission
29,Amador County Transportation Commission
9,Butte County Association of Governments
3,Calaveras Council of Governments
32,Colusa County Transportation Commission
40,Council of San Benito County Governments
19,Del Norte Local Transportation Commission
33,El Dorado County Transportation Commission
31,Fresno Council of Governments
34,Glenn County Transportation Commission


---

In [None]:
## update function
def produce_annual_ntd_ridership_data_by_rtpa(min_year: str):
    """
    Function that ingest ridership data from `dim_annual_service_agencies`, filters for CA agencies.
    Merges in ntd_id_to_RTPA_crosswalk. Aggregates by agency, mode and TOS. calculates change in UPT.
    """
    from annual_ridership_module import add_change_columns

In [None]:
min_year = 2018

In [None]:
    print("ingest annual ridership data from warehouse")
    
    ntd_service =(
        tbls.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt()
        >> filter(
            #_.state.str.contains("CA") | 
            #_.state.str.contains("NV"), # to get lake Tahoe Transportation back
            _.year >= min_year,
                  #_.city != None,
            _.last_report_year >= min_year,
            _.primary_uza_name.str.contains(", CA") | 
            _.primary_uza_name.str.contains("CA-NV") |
            _.primary_uza_name.str.contains("California Non-UZA") 
            #_.primary_uza_name.str.contains("El Paso, TX--NM") # something about Paso 
        )
        >> select(
            'source_agency',
            'agency_status',
            #'city',
            'legacy_ntd_id',
            'last_report_year',
            'mode',
            'ntd_id',
            'reporter_type',
            'reporting_module',
            'service',
            #'state',
            'uace_code',
            'primary_uza_name',
            'uza_population',
            'year',
            'upt',
        )
        >> collect())

In [None]:
# check
display(
    ntd_service.head(),
    len(ntd_service),
    ntd_service["year"].unique(),
    ntd_service["last_report_year"].unique(),
)

In [None]:
ntd_service = (
    ntd_service.groupby(
        [
            "source_agency",
            "agency_status",
            #"city",
            #"state",
            "ntd_id",
            "primary_uza_name",
            "reporter_type",
            "mode",
            "service",
            "last_report_year",
            "year",
        ]
    )
    .agg({"upt": "sum"})
    .sort_values(by="ntd_id")
    .reset_index()
)

In [None]:
# check
display(
    ntd_service.head(),
    ntd_service["year"].unique(),
    ntd_service["last_report_year"].unique(),
)

In [None]:
ntd_service[ntd_service["ntd_id"]=="90003"].sort_values(by=["year","mode","service"])

In [None]:
socal_county_dict = {
    "Ventura": "Ventura County Transportation Commission",
    "Los Angeles": "Los Angeles County Metropolitan Transportation Authority",
    "San Bernardino": "San Bernardino County Transportation Authority",
    "Riverside": "Riverside County Transportation Commission",
    "Orange": "Orange County Transportation Authority",
}

In [None]:
#     print("read in new `ntd_id_to_rtpa_all_reporter_types` crosswalk")

#     ntd_to_rtpa_crosswalk = pd.read_parquet(f"{GCS_FILE_PATH}ntd_id_rtpa_crosswalk_all_reporter_types.parquet")

print("read in RTPA data from dim_organizations")
ntd_to_rtpa_crosswalk = (
    tbls.mart_transit_database.dim_organizations()
    >> filter(
        _._is_current == True,
        _.ntd_id_2022.notna(),
        _.rtpa_name.notna(),
    )
    >> select(_.name, _.ntd_id_2022, _.rtpa_name, _.mpo_name, _.key)
    >> collect()
)

# need to join bridge org county geo to get org counties
bridge_counties = (
    tbls.mart_transit_database.bridge_organizations_x_headquarters_county_geography()
    >> filter(_._is_current == True)
    >> select(_.county_geography_name, _.organization_key)
    >> collect()
)

ntd_to_rtpa_crosswalk = ntd_to_rtpa_crosswalk.merge(
    bridge_counties, left_on="key", right_on="organization_key", how="left"
)

In [None]:
ntd_to_rtpa_crosswalk["county_geography_name"].sort_values().unique()

In [None]:
ntd_to_rtpa_crosswalk[
    ntd_to_rtpa_crosswalk["county_geography_name"].isin(socal_county_dict.keys())
]["rtpa_name"].unique() # currently does not include Orange, San Bern, Riverside CTC

In [None]:
# filter for counties in the socal dict, then use map
# ntd_to_rtpa_crosswalk[
#     ntd_to_rtpa_crosswalk["county_geography_name"].isin(list(socal_county_dict.keys()))
# ]["rtpa_name"] = ntd_to_rtpa_crosswalk[
#     ntd_to_rtpa_crosswalk["county_geography_name"].isin(list(socal_county_dict.keys()))
# ][
#     "county_geography_name"
# ].map(
#     socal_county_dict
# )

In [None]:
# replace socal copunties with dictionary
ntd_to_rtpa_crosswalk.loc[
    ntd_to_rtpa_crosswalk["county_geography_name"].isin(socal_county_dict.keys()),
    "rtpa_name",
] = ntd_to_rtpa_crosswalk["county_geography_name"].map(socal_county_dict)

In [None]:
# check
display(
    ntd_to_rtpa_crosswalk.head(),
    ntd_to_rtpa_crosswalk[
        ntd_to_rtpa_crosswalk["county_geography_name"].isin(socal_county_dict.keys())
    ]["rtpa_name"].unique(),
    # ntd_to_rtpa_crosswalk[
    #     ntd_to_rtpa_crosswalk["county_geography_name"].isin(socal_county_dict.keys())
    # ],
)

In [None]:
lacdpw_list = [
    "90269",
    "90270",
    "90272",
    "90273",
    "90274",
    "90275",
    "90276",
    "90277",
    "90278",
    "90279",
]

In [None]:
ntd_to_rtpa_crosswalk[ntd_to_rtpa_crosswalk["ntd_id_2022"].isin(lacdpw_list)] # crosswalk does not include LADPW agencies

In [None]:
print("merge ntd data to crosswalk")

ntd_data_by_rtpa = ntd_service.merge(
    ntd_to_rtpa_crosswalk,
    how="left",
    left_on=[
        "ntd_id",
        # "agency", "reporter_type", "city" # sometime agency name, reporter type and city name change or are inconsistent, causing possible fanout
    ],
    right_on="ntd_id_2022",
    indicator=True,
)  # .rename(
# columns={
#     "actual_vehicles_passenger_car_revenue_hours":"vrh",
#     "actual_vehicles_passenger_car_revenue_miles":"vrm",
#     "unlinked_passenger_trips_upt":"upt",
#     'agency_name_x':"agency_name",
#     'agency_status_x':"agency_status",
#     'city_x':"city",
#     'state_x':"state",
#     'reporter_type_x':"reporter_type",
#     "agency_name_y":"xwalk_agency_name",
#     'reporter_type_y':"xwalk_reporter_type",
#     'agency_status_y':"xwalk_agency_status",
#     'city_y':"xwalk_city",
#     'state_y':"xwalk_state",
# }
# )

In [None]:
# check
display(
    ntd_data_by_rtpa["_merge"].value_counts(),
    ntd_data_by_rtpa.head()
)


In [None]:
ntd_data_by_rtpa[ntd_data_by_rtpa._merge == "left_only"]["ntd_id"].unique().tolist() == lacdpw_list # the unmerged rows match the LA public works ntd_id


In [None]:
# adding "Los Angeles County Department of Public Works" RTPA Name
# if agency_name str.contains "Los Angeles County Department of Public Works", then make rtpa_name "Los Angeles County Department of Public Works"?
# or if ntd_id is in lA County Public works list, then make rtpa name
lacdpw_list = [
    "90269",
    "90270",
    "90272",
    "90273",
    "90274",
    "90275",
    "90276",
    "90277",
    "90278",
    "90279",
]

# update LACDPW rows for rtpa and merge indicator
ntd_data_by_rtpa.loc[
    ntd_data_by_rtpa["ntd_id"].isin(lacdpw_list), ["rtpa_name", "_merge"]
] = ["Los Angeles County Department of Public Works", "both"]

In [None]:
# check
display(
    #ntd_data_by_rtpa[ntd_data_by_rtpa._merge == "left_only"],
    ntd_data_by_rtpa[ntd_data_by_rtpa["ntd_id"].isin(lacdpw_list)],
)

In [None]:
    print(ntd_data_by_rtpa._merge.value_counts())
        
    if len(ntd_data_by_rtpa[ntd_data_by_rtpa._merge=="left_only"]) > 0:
        raise ValueError("There are unmerged rows to crosswalk")
    
    print("add `change_column` to data")
    ntd_data_by_rtpa = annual_ridership_module.add_change_columns(ntd_data_by_rtpa)
    
    print("map mode and tos desc.")
    ntd_data_by_rtpa = ntd_data_by_rtpa.assign(
        mode_full = ntd_data_by_rtpa["mode"].map(NTD_MODES),
        service_full = ntd_data_by_rtpa["service"].map(NTD_TOS)
    )
    
    # return ntd_data_by_rtpa

## Double checking initial annual report data
~~- why am i only seeing 2022-2023 data in the warehouse as of 6/5/2025?~~
- fixed it. excluded city, state and agency_name from my query

In [None]:
initial_data = pd.read_parquet(f"{GCS_FILE_PATH}annual_ridership_report_data.parquet")

initial_data["year"].value_counts()

In [None]:
ntd_data_by_rtpa["year"].value_counts()



In [None]:
def group_by_agency(df):
    """
    Take in the 'by_ageny_long' df and aggregatese by rtpa, and calculates upt % of total.
    To be used in pie chart
    """
    initial_agg = df.groupby("agency_name").agg(total_upt=("upt", "sum")).reset_index()

    # % total columns
    initial_agg["pct_of_total_upt"] = (
        initial_agg["total_upt"] / initial_agg["total_upt"].sum()
    ) * 100

    # cleaning data types and rounding
    initial_agg["total_upt"] = initial_agg["total_upt"].astype("int64")
    initial_agg["pct_of_total_upt"] = initial_agg["pct_of_total_upt"].round(decimals=2)
    cleaned_agg = initial_agg.sort_values(by="total_upt", ascending=False)

    return cleaned_agg

In [None]:
from calitp_data_analysis import calitp_color_palette as cp

In [None]:
WIDTH = 300
HEIGHT = 150

sacog = "Sacramento Area Council of Governments"  # med
scag = "Southern California Association of Governments"  # XL
mtc = "Metropolitan Transportation Commission"  # large
sjcog = "San Joaquin Council of Governments"  # small

agency_cols = ["ntd_id", "agency_name", "RTPA"]
by_agency_long = annual_ridership_module.sum_by_group(
    initial_data[initial_data["RTPA"] == scag], agency_cols
)
agency_agg_yr = group_by_agency(by_agency_long)

In [None]:
## Update total_upt_chart function
def total_upt_chart(df: pd.DataFrame, x_col: str, y_col: str, tool_tip: list):
    bar_chart = (
        alt.Chart(df)
        .mark_bar()
        .encode(
            x=alt.X(x_col).sort("-y"),
            y=alt.Y(y_col),
            tooltip=tool_tip,
            color=alt.Color(
                x_col,
                title="",
                scale=alt.Scale(
                    range=cp.CALITP_CATEGORY_BRIGHT_COLORS
                    + cp.CALITP_CATEGORY_BOLD_COLORS
                ),
            ),
        )
        .properties(
            title=f"Total Annual Unlinked Passenger Trips per Reporter in RTPA since 2018",
            width=WIDTH,
            height=HEIGHT,
        )
        .resolve_scale(y="independent")
        .interactive()
    )

    return bar_chart

In [None]:
def total_upt_chart_v2(df: pd.DataFrame, cat: str, con: str, tool_tip: list):
    # Base bar chart
    bar_chart = (
        alt.Chart(df)
        .mark_bar()
        .encode(
            x=alt.X(con),
            y=alt.Y(cat).sort("-x"),
            tooltip=tool_tip,
            color=alt.Color(
                cat,
                title="",
                scale=alt.Scale(
                    range=cp.CALITP_CATEGORY_BRIGHT_COLORS
                    + cp.CALITP_CATEGORY_BOLD_COLORS
                ),
            ).legend(orient="bottom", columns=3),
        )
    )

    # New: Add a text layer for data labels
    text_labels = (
        alt.Chart(df)
        .mark_text(align="left", dx=3, dy=0, color="black", fontSize=11)
        .encode(
            x=alt.X(con),
            y=alt.Y(cat, sort="-x", axis=None),
            text=alt.Text(con),  # Display value of y_col
        )
    )

    # Combine both layers
    chart = (
        (bar_chart + text_labels)  # Layer bars and text
        .properties(
            title="Total Annual Unlinked Passenger Trips per Reporter in RTPA since 2018",
            width=WIDTH,
            height=HEIGHT,
        )
        .resolve_scale(y="independent")
        .interactive()
    )

    return chart

In [None]:
tooltip_list = ["agency_name", "total_upt", "pct_of_total_upt"]

total_upt_chart_v2(
    agency_agg_yr, cat="agency_name", con="total_upt", tool_tip=tooltip_list
)

## update deploy_portfolio_yaml

In [None]:
# read in rtpa data from dim_orgs
# if name == main section
df = (
    (
        tbls.mart_transit_database.dim_organizations()
        >> filter(
            _._is_current == True,
            # _.ntd_id_2022.notna(),
            _.rtpa_name.notna(),
        )
        >> select(_.name, _.ntd_id_2022, _.rtpa_name, _.mpo_name)
        >> collect()
    )["rtpa_name"]
    .sort_values()
    .drop_duplicates()
    .reset_index(drop=True)
)

In [None]:
df.info()

In [None]:
# add new row to the rtpa df
df.loc[len(df)] = "Los Angeles County Department of Public Works"

In [None]:
df  # GTG

# Work plan

## COMPLETE ~~Need List of all annual reporters~~
- Full, Reduced, Rural reporters all submit an annual Report to NTD
- used `dim_annual_service_agencies`

## COMPLETE ~~Need to assign rural and reduced annual reporters to RTPAs~~ 
- Full reporters are already mapped to RTPAs via the monthly report
- but reduced and rurap reporters need to be mapped to RTPAs
- sjoined a map of CA census designated places to RTPA map, then merged with `dim_annual_service_agenices` to attach RTPA info

## COMPLETE ~~Export processed data to parquet~~
- to be used for aggregation

## COMPLTE ~~Make new ntd_id to RTPA crosswalk~~
- to be used for future analyses and merges

## COMPLETE ~~Import formulas from monthly report for aggregation~~
- modified `produce_annual_ntd_ridership_data_by_rtpa`

## COMPLETE ~~Test using data from new warehouse table `~~fct_service_data_and_operating_expenses_time_series_by_mode_upt`
- https://dbt-docs.calitp.org/#!/model/model.calitp_warehouse.fct_service_data_and_operating_expenses_time_series_by_mode_upt

## How to export data to the public GCS
- Also need to explain where the data comes from. The statutory requirements state to get the data NTD? but we ingest the NTD data. the data originates from NTD API, we clean/transform it
- rip the `save rtpa outputs` code from the monthly report.
---

## read in `fct_service_data_and_operating_expenses_time_series_by_mode_upt`

In [None]:
# new warehouse table
# includes upt, mode, tos and report year starting from 2018
# contains historical,inactive agencies

get_ntd_time_series = (
    tbls.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt()
    >> filter(
        _.state.str.contains("CA")
        | _.state.str.contains("NV"),  # to get lake Tahoe Transportation back
        _.year >= "2018",
        _.city != None,
        _.primary_uza_name.str.contains(", CA")
        | _.primary_uza_name.str.contains("CA-NV")
        | _.primary_uza_name.str.contains("California Non-UZA")
        | _.primary_uza_name.str.contains("El Paso, TX--NM"),  # something about Paso
    )
    >> select(
        "agency_name",
        "agency_status",
        "city",
        "legacy_ntd_id",
        "mode",
        "ntd_id",
        "reporter_type",
        "reporting_module",
        "service",
        "state",
        "uace_code",
        "primary_uza_name",
        "uza_population",
        "year",
        "upt",
    )
    >> collect()
)

# what does group by look like
ntd_time_series = (
    get_ntd_time_series.groupby(
        [
            "agency_name",
            "agency_status",
            "city",
            "state",
            "ntd_id",
            "primary_uza_name",
            "reporter_type",
            "mode",
            "service",
            "year",
        ]
    )
    .agg({"upt": "sum"})
    .sort_values(by="ntd_id")
    .reset_index()
)

display(
    get_ntd_time_series.info(),
    ntd_time_series.info(),
    ntd_time_series["state"].value_counts(),
    ntd_time_series["primary_uza_name"].value_counts(),
)

In [None]:
print(NTD_MODES, NTD_TOS)

ntd_time_series = ntd_time_series.assign(
    mode_full=ntd_time_series["mode"].map(NTD_MODES),
    service_full=ntd_time_series["service"].map(NTD_TOS),
)
# Add mode and tos desc colums
ntd_time_series.columns

In [None]:
ntd_time_series[~ntd_time_series["primary_uza_name"].str.contains(", CA")][
    "primary_uza_name"
].value_counts()
# who has uza in texas new mex?!

In [None]:
ntd_time_series[
    ntd_time_series["primary_uza_name"].str.contains("El Paso, TX--NM")
]  # just 1 agency, PAso Robles Transit Services

In [None]:
# how many cities in ntd_time_serires are there?
# wonder if all the cities will be in the crosswalk
ntd_time_series["city"].nunique()  # 183. should be enough

## See `explore_rtpa_ntd_crosswalk` notebook to see how the new rtpa to ntd id crosswalk was made

## read in crosswalk from gcs

In [None]:
test_data = pd.read_parquet(f"{GCS_FILE_PATH}annual_report_data_2022-2023.parquet")
rtpa_ntd_crosswalk = pd.read_parquet(
    f"{GCS_FILE_PATH}ntd_id_rtpa_crosswalk_all_reporter_types.parquet"
)

display(test_data.head(), rtpa_ntd_crosswalk.head())

## test merge of `ntd_time_series` with `rtpa ntd crosswalk`

In [None]:
ntd_time_series_rtpa = ntd_time_series.merge(
    rtpa_ntd_crosswalk, on="ntd_id", how="left", indicator=True
)
display(ntd_time_series_rtpa.info(), ntd_time_series_rtpa["_merge"].value_counts())

In [None]:
# gtg
ntd_time_series_rtpa.head()

## Testing the `add_change_column` from module
- use on `ntd_time_series_rtpa` to add new column
1. previous UPT from prev year/ntd_id/mode/tos
2. change upt % from prev year/ntd_/mode/tos


In [None]:
sort_cols2 = [
    "ntd_id",
    "year",
    "service",
    "mode",
]  # got the order correct with ["period_month", "period_year"]! sorted years with grouped months
group_cols2 = ["ntd_id", "mode", "service"]

# df[["period_year","period_month"]] = df[["period_year","period_month"]].astype(int)
# ntd_time_series_rtpa.sort_values(sort_cols2).groupby(group_cols2)["upt"].apply(lambda x: x.shift(1))

In [None]:
ntd_time_series_rtpa.sort_values(sort_cols2).head()

In [None]:
ntd_time_series_rtpa.sort_values(sort_cols2).groupby(group_cols2)["upt"].apply(
    lambda x: x.shift(1)
).head()

In [None]:
test = ntd_time_series_rtpa.assign(
    previous_y_upt=(
        ntd_time_series_rtpa.sort_values(sort_cols2)
        .groupby(group_cols2)["upt"]
        .apply(lambda x: x.shift(1))
    )
)

In [None]:
test[test["ntd_id"] == "90253"].sort_values(by=["year", "mode", "service"])

In [None]:
ntd_rtpa_change_cols = annual_ridership_module.add_change_columns(ntd_time_series_rtpa)

In [None]:
ntd_rtpa_change_cols[ntd_rtpa_change_cols["ntd_id"] == "90253"].sort_values(
    by=["year", "service", "mode"]
)

## Full Test `produce_annual_ntd_ridership_data_by_rtpa` function
- need to updat to new NTD data table.


In [None]:
check_id = ["90253", "90227", "90259", "90286"]

display(
    ntd_time_series_rtpa[ntd_time_series_rtpa["ntd_id"].isin(check_id)][
        "city_x"
    ].unique(),
    rtpa_ntd_crosswalk[rtpa_ntd_crosswalk["ntd_id"].isin(check_id)]["city"].unique(),
)

# cities match, no need for the dictionary update anymore in the function

In [None]:
def produce_annual_ntd_ridership_data_by_rtpa():
    """
    Function that ingest ridership data from `dim_annual_service_agencies`, filters for CA agencies.
    Merges in ntd_id_to_RTPA_crosswalk.

    """
    from annual_ridership_module import add_change_columns

    print("ingest annual ridership data from warehouse")

    ntd_service = (
        tbls.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt()
        >> filter(
            _.state.str.contains("CA")
            | _.state.str.contains("NV"),  # to get lake Tahoe Transportation back
            _.year >= "2018",
            _.city != None,
            _.primary_uza_name.str.contains(", CA")
            | _.primary_uza_name.str.contains("CA-NV")
            | _.primary_uza_name.str.contains("California Non-UZA")
            | _.primary_uza_name.str.contains(
                "El Paso, TX--NM"
            ),  # something about Paso
        )
        >> select(
            "agency_name",
            "agency_status",
            "city",
            "legacy_ntd_id",
            "mode",
            "ntd_id",
            "reporter_type",
            "reporting_module",
            "service",
            "state",
            "uace_code",
            "primary_uza_name",
            "uza_population",
            "year",
            "upt",
        )
        >> collect()
    )

    ntd_service = (
        ntd_service.groupby(
            [
                "agency_name",
                "agency_status",
                "city",
                "state",
                "ntd_id",
                "primary_uza_name",
                "reporter_type",
                "mode",
                "service",
                "year",
            ]
        )
        .agg({"upt": "sum"})
        .sort_values(by="ntd_id")
        .reset_index()
    )

    # print("clean up warehouse data")

    # update_dict={
    #    "90227":"Moorpark",
    #    "90253":"Bell Gardens",
    #    "90259":"Cerritos",
    #    "90286":"Monterey Park",
    # }

    # for i, v in update_dict.items():
    #    ntd_service.loc[ntd_service["ntd_id"]== i,"city"] = v

    print("read in new `ntd_id_to_rtpa_all_reporter_types` crosswalk")

    ntd_to_rtpa_crosswalk = pd.read_parquet(
        f"{GCS_FILE_PATH}ntd_id_rtpa_crosswalk_all_reporter_types.parquet"
    )

    print("merge ntd data to crosswalk")

    ntd_data_by_rtpa = ntd_service.merge(
        ntd_to_rtpa_crosswalk,
        how="left",
        on=[
            "ntd_id",
            # "agency", "reporter_type", "city" # sometime agency name, reporter type and city name change or are inconsistent, causing possible fanout
        ],
        indicator=True,
    ).rename(
        columns={
            "actual_vehicles_passenger_car_revenue_hours": "vrh",
            "actual_vehicles_passenger_car_revenue_miles": "vrm",
            "unlinked_passenger_trips_upt": "upt",
            "agency_name_x": "agency_name",
            "agency_status_x": "agency_status",
            "city_x": "city",
            "state_x": "state",
            "reporter_type_x": "reporter_type",
            "agency_name_y": "xwalk_agency_name",
            "reporter_type_y": "xwalk_reporter_type",
            "agency_status_y": "xwalk_agency_status",
            "city_y": "xwalk_city",
            "state_y": "xwalk_state",
        }
    )

    print(ntd_data_by_rtpa._merge.value_counts())

    if len(ntd_data_by_rtpa[ntd_data_by_rtpa._merge == "left_only"]) > 0:
        raise ValueError("There are unmerged rows to crosswalk")

    print("add `change_column` to data")
    ntd_data_by_rtpa = add_change_columns(ntd_data_by_rtpa)

    print("map mode and tos desc.")
    ntd_data_by_rtpa = ntd_data_by_rtpa.assign(
        mode_full=ntd_data_by_rtpa["mode"].map(NTD_MODES),
        service_full=ntd_data_by_rtpa["service"].map(NTD_TOS),
    )

    return ntd_data_by_rtpa


# will add step to export data to gcs in final script

In [None]:
annual_ridership_report_data = produce_annual_ntd_ridership_data_by_rtpa()

In [None]:
if annual_ridership_report_data.shape == ntd_time_series_rtpa.shape:
    pass
else:
    print(
        f"""
{annual_ridership_report_data.shape} vs {ntd_time_series_rtpa.shape}.
{annual_ridership_report_data.columns}
{ntd_time_series_rtpa.columns}
"""
    )

# gtg, ntd_time_series_rtpa doesnt apply the change 1 year calculations, but the lengths are the same

In [None]:
annual_ridership_report_data.columns

In [None]:
annual_ridership_report_data["year"].unique()

In [None]:
annual_ridership_report_data[
    annual_ridership_report_data["agency_name"].str.contains("Sac")
].head()

### export report data to GCS, as parquet,
- to be read in by portfolio

In [None]:
# annual_ridership_report_data.to_parquet(f"{GCS_FILE_PATH}annual_ridership_report_data.parquet")

## read in data from GCS to make sure things work 

In [None]:
report_data_check = pd.read_parquet(
    f"{GCS_FILE_PATH}annual_ridership_report_data.parquet"
)

In [None]:
report_data_check.info()

In [None]:
report_data_check["mode_full"].value_counts()

In [None]:
report_data_check[report_data_check["mode_full"].isna()]["mode"].value_counts()

## Test of `save_rtpa_outs` function 

In [None]:
import gcsfs
from segment_speed_utils.project_vars import PUBLIC_GCS

fs = gcsfs.GCSFileSystem()

In [None]:
def save_rtpa_outputs(
    df: pd.DataFrame, year: int, month: str, upload_to_public: bool = False
):
    """
    Export an excel for each RTPA, adds a READ ME tab, then writes into a folder.
    Zip that folder.
    Upload zipped file to GCS.
    """
    # col_dict = {
    # "agency_name":,
    # "agency_status":,
    # "city":,
    # "state":,
    # "ntd_id":,
    # "primary_uza_name":,
    # "reporter_type":,
    # "mode":,
    # "service":,
    # "year":,
    # "upt":,
    # "RTPA":,
    # "previous_y_upt":,
    # "change_1yr":,
    # "pct_change_1yr":,
    # "mode_full":,
    # "service_full":,
    # }
    print("creating individual RTPA excel files")

    for i in df["RTPA"].unique():

        print(f"creating excel file for: {i}")

        # Filename should be snakecase
        rtpa_snakecase = (
            i.replace(" ", "_").replace("/", "_").lower()
        )  # this fixes 'Lake County/City Area Planning Council`

        # insertng readme cover sheet,
        cover_sheet = pd.read_excel(
            "./annual_report_cover_sheet_template.xlsx",
            index_col="**NTD Annual Ridership by RTPA**",
        )
        cover_sheet.to_excel(
            f"./{year}_{month}/{rtpa_snakecase}.xlsx", sheet_name="README"
        )

        # filter data by single RTPA
        rtpa_data = (
            df[df["RTPA"] == i]
            .sort_values("ntd_id")
            .drop(
                columns=[
                    "_merge",
                    "xwalk_agency_name",
                    "xwalk_reporter_type",
                    "xwalk_agency_status",
                    "xwalk_city",
                    "xwalk_state",
                ]
            )
            # cleaning column names
            .rename(columns=lambda x: x.replace("_", " ").title().strip())
            # rename columns
            # .rename(columns=col_dict)
        )
        # column lists for aggregations
        agency_cols = ["ntd_id", "agency_name", "RTPA"]
        mode_cols = ["mode", "RTPA"]
        tos_cols = ["service", "RTPA"]
        reporter_type = ["reporter_type", "RTPA"]

        # Creating aggregations
        by_agency_long = annual_ridership_module.sum_by_group(
            (df[df["RTPA"] == i]), agency_cols
        )
        by_mode_long = annual_ridership_module.sum_by_group(
            (df[df["RTPA"] == i]), mode_cols
        )
        by_tos_long = annual_ridership_module.sum_by_group(
            (df[df["RTPA"] == i]), tos_cols
        )
        by_reporter_type_long = annual_ridership_module.sum_by_group(
            (df[df["RTPA"] == i]), reporter_type
        )

        # writing pages to excel file
        with pd.ExcelWriter(
            f"./{year}_{month}/{rtpa_snakecase}.xlsx", mode="a"
        ) as writer:
            rtpa_data.to_excel(writer, sheet_name="RTPA Ridership Data", index=False)
            by_agency_long.to_excel(
                writer, sheet_name="Aggregated by Agency", index=False
            )
            by_mode_long.to_excel(writer, sheet_name="Aggregated by Mode", index=False)
            by_tos_long.to_excel(writer, sheet_name="Aggregated by TOS", index=False)
            by_reporter_type_long.to_excel(
                writer, sheet_name="Aggregate by Reporter Type", index=False
            )

    print("zipping all excel files")

    shutil.make_archive(
        f"./{year}_{month}_annual_report_data", "zip", f"{year}_{month}"
    )

    print("Zipped folder")

    print("Upload to private GCS")
    fs.upload(
        f"./{year}_{month}_annual_report_data.zip",
        f"{GCS_FILE_PATH}{year}_{month}_annual_report_data.zip",
    )

    if upload_to_public:
        fs.upload(
            f"./{year}_{month}_annual_report_data.zip",
            f"{PUBLIC_GCS}ntd_annual_ridership/{year}_{month}_annual_report_data.zip",
        )

        print("Uploaded to public GCS")

    return

In [None]:
def remove_local_outputs(year: int, month: str):
    shutil.rmtree(f"{year}_{month}/")
    os.remove(f"{year}_{month}_annual_report_data.zip")

In [None]:
## add these steps to if __name__ = __"main"__

os.makedirs(f"./{YEAR}_{MONTH}/")

df = pd.read_parquet(f"{GCS_FILE_PATH}annual_ridership_report_data.parquet")

save_rtpa_outputs(df, YEAR, MONTH, upload_to_public=False)

In [None]:
remove_local_outputs(YEAR, MONTH)