## Update add_change_columns

In [None]:
def add_change_columns(
    df: pd.DataFrame,
	sort_cols,
	group_cols,
	change_col
) -> pd.DataFrame:
    """
    This function works with the warehouse `dim_monthly_ntd_ridership_with_adjustments` long data format.
    Sorts the df by ntd id, mode, tos, period month and period year. then adds 2 new columns, 1. previous year/month UPT and 2. UPT change 1yr.
    """

    
    df[["period_year","period_month"]] = df[["period_year","period_month"]].astype(int)

    df = df.assign(
        previous_y_m_upt = (df.sort_values(sort_cols)
                        .groupby(group_cols)["upt"] 
                        .apply(lambda x: x.shift(1))
                       )
    )

    df["change_1yr"] = (df["upt"] - df[change_col])
    
    df = get_percent_change(df)
    
    return df

In [None]:
# for produce_ntd_monthly_ridership_by_rtpa
# in monthly report sript

monthly_sort_cols =  [
    "ntd_id",
    "mode", 
    "tos",
    "period_month", 
    "period_year"
] # got the order correct with ["period_month", "period_year"]! sorted years with grouped months

monthly_group_cols = [
    "ntd_id",
    "mode", 
    "tos"
              ]

monthly_change_col ="previous_y_m_upt"

df = add_change_columns(
    df,
    sort_cols = monthly_sort_cols,
    group_cols = monthly_group_cols,
    change_col = monthly_change_col
)

In [None]:
# for produce_annual_ntd_ridership_data_by_rtpa
# in annual report script

annual_sort_cols =  [
    "ntd_id",
    "year",
    "mode", 
    "service",
] # got the order correct with ["period_month", "period_year"]! sorted years with grouped months

annual_group_cols = [
    "ntd_id",
    "mode", 
    "service"
              ]

annual_change_col ="previous_y_upt"

df = add_change_columns(
    df,
    sort_cols = annual_sort_cols,
    group_cols = annual_group_cols,
    change_col = annual_change_col
)

## Update get_percent_change

In [None]:
def get_percent_change(
    df: pd.DataFrame,
    change_col: str
) -> pd.DataFrame:
    """
    updated to work with the warehouse `dim_monthly_ntd_ridership_with_adjustments` long data format. 
    
    """
    df["pct_change_1yr"] = (
        (df["upt"] - df[change_col])
        .divide(df["upt"])
        .round(4)
    )
    
    return df

In [None]:
monthly_change_col = "previous_y_m_upt"

df = get_percent_change(
    df,
    change_col=monthly_change_col
)

In [None]:
annual_change_col ="previous_y_upt"

df = get_percent_change(
    df,
    change_col=annual_change_col
)

## update sum_by_group

In [None]:
def sum_by_group(
    df: pd.DataFrame,
    group_cols: list,
    group_col2: list,
    agg_cols: dict,
    
) -> pd.DataFrame:
    """
    since data is now long to begin with, this replaces old sum_by_group, make_long and assemble_long_df functions.
    """
    grouped_df = df.groupby(group_cols+
                             group_col2
                           ).agg(agg_cols
    ).reset_index()
    
    #get %change back
    grouped_df = get_percent_change(grouped_df)
    
    #decimal to whole number
    grouped_df["pct_change_1yr"] = grouped_df["pct_change_1yr"]*100
    
    return grouped_df

In [None]:
# for monthly report
monthly_group_col_2 = [
    'period_year',
    'period_month',
    'period_year_month']

monthly_agg_col = {
        "upt":"sum",
        "previous_y_m_upt":"sum",
        "change_1yr":"sum"
    }

by_,,,_long = sum_by_group(
    df,
    group_cols= ,,,_cols,
    group_col2= monthly_group_col_2,# look into combingin with base grou_cols
    agg_cols = monthly_agg_col
)

In [None]:
# for annual report
annual_group_col_2 = ["year"]

monthly_agg_col= {
                "upt": "sum",
                "previous_y_upt": "sum",
                "change_1yr": "sum",
            }

# used in save_rtpa_outputs
by_,,,_long = sum_by_group(
    df,
    group_cols= ,,,_cols,
    group_col2= annual_group_col_2,# look into combingin with base grou_cols
    agg_cols = monthly_agg_col
)

## update save_rtpa_outputs

In [None]:
def save_rtpa_outputs(
    df: pd.DataFrame, 
    year: int, 
    col_dict: dict,
    month: str,
    cover_sheet_path: str,
    cover_sheet_index_col: str,
    output_file_name: str,
    monthly_upload_to_public: bool = False,
    annual_upload_to_public: bool = False,
):
    """
    Export an excel for each RTPA, adds a READ ME tab, then writes into a folder.
    Zip that folder. 
    Upload zipped file to GCS.
    """

    print("creating individual RTPA excel files")
    
    for i in df["rtpa_name"].unique():
        
        print(f"creating excel file for: {i}")
        
        # Filename should be snakecase
        rtpa_snakecase = i.replace(' ', '_').lower()
        
        #insertng readme cover sheet, 
        cover_sheet = pd.read_excel(cover_sheet_path, index_col = cover_sheet_index_col)
        cover_sheet.to_excel(
            f"./{year}_{month}/{rtpa_snakecase}.xlsx", sheet_name = "README")

        rtpa_data =(df[df["rtpa_name"] == i]
         .sort_values("ntd_id")
         #got error from excel not recognizing timezone, made list to include dropping "execution_ts" column
         .drop(columns = "_merge")
         #cleaning column names
         .rename(columns=lambda x: x.replace("_"," ").title().strip())
         #rename columns
         .rename(columns=col_dict)
                   )
        #column lists for aggregations
        agency_cols = ["ntd_id", "agency", "rtpa_name"]
        mode_cols = ["mode", "rtpa_name"]
        tos_cols = ["tos", "rtpa_name"]
        reporter_type = ["reporter_type", "rtpa_name"] # may break the monthly report

        # Creating aggregations
        by_agency_long = sum_by_group((df[df["rtpa_name"] == i]), agency_cols)                                 
        by_mode_long = sum_by_group((df[df["rtpa_name"] == i]), mode_cols)
        by_tos_long = sum_by_group((df[df["rtpa_name"] == i]), tos_cols)
        by_reporter_type_long = sum_by_group((df[df["rtpa_name"] == i]), reporter_type)
        
        #writing pages to excel fil
        with pd.ExcelWriter(f"./{year}_{month}/{rtpa_snakecase}.xlsx", mode ="a") as writer:
            rtpa_data.to_excel(writer, sheet_name = "RTPA Ridership Data", index=False)
            by_agency_long.to_excel(writer, sheet_name = "Aggregated by Agency", index=False)
            by_mode_long.to_excel(writer, sheet_name = "Aggregated by Mode", index=False)
            by_tos_long.to_excel(writer, sheet_name = "Aggregated by TOS", index=False)
            by_reporter_type_long.to_excel(writer, sheet_name="Aggregate by Reporter Type", index=False)
    
    
    print("zipping all excel files")
    
    shutil.make_archive(f"./{output_file_name}", "zip", f"{year}_{month}")
    
    print("Zipped folder")
    
    print("Upload to private GCS")
    fs.upload(
        f"./{output_file_name}.zip", 
        f"{GCS_FILE_PATH}{year}_{month}.zip"
    )
    
    if monthly_upload_to_public:
        fs.upload(
            f"./{output_file_name}.zip",
            f"{PUBLIC_GCS}ntd_monthly_ridership/{year}_{month}.zip"
        )
        print("Uploaded to public GCS - monthly report")
        
    if annual_upload_to_public:
        fs.upload(
            f"./{output_file_name}.zip",
            f"{PUBLIC_GCS}ntd_annual_ridership/{year}_{month}_annual_report_data.zip"
        )
    
        print("Uploaded to public GCS - annual report")
    
    print("complete")
    
    return

In [None]:
# for monthly report

col_dict ={
    'Uace Cd': "UACE Code",
    'Dt': "Date",
    'Ntd Id': "NTD ID",
    'Tos': "Type of Service",
    'Legacy Ntd Id': "Legacy NTD ID",
    'Upt': "UPT",
    'Vrm': "VRM",
    'Vrh': "VRH",
    'Voms': "VOMS",
    'Rtpa': "RTPA",
    'Previous Y M Upt': "Previous Year/Month UPT",
    'Change 1Yr': "Change in 1 Year UPT",
    'Pct Change 1Yr': "Percent Change in 1 Year UPT",
    'Tos Full': "Type of Service Full Name"
}

monthly_cover_sheet_path = "./cover_sheet_template.xlsx"
monthly_index_col = "**NTD Monthly Ridership by RTPA**"
monthly_data_file_name = f"{year}_{month}_monthly_report_data"

In [None]:
# for annual report
col_dict= {
    "source_agency":"agency",
    "service":"tos"
}

annual_cover_sheet_path = "./annual_report_cover_sheet_template.xlsx"
annual_index_col = "**NTD Annual Ridership by RTPA**"
annual_data_file_name = f"{year}_{month}_annual_report_data"

In [None]:
## update remove_local_outputs

In [None]:
def remove_local_outputs(
    year: int, 
    month: str
):
    shutil.rmtree(f"{year}_{month}/")
    os.remove(f"{year}_{month}_annual_report_data.zip")

In [None]:
# for monthly report

In [None]:
# for annual report