In [4]:
import gcsfs
import geopandas as gpd
import os
import pandas as pd
import shutil

from calitp_data_analysis.tables import tbls
from siuba import _, collect, count, filter, show_query
from calitp_data_analysis.sql import to_snakecase
from segment_speed_utils.project_vars import PUBLIC_GCS
#from shared_utils.rt_dates import MONTH_DICT
from update_vars import NTD_MODES, NTD_TOS, YEAR, MONTH

In [12]:
from monthly_ridership_by_rtpa import produce_ntd_monthly_ridership_by_rtpa, sum_by_group

In [7]:
df = produce_ntd_monthly_ridership_by_rtpa(YEAR, MONTH)

both          95900
left_only         0
right_only        0
Name: _merge, dtype: int64


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  previous_y_m_upt = (df.sort_values(sort_cols2)


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 95900 entries, 0 to 95899
Data columns (total 29 columns):
 #   Column             Non-Null Count  Dtype              
---  ------             --------------  -----              
 0   key                95900 non-null  object             
 1   ntd_id             95900 non-null  object             
 2   legacy_ntd_id      90420 non-null  object             
 3   agency             95900 non-null  object             
 4   reporter_type      95900 non-null  object             
 5   period_year_month  95900 non-null  object             
 6   period_year        95900 non-null  int64              
 7   period_month       95900 non-null  int64              
 8   uza_name           95900 non-null  object             
 9   primary_uza_code   95900 non-null  object             
 10  _3_mode            95900 non-null  object             
 11  mode               95900 non-null  object             
 12  mode_name          95900 non-null  object     

In [46]:
def save_rtpa_outputs(
    df: pd.DataFrame, 
    year: int, 
    month: str,
    upload_to_public: bool = False
):
    """
    Export an excel for each RTPA, adds a READ ME tab, then writes into a folder.
    Zip that folder. 
    Upload zipped file to GCS.
    """
    col_dict ={
    'Uace Cd': "UACE Code",
    'Dt': "Date",
    'Ntd Id': "NTD ID",
    'Tos': "Type of Service",
    'Legacy Ntd Id': "Legacy NTD ID",
    'Upt': "UPT",
    'Vrm': "VRM",
    'Vrh': "VRH",
    'Voms': "VOMS",
    'Rtpa': "RTPA",
    'Previous Y M Upt': "Previous Year/Month UPT",
    'Change 1Yr': "Change in 1 Year UPT",
    'Pct Change 1Yr': "Percent Change in 1 Year UPT",
    'Tos Full': "Type of Service Full Name"
}
    print("creating individual RTPA excel files")
    
    for i in df["RTPA"].unique():
        
        print(f"creating excel file for: {i}")
        
        # Filename should be snakecase
        rtpa_snakecase = i.replace(' ', '_').lower()
        
        #insertng readme cover sheet, 
        cover_sheet = pd.read_excel("./cover_sheet_template.xlsx", index_col = "**NTD Monthly Ridership by RTPA**")
        cover_sheet.to_excel(
            f"./{year}_{month}/{rtpa_snakecase}.xlsx", sheet_name = "README")

        rtpa_data =( df[df["RTPA"] == i]
         .sort_values("ntd_id")
         #got error from excel not recognizing timezone, made list to include dropping "execution_ts" column
         .drop(columns = ["_merge","execution_ts"])
         #cleaning column names
         .rename(columns=lambda x: x.replace("_"," ").title().strip())
         #rename columns
         .rename(columns=col_dict)
                   )
        #column lists for aggregations
        agency_cols = ["ntd_id", "agency", "RTPA"]
        mode_cols = ["mode", "RTPA"]
        tos_cols = ["tos", "RTPA"]

        # Creating aggregations
        by_agency_long = sum_by_group((df[df["RTPA"] == i]), agency_cols)                                 
        by_mode_long = sum_by_group((df[df["RTPA"] == i]), mode_cols)
        by_tos_long = sum_by_group((df[df["RTPA"] == i]), tos_cols)
        
        #writing pages to excel fil
        with pd.ExcelWriter(f"./{year}_{month}/{rtpa_snakecase}.xlsx", mode ="a") as writer:
            rtpa_data.to_excel(writer, sheet_name = "RTPA Ridership Data", index=False)
            by_agency_long.to_excel(writer, sheet_name = "Aggregated by Agency", index=False)
            by_mode_long.to_excel(writer, sheet_name = "Aggregated by Mode", index=False)
            by_tos_long.to_excel(writer, sheet_name = "Aggregated by TOS", index=False)
    
    print("zipping all excel files")
    
    shutil.make_archive(f"./{year}_{month}", "zip", f"{year}_{month}")
    
    print("Zipped folder")
    
    fs.upload(
        f"./{year}_{month}.zip", 
        f"{GCS_FILE_PATH}{year}_{month}.zip"
    )
    
    if upload_to_public:
        fs.upload(
            f"./{year}_{month}.zip",
            f"{PUBLIC_GCS}ntd_monthly_ridership/{year}_{month}.zip"
        )
    
    print("Uploaded to GCS")
    
    return

In [47]:
os.makedirs(f"./{YEAR}_{MONTH}/")
save_rtpa_outputs(df, YEAR, MONTH, upload_to_public = False)

creating individual RTPA excel files
creating excel file for: San Joaquin Council of Governments
creating excel file for: Orange County Transportation Authority
creating excel file for: Transportation Agency for Monterey County
creating excel file for: Riverside County Transportation Commission
creating excel file for: Metropolitan Transportation Commission
creating excel file for: San Diego Association of Governments
creating excel file for: San Bernardino County Transportation Authority
creating excel file for: Los Angeles County Metropolitan Transportation Authority
creating excel file for: Placer County Transportation Planning Agency
creating excel file for: Kings County Association of Governments
creating excel file for: Stanislaus Council of Governments
creating excel file for: Kern Council of Governments
creating excel file for: Santa Cruz County Transportation Commission
creating excel file for: Sacramento Area Council of Governments
creating excel file for: Santa Barbara Count

NameError: name 'fs' is not defined