In [1]:
import pandas as pd
import shutil
import os
import gcsfs
from calitp_data_analysis.tables import tbls
from siuba import _, collect, count, filter, show_query
from shared_utils.rt_dates import MONTH_DICT
from update_vars import NTD_MODES, NTD_TOS, YEAR, MONTH

#temp file path for testing
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/csuyat_folder/"

In [2]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

# Why doesnt `El Dorado County Transportation Comission` and `Placer County Transportation Planning Agency` appear in the RTPA list?

In [3]:
# got this from the deploy_portfolio_yaml.py

# whats in it?
oct_ridership = pd.read_parquet(
        f"gs://calitp-analytics-data/data-analyses/ntd/ca_monthly_ridership_2023_October.parquet",
        columns = ["RTPA"]
    ).drop_duplicates()

check_list  = ["Placer", "El Dorado"]

In [4]:
oct_ridership.columns

Index(['RTPA'], dtype='object')

In [5]:
display(
    oct_ridership.shape,
    oct_ridership[oct_ridership["RTPA"].str.contains("El Dorado")],
    oct_ridership[oct_ridership["RTPA"].str.contains("Placer")],
    oct_ridership.sort_values(by="RTPA")
)

(24, 1)

Unnamed: 0,RTPA


Unnamed: 0,RTPA


Unnamed: 0,RTPA
157,Alpine County Transportation Commission
184,Butte County Association of Governments
86,Fresno County Council of Governments
288,Imperial County Transportation Commission
4,Kern Council of Governments
270,Kings County Association of Governments
14,Los Angeles County Metropolitan Transportation...
244,Merced County Association of Governments
0,Metropolitan Transportation Commission
111,Orange County Transportation Authority


There are 24 in this oct list.

there are 24 pages on the ntd ridership page

how many RTPA are on the "crosswalk"? where is this site pulling its RTPA list?

In [6]:
crosswalk = pd.read_csv(
        f"gs://calitp-analytics-data/data-analyses/ntd/ntd_id_rtpa_crosswalk.csv", 
        dtype = {"NTD ID": "str"}
    #have to rename NTD ID col to match the dim table
    ).rename(columns={"NTD ID": "ntd_id"})
    


In [7]:
display(
    len(crosswalk["RTPA"].nunique()),
    len(crosswalk["RTPA_open_data"].nunique())
)

24

19

In [8]:
crosswalk[
    crosswalk["Agency"].str.contains("Placer") | 
    crosswalk["Agency"].str.contains("El Dor") |
    crosswalk["Agency"].str.contains("Roseville")
]


Unnamed: 0,ntd_id,Legacy NTD ID,Agency,UZA Name,RTPA_open_data,RTPA
60,90168,9168,City of Roseville,"Sacramento, CA",Sacramento Area Council of Governments,Sacramento Area Council of Governments
61,90196,9196,County of Placer,"Sacramento, CA",Sacramento Area Council of Governments,Sacramento Area Council of Governments
63,90229,9229,El Dorado County Transit Authority,"Sacramento, CA",Sacramento Area Council of Governments,Sacramento Area Council of Governments


## Explained
The agencies do exist in the crosswalk, but are under SACOG instead of their own RTPA

In [9]:
# what are the differences between RTPA and RTPA_open_data?
rtpa = crosswalk["RTPA"].unique()
rtpa_list = pd.DataFrame(rtpa, columns=["rtpa_name"])

rtpa_open_data = crosswalk["RTPA_open_data"].unique()
rtpa_open_data_list = pd.DataFrame(rtpa_open_data, columns=["rtpa_name"])

In [10]:
diff_merge = pd.merge(rtpa_list, rtpa_open_data_list,on="rtpa_name", how="outer", indicator= True)

In [11]:
display(
    diff_merge["_merge"].value_counts(),
    # what are the RTPA name that only exist in the open data list
    diff_merge[diff_merge["_merge"]!="both"]
)

both          15
left_only      9
right_only     4
Name: _merge, dtype: int64

Unnamed: 0,rtpa_name,_merge
4,Imperial County Transportation Commission,left_only
5,Fresno County Council of Governments,left_only
7,Riverside County Transportation Commission,left_only
11,Los Angeles County Metropolitan Transportation...,left_only
12,Orange County Transportation Authority,left_only
15,Alpine County Transportation Commission,left_only
16,Ventura County Transportation Commission,left_only
19,San Bernardino County Transportation Authority,left_only
22,Santa Cruz County Transportation Commission,left_only
24,Southern California Association of Governments,right_only


### Explained
- SCAG is pretty big and was split up into its different MPOs
- `Fresno Concil of Governments` is `Fresno County Council of Governments` on the RTPA list and reports site
- `Alpine County Local Transporation Comssion` is `Alpine County Transportation Comission`
- `Santa Cruz Coutny Regional Transportation Comission` is `Santa Cruz County Transportation Commission`

---

### WHAT ARE THE TRANSIT AGENCIES IN THESE RTPA?!
- look into the ntd report, the RTPAs website and possibly the RTPA allocation packages for clues to what agencies they have
- El Dorado County Transportation Comission
    1. El Dorado County Transit Authority (small systems reporter)
- Placer County Transportation Planning Agency
    1. City of Roseville (small systems reporter)
    2. County of Placer (full reporter)
<br>

### Do the operators in these RTPAs exist in the NTD ridership data at all?
- YES they do! 
- some are currently small system reporters and dont have any current monthly data

<br>

In [12]:
# query the warehouse
full_upt = (
    tbls.mart_ntd.dim_monthly_ntd_ridership_with_adjustments() 
    >> filter(#_.uza_name.str.contains(" ,CA"),
              #_.period_year_month.str.contains("2024-")
             )
    >> collect()
).rename(columns = {"mode_type_of_service_status": "Status"}
       )

# double check to see what month this table goes too.
full_upt[full_upt["period_year_month"].str.contains("2024-")]["period_year_month"].sort_values().unique()

array(['2024-01', '2024-02', '2024-03', '2024-04', '2024-05', '2024-06',
       '2024-07'], dtype=object)

In [13]:
display(full_upt.columns, full_upt.head(2)
       )

Index(['uza_name', 'uace_cd', '_dt', 'ts', 'ntd_id', 'year', 'reporter_type',
       'agency', 'Status', 'mode', 'mode_full_name', 'service_type', '_3_mode',
       'tos', 'legacy_ntd_id', 'period_year_month', 'period_year',
       'period_month', 'upt', 'vrm', 'vrh', 'voms'],
      dtype='object')

Unnamed: 0,uza_name,uace_cd,_dt,ts,ntd_id,year,reporter_type,agency,Status,mode,mode_full_name,service_type,_3_mode,tos,legacy_ntd_id,period_year_month,period_year,period_month,upt,vrm,vrh,voms
0,"Miami--Fort Lauderdale, FL",56602,2024-09-06,2024-09-06 20:49:05.015238+00:00,40034,2024,Full Reporter,County of Miami-Dade,Inactive,AG,Automated Guideway,Fixed Route,Rail,DO,4034,2018-08,2018,8,,,,
1,"Miami--Fort Lauderdale, FL",56602,2024-09-06,2024-09-06 20:49:05.015238+00:00,40034,2024,Full Reporter,County of Miami-Dade,Inactive,AG,Automated Guideway,Fixed Route,Rail,DO,4034,2015-11,2015,11,,,,


In [14]:
full_upt[full_upt["agency"].isin([
    "County of Placer",
    "City of Roseville",
    "El Dorado County Transit Authority"
]
)]["agency"].value_counts()

County of Placer                      2168
El Dorado County Transit Authority     813
City of Roseville                      542
Name: agency, dtype: int64

In [15]:
full_upt[full_upt["uza_name"].str.contains(", CA") & full_upt["agency"].str.contains("Tahoe")]["agency"].unique()

array(['Tahoe Transportation District'], dtype=object)

---

### what is the grain of this dataset?
metrics values (upt/vrm/vrh/crm/voms) of a specifc **tos**, of specific **mode**, for a speicifc **agency** on a specific **year/month**

# Test reading in `dim_monthly_ntd_ridership_with_adjustments` from warehouse

In [None]:
monthly_ridership = (
    tbls.mart_ntd.dim_monthly_ntd_ridership_with_adjustments() >> collect()
)

In [None]:
# reading in old monthly ridership parquet for comparison
old_march_2024_ridership = pd.read_parquet("gs://calitp-analytics-data/data-analyses/ntd/ntd_monthly_ridership_2024_March.parquet")

In [None]:
# what is in this table?
# all states
display(
    monthly_ridership.shape,
    # monthly_ridership.info,
    monthly_ridership.head(3),
)

In [None]:
print(monthly_ridership.dtypes)

In [None]:
# are there NaN values in uza name? YES
monthly_ridership["uza_name"].isna().sum()

In [None]:
# fill the NaN values with something
monthly_ridership["uza_name"].fillna("none", inplace=True)

In [None]:
# double check for na
monthly_ridership["uza_name"].isna().sum()

In [None]:
# can i now filter by uza CA, becaue there are zero NaN values? YES
uza_ca = monthly_ridership[monthly_ridership["uza_name"].str.contains(", CA")]
display(
    uza_ca.shape,
    uza_ca["uza_name"].value_counts()
)

In [None]:
# where are the UTP values? the UTP page for this table used to be very wide
# look at a specific agency by ntd id, LA Metro == "9154"
# what is the dtype of ntd_id... its object, so use str.contrains
la_metro = monthly_ridership[monthly_ridership["ntd_id"] == "90154"]
display(la_metro.shape, uza_ca.shape)

In [None]:
# filter la_metro data to only see current and previous year (2024 and 2023)

la_2324 = la_metro[la_metro["period_year"].isin(["2023", "2024"])]

In [None]:
# does this 23/24 data match whats in the NTD report? YES
# confirm that data is now long instead of wide
la_2324[la_2324["period_year_month"] == "2024-03"][
    ["period_year_month", "mode", "tos", "upt"]
].sort_values(by="period_year_month", ascending=True)

# What to modify in the `produce_ntd_monthly_ridership_by_rtpa` function
step-by-step

In [None]:
# 1. reading in the data & renaming column
# intead of reading in the data via an excel doc by URL, intead will query the warehouse directly

#GOOD TO GO
full_upt = (
    tbls.mart_ntd.dim_monthly_ntd_ridership_with_adjustments() >> collect()
).rename(columns = {"mode_type_of_service_status": "Status"}
       )

In [None]:
#checking for recent data
full_upt[full_upt["period_year_month"].str.contains("2024-")]["period_year_month"].sort_values().unique()

In [None]:
#1.1 setting `ntd_id` to str

# NTD_ID COL IS ALREADY OBJECT/STR DATA TYPE
# full_upt["ntd_id"] = full_upt["ntd_id"].astype(str)

In [None]:
#2 filter df to get rid of na rows
# GOOD TO GO!
full_upt = full_upt[full_upt.agency.notna()].reset_index(drop=True)

In [None]:
#3 new DF of just CA and not-blank agency names
# filter UZA name col for CA and not blank agency names.

#GOOD TO GO!
ca = full_upt[(full_upt["uza_name"].str.contains(", CA")) & 
            (full_upt.agency.notna())].reset_index(drop=True)

In [None]:
#4 read in ntd_id-to-RTPA crosswalk
# same as before, no changes

#GOOD TO GO!
crosswalk = pd.read_csv(
        f"gs://calitp-analytics-data/data-analyses/ntd/ntd_id_rtpa_crosswalk.csv", 
        dtype = {"NTD ID": "str"}
    #have to rename NTD ID col to match the dim table
    ).rename(columns={"NTD ID": "ntd_id"})
    
crosswalk.columns

In [None]:
#5 merge CA df and crosswalk

# GOOD TO GO
df = pd.merge(
        ca,
        # Merging on too many columns can create problems 
        # because csvs and dtypes aren't stable / consistent 
        # for NTD ID, Legacy NTD ID, and UZA
        crosswalk[["ntd_id", "RTPA"]],
        #change on = value to ntd_id
        on = "ntd_id",
        how = "left",
        indicator = True
    )
    
print(df._merge.value_counts())

In [None]:
#6 flag to see if there are any unmerged rows 

# GOOD TO GO
if len(df[df._merge=="left_only"]) > 0:
        raise ValueError("There are unmerged rows to crosswalk")

In [None]:
#7 reverses the MONTH_DICT 

# Add new columns    
reversed_months = {v:k for k, v in MONTH_DICT.items()}
    
for m in range(1, reversed_months[month] + 1):
    df = add_change_columns(df, year, m)

In [None]:
# FINAL FORM


# final updated  `produce_ntd_monthly_ridership_by_rtpa` function.
# may not need the upt_url

def produce_ntd_monthly_ridership_by_rtpa(
    #upt_url: str,
    year: int,
    month: str
) -> pd.DataFrame:
    """
    Import NTD data from url, filter to CA, 
    merge in crosswalk, and save individual csvs.
    """
    full_upt = (tbls.mart_ntd.dim_monthly_ntd_ridership_with_adjustments() >> collect()
               ).rename(columns = {"mode_type_of_service_status": "Status"})
    
    full_upt = full_upt[full_upt.agency.notna()].reset_index(drop=True)
    #full_upt.to_parquet(
    #    f"{GCS_FILE_PATH}ntd_monthly_ridership_{year}_{month}.parquet"
    #)
    ca = full_upt[(full_upt["uza_name"].str.contains(", CA")) & 
            (full_upt.agency.notna())].reset_index(drop=True)
    
    crosswalk = pd.read_csv(
        f"gs://calitp-analytics-data/data-analyses/ntd/ntd_id_rtpa_crosswalk.csv", 
        dtype = {"NTD ID": "str"}
    #have to rename NTD ID col to match the dim table
    ).rename(columns={"NTD ID": "ntd_id"})
    
    df = pd.merge(
        ca,
        # Merging on too many columns can create problems 
        # because csvs and dtypes aren't stable / consistent 
        # for NTD ID, Legacy NTD ID, and UZA
        crosswalk[["ntd_id", "RTPA"]],
        #change on = value to ntd_id
        on = "ntd_id",
        how = "left",
        indicator = True
    )
    
    print(df._merge.value_counts())
    
    # Good, everything merged, as we want
    if len(df[df._merge=="left_only"]) > 0:
        raise ValueError("There are unmerged rows to crosswalk")
        
    # Add new columns    
    reversed_months = {v:k for k, v in MONTH_DICT.items()}
    
    for m in range(1, reversed_months[month] + 1):
        df = add_change_columns(df, year, m)
    
    df = df.assign(
        Mode_full = df.Mode.map(NTD_MODES),
        TOS_full = df.TOS.map(NTD_TOS)
    )
    
    return df


---
## Do I need `reserved_months` anymore?
RE: `produce_ntd_monthly_ridership_by_rtpa` function.


Since the new table uses a `yyyy-mm` format now. i dont think i need the revesre months anymore. what else needs to change?
reversed month was needed for the `add_change_columns` function.  

**NO LONGER NEEDED SINCE `add_change_columns_v2` WORKS WITHOUT USING MONTH_DICT


In [None]:
# what does reversed months do?

reversed_months = {v:k for k, v in MONTH_DICT.items()}

# what MONTH_DICT looks like initially.

print("this is MONTH_DICT"),
display(MONTH_DICT),

# like the name implies, reverses the key values pairs. 
print("this is reversed_months"),
display(reversed_months)


In [None]:
# what is the date format in the new df
# YYYY-MM
df["period_year_month"].sample(3)

In [None]:
# what is the date format in the old data?
# M/YYYY
old_march_2024_ridership.iloc[:,18:22].head()

## options
* change the df date format to match reversed_months format. (yyyy-mm to mm/yyy)
    * can use a mix of split, str replace, and reversing via slicing 
* change the `add_change_column` function to accept yyyy-mm
    * make a v2 of this function that accepts yyyy-mm, creates a new column, and applys some function by looking at the row period_year_month and finding the "period_year_month" -1 yr to calculate the difference.

# TEST update `add_change_columns` function
need to update this function so it can work with the `produce_ntd_monthly_ridership_by_rtpa` function

In [None]:
# updated to match new year-month format 

def add_change_columns_v2(
    df: pd.DataFrame,
    year: int,
    month: int
) -> pd.DataFrame:
    """
    updated to accept yyyy-mm format.
    
    """    
    ntd_month_col = f"{year}-{month}"
    prior_year_col = f"{int(year)-1}-{month}"
        
    df[f"change_1yr_{ntd_month_col}"] = df[ntd_month_col] - df[prior_year_col]
    df = get_percent_change(df, ntd_month_col, prior_year_col)
    
    return df

In [None]:
# can i use a lambda function to create a new column that:
# checks every row (x), if the row has a `period_year_month` value, does it have a `period_year_month` -1 year value in the same column? 
    # if true, then subtract `period_year_month` by `period_year_month`-1 year.
    # else return 0
year = 2023
month = "05"

current_y_m = f"{year}-{month}"
prior_y_m = f"{int(year)-1}-{month}"

# using current_y_m works and prior_year_col works
df[df["period_year_month"].str.contains(prior_y_m)].head(3)



In [None]:
# first attempt at what im trying to do

# create a new column called "change 1yr"
# at each row in "period_month_year", do what? take the value of that row, then mimus the value of the previous period_mont_year -1

#this doesnt work becaues im not indicating how X is used, 
#df["change_1yr"] = df["period_year_month"].apply(lambda x: (df["period_year_month"].str.contains(current_y_m)["upt"]) - (df["period_year_month"].str.contains(prior_year_col)["upt"]))

#pseudo code
df["change_1yr"] = df["period_year_month"].apply(lambda x: (upt at x for current_y_m, agency, tos and mode)  - 
                                                 (upt at x for prior_y_m, current_y_m, agency, tos and mode))



In [None]:
# how can i see the specific UPT for a specific agency, mode & TOS and date?

#uses .loc at current_y_m UPT, position 0, minus .loc at previous_year_col UPT, position o
# but is is the same agency/mode/tos in both sides of the equation?
df.loc[df["period_year_month"] == current_y_m,"upt"].values[0] - df.loc[df["period_year_month"] == prior_y_m,"upt"].values[0]

In [None]:
df.loc[
        (df["period_year_month"] == current_y_m) &
        (df["ntd_id"] == df["ntd_id"]) &
        (df["mode"] == df["mode"]) &
        (df["tos"] == df["tos"]), "upt"
    ].head()

In [None]:
df.sample(3)

In [None]:
df.loc[df["period_year_month"] == prior_y_m, "upt"][:10]


In [None]:
def test_difference_func(row, current_y_m, prior_y_m ):
    current_ym_val =  df.loc[
        (df["period_year_month"] == current_y_m) &
        (df["ntd_id"] == row["ntd_id"]) &
        (df["mode"] == row["mode"]) &
        (df["tos"] == row["tos"]), "upt"
    ]
    
    prev_ym_val =  df.loc[
        (df["period_year_month"] == prior_y_m) &
        (df["ntd_id"] == row["ntd_id"]) &
        (df["mode"] == row["mode"]) &
        (df["tos"] == row["tos"]), "upt"
    ]
    
    return (current_ym_val - prev_ym_val)

In [None]:
test_df = df[:50].copy()

In [None]:
test_df.columns

In [None]:
year = 2023
month = "05"

current_y_m = f"{year}-{month}"
prior_y_m = f"{int(year)-1}-{month}"

display(
    current_y_m,
    prior_y_m
)

In [None]:
test_df["test_col"] = test_df.apply(lambda row: test_difference_func(row, current_y_m="2023-05", prior_y_m="2022-05"), axis=1)

So im having trouble with coming up with a `change 1yr` column, now that the dataset is long (instead of the previous wide)

what i was going for was something like:
- adding a new column to the dataset called `change 1yr`, that takes the `UPT` value of the `ntd_id, mod, tos` for current `period_year_month` (ex. 2024-05) row, and subtracts the `UPT` value from the previous year's `period_year_month` (ex. 2024-05) row of the same `ntd_id, mod, tos`.

- i thought i had something with a .apply(lambda x:) method, but wasnt etting anywhere.

any suggestions on how i should approach this?


---

In [None]:
# what is the most recent period_year_month? as of 7/2/2024\
# should expect to see values up tp May 2024
display(
    ca[ca["period_year_month"] == "2024-04"].shape,
    # as of 7/2, zero rows for May 2024. but expect ntd to update that data soon.
    # 7/15, still no values for May 2024
    ca[ca["period_year_month"] == "2024-05"].shape
)

# Tiffany method to get change 1yr working

[see link to her old GH sctipt](https://github.com/CityOfLosAngeles/covid19-indicators/blob/master/processing_utils/utils.py#L306-L315)

The .shift() method in Pandas is used to shift the values of a DataFrame or Series by a specified number of periods along the given axis.
When you use .shift(), the data in the DataFrame or Series is moved up or down, while the index/columns stay in place. can move the entire dataframe up/down/left/right, or just a row or column in any direction.

Per Tiffany
- "i think you could perhaps make this idea work. that snippet uses .shift to get you a previous observation and a post observation. in this case, it's the previous day and the day after. but for you, `you're looking at a previous year (same month)`."

```
sort_cols = ["county", "state", "fips", "date"]
group_cols = ["county", "state", "fips"]

df = df.assign(
        previous_day = (df.sort_values(sort_cols)
                        .groupby(group_cols)["new_cases"]
                        .apply(lambda x: x.shift(1))
                       ),
        post_day = (df.sort_values(sort_cols)
                    .groupby(group_cols)["new_cases"]
                    .apply(lambda x: x.shift(-1))
                   ),
    )
```

In [None]:
# df of just riverside county transpo agency in select years
rcta = ca[(ca["ntd_id"].isin(["90218", "90006","90216"])) & (ca["period_year"].isin(["2024","2023", "2022", "2021"]))]


In [None]:
rcta[["period_year","period_month"]] = rcta[["period_year","period_month"]].astype(int)

In [None]:
display(
    rcta.columns,
    rcta.shape,
    rcta.dtypes,
    rcta["agency"].value_counts()
)

In [None]:
# list of column names to sort the df by
# lets sort by mode, tos, period_year_month
sort_cols = ["mode", "tos", "period_year_month"]
group_cols = ["mode", "tos"]

current_y_m = f"{year}-{month}"
prior_y_m = f"{int(year)-1}-{month}"

In [None]:
previous_day = (rcta.sort_values(sort_cols) #sort df by oldest date
                        .groupby(group_cols)["upt"] #returns only upt values
                        .apply(lambda x: x.shift(1)) #shifts upt values down by 1 step
                       )

next_day = (rcta.sort_values(sort_cols) #sort df by oldest date
                        .groupby(group_cols)["upt"] #returns only upt values
                        .apply(lambda x: x.shift(-1)) #shifts upt values up by 1 step
                       )
display(
    type(previous_day),
    rcta.sort_values(sort_cols).groupby(group_cols)["upt"].head(),
    previous_day.head(),
    next_day.head()

)
# result is a series of the "upt" col shifted down 1 place

In [None]:
# put together, what happens?
# as is, new columns will be added to the DF, called prev_day and next_day. filled with values from upt shifted down one day and up one day respectivley. 

rcta.sort_values(sort_cols).groupby(group_cols)["upt"] = rcta.assign(
    previous_day = (rcta.sort_values(sort_cols) #sort df by oldest date
                        .groupby(group_cols)["upt"] #returns only upt values
                        .apply(lambda x: x.shift(1)) #shifts upt values down by 1 step
                       ),
    next_day = (rcta.sort_values(sort_cols) #sort df by oldest date
                        .groupby(group_cols)["upt"] #returns only upt values
                        .apply(lambda x: x.shift(-1)) #shifts upt values up by 1 step
                       ),

)
The above code shifts the UPT data down/up 1 step

In [None]:
# How can i adapt this code to shift the just year?
# apply the shift to period_year?

sort_cols2 =  ["ntd_id","mode", "tos","period_month", "period_year"] # got the order correct with ["period_month", "period_year"]! sorted years with grouped months
group_cols2 = ["ntd_id","mode", "tos"]

rcta = rcta.assign(
    #this works! previous_y_m_upt shifted down 1 step as intended for each agency! each agency's first year_month got a NaN for previous_y_m_upt
    previous_y_m_upt = (rcta.sort_values(sort_cols2) #sort df by oldest date
                        .groupby(group_cols2)["upt"] #
                        .apply(lambda x: x.shift(1)) #shifts upt values down by 1 step
                       ),
    #THIS WORKS! the math works out!
    change_1yr = (rcta["upt"] - rcta["previous_y_m_upt"])
                  )


In [None]:
display(
    #easy to see the differences when the DF is sorte the same way,
    rcta.sort_values(sort_cols2).head(),
    
    #does the math work out when the df is not sorted? YES IT IS!!
    rcta.sample(5)
)

# FINAL `get_percent_change_v2` function

initial code
```
def get_percent_change(
    df: pd.DataFrame, 
    current_col: str, 
    prior_col: str
) -> pd.DataFrame:
    
    df[f"pct_change_1yr_{current_col}"] = (
        (df[current_col] - df[prior_col])
        .divide(df[current_col])
        .round(4)
    )
    
    return df
```


In [None]:
def get_percent_change_v2(
    df: pd.DataFrame, 
) -> pd.DataFrame:
    """
    updated to work with the warehouse `dim_monthly_ntd_ridership_with_adjustments` long data format. 
    
    
    """
    df["pct_change_1yr"] = (
        (df["upt"] - df["previous_y_m_upt"])
        .divide(df["upt"])
        .round(4)
    )
    
    return df

In [None]:
#it works!!
test_df = get_percent_change_v2(test_df)

test_df.head()

## ~How to deal with `inf` and `NaN`~ 
* `inf` when dividing by zero
* or `NaN` when 0 minus 0

SOLUTOIN! Already have `remove zero upt rows` function in notebooks



# FINAL `add_change_columns_v2` function

In [None]:
def add_change_columns_v2(df: pd.DataFrame) -> pd.DataFrame:
    """
    This function works with the warehouse `dim_monthly_ntd_ridership_with_adjustments` long data format.
    Sorts the df by ntd id, mode, tos, period month and period year. then adds 2 new columns, 1. previous year/month UPT and 2. UPT change 1yr.
    """

    sort_cols2 =  ["ntd_id","mode", "tos","period_month", "period_year"] # got the order correct with ["period_month", "period_year"]! sorted years with grouped months
    group_cols2 = ["ntd_id","mode", "tos"]
    
    df[["period_year","period_month"]] = df[["period_year","period_month"]].astype(int)

    df = df.assign(
        previous_y_m_upt = (df.sort_values(sort_cols2)
                        .groupby(group_cols2)["upt"] 
                        .apply(lambda x: x.shift(1))
                       )
    )

    df["change_1yr"] = (df["upt"] - df["previous_y_m_upt"])
    
    df = get_percent_change_v2(df)
    
    return df

In [None]:
ca.shape

In [None]:
#test to ensure `add_change_columns_v2` works
test_df = add_change_columns_v2(rcta)

big_test = add_change_columns_v2(ca)

really_big_test = add_change_columns_v2(full_upt)

In [None]:
#it works!
display(
    test_df.shape,
    big_test.shape,
    really_big_test.shape,
    test_df.columns
)

display(
    rcta.shape,
    ca.shape,
    full_upt.shape
)

# ensuring the `.assign(NTD_MODES and NTD_TOS)` works.

In [None]:
#it works
more_test = test_df.assign(
        Mode_full = test_df["mode"].map(NTD_MODES),
        TOS_full = test_df["tos"].map(NTD_TOS)
    )
more_test.columns

# FINAL produce_ntd_monthly_ridership_by_rtpa_v2 function

now that `add_change_col_v2` & `get_percent_change_v2` are working. lets put it all together

In [None]:
def produce_ntd_monthly_ridership_by_rtpa_v2(
    #df: pd.DataFrame,
    year: int,
    month: int
) -> pd.DataFrame:
    """
    This function works with the warehouse `dim_monthly_ntd_ridership_with_adjustments` long data format.
    """
    full_upt = (tbls.mart_ntd.dim_monthly_ntd_ridership_with_adjustments() >> collect()).rename(columns = {"mode_type_of_service_status": "Status"})
    
    #updating month & year to int is already in add_change_columns_v2. keeping here for now.
    #full_upt[["period_year","period_month"]] = full_upt[["period_year","period_month"]].astype(int)
    
    full_upt = full_upt[full_upt.agency.notna()].reset_index(drop=True)
    
    full_upt.to_parquet(
        f"{GCS_FILE_PATH}ntd_monthly_ridership_{year}_{month}.parquet"
    )
    
    ca = full_upt[(full_upt["uza_name"].str.contains(", CA")) & 
            (full_upt.agency.notna())].reset_index(drop=True)
    
    crosswalk = pd.read_csv(
        f"gs://calitp-analytics-data/data-analyses/ntd/ntd_id_rtpa_crosswalk.csv", 
        dtype = {"NTD ID": "str"}
    #have to rename NTD ID col to match the dim table
    ).rename(columns={"NTD ID": "ntd_id"})
    
    df = pd.merge(
        ca,
        # Merging on too many columns can create problems 
        # because csvs and dtypes aren't stable / consistent 
        # for NTD ID, Legacy NTD ID, and UZA
        crosswalk[["ntd_id", "RTPA"]],
        #change on = value to ntd_id
        on = "ntd_id",
        how = "left",
        indicator = True
    )
    
    print(df._merge.value_counts())
    
    if len(df[df._merge=="left_only"]) > 0:
        raise ValueError("There are unmerged rows to crosswalk")
    
    df = add_change_columns_v2(df)
    
    df = df.assign(
        Mode_full = df["mode"].map(NTD_MODES),
        TOS_full = df["tos"].map(NTD_TOS)
    )
    
    return df
    

# Full Run test of `produce_ntd_monthly...v2`

In [None]:
full_run_test = produce_ntd_monthly_ridership_by_rtpa_v2(YEAR, MONTH)
# it works!!
# also saving the temp GCS folder!

full_run_test.to_parquet(f"{GCS_FILE_PATH}ca_monthly_ridership_{YEAR}_{MONTH}.parquet")

In [None]:
display(
    full_run_test.columns,
    full_run_test.sample(3),
    full_run_test["reporter_type"].sort_values().unique()
)

# as of 7/22/2024. still only see '2024-04' as the latest date. need may 2024

# FINAL `save_rtpa_outputs_v2`

In [None]:
# NOT FINAL
#def save_rtpa_outputs_v2(
    df: pd.DataFrame( 
    year: int, 
    month: str,
    upload_to_public: bool = False
):
    """
    Export a csv for each RTPA into a folder.
    Zip that folder. 
    Upload zipped file to GCS.
    """
    for i in df["RTPA"].unique():
        # Filename should be snakecase
        rtpa_snakecase = i.replace(' ', '_').lower()

        (df[df["RTPA"] == i]
         .sort_values("ntd_id")
         .drop(columns = "_merge")
         .to_csv(
            f"./{year}_{month}/{rtpa_snakecase}.csv",
            index = False)
        )
        
    shutil.make_archive(f"./{year}_{month}", "zip", f"{year}_{month}")
    print("Zipped folder")
    
    fs.upload(
        f"./{year}_{month}.zip", 
        f"{GCS_FILE_PATH}{year}_{month}.zip"
    )
    
    if upload_to_public:
        fs.upload(
            f"./{year}_{month}.zip",
            f"{PUBLIC_GCS}ntd_monthly_ridership/{year}_{month}.zip"
        )
    
    print("Uploaded to GCS")
    
    return

## Test `save_rtpa_outputs_v2`

initial code
```
def save_rtpa_outputs(
    df: pd.DataFrame, year: int, month: str,
    upload_to_public: bool = False
):
    """
    Export a csv for each RTPA into a folder.
    Zip that folder. 
    Upload zipped file to GCS.
    """
    for i in df.RTPA.unique():
        # Filename should be snakecase
        rtpa_snakecase = i.replace(' ', '_').lower()

        (df[df.RTPA == i]
         .sort_values("NTD ID")
         .drop(columns = "_merge")
         .to_csv(
            f"./{year}_{month}/{rtpa_snakecase}.csv",
            index = False)
        )
       
    # Zip this folder, and save zipped output to GCS
    shutil.make_archive(f"./{year}_{month}", "zip", f"{year}_{month}")
    print("Zipped folder")
    
    fs.upload(
        f"./{year}_{month}.zip", 
        f"{GCS_FILE_PATH}{year}_{month}.zip"
    )
    
    if upload_to_public:
        fs.upload(
            f"./{year}_{month}.zip",
            f"{PUBLIC_GCS}ntd_monthly_ridership/{year}_{month}.zip"
        )
    
    print("Uploaded to GCS")
    
    return
```

In [None]:
# test to read in local excel sheet, and add it to another sheet
cover_sheet = pd.read_excel("./cover_sheet_template.xlsx")

In [None]:
type(cover_sheet)

In [None]:
#testing how to append a sheet to an existing workbook
#this works by itself
with pd.ExcelWriter("./2024_April/butte_county_association_of_governments.xlsx", mode ="a") as writer:
    cover_sheet.to_excel(writer, sheet_name = "1. Cover Sheet")

In [None]:
# TEst to str replace and title case the columns name

test_df = full_run_test.copy()

test_df.columns


In [None]:
test_df.columns = test_df.columns.str.replace("_", " ").str.strip().str.title()

#changes most of the columns but realized some of the columns 
# need to change seperately
list(test_df.columns)

In [None]:
col_dict ={
    'Uace Cd': "UACE Code",
    'Dt': "Date",
    'Ntd Id': "NTD ID",
    'Tos': "Type of Service",
    'Legacy Ntd Id': "Legacy NTD ID",
    'Upt': "UPT",
    'Vrm': "VRM",
    'Vrh': "VRH",
    'Voms': "VOMS",
    'Rtpa': "RTPA",
    'Previous Y M Upt': "Previous Year/Month UPT",
    'Change 1Yr': "Change in 1 Year UPT",
    'Pct Change 1Yr': "Percent Change in 1 Year UPT",
    'Tos Full': "Type of Service Full Name"
}

test_df = test_df.rename(columns=col_dict)

list(test_df.columns)

In [None]:
# TEST VERSION

# updated to address feature requests raised by MTC and other RTPAs
# x save out as excel,
# x include a cover sheet/ READ ME 
# x unsnakecase the columns
# x use dict to rename remaining columns

def save_rtpa_outputs_v2(
    df: pd.DataFrame, 
    year: int, 
    month: str,
    upload_to_public: bool = False
):
    """
    Export a csv for each RTPA into a folder.
    Zip that folder. 
    Upload zipped file to GCS.
    """
    col_dict ={
    'Uace Cd': "UACE Code",
    'Dt': "Date",
    'Ntd Id': "NTD ID",
    'Tos': "Type of Service",
    'Legacy Ntd Id': "Legacy NTD ID",
    'Upt': "UPT",
    'Vrm': "VRM",
    'Vrh': "VRH",
    'Voms': "VOMS",
    'Rtpa': "RTPA",
    'Previous Y M Upt': "Previous Year/Month UPT",
    'Change 1Yr': "Change in 1 Year UPT",
    'Pct Change 1Yr': "Percent Change in 1 Year UPT",
    'Tos Full': "Type of Service Full Name"
}
    
    for i in df["RTPA"].unique():
        # Filename should be snakecase
        rtpa_snakecase = i.replace(' ', '_').lower()

        (df[df["RTPA"] == i]
         .sort_values("ntd_id")
         #got error from excel not recognizing timezone, made list to include dropping "ts" column
         .drop(columns = ["_merge","ts"])
         #cleaning column names
         .rename(columns=lambda x: x.replace("_"," ").title().strip())
         #rename columns
         .rename(columns=col_dict)
         #updated to `to_excel`, added sheet_name 
         .to_excel(
            f"./{year}_{month}/{rtpa_snakecase}.xlsx", sheet_name = "RTPA Ridership Data",
            index = False)
         
        )
        #insertng readme cover sheet, 
        cover_sheet = pd.read_excel("./cover_sheet_template.xlsx", index_col = "NTD Monthly Ridership by RTPA")
        with pd.ExcelWriter(f"./{year}_{month}/{rtpa_snakecase}.xlsx", mode ="a") as writer:
            cover_sheet.to_excel(writer, sheet_name = "READ ME")
        
    shutil.make_archive(f"./{year}_{month}", "zip", f"{year}_{month}")
    print("Zipped folder")
    
    fs.upload(
        f"./{year}_{month}.zip", 
        f"{GCS_FILE_PATH}{year}_{month}.zip"
    )
    
    if upload_to_public:
        fs.upload(
            f"./{year}_{month}.zip",
            f"{PUBLIC_GCS}ntd_monthly_ridership/{year}_{month}.zip"
        )
    
    print("Uploaded to GCS")
    
    return

In [None]:
# double check cwd
os.getcwd()

In [None]:
# need to make directory for `save_rtpa_outputs_v2` to work

#confirmed this make a sub directory within /ntd

#ntd/2024_April
os.makedirs(f"./{YEAR}_{MONTH}/")

In [None]:
fs = gcsfs.GCSFileSystem()

#works!
# see csvs in the new 2024_April directory
# also uploated to temp CGS file

# takes a really long time since writing to excel is a lot slower. 
#save_rtpa_outputs_v2(full_run_test,YEAR, MONTH, upload_to_public = False)

# Test `remove_local_outputs` function

**NO CHANGES NEEDED**

In [None]:
def remove_local_outputs(year: int, month: str):
    shutil.rmtree(f"{year}_{month}/")
    os.remove(f"{year}_{month}.zip")

In [None]:
# yup this worked really fast, no changes are needed
remove_local_outputs(YEAR, MONTH)

# Test updating `save_rtpa_outputs` with additional tabs for aggregation

its works!!!!

## FINAL `sum_by_gorup`

In [None]:
# ripped from NB, will need this for `save_rtpa_outputs_v3`

# will move this to scrip and remove from nb, then add from...import sum_by_group

def sum_by_group(
    df: pd.DataFrame,
    group_cols: list) -> pd.DataFrame:
    """
    since data is now long to begin with, this replaces old sum_by_group, make_long and assemble_long_df functions.
    """
    grouped_df = df.groupby(group_cols+
                             ['period_year',
                             'period_month',
                             'period_year_month']
                           ).agg({
        "upt":"sum",
        "previous_y_m_upt":"sum",
        "change_1yr":"sum"
    }
    ).reset_index()
    
    #get %change back
    grouped_df = get_percent_change_v2(grouped_df)
    
    #decimal to whole number
    grouped_df["pct_change_1yr"] = grouped_df["pct_change_1yr"]*100
    
    return grouped_df

In [None]:
def save_rtpa_outputs_v3(
    df: pd.DataFrame, 
    year: int, 
    month: str,
    upload_to_public: bool = False
):
    """
    Export an excel for each RTPA, adds a READ ME tab, then writes into a folder.
    Zip that folder. 
    Upload zipped file to GCS.
    """
    col_dict ={
    'Uace Cd': "UACE Code",
    'Dt': "Date",
    'Ntd Id': "NTD ID",
    'Tos': "Type of Service",
    'Legacy Ntd Id': "Legacy NTD ID",
    'Upt': "UPT",
    'Vrm': "VRM",
    'Vrh': "VRH",
    'Voms': "VOMS",
    'Rtpa': "RTPA",
    'Previous Y M Upt': "Previous Year/Month UPT",
    'Change 1Yr': "Change in 1 Year UPT",
    'Pct Change 1Yr': "Percent Change in 1 Year UPT",
    'Tos Full': "Type of Service Full Name"
}
    
    for i in df["RTPA"].unique():
        # Filename should be snakecase
        rtpa_snakecase = i.replace(' ', '_').lower()

        (df[df["RTPA"] == i]
         .sort_values("ntd_id")
         #got error from excel not recognizing timezone, made list to include dropping "ts" column
         .drop(columns = ["_merge","ts"])
         #cleaning column names
         .rename(columns=lambda x: x.replace("_"," ").title().strip())
         #rename columns
         .rename(columns=col_dict)
         #updated to `to_excel`, added sheet_name 
         .to_excel(
            f"./{year}_{month}/{rtpa_snakecase}.xlsx", sheet_name = "RTPA Ridership Data",
            index = False)
         
        )
        #insertng readme cover sheet, 
        cover_sheet = pd.read_excel("./cover_sheet_template.xlsx", index_col = "NTD Monthly Ridership by RTPA")
        
        agency_cols = ["ntd_id", "agency", "RTPA"]
        mode_cols = ["mode", "RTPA"]
        tos_cols = ["tos", "RTPA"]

        by_agency_long = sum_by_group(df, agency_cols)
        by_mode_long = sum_by_group(df, mode_cols)
        by_tos_long = sum_by_group(df, tos_cols)
        
        with pd.ExcelWriter(f"./{year}_{month}/{rtpa_snakecase}.xlsx", mode ="a") as writer:
            cover_sheet.to_excel(writer, sheet_name = "READ ME")
            by_agency_long.to_excel(writer, sheet_name = "Aggregated by Agency")
            by_mode_long.to_excel(writer, sheet_name = "Aggregated by Mode")
            by_tos_long.to_excel(writer, sheet_name = "Aggregated by TOS")
        
    shutil.make_archive(f"./{year}_{month}", "zip", f"{year}_{month}")
    print("Zipped folder")
    
    fs.upload(
        f"./{year}_{month}.zip", 
        f"{GCS_FILE_PATH}{year}_{month}.zip"
    )
    
    if upload_to_public:
        fs.upload(
            f"./{year}_{month}.zip",
            f"{PUBLIC_GCS}ntd_monthly_ridership/{year}_{month}.zip"
        )
    
    print("Uploaded to GCS")
    
    return

## FINAL `save_rtpa_outputs` with aggregation tabs

In [None]:
def save_rtpa_outputs(
    df: pd.DataFrame, 
    year: int, 
    month: str,
    upload_to_public: bool = False
):
    """
    Export an excel for each RTPA, adds a READ ME tab, then writes into a folder.
    Zip that folder. 
    Upload zipped file to GCS.
    """
    col_dict ={
    'Uace Cd': "UACE Code",
    'Dt': "Date",
    'Ntd Id': "NTD ID",
    'Tos': "Type of Service",
    'Legacy Ntd Id': "Legacy NTD ID",
    'Upt': "UPT",
    'Vrm': "VRM",
    'Vrh': "VRH",
    'Voms': "VOMS",
    'Rtpa': "RTPA",
    'Previous Y M Upt': "Previous Year/Month UPT",
    'Change 1Yr': "Change in 1 Year UPT",
    'Pct Change 1Yr': "Percent Change in 1 Year UPT",
    'Tos Full': "Type of Service Full Name"
}
    
    for i in df["RTPA"].unique():
        # Filename should be snakecase
        rtpa_snakecase = i.replace(' ', '_').lower()

        (df[df["RTPA"] == i]
         .sort_values("ntd_id")
         #got error from excel not recognizing timezone, made list to include dropping "ts" column
         .drop(columns = ["_merge","ts"])
         #cleaning column names
         .rename(columns=lambda x: x.replace("_"," ").title().strip())
         #rename columns
         .rename(columns=col_dict)
         #updated to `to_excel`, added sheet_name 
         .to_excel(
            f"./{year}_{month}/{rtpa_snakecase}.xlsx", sheet_name = "RTPA Ridership Data",
            index = False)
         
        )
        #insertng readme cover sheet, 
        cover_sheet = pd.read_excel("./cover_sheet_template.xlsx", index_col = "NTD Monthly Ridership by RTPA")
        
        agency_cols = ["ntd_id", "agency", "RTPA"]
        mode_cols = ["mode", "RTPA"]
        tos_cols = ["tos", "RTPA"]

        by_agency_long = sum_by_group(df, agency_cols)
        by_mode_long = sum_by_group(df, mode_cols)
        by_tos_long = sum_by_group(df, tos_cols)
        
        with pd.ExcelWriter(f"./{year}_{month}/{rtpa_snakecase}.xlsx", mode ="a") as writer:
            cover_sheet.to_excel(writer, sheet_name = "READ ME")
            by_agency_long.to_excel(writer, sheet_name = "Aggregated by Agency")
            by_mode_long.to_excel(writer, sheet_name = "Aggregated by Mode")
            by_tos_long.to_excel(writer, sheet_name = "Aggregated by TOS")
        
    shutil.make_archive(f"./{year}_{month}", "zip", f"{year}_{month}")
    print("Zipped folder")
    
    fs.upload(
        f"./{year}_{month}.zip", 
        f"{GCS_FILE_PATH}{year}_{month}.zip"
    )
    
    if upload_to_public:
        fs.upload(
            f"./{year}_{month}.zip",
            f"{PUBLIC_GCS}ntd_monthly_ridership/{year}_{month}.zip"
        )
    
    print("Uploaded to GCS")
    
    return

In [None]:
save_rtpa_outputs_v3(full_run_test,YEAR, MONTH, upload_to_public = False)

IT WORKS!! The excel exports all have a tab for Agg by agency, mode and tos

# ---
# Things to Move Over

these are the final versions of the changes to move over 

## Functions to Move over

In [None]:
def get_percent_change(
    df: pd.DataFrame, 
) -> pd.DataFrame:
    """
    updated to work with the warehouse `dim_monthly_ntd_ridership_with_adjustments` long data format. 
    
    """
    df["pct_change_1yr"] = (
        (df["upt"] - df["previous_y_m_upt"])
        .divide(df["upt"])
        .round(4)
    )
    
    return df

In [None]:
def sum_by_group(
    df: pd.DataFrame,
    group_cols: list) -> pd.DataFrame:
    """
    since data is now long to begin with, this replaces old sum_by_group, make_long and assemble_long_df functions.
    """
    grouped_df = df.groupby(group_cols+
                             ['period_year',
                             'period_month',
                             'period_year_month']
                           ).agg({
        "upt":"sum",
        "previous_y_m_upt":"sum",
        "change_1yr":"sum"
    }
    ).reset_index()
    
    #get %change back
    grouped_df = get_percent_change(grouped_df)
    
    #decimal to whole number
    grouped_df["pct_change_1yr"] = grouped_df["pct_change_1yr"]*100
    
    return grouped_df

In [None]:
def add_change_columns(df: pd.DataFrame) -> pd.DataFrame:
    """
    This function works with the warehouse `dim_monthly_ntd_ridership_with_adjustments` long data format.
    Sorts the df by ntd id, mode, tos, period month and period year. then adds 2 new columns, 1. previous year/month UPT and 2. UPT change 1yr.
    """

    sort_cols2 =  ["ntd_id","mode", "tos","period_month", "period_year"] # got the order correct with ["period_month", "period_year"]! sorted years with grouped months
    group_cols2 = ["ntd_id","mode", "tos"]
    
    df[["period_year","period_month"]] = df[["period_year","period_month"]].astype(int)

    df = df.assign(
        previous_y_m_upt = (df.sort_values(sort_cols2)
                        .groupby(group_cols2)["upt"] 
                        .apply(lambda x: x.shift(1))
                       )
    )

    df["change_1yr"] = (df["upt"] - df["previous_y_m_upt"])
    
    df = get_percent_change_v2(df)
    
    return df

In [None]:
def produce_ntd_monthly_ridership_by_rtpa(
    year: int,
    month: int
) -> pd.DataFrame:
    """
    This function works with the warehouse `dim_monthly_ntd_ridership_with_adjustments` long data format.
    """
    full_upt = (tbls.mart_ntd.dim_monthly_ntd_ridership_with_adjustments() >> collect()).rename(columns = {"mode_type_of_service_status": "Status"})
    
    full_upt = full_upt[full_upt.agency.notna()].reset_index(drop=True)
    
    full_upt.to_parquet(
        f"{GCS_FILE_PATH}ntd_monthly_ridership_{year}_{month}.parquet"
    )
    
    ca = full_upt[(full_upt["uza_name"].str.contains(", CA")) & 
            (full_upt.agency.notna())].reset_index(drop=True)
    
    crosswalk = pd.read_csv(
        f"gs://calitp-analytics-data/data-analyses/ntd/ntd_id_rtpa_crosswalk.csv", 
        dtype = {"NTD ID": "str"}
    #have to rename NTD ID col to match the dim table
    ).rename(columns={"NTD ID": "ntd_id"})
    
    df = pd.merge(
        ca,
        # Merging on too many columns can create problems 
        # because csvs and dtypes aren't stable / consistent 
        # for NTD ID, Legacy NTD ID, and UZA
        crosswalk[["ntd_id", "RTPA"]],
        on = "ntd_id",
        how = "left",
        indicator = True
    )
    
    print(df._merge.value_counts())
    
    if len(df[df._merge=="left_only"]) > 0:
        raise ValueError("There are unmerged rows to crosswalk")
    
    df = add_change_columns_v2(df)
    
    df = df.assign(
        Mode_full = df["mode"].map(NTD_MODES),
        TOS_full = df["tos"].map(NTD_TOS)
    )
    
    return df

In [None]:
def save_rtpa_outputs(
    df: pd.DataFrame, 
    year: int, 
    month: str,
    upload_to_public: bool = False
):
    """
    Export an excel for each RTPA, adds a READ ME tab, then writes into a folder.
    Zip that folder. 
    Upload zipped file to GCS.
    """
    col_dict ={
    'Uace Cd': "UACE Code",
    'Dt': "Date",
    'Ntd Id': "NTD ID",
    'Tos': "Type of Service",
    'Legacy Ntd Id': "Legacy NTD ID",
    'Upt': "UPT",
    'Vrm': "VRM",
    'Vrh': "VRH",
    'Voms': "VOMS",
    'Rtpa': "RTPA",
    'Previous Y M Upt': "Previous Year/Month UPT",
    'Change 1Yr': "Change in 1 Year UPT",
    'Pct Change 1Yr': "Percent Change in 1 Year UPT",
    'Tos Full': "Type of Service Full Name"
}
    
    for i in df["RTPA"].unique():
        # Filename should be snakecase
        rtpa_snakecase = i.replace(' ', '_').lower()

        (df[df["RTPA"] == i]
         .sort_values("ntd_id")
         #got error from excel not recognizing timezone, made list to include dropping "ts" column
         .drop(columns = ["_merge","ts"])
         #cleaning column names
         .rename(columns=lambda x: x.replace("_"," ").title().strip())
         #rename columns
         .rename(columns=col_dict)
         #updated to `to_excel`, added sheet_name 
         .to_excel(
            f"./{year}_{month}/{rtpa_snakecase}.xlsx", sheet_name = "RTPA Ridership Data",
            index = False)
         
        )
        #insertng readme cover sheet, 
        cover_sheet = pd.read_excel("./cover_sheet_template.xlsx", index_col = "NTD Monthly Ridership by RTPA")
        
        agency_cols = ["ntd_id", "agency", "RTPA"]
        mode_cols = ["mode", "RTPA"]
        tos_cols = ["tos", "RTPA"]

        by_agency_long = sum_by_group(df, agency_cols)
        by_mode_long = sum_by_group(df, mode_cols)
        by_tos_long = sum_by_group(df, tos_cols)
        
        with pd.ExcelWriter(f"./{year}_{month}/{rtpa_snakecase}.xlsx", mode ="a") as writer:
            cover_sheet.to_excel(writer, sheet_name = "READ ME")
            by_agency_long.to_excel(writer, sheet_name = "Aggregated by Agency")
            by_mode_long.to_excel(writer, sheet_name = "Aggregated by Mode")
            by_tos_long.to_excel(writer, sheet_name = "Aggregated by TOS")
        
    shutil.make_archive(f"./{year}_{month}", "zip", f"{year}_{month}")
    print("Zipped folder")
    
    fs.upload(
        f"./{year}_{month}.zip", 
        f"{GCS_FILE_PATH}{year}_{month}.zip"
    )
    
    if upload_to_public:
        fs.upload(
            f"./{year}_{month}.zip",
            f"{PUBLIC_GCS}ntd_monthly_ridership/{year}_{month}.zip"
        )
    
    print("Uploaded to GCS")
    
    return

## Misc. Script changes 

## Notebook Functions and changes to move over

In [None]:
MIN_YEAR = 2018

#updated to filter for period_year greater than 2018
df = pd.read_parquet(
    f"{GCS_FILE_PATH}ca_monthly_ridership_{YEAR}_{MONTH}.parquet",
    filters = [[("RTPA", "==", rtpa), ("period_year", ">=", MIN_YEAR)]]
).drop(
    #updated columns names to match new df
    columns = ["mode", "tos"]
).rename(columns = {"Mode_full": "Mode", "TOS_full": "TOS"})

In [None]:
agency_cols = ["ntd_id", "agency", "RTPA"]
mode_cols = ["Mode", "RTPA"]
tos_cols = ["TOS", "RTPA"]

by_agency_long = sum_by_group(df, agency_cols)
by_mode_long = sum_by_group(df, mode_cols)
by_tos_long = sum_by_group(df, tos_cols)

# zero upt still works!
by_agency_long_no_zero_upt = remove_zero_upt_rows(by_agency_long)

In [None]:
def make_line_chart(
    df: pd.DataFrame, 
    y_col: str,
    color_col: str,
) -> alt.Chart:
    df = df[df[y_col] > 0].dropna(subset = y_col)

    x_label = [i for i in df.period_year_month.unique() if 
               any(substring in i for substring in 
                   ["-01", "-06"])
              ]    
    chart = (alt.Chart(df)
         .mark_line()
         .encode(
             x = alt.X("period_year_month:O", 
                       axis=alt.Axis(values = x_label), 
                       title = "Date"
                      ),
             y = alt.Y(y_col, title = labeling(y_col)),
             color = alt.Color(color_col, title = "", 
                               scale = alt.Scale(
                                   range = cp.CALITP_CATEGORY_BRIGHT_COLORS + 
                                   cp.CALITP_CATEGORY_BOLD_COLORS
                               )),
             tooltip = ["period_year_month", y_col, color_col, "RTPA"]
         ).properties(width = WIDTH, height = HEIGHT)
         .facet(color_col, columns=2, title = "")
         .resolve_scale(y="independent")
    ).properties(
        title = f"{labeling(y_col)} by {labeling(color_col)}"
    ).interactive()
        
    return chart

In [None]:
def make_bar_chart(
    df: pd.DataFrame, 
    y_col: str,
    color_col: str,
) -> alt.Chart:
    
    def short_label(word):
        shorten_dict = {
            "change_1yr": "Change",
            "pct_change_1yr":"Change",
        }
        return shorten_dict[word]
    
    # For change column, we are missing everything prior to 2023 
    #df = df.dropna(subset = y_col)
    
    #need flag for y_col >,<, 0, missing?
    #count function to how many agencies fall in those categories, then look at those agencies
    #present table
    
    x_label = [i for i in df.period_year_month.unique() if 
               any(substring in i for substring in 
                   ["-01", "-03", "-06", "-09"])
              ]
    
    chart = (alt.Chart(df)
         .mark_bar()
         .encode(
             x = alt.X("period_year_month:O", 
                       axis=alt.Axis(values = x_label), 
                       title = "Date"
                      ),
             y = alt.Y(y_col, title = short_label(y_col)),
             color = alt.Color(color_col, title = "", 
                              scale = alt.Scale(
                                   range = cp.CALITP_CATEGORY_BRIGHT_COLORS + 
                                   cp.CALITP_CATEGORY_BOLD_COLORS
                              )),
             tooltip = ["period_year_month", y_col, color_col, "RTPA"]
         ).properties(width = WIDTH, height = HEIGHT)
         .facet(color_col, columns=2, title = "")
         .resolve_scale(x="shared", 
                        y="independent")
    ).properties(
        title = f"{labeling(y_col)} by {labeling(color_col)}"
    ).interactive()
    
    return chart