In [1]:
import pandas as pd
from calitp_data_analysis.tables import tbls
from siuba import _, collect, count, filter, show_query
from shared_utils.rt_dates import MONTH_DICT
from update_vars import GCS_FILE_PATH, NTD_MODES, NTD_TOS, YEAR, MONTH

In [2]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

### what is the grain of this dataset?
metrics values (upt/vrm/vrh/crm/voms) of a specifc **tos**, of specific **mode**, for a speicifc **agency** on a specific **year/month**

# Test reading in `dim_monthly_ntd_ridership_with_adjustments` from warehouse

In [None]:
monthly_ridership = (
    tbls.mart_ntd.dim_monthly_ntd_ridership_with_adjustments() >> collect()
)

In [None]:
# reading in old monthly ridership parquet for comparison
old_march_2024_ridership = pd.read_parquet("gs://calitp-analytics-data/data-analyses/ntd/ntd_monthly_ridership_2024_March.parquet")

In [None]:
# what is in this table?
# all states
display(
    monthly_ridership.shape,
    # monthly_ridership.info,
    monthly_ridership.head(3),
)

In [None]:
print(monthly_ridership.dtypes)

In [None]:
# are there NaN values in uza name? YES
monthly_ridership["uza_name"].isna().sum()

In [None]:
# fill the NaN values with something
monthly_ridership["uza_name"].fillna("none", inplace=True)

In [None]:
# double check for na
monthly_ridership["uza_name"].isna().sum()

In [None]:
# can i now filter by uza CA, becaue there are zero NaN values? YES
uza_ca = monthly_ridership[monthly_ridership["uza_name"].str.contains(", CA")]
display(
    uza_ca.shape,
    uza_ca["uza_name"].value_counts()
)

In [None]:
# where are the UTP values? the UTP page for this table used to be very wide
# look at a specific agency by ntd id, LA Metro == "9154"
# what is the dtype of ntd_id... its object, so use str.contrains
la_metro = monthly_ridership[monthly_ridership["ntd_id"] == "90154"]
display(la_metro.shape, uza_ca.shape)

In [None]:
# filter la_metro data to only see current and previous year (2024 and 2023)

la_2324 = la_metro[la_metro["period_year"].isin(["2023", "2024"])]

In [None]:
# does this 23/24 data match whats in the NTD report? YES
# confirm that data is now long instead of wide
la_2324[la_2324["period_year_month"] == "2024-03"][
    ["period_year_month", "mode", "tos", "upt"]
].sort_values(by="period_year_month", ascending=True)

# What to modify in the `produce_ntd_monthly_ridership_by_rtpa` function
step-by-step

In [3]:
# 1. reading in the data & renaming column
# intead of reading in the data via an excel doc by URL, intead will query the warehouse directly

#GOOD TO GO
full_upt = (tbls.mart_ntd.dim_monthly_ntd_ridership_with_adjustments() >> collect()).rename(columns = {"mode_type_of_service_status": "Status"})


In [None]:
#1.1 setting `ntd_id` to str

# NTD_ID COL IS ALREADY OBJECT/STR DATA TYPE
# full_upt["ntd_id"] = full_upt["ntd_id"].astype(str)

In [4]:
#2 filter df to get rid of na rows
# GOOD TO GO!
full_upt = full_upt[full_upt.agency.notna()].reset_index(drop=True)

In [None]:
#3 new DF of just CA and not-blank agency names
# filter UZA name col for CA and not blank agency names.

#GOOD TO GO!
ca = full_upt[(full_upt["uza_name"].str.contains(", CA")) & 
            (full_upt.agency.notna())].reset_index(drop=True)

In [None]:
display(
    ca.shape,
    ca.head()
)

In [None]:
#4 read in ntd_id-to-RTPA crosswalk
# same as before, no changes

#GOOD TO GO!
crosswalk = pd.read_csv(
        f"gs://calitp-analytics-data/data-analyses/ntd/ntd_id_rtpa_crosswalk.csv", 
        dtype = {"NTD ID": "str"}
    #have to rename NTD ID col to match the dim table
    ).rename(columns={"NTD ID": "ntd_id"})
    
crosswalk.columns

In [None]:
#5 merge CA df and crosswalk

# GOOD TO GO
df = pd.merge(
        ca,
        # Merging on too many columns can create problems 
        # because csvs and dtypes aren't stable / consistent 
        # for NTD ID, Legacy NTD ID, and UZA
        crosswalk[["ntd_id", "RTPA"]],
        #change on = value to ntd_id
        on = "ntd_id",
        how = "left",
        indicator = True
    )
    
print(df._merge.value_counts())

In [None]:
#6 flag to see if there are any unmerged rows 

# GOOD TO GO
if len(df[df._merge=="left_only"]) > 0:
        raise ValueError("There are unmerged rows to crosswalk")

In [None]:
#7 reverses the MONTH_DICT 

# Add new columns    
reversed_months = {v:k for k, v in MONTH_DICT.items()}
    
for m in range(1, reversed_months[month] + 1):
    df = add_change_columns(df, year, m)

In [None]:
# FINAL FORM


# final updated  `produce_ntd_monthly_ridership_by_rtpa` function.
# may not need the upt_url

def produce_ntd_monthly_ridership_by_rtpa(
    #upt_url: str,
    year: int,
    month: str
) -> pd.DataFrame:
    """
    Import NTD data from url, filter to CA, 
    merge in crosswalk, and save individual csvs.
    """
    full_upt = (tbls.mart_ntd.dim_monthly_ntd_ridership_with_adjustments() >> collect()
               ).rename(columns = {"mode_type_of_service_status": "Status"})
    
    full_upt = full_upt[full_upt.agency.notna()].reset_index(drop=True)
    #full_upt.to_parquet(
    #    f"{GCS_FILE_PATH}ntd_monthly_ridership_{year}_{month}.parquet"
    #)
    ca = full_upt[(full_upt["uza_name"].str.contains(", CA")) & 
            (full_upt.agency.notna())].reset_index(drop=True)
    
    crosswalk = pd.read_csv(
        f"gs://calitp-analytics-data/data-analyses/ntd/ntd_id_rtpa_crosswalk.csv", 
        dtype = {"NTD ID": "str"}
    #have to rename NTD ID col to match the dim table
    ).rename(columns={"NTD ID": "ntd_id"})
    
    df = pd.merge(
        ca,
        # Merging on too many columns can create problems 
        # because csvs and dtypes aren't stable / consistent 
        # for NTD ID, Legacy NTD ID, and UZA
        crosswalk[["ntd_id", "RTPA"]],
        #change on = value to ntd_id
        on = "ntd_id",
        how = "left",
        indicator = True
    )
    
    print(df._merge.value_counts())
    
    # Good, everything merged, as we want
    if len(df[df._merge=="left_only"]) > 0:
        raise ValueError("There are unmerged rows to crosswalk")
        
    # Add new columns    
    reversed_months = {v:k for k, v in MONTH_DICT.items()}
    
    for m in range(1, reversed_months[month] + 1):
        df = add_change_columns(df, year, m)
    
    df = df.assign(
        Mode_full = df.Mode.map(NTD_MODES),
        TOS_full = df.TOS.map(NTD_TOS)
    )
    
    return df


---
## Do I need `reserved_months` anymore?
RE: `produce_ntd_monthly_ridership_by_rtpa` function.


Since the new table uses a `yyyy-mm` format now. i dont think i need the revesre months anymore. what else needs to change?
reversed month was needed for the `add_change_columns` function.  



In [None]:
# what does reversed months do?

reversed_months = {v:k for k, v in MONTH_DICT.items()}

# what MONTH_DICT looks like initially.

print("this is MONTH_DICT"),
display(MONTH_DICT),

# like the name implies, reverses the key values pairs. 
print("this is reversed_months"),
display(reversed_months)


In [None]:
# what is the date format in the new df
# YYYY-MM
df["period_year_month"].sample(3)

In [None]:
# what is the date format in the old data?
# M/YYYY
old_march_2024_ridership.iloc[:,18:22].head()

## options
* change the df date format to match reversed_months format. (yyyy-mm to mm/yyy)
    * can use a mix of split, str replace, and reversing via slicing 
* change the `add_change_column` function to accept yyyy-mm
    * make a v2 of this function that accepts yyyy-mm, creates a new column, and applys some function by looking at the row period_year_month and finding the "period_year_month" -1 yr to calculate the difference.

# update `add_change_columns` function
need to update this function so it can work with the `produce_ntd_monthly_ridership_by_rtpa` function

In [None]:
df.dtypes

In [None]:
df.head()

In [None]:
# updated to match new year-month format 

def add_change_columns_v2(
    df: pd.DataFrame,
    year: int,
    month: int
) -> pd.DataFrame:
    """
    updated to accept yyyy-mm format.
    
    """    
    ntd_month_col = f"{year}-{month}"
    prior_year_col = f"{int(year)-1}-{month}"
        
    df[f"change_1yr_{ntd_month_col}"] = df[ntd_month_col] - df[prior_year_col]
    df = get_percent_change(df, ntd_month_col, prior_year_col)
    
    return df

In [None]:
# can i use a lambda function to create a new column that:
# checks every row (x), if the row has a `period_year_month` value, does it have a `period_year_month` -1 year value in the same column? 
    # if true, then subtract `period_year_month` by `period_year_month`-1 year.
    # else return 0
year = 2023
month = "05"

current_y_m = f"{year}-{month}"
prior_y_m = f"{int(year)-1}-{month}"

# using current_y_m works and prior_year_col works
df[df["period_year_month"].str.contains(prior_y_m)].head(3)



In [None]:
# first attempt at what im trying to do

# create a new column called "change 1yr"
# at each row in "period_month_year", do what? take the value of that row, then mimus the value of the previous period_mont_year -1

#this doesnt work becaues im not indicating how X is used, 
#df["change_1yr"] = df["period_year_month"].apply(lambda x: (df["period_year_month"].str.contains(current_y_m)["upt"]) - (df["period_year_month"].str.contains(prior_year_col)["upt"]))

#pseudo code
df["change_1yr"] = df["period_year_month"].apply(lambda x: (upt at x for current_y_m, agency, tos and mode)  - 
                                                 (upt at x for prior_y_m, current_y_m, agency, tos and mode))



In [None]:
# how can i see the specific UPT for a specific agency, mode & TOS and date?

#uses .loc at current_y_m UPT, position 0, minus .loc at previous_year_col UPT, position o
# but is is the same agency/mode/tos in both sides of the equation?
df.loc[df["period_year_month"] == current_y_m,"upt"].values[0] - df.loc[df["period_year_month"] == prior_y_m,"upt"].values[0]

In [None]:
df.loc[
        (df["period_year_month"] == current_y_m) &
        (df["ntd_id"] == df["ntd_id"]) &
        (df["mode"] == df["mode"]) &
        (df["tos"] == df["tos"]), "upt"
    ].head()

In [None]:
df.sample(3)

In [None]:
df.loc[df["period_year_month"] == prior_y_m, "upt"][:10]


In [None]:
def test_difference_func(row, current_y_m, prior_y_m ):
    current_ym_val =  df.loc[
        (df["period_year_month"] == current_y_m) &
        (df["ntd_id"] == row["ntd_id"]) &
        (df["mode"] == row["mode"]) &
        (df["tos"] == row["tos"]), "upt"
    ]
    
    prev_ym_val =  df.loc[
        (df["period_year_month"] == prior_y_m) &
        (df["ntd_id"] == row["ntd_id"]) &
        (df["mode"] == row["mode"]) &
        (df["tos"] == row["tos"]), "upt"
    ]
    
    return (current_ym_val - prev_ym_val)

In [None]:
test_df = df[:50].copy()

In [None]:
test_df.columns

In [None]:
year = 2023
month = "05"

current_y_m = f"{year}-{month}"
prior_y_m = f"{int(year)-1}-{month}"

display(
    current_y_m,
    prior_y_m
)

In [None]:
test_df["test_col"] = test_df.apply(lambda row: test_difference_func(row, current_y_m="2023-05", prior_y_m="2022-05"), axis=1)

So im having trouble with coming up with a `change 1yr` column, now that the dataset is long (instead of the previous wide)

what i was going for was something like:
- adding a new column to the dataset called `change 1yr`, that takes the `UPT` value of the `ntd_id, mod, tos` for current `period_year_month` (ex. 2024-05) row, and subtracts the `UPT` value from the previous year's `period_year_month` (ex. 2024-05) row of the same `ntd_id, mod, tos`.

- i thought i had something with a .apply(lambda x:) method, but wasnt etting anywhere.

any suggestions on how i should approach this?


---

In [None]:
# what is the most recent period_year_month? as of 7/2/2024\
# should expect to see values up tp May 2024
display(
    ca[ca["period_year_month"] == "2024-04"].shape,
    # as of 7/2, zero rows for May 2024. but expect ntd to update that data soon.
    # 7/15, still no values for May 2024
    ca[ca["period_year_month"] == "2024-05"].shape
)

# Tiffany method to get change 1yr working

[see link to her old GH sctipt](https://github.com/CityOfLosAngeles/covid19-indicators/blob/master/processing_utils/utils.py#L306-L315)

The .shift() method in Pandas is used to shift the values of a DataFrame or Series by a specified number of periods along the given axis.
When you use .shift(), the data in the DataFrame or Series is moved up or down, while the index/columns stay in place. can move the entire dataframe up/down/left/right, or just a row or column in any direction.

Per Tiffany
- "i think you could perhaps make this idea work. that snippet uses .shift to get you a previous observation and a post observation. in this case, it's the previous day and the day after. but for you, `you're looking at a previous year (same month)`."

```
sort_cols = ["county", "state", "fips", "date"]
group_cols = ["county", "state", "fips"]

df = df.assign(
        previous_day = (df.sort_values(sort_cols)
                        .groupby(group_cols)["new_cases"]
                        .apply(lambda x: x.shift(1))
                       ),
        post_day = (df.sort_values(sort_cols)
                    .groupby(group_cols)["new_cases"]
                    .apply(lambda x: x.shift(-1))
                   ),
    )
```

In [None]:
# df of just riverside county transpo agency in select years
rcta = ca[(ca["ntd_id"].isin(["90218", "90006","90216"])) & (ca["period_year"].isin(["2024","2023", "2022", "2021"]))]


In [None]:
rcta[["period_year","period_month"]] = rcta[["period_year","period_month"]].astype(int)

In [None]:
display(
    rcta.columns,
    rcta.shape,
    rcta.dtypes,
    rcta["agency"].value_counts()
)

In [None]:
# list of column names to sort the df by
# lets sort by mode, tos, period_year_month
sort_cols = ["mode", "tos", "period_year_month"]
group_cols = ["mode", "tos"]

current_y_m = f"{year}-{month}"
prior_y_m = f"{int(year)-1}-{month}"

In [None]:
previous_day = (rcta.sort_values(sort_cols) #sort df by oldest date
                        .groupby(group_cols)["upt"] #returns only upt values
                        .apply(lambda x: x.shift(1)) #shifts upt values down by 1 step
                       )

next_day = (rcta.sort_values(sort_cols) #sort df by oldest date
                        .groupby(group_cols)["upt"] #returns only upt values
                        .apply(lambda x: x.shift(-1)) #shifts upt values up by 1 step
                       )
display(
    type(previous_day),
    rcta.sort_values(sort_cols).groupby(group_cols)["upt"].head(),
    previous_day.head(),
    next_day.head()

)
# result is a series of the "upt" col shifted down 1 place

In [None]:
# put together, what happens?
# as is, new columns will be added to the DF, called prev_day and next_day. filled with values from upt shifted down one day and up one day respectivley. 

rcta.sort_values(sort_cols).groupby(group_cols)["upt"] = rcta.assign(
    previous_day = (rcta.sort_values(sort_cols) #sort df by oldest date
                        .groupby(group_cols)["upt"] #returns only upt values
                        .apply(lambda x: x.shift(1)) #shifts upt values down by 1 step
                       ),
    next_day = (rcta.sort_values(sort_cols) #sort df by oldest date
                        .groupby(group_cols)["upt"] #returns only upt values
                        .apply(lambda x: x.shift(-1)) #shifts upt values up by 1 step
                       ),

)
The above code shifts the UPT data down/up 1 step

In [None]:
# How can i adapt this code to shift the just year?
# apply the shift to period_year?

sort_cols2 =  ["ntd_id","mode", "tos","period_month", "period_year"] # got the order correct with ["period_month", "period_year"]! sorted years with grouped months
group_cols2 = ["ntd_id","mode", "tos"]

rcta = rcta.assign(
    #this works! previous_y_m_upt shifted down 1 step as intended for each agency! each agency's first year_month got a NaN for previous_y_m_upt
    previous_y_m_upt = (rcta.sort_values(sort_cols2) #sort df by oldest date
                        .groupby(group_cols2)["upt"] #
                        .apply(lambda x: x.shift(1)) #shifts upt values down by 1 step
                       ),
    #THIS WORKS! the math works out!
    change_1yr = (rcta["upt"] - rcta["previous_y_m_upt"])
                  )


In [None]:
display(
    #easy to see the differences when the DF is sorte the same way,
    rcta.sort_values(sort_cols2).head(),
    
    #does the math work out when the df is not sorted? YES IT IS!!
    rcta.sample(5)
)

# FINAL `add_change_col` function

In [None]:
def add_change_columns_v2(df: pd.DataFrame) -> pd.DataFrame:
    """
    This function works with the warehouse `dim_monthly_ntd_ridership_with_adjustments` long data format.
    Sorts the df by ntd id, mode, tos, period month and period year. then adds 2 new columns, 1. previous year/month UPT and 2. UPT change 1yr.
    """

    sort_cols2 =  ["ntd_id","mode", "tos","period_month", "period_year"] # got the order correct with ["period_month", "period_year"]! sorted years with grouped months
    group_cols2 = ["ntd_id","mode", "tos"]

    df = df.assign(
        previous_y_m_upt = (df.sort_values(sort_cols2)
                        .groupby(group_cols2)["upt"] 
                        .apply(lambda x: x.shift(1))
                       )
    )

    df["change_1yr"] = (df["upt"] - df["previous_y_m_upt"])
    
    return df

In [None]:
ca.shape

In [None]:
#test to ensure `add_change_columns_v2` works
add_change_columns_v2(ca)