# Test reading in `dim_monthly_ntd_ridership_with_adjustments` from warehouse

In [1]:
import pandas as pd
from calitp_data_analysis.tables import tbls
from siuba import _, collect, count, filter, show_query


In [2]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [3]:
monthly_ridership = (
    tbls.mart_ntd.dim_monthly_ntd_ridership_with_adjustments() >> collect()
)

In [4]:
# what is in this table?
# all states
display(
    monthly_ridership.shape,
    # monthly_ridership.info,
    monthly_ridership.head(3),
)

(610362, 21)

Unnamed: 0,uza_name,uace_cd,_dt,ts,ntd_id,year,reporter_type,agency,mode_type_of_service_status,mode,service_type,_3_mode,tos,legacy_ntd_id,period_year_month,period_year,period_month,upt,vrm,vrh,voms
0,"Miami--Fort Lauderdale, FL",56602,2024-05-06,2024-05-06 22:08:22.483961+00:00,40034,2024,Full Reporter,County of Miami-Dade,Inactive,AG,Fixed Route,Rail,DO,4034,2013-01,2013,1,0,0,0,0
1,"Miami--Fort Lauderdale, FL",56602,2024-05-06,2024-05-06 22:08:22.483961+00:00,40034,2024,Full Reporter,County of Miami-Dade,Inactive,AG,Fixed Route,Rail,DO,4034,2020-05,2020,5,0,0,0,0
2,"Miami--Fort Lauderdale, FL",56602,2024-05-06,2024-05-06 22:08:22.483961+00:00,40034,2024,Full Reporter,County of Miami-Dade,Inactive,AG,Fixed Route,Rail,DO,4034,2006-04,2006,4,701883,89706,8795,15


In [5]:
print(monthly_ridership.dtypes)

uza_name                                    object
uace_cd                                     object
_dt                                         object
ts                             datetime64[ns, UTC]
ntd_id                                      object
year                                         int64
reporter_type                               object
agency                                      object
mode_type_of_service_status                 object
mode                                        object
service_type                                object
_3_mode                                     object
tos                                         object
legacy_ntd_id                               object
period_year_month                           object
period_year                                 object
period_month                                object
upt                                          int64
vrm                                          int64
vrh                            

In [7]:
# are there NaN values in uza name?
monthly_ridership["uza_name"].isna().sum()

1602

In [8]:
# fill the NaN values with something
monthly_ridership["uza_name"].fillna("none", inplace=True)

In [9]:
# double check for na
monthly_ridership["uza_name"].isna().sum()

0

In [65]:
# can i filter by uza CA? YES
uza_ca = monthly_ridership[monthly_ridership["uza_name"].str.contains(", CA")]
display(
    uza_ca.shape,
    uza_ca["uza_name"].value_counts()
)

(91047, 21)

Los Angeles--Long Beach--Anaheim, CA             19224
San Francisco--Oakland, CA                       10680
Sacramento, CA                                    8277
San Diego, CA                                     6141
Riverside--San Bernardino, CA                     3738
Stockton, CA                                      2937
Vallejo, CA                                       2136
Seaside--Monterey--Pacific Grove, CA              1869
Santa Rosa, CA                                    1869
Lake Tahoe, CA-NV                                 1869
Modesto, CA                                       1869
San Luis Obispo, CA                               1602
Palmdale--Lancaster, CA                           1602
Hanford, CA                                       1335
San Jose, CA                                      1335
Santa Cruz, CA                                    1335
Visalia, CA                                       1335
Oxnard--San Buenaventura (Ventura), CA            1335
Indio--Pal

In [25]:
# where are the UTP values? the UTP page for this table used to be very wide
# look at a specific agency by ntd id, LA Metro "9154"
# what is the dtype of ntd_id... its object, so use str.contrains
la_metro = monthly_ridership[monthly_ridership["ntd_id"] == "90154"]
display(la_metro.shape, la_metro.columns)

(1869, 21)

Index(['uza_name', 'uace_cd', '_dt', 'ts', 'ntd_id', 'year', 'reporter_type',
       'agency', 'mode_type_of_service_status', 'mode', 'service_type',
       '_3_mode', 'tos', 'legacy_ntd_id', 'period_year_month', 'period_year',
       'period_month', 'upt', 'vrm', 'vrh', 'voms'],
      dtype='object')

In [36]:
# filter la_metro data to only see current and previous year (2024 and 2023)

la_2324 = la_metro[la_metro["period_year"].isin(["2023", "2024"])]

In [53]:
# does this 23/24 data match whats in the NTD report? YES
# confirm that data is now long instead of wide
la_2324[la_2324["period_year_month"] == "2024-03"][
    ["period_year_month", "mode", "tos", "upt"]
].sort_values(by="period_year_month", ascending=True)

Unnamed: 0,period_year_month,mode,tos,upt
285539,2024-03,DR,DO,62325
320379,2024-03,HR,DO,1874721
330324,2024-03,LR,DO,3842909
531362,2024-03,MB,DO,18852906
531852,2024-03,MB,PT,925436
554507,2024-03,RB,DO,380625
604674,2024-03,VP,PT,125149


In [57]:
# how to modify the `produce_ntd_monthly_ridership_by_rtpa` function

full_upt = (tbls.mart_ntd.dim_monthly_ntd_ridership_with_adjustments() >> collect()).rename(columns = {"mode_type_of_service_status": "Status"})


In [59]:
display(
    full_upt.shape,
    full_upt.columns
)

(610362, 21)

Index(['uza_name', 'uace_cd', '_dt', 'ts', 'ntd_id', 'year', 'reporter_type',
       'agency', 'Status', 'mode', 'service_type', '_3_mode', 'tos',
       'legacy_ntd_id', 'period_year_month', 'period_year', 'period_month',
       'upt', 'vrm', 'vrh', 'voms'],
      dtype='object')

In [60]:
full_upt = full_upt[full_upt.agency.notna()].reset_index(drop=True)

In [61]:
full_upt.shape

(610362, 21)

In [63]:
ca = full_upt[(full_upt["uza_name"].str.contains(", CA")) & 
            (full_upt.agency.notna())].reset_index(drop=True)

In [64]:
ca.shape

(91047, 21)

In [73]:
crosswalk = pd.read_csv(
        f"gs://calitp-analytics-data/data-analyses/ntd/ntd_id_rtpa_crosswalk.csv", 
        dtype = {"NTD ID": "str"}
    #have to rename NTD ID col to match the dim table
    ).rename(columns={"NTD ID": "ntd_id"})
    
crosswalk.columns

Index(['ntd_id', 'Legacy NTD ID', 'Agency', 'UZA Name', 'RTPA_open_data',
       'RTPA'],
      dtype='object')

In [76]:
df = pd.merge(
        ca,
        # Merging on too many columns can create problems 
        # because csvs and dtypes aren't stable / consistent 
        # for NTD ID, Legacy NTD ID, and UZA
        crosswalk[["ntd_id", "RTPA"]],
        #change on = value to ntd_id
        on = "ntd_id",
        how = "left",
        indicator = True
    )
    
print(df._merge.value_counts())

both          91047
left_only         0
right_only        0
Name: _merge, dtype: int64


In [92]:
# updated to match new year-month format 

def add_change_columns(
    df: pd.DataFrame,
    year: int,
    month: int
) -> pd.DataFrame:
    """
    """    
    ntd_month_col = f"{year}-{month}"
    prior_year_col = f"{int(year)-1}-{month}"
        
    df[f"change_1yr_{ntd_month_col}"] = df[ntd_month_col] - df[prior_year_col]
    df = get_percent_change(df, ntd_month_col, prior_year_col)
    
    return df

In [94]:
from shared_utils.rt_dates import MONTH_DICT
from update_vars import GCS_FILE_PATH, NTD_MODES, NTD_TOS, YEAR, MONTH

# final updated  `produce_ntd_monthly_ridership_by_rtpa` function.
# may not need the upt_url

def produce_ntd_monthly_ridership_by_rtpa(
    #upt_url: str,
    year: int,
    month: str
) -> pd.DataFrame:
    """
    Import NTD data from url, filter to CA, 
    merge in crosswalk, and save individual csvs.
    """
    full_upt = (tbls.mart_ntd.dim_monthly_ntd_ridership_with_adjustments() >> collect()
               ).rename(columns = {"mode_type_of_service_status": "Status"})
    
    full_upt = full_upt[full_upt.agency.notna()].reset_index(drop=True)
    #full_upt.to_parquet(
    #    f"{GCS_FILE_PATH}ntd_monthly_ridership_{year}_{month}.parquet"
    #)
    ca = full_upt[(full_upt["uza_name"].str.contains(", CA")) & 
            (full_upt.agency.notna())].reset_index(drop=True)
    
    crosswalk = pd.read_csv(
        f"gs://calitp-analytics-data/data-analyses/ntd/ntd_id_rtpa_crosswalk.csv", 
        dtype = {"NTD ID": "str"}
    #have to rename NTD ID col to match the dim table
    ).rename(columns={"NTD ID": "ntd_id"})
    
    df = pd.merge(
        ca,
        # Merging on too many columns can create problems 
        # because csvs and dtypes aren't stable / consistent 
        # for NTD ID, Legacy NTD ID, and UZA
        crosswalk[["ntd_id", "RTPA"]],
        #change on = value to ntd_id
        on = "ntd_id",
        how = "left",
        indicator = True
    )
    
    print(df._merge.value_counts())
    
    # Good, everything merged, as we want
    if len(df[df._merge=="left_only"]) > 0:
        raise ValueError("There are unmerged rows to crosswalk")
        
    # Add new columns    
   # reversed_months = {v:k for k, v in MONTH_DICT.items()}
    
    for m in range(1, reversed_months[month] + 1):
        df = add_change_columns(df, year, m)
    
    df = df.assign(
        Mode_full = df.Mode.map(NTD_MODES),
        TOS_full = df.TOS.map(NTD_TOS)
    )
    
    return df


In [91]:
ca["period_year_month"].value_counts().head()

2002-12    341
2010-10    341
2006-12    341
2023-08    341
2008-02    341
Name: period_year_month, dtype: int64

---
## Do I need `reserved_months` anymore?
RE: produce_ntd_monthly_ridership_by_rtpa function.


Since the new table uses a `yyyy-mm` format now. i dont think i need the revesre months anymore. what else needs to change to the the function working again.

In [87]:
# do i need reversed_months anymore?

reversed_months = {v:k for k, v in MONTH_DICT.items()}

In [88]:
reversed_months

{'January': 1,
 'February': 2,
 'March': 3,
 'April': 4,
 'May': 5,
 'June': 6,
 'July': 7,
 'August': 8,
 'September': 9,
 'October': 10,
 'November': 11,
 'December': 12}

In [97]:
produce_ntd_monthly_ridership_by_rtpa(year=2024, month="03")

both          91047
left_only         0
right_only        0
Name: _merge, dtype: int64


KeyError: '03'