# Test reading in `dim_monthly_ntd_ridership_with_adjustments` from warehouse

In [1]:
import pandas as pd
from calitp_data_analysis.tables import tbls
from siuba import _, collect, count, filter, show_query


In [2]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [3]:
monthly_ridership = (
    tbls.mart_ntd.dim_monthly_ntd_ridership_with_adjustments() >> collect()
)

In [4]:
# what is in this table?
# all states
display(
    monthly_ridership.shape,
    # monthly_ridership.info,
    monthly_ridership.head(3),
)

(613184, 22)

Unnamed: 0,uza_name,uace_cd,_dt,ts,ntd_id,year,reporter_type,agency,mode_type_of_service_status,mode,mode_full_name,service_type,_3_mode,tos,legacy_ntd_id,period_year_month,period_year,period_month,upt,vrm,vrh,voms
0,"Miami--Fort Lauderdale, FL",56602,2024-07-01,2024-07-01 21:44:31.635354+00:00,40034,2024,Full Reporter,County of Miami-Dade,Inactive,AG,Automated Guideway,Fixed Route,Rail,DO,4034,2019-03,2019,3,0,0,0,0
1,"Miami--Fort Lauderdale, FL",56602,2024-07-01,2024-07-01 21:44:31.635354+00:00,40034,2024,Full Reporter,County of Miami-Dade,Inactive,AG,Automated Guideway,Fixed Route,Rail,DO,4034,2003-08,2003,8,617327,79614,7231,18
2,"Miami--Fort Lauderdale, FL",56602,2024-07-01,2024-07-01 21:44:31.635354+00:00,40034,2024,Full Reporter,County of Miami-Dade,Inactive,AG,Automated Guideway,Fixed Route,Rail,DO,4034,2015-12,2015,12,0,0,0,0


In [5]:
print(monthly_ridership.dtypes)

uza_name                                    object
uace_cd                                     object
_dt                                         object
ts                             datetime64[ns, UTC]
ntd_id                                      object
year                                         int64
reporter_type                               object
agency                                      object
mode_type_of_service_status                 object
mode                                        object
mode_full_name                              object
service_type                                object
_3_mode                                     object
tos                                         object
legacy_ntd_id                               object
period_year_month                           object
period_year                                 object
period_month                                object
upt                                          int64
vrm                            

In [6]:
# are there NaN values in uza name?
monthly_ridership["uza_name"].isna().sum()

1608

In [7]:
# fill the NaN values with something
monthly_ridership["uza_name"].fillna("none", inplace=True)

In [8]:
# double check for na
monthly_ridership["uza_name"].isna().sum()

0

In [9]:
# can i filter by uza CA? YES
uza_ca = monthly_ridership[monthly_ridership["uza_name"].str.contains(", CA")]
display(
    uza_ca.shape,
    uza_ca["uza_name"].value_counts()
)

(91924, 22)

Los Angeles--Long Beach--Anaheim, CA             19296
San Francisco--Oakland, CA                       10720
Sacramento, CA                                    8308
San Diego, CA                                     6164
Riverside--San Bernardino, CA                     3752
Stockton, CA                                      2948
Vallejo, CA                                       2144
Santa Rosa, CA                                    1876
Lake Tahoe, CA-NV                                 1876
Modesto, CA                                       1876
Seaside--Monterey--Pacific Grove, CA              1876
San Luis Obispo, CA                               1608
Palmdale--Lancaster, CA                           1608
Oxnard--San Buenaventura (Ventura), CA            1340
San Jose, CA                                      1340
Santa Cruz, CA                                    1340
Visalia, CA                                       1340
Hanford, CA                                       1340
Indio--Pal

In [10]:
# where are the UTP values? the UTP page for this table used to be very wide
# look at a specific agency by ntd id, LA Metro "9154"
# what is the dtype of ntd_id... its object, so use str.contrains
la_metro = monthly_ridership[monthly_ridership["ntd_id"] == "90154"]
display(la_metro.shape, la_metro.columns)

(1876, 22)

Index(['uza_name', 'uace_cd', '_dt', 'ts', 'ntd_id', 'year', 'reporter_type',
       'agency', 'mode_type_of_service_status', 'mode', 'mode_full_name',
       'service_type', '_3_mode', 'tos', 'legacy_ntd_id', 'period_year_month',
       'period_year', 'period_month', 'upt', 'vrm', 'vrh', 'voms'],
      dtype='object')

In [11]:
# filter la_metro data to only see current and previous year (2024 and 2023)

la_2324 = la_metro[la_metro["period_year"].isin(["2023", "2024"])]

In [12]:
# does this 23/24 data match whats in the NTD report? YES
# confirm that data is now long instead of wide
la_2324[la_2324["period_year_month"] == "2024-03"][
    ["period_year_month", "mode", "tos", "upt"]
].sort_values(by="period_year_month", ascending=True)

Unnamed: 0,period_year_month,mode,tos,upt
286606,2024-03,DR,DO,62986
321629,2024-03,HR,DO,1874721
331792,2024-03,LR,DO,3842909
533543,2024-03,MB,DO,18852906
533571,2024-03,MB,PT,929537
556725,2024-03,RB,DO,380625
607325,2024-03,VP,PT,125149


In [13]:
# how to modify the `produce_ntd_monthly_ridership_by_rtpa` function

full_upt = (tbls.mart_ntd.dim_monthly_ntd_ridership_with_adjustments() >> collect()).rename(columns = {"mode_type_of_service_status": "Status"})


In [14]:
display(
    full_upt.shape,
    full_upt.columns
)

(613184, 22)

Index(['uza_name', 'uace_cd', '_dt', 'ts', 'ntd_id', 'year', 'reporter_type',
       'agency', 'Status', 'mode', 'mode_full_name', 'service_type', '_3_mode',
       'tos', 'legacy_ntd_id', 'period_year_month', 'period_year',
       'period_month', 'upt', 'vrm', 'vrh', 'voms'],
      dtype='object')

In [15]:
full_upt = full_upt[full_upt.agency.notna()].reset_index(drop=True)

In [16]:
full_upt.shape

(613184, 22)

In [17]:
ca = full_upt[(full_upt["uza_name"].str.contains(", CA")) & 
            (full_upt.agency.notna())].reset_index(drop=True)

In [18]:
ca.shape

(91924, 22)

In [19]:
crosswalk = pd.read_csv(
        f"gs://calitp-analytics-data/data-analyses/ntd/ntd_id_rtpa_crosswalk.csv", 
        dtype = {"NTD ID": "str"}
    #have to rename NTD ID col to match the dim table
    ).rename(columns={"NTD ID": "ntd_id"})
    
crosswalk.columns

Index(['ntd_id', 'Legacy NTD ID', 'Agency', 'UZA Name', 'RTPA_open_data',
       'RTPA'],
      dtype='object')

In [20]:
df = pd.merge(
        ca,
        # Merging on too many columns can create problems 
        # because csvs and dtypes aren't stable / consistent 
        # for NTD ID, Legacy NTD ID, and UZA
        crosswalk[["ntd_id", "RTPA"]],
        #change on = value to ntd_id
        on = "ntd_id",
        how = "left",
        indicator = True
    )
    
print(df._merge.value_counts())

both          91924
left_only         0
right_only        0
Name: _merge, dtype: int64


In [21]:
# updated to match new year-month format 

def add_change_columns(
    df: pd.DataFrame,
    year: int,
    month: int
) -> pd.DataFrame:
    """
    """    
    ntd_month_col = f"{year}-{month}"
    prior_year_col = f"{int(year)-1}-{month}"
        
    df[f"change_1yr_{ntd_month_col}"] = df[ntd_month_col] - df[prior_year_col]
    df = get_percent_change(df, ntd_month_col, prior_year_col)
    
    return df

In [22]:
from shared_utils.rt_dates import MONTH_DICT
from update_vars import GCS_FILE_PATH, NTD_MODES, NTD_TOS, YEAR, MONTH

# final updated  `produce_ntd_monthly_ridership_by_rtpa` function.
# may not need the upt_url

def produce_ntd_monthly_ridership_by_rtpa(
    #upt_url: str,
    year: int,
    month: str
) -> pd.DataFrame:
    """
    Import NTD data from url, filter to CA, 
    merge in crosswalk, and save individual csvs.
    """
    full_upt = (tbls.mart_ntd.dim_monthly_ntd_ridership_with_adjustments() >> collect()
               ).rename(columns = {"mode_type_of_service_status": "Status"})
    
    full_upt = full_upt[full_upt.agency.notna()].reset_index(drop=True)
    #full_upt.to_parquet(
    #    f"{GCS_FILE_PATH}ntd_monthly_ridership_{year}_{month}.parquet"
    #)
    ca = full_upt[(full_upt["uza_name"].str.contains(", CA")) & 
            (full_upt.agency.notna())].reset_index(drop=True)
    
    crosswalk = pd.read_csv(
        f"gs://calitp-analytics-data/data-analyses/ntd/ntd_id_rtpa_crosswalk.csv", 
        dtype = {"NTD ID": "str"}
    #have to rename NTD ID col to match the dim table
    ).rename(columns={"NTD ID": "ntd_id"})
    
    df = pd.merge(
        ca,
        # Merging on too many columns can create problems 
        # because csvs and dtypes aren't stable / consistent 
        # for NTD ID, Legacy NTD ID, and UZA
        crosswalk[["ntd_id", "RTPA"]],
        #change on = value to ntd_id
        on = "ntd_id",
        how = "left",
        indicator = True
    )
    
    print(df._merge.value_counts())
    
    # Good, everything merged, as we want
    if len(df[df._merge=="left_only"]) > 0:
        raise ValueError("There are unmerged rows to crosswalk")
        
    # Add new columns    
   # reversed_months = {v:k for k, v in MONTH_DICT.items()}
    
    for m in range(1, reversed_months[month] + 1):
        df = add_change_columns(df, year, m)
    
    df = df.assign(
        Mode_full = df.Mode.map(NTD_MODES),
        TOS_full = df.TOS.map(NTD_TOS)
    )
    
    return df


In [23]:
ca["period_year_month"].value_counts().head()

2002-08    343
2007-06    343
2002-01    343
2005-02    343
2004-03    343
Name: period_year_month, dtype: int64

In [30]:
# what is the most recent period_year_month? as of 7/2/2024\
# should expect to see values up tp April 2024
display(
    ca[ca["period_year_month"] == "2024-04"].shape,
    # as of 7/2, zero rows for May 2024. but expect ntd to update that data soon.
    ca[ca["period_year_month"] == "2024-05"].shape
)

(343, 22)

(0, 22)

---
## Do I need `reserved_months` anymore?
RE: produce_ntd_monthly_ridership_by_rtpa function.


Since the new table uses a `yyyy-mm` format now. i dont think i need the revesre months anymore. what else needs to change to the the function working again.

In [24]:
# do i need reversed_months anymore?

reversed_months = {v:k for k, v in MONTH_DICT.items()}

In [25]:
reversed_months

{'January': 1,
 'February': 2,
 'March': 3,
 'April': 4,
 'May': 5,
 'June': 6,
 'July': 7,
 'August': 8,
 'September': 9,
 'October': 10,
 'November': 11,
 'December': 12}

In [26]:
produce_ntd_monthly_ridership_by_rtpa(year=2024, month="03")

both          91924
left_only         0
right_only        0
Name: _merge, dtype: int64


KeyError: '03'