In [1]:
import sys

sys.path.append("../")  # up one level

import os
import shutil

import gcsfs
import geopandas as gpd
import pandas as pd
import annual_ridership_module

from calitp_data_analysis.sql import to_snakecase
from calitp_data_analysis.tables import tbls
from segment_speed_utils.project_vars import PUBLIC_GCS
from siuba import _, collect, count, filter, select, show_query
from update_vars import GCS_FILE_PATH, NTD_MODES, NTD_TOS

GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/ntd/"

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

# Work plan

## COMPLETE ~~Need List of all annual reporters~~
- Full, Reduced, Rural reporters all submit an annual Report to NTD
- used `dim_annual_service_agencies`

## COMPLETE ~~Need to assign rural and reduced annual reporters to RTPAs~~ 
- Full reporters are already mapped to RTPAs via the monthly report
- but reduced and rurap reporters need to be mapped to RTPAs
- sjoined a map of CA census designated places to RTPA map, then merged with `dim_annual_service_agenices` to attach RTPA info

## COMPLETE ~~Export processed data to parquet~~
- to be used for aggregation

## COMPLTE ~~Make new ntd_id to RTPA crosswalk~~
- to be used for future analyses and merges

## COMPLETE ~~Import formulas from monthly report for aggregation~~
- modified `produce_annual_ntd_ridership_data_by_rtpa`

## Test using data from new warehouse table `fct_service_data_and_operating_expenses_time_series_by_mode_upt`
- https://dbt-docs.calitp.org/#!/model/model.calitp_warehouse.fct_service_data_and_operating_expenses_time_series_by_mode_upt
---

## read in `fct_service_data_and_operating_expenses_time_series_by_mode_upt`

In [None]:
# new warehouse table
# includes upt, mode, tos and report year starting from 2018
# contains historical,inactive agencies

get_ntd_time_series = (
    tbls.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt()
    >> filter(_.state.str.contains("CA") | 
              _.state.str.contains("NV"), # to get lake Tahoe Transportation back
              _.year >= "2018",
              _.city != None,
              _.primary_uza_name.str.contains(", CA") | 
              _.primary_uza_name.str.contains("CA-NV") |
              _.primary_uza_name.str.contains("California Non-UZA") | 
              _.primary_uza_name.str.contains("El Paso, TX--NM") # something about Paso 
             )
    >> select(
        'agency_name',
        'agency_status',
        'city',
        'legacy_ntd_id',
        'mode',
        'ntd_id',
        'reporter_type',
        'reporting_module',
        'service',
        'state',
        'uace_code',
        'primary_uza_name',
        'uza_population',
        'year',
        'upt',
    )
    >> collect()
)

#what does group by look like
ntd_time_series = get_ntd_time_series.groupby(
    [
        "agency_name",
        'agency_status',
        "city",
        "state",
        "ntd_id",
        'primary_uza_name',
        "reporter_type",
        "mode",
        "service",
        "year"
    ]
).agg({
    "upt":"sum"
}).sort_values(by="ntd_id").reset_index()

display(
    get_ntd_time_series.info(),
    ntd_time_series.info(),
    ntd_time_series["state"].value_counts(),
    ntd_time_series["primary_uza_name"].value_counts()
    
)

In [None]:
ntd_time_series[~ntd_time_series["primary_uza_name"].str.contains(", CA")]["primary_uza_name"].value_counts()
# who has uza in texas new mex?!

In [None]:
ntd_time_series[ntd_time_series["primary_uza_name"].str.contains("El Paso, TX--NM")] # just 1 agency, PAso Robles Transit Services

In [None]:
# how many cities in ntd_time_serires are there?
# wonder if all the cities will be in the crosswalk
ntd_time_series["city"].nunique() # 183. should be enough


## See `explore_rtpa_ntd_crosswalk` notebook to see how the new rtpa to ntd id crosswalk was made

## read in crosswalk from gcs

In [None]:
test_data = pd.read_parquet(f"{GCS_FILE_PATH}annual_report_data_2022-2023.parquet")
rtpa_ntd_crosswalk = pd.read_parquet(f"{GCS_FILE_PATH}ntd_id_rtpa_crosswalk_all_reporter_types.parquet")

display(
    test_data.head(),
    rtpa_ntd_crosswalk.head()
)

## test merge of `ntd_time_series` with `rtpa ntd crosswalk`

In [None]:
ntd_time_series_rtpa = ntd_time_series.merge(
    rtpa_ntd_crosswalk,
    on="ntd_id",
    how="left",
    indicator=True
)
display(
    ntd_time_series_rtpa.info(),
    ntd_time_series_rtpa["_merge"].value_counts()
)

In [None]:
# gtg
ntd_time_series_rtpa.head()

## Testing the `add_change_column` from module
- use on `ntd_time_series_rtpa` to add new column
1. previous UPT from prev year/ntd_id/mode/tos
2. change upt % from prev year/ntd_/mode/tos


In [None]:
sort_cols2 =  ["ntd_id",
               "year", 
               "service",
               "mode"
              ] # got the order correct with ["period_month", "period_year"]! sorted years with grouped months
group_cols2 = ["ntd_id",
               "mode",
               "service"
              ]
    
#df[["period_year","period_month"]] = df[["period_year","period_month"]].astype(int)
#ntd_time_series_rtpa.sort_values(sort_cols2).groupby(group_cols2)["upt"].apply(lambda x: x.shift(1))

In [None]:
ntd_time_series_rtpa.sort_values(sort_cols2).head()

In [None]:
ntd_time_series_rtpa.sort_values(sort_cols2).groupby(group_cols2)["upt"].apply(lambda x: x.shift(1)).head()

In [None]:
test = ntd_time_series_rtpa.assign(
        previous_y_upt = (ntd_time_series_rtpa.sort_values(sort_cols2)
                        .groupby(group_cols2)["upt"] 
                        .apply(lambda x: x.shift(1))
                       )
    )

In [None]:
test[test["ntd_id"]=="90253"].sort_values(by=["year","mode","service"])

In [None]:
ntd_rtpa_change_cols = annual_ridership_module.add_change_columns(ntd_time_series_rtpa)

In [None]:
ntd_rtpa_change_cols[ntd_rtpa_change_cols["ntd_id"]=="90253"].sort_values(by=["year","service","mode"])

## Full Test `produce_annual_ntd_ridership_data_by_rtpa` function
- need to updat to new NTD data table.


In [None]:
check_id=["90253","90227","90259","90286"]

display(
    ntd_time_series_rtpa[ntd_time_series_rtpa["ntd_id"].isin(check_id)]["city_x"].unique(),
    rtpa_ntd_crosswalk[rtpa_ntd_crosswalk["ntd_id"].isin(check_id)]["city"].unique()
)

# cities match, no need for the dictionary update anymore in the function

In [33]:
def produce_annual_ntd_ridership_data_by_rtpa():
    """
    Function that ingest ridership data from `dim_annual_service_agencies`, filters for CA agencies.
    Merges in ntd_id_to_RTPA_crosswalk.
    
    """
    from annual_ridership_module import add_change_columns
    
    print("ingest annual ridership data from warehouse")
    
    ntd_service = (
    tbls.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt()
    >> filter(_.state.str.contains("CA") | 
              _.state.str.contains("NV"), # to get lake Tahoe Transportation back
              _.year >= "2018",
              _.city != None,
              _.primary_uza_name.str.contains(", CA") | 
              _.primary_uza_name.str.contains("CA-NV") |
              _.primary_uza_name.str.contains("California Non-UZA") | 
              _.primary_uza_name.str.contains("El Paso, TX--NM") # something about Paso 
             )
    >> select(
        'agency_name',
        'agency_status',
        'city',
        'legacy_ntd_id',
        'mode',
        'ntd_id',
        'reporter_type',
        'reporting_module',
        'service',
        'state',
        'uace_code',
        'primary_uza_name',
        'uza_population',
        'year',
        'upt',
    )
    >> collect())
    
    ntd_service = ntd_service.groupby(
        [
            "agency_name",
            'agency_status',
            "city",
            "state",
            "ntd_id",
            'primary_uza_name',
            "reporter_type",
            "mode",
            "service",
            "year"
        ]
    ).agg({
        "upt":"sum"
    }).sort_values(by="ntd_id").reset_index()
        
    #print("clean up warehouse data")
    
    #update_dict={
    #    "90227":"Moorpark",
    #    "90253":"Bell Gardens",
    #    "90259":"Cerritos",
    #    "90286":"Monterey Park",
    #}
    
    #for i, v in update_dict.items():
    #    ntd_service.loc[ntd_service["ntd_id"]== i,"city"] = v
    
    print("read in new `ntd_id_to_rtpa_all_reporter_types` crosswalk") 
    
    ntd_to_rtpa_crosswalk = pd.read_parquet(f"{GCS_FILE_PATH}ntd_id_rtpa_crosswalk_all_reporter_types.parquet")
        
    print("merge ntd data to crosswalk")
    
    ntd_data_by_rtpa = ntd_service.merge(
    ntd_to_rtpa_crosswalk,
    how="left",
    on=[
        "ntd_id",
        #"agency", "reporter_type", "city" # sometime agency name, reporter type and city name change or are inconsistent, causing possible fanout
    ],
    indicator=True
    ).rename(
    columns={
        "actual_vehicles_passenger_car_revenue_hours":"vrh",
        "actual_vehicles_passenger_car_revenue_miles":"vrm",
        "unlinked_passenger_trips_upt":"upt",
        'agency_name_x':"agency_name", 
        'agency_status_x':"agency_status", 
        'city_x':"city", 
        'state_x':"state",
        'reporter_type_x':"reporter_type",
        "agency_name_y":"xwalk_agency_name",
        'reporter_type_y':"xwalk_reporter_type",
        'agency_status_y':"xwalk_agency_status",
        'city_y':"xwalk_city",
        'state_y':"xwalk_state",
    }
    )
    
    print(ntd_data_by_rtpa._merge.value_counts())
        
    if len(ntd_data_by_rtpa[ntd_data_by_rtpa._merge=="left_only"]) > 0:
        raise ValueError("There are unmerged rows to crosswalk")
    
    print("add `change_column` to data")
    ntd_data_by_rtpa = add_change_columns(ntd_data_by_rtpa)
    
    return ntd_data_by_rtpa

# will add step to export data to gcs in final script

In [34]:
annual_ridership_report_data = produce_annual_ntd_ridership_data_by_rtpa()


ingest annual ridership data from warehouse
read in new `ntd_id_to_rtpa_all_reporter_types` crosswalk
merge ntd data to crosswalk
both          4038
left_only        0
right_only       0
Name: _merge, dtype: int64
add `change_column` to data


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  .apply(lambda x: x.shift(1))


In [23]:
if annual_ridership_report_data.shape == ntd_time_series_rtpa.shape:
    pass
else: print(f"""
{annual_ridership_report_data.shape} vs {ntd_time_series_rtpa.shape}.
{annual_ridership_report_data.columns}
{ntd_time_series_rtpa.columns}
""")

# gtg, ntd_time_series_rtpa doesnt apply the change 1 year calculations, but the lengths are the same 

NameError: name 'ntd_time_series_rtpa' is not defined

In [35]:
annual_ridership_report_data.columns

Index(['agency_name', 'agency_status', 'city', 'state', 'ntd_id',
       'primary_uza_name', 'reporter_type', 'mode', 'service', 'year', 'upt',
       'xwalk_agency_name', 'xwalk_reporter_type', 'xwalk_agency_status',
       'xwalk_city', 'xwalk_state', 'RTPA', '_merge', 'previous_y_upt',
       'change_1yr', 'pct_change_1yr'],
      dtype='object')

In [36]:
annual_ridership_report_data["year"].unique()

array(['2018', '2019', '2020', '2021', '2022', '2023'], dtype=object)

In [37]:
annual_ridership_report_data[annual_ridership_report_data["agency_name"].str.contains("Sac")].head()

Unnamed: 0,agency_name,agency_status,city,state,ntd_id,primary_uza_name,reporter_type,mode,service,year,upt,xwalk_agency_name,xwalk_reporter_type,xwalk_agency_status,xwalk_city,xwalk_state,RTPA,_merge,previous_y_upt,change_1yr,pct_change_1yr
444,Sacramento Regional Transit District,Active,Sacramento,CA,90019,"Sacramento, CA",Full Reporter,DR,PT,2023,0.0,Sacramento Regional Transit District,Full Reporter,Active,Sacramento,CA,Sacramento Area Council of Governments,both,0.0,0.0,
445,Sacramento Regional Transit District,Active,Sacramento,CA,90019,"Sacramento, CA",Full Reporter,DR,TX,2022,0.0,Sacramento Regional Transit District,Full Reporter,Active,Sacramento,CA,Sacramento Area Council of Governments,both,0.0,0.0,
446,Sacramento Regional Transit District,Active,Sacramento,CA,90019,"Sacramento, CA",Full Reporter,DR,TX,2023,0.0,Sacramento Regional Transit District,Full Reporter,Active,Sacramento,CA,Sacramento Area Council of Governments,both,0.0,0.0,
447,Sacramento Regional Transit District,Active,Sacramento,CA,90019,"Sacramento, CA",Full Reporter,DR,TN,2018,0.0,Sacramento Regional Transit District,Full Reporter,Active,Sacramento,CA,Sacramento Area Council of Governments,both,,,
448,Sacramento Regional Transit District,Active,Sacramento,CA,90019,"Sacramento, CA",Full Reporter,DT,PT,2018,0.0,Sacramento Regional Transit District,Full Reporter,Active,Sacramento,CA,Sacramento Area Council of Governments,both,,,


### export report data to GCS, as parquet,
- to be read in by portfolio

In [38]:
annual_ridership_report_data.to_parquet(f"{GCS_FILE_PATH}annual_ridership_report_data.parquet")

## read in data from GCS to make sure things work 

In [39]:
report_data_check= pd.read_parquet(f"{GCS_FILE_PATH}annual_ridership_report_data.parquet")

In [40]:
report_data_check.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4038 entries, 0 to 4037
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   agency_name          4038 non-null   object  
 1   agency_status        4038 non-null   object  
 2   city                 4038 non-null   object  
 3   state                4038 non-null   object  
 4   ntd_id               4038 non-null   object  
 5   primary_uza_name     4038 non-null   object  
 6   reporter_type        4038 non-null   object  
 7   mode                 4038 non-null   object  
 8   service              4038 non-null   object  
 9   year                 4038 non-null   object  
 10  upt                  4038 non-null   float64 
 11  xwalk_agency_name    4038 non-null   object  
 12  xwalk_reporter_type  4038 non-null   object  
 13  xwalk_agency_status  4038 non-null   object  
 14  xwalk_city           4038 non-null   object  
 15  xwalk_state          