In [46]:
import sys

sys.path.append("../")  # up one level

import os
import shutil

import gcsfs
import geopandas as gpd
import pandas as pd
import annual_ridership_module

from calitp_data_analysis.sql import to_snakecase
from calitp_data_analysis.tables import tbls
from segment_speed_utils.project_vars import PUBLIC_GCS
from siuba import _, collect, count, filter, select, show_query
from update_vars import GCS_FILE_PATH, NTD_MODES, NTD_TOS

GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/ntd/"

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

# Work plan

## COMPLETE ~~Need List of all annual reporters~~
- Full, Reduced, Rural reporters all submit an annual Report to NTD
- used `dim_annual_service_agencies`

## COMPLETE ~~Need to assign rural and reduced annual reporters to RTPAs~~ 
- Full reporters are already mapped to RTPAs via the monthly report
- but reduced and rurap reporters need to be mapped to RTPAs
- sjoined a map of CA census designated places to RTPA map, then merged with `dim_annual_service_agenices` to attach RTPA info

## COMPLETE ~~Export processed data to parquet~~
- to be used for aggregation

## COMPLTE ~~Make new ntd_id to RTPA crosswalk~~
- to be used for future analyses and merges

## COMPLETE ~~Import formulas from monthly report for aggregation~~
- modified `produce_annual_ntd_ridership_data_by_rtpa`

## Test using data from new warehouse table `fct_service_data_and_operating_expenses_time_series_by_mode_upt`
- https://dbt-docs.calitp.org/#!/model/model.calitp_warehouse.fct_service_data_and_operating_expenses_time_series_by_mode_upt
---

## read in `fct_service_data_and_operating_expenses_time_series_by_mode_upt`

In [42]:
# new warehouse table
# includes upt, mode, tos and report year starting from 2018
# contains historical,inactive agencies

get_ntd_time_series = (
    tbls.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt()
    >> filter(_.state.str.contains("CA") | 
              _.state.str.contains("NV"), # to get lake Tahoe Transportation back
              _.year >= "2018",
              _.city != None,
              _.primary_uza_name.str.contains(", CA") | 
              _.primary_uza_name.str.contains("CA-NV") |
              _.primary_uza_name.str.contains("California Non-UZA") | 
              _.primary_uza_name.str.contains("El Paso, TX--NM") # something about Paso 
             )
    >> select(
        'agency_name',
        'agency_status',
        'city',
        'legacy_ntd_id',
        'mode',
        'ntd_id',
        'reporter_type',
        'reporting_module',
        'service',
        'state',
        'uace_code',
        'primary_uza_name',
        'uza_population',
        'year',
        'upt',
    )
    >> collect()
)

#what does group by look like
ntd_time_series = get_ntd_time_series.groupby(
    [
        "agency_name",
        'agency_status',
        "city",
        "state",
        "ntd_id",
        'primary_uza_name',
        "reporter_type",
        "mode",
        "service",
        "year"
    ]
).agg({
    "upt":"sum"
}).sort_values(by="ntd_id").reset_index()

display(
    get_ntd_time_series.info(),
    ntd_time_series.info(),
    ntd_time_series["state"].value_counts(),
    ntd_time_series["primary_uza_name"].value_counts()
    
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4050 entries, 0 to 4049
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   agency_name       4050 non-null   object 
 1   agency_status     4050 non-null   object 
 2   city              4050 non-null   object 
 3   legacy_ntd_id     3324 non-null   object 
 4   mode              4050 non-null   object 
 5   ntd_id            4038 non-null   object 
 6   reporter_type     4050 non-null   object 
 7   reporting_module  3768 non-null   object 
 8   service           4050 non-null   object 
 9   state             4050 non-null   object 
 10  uace_code         4050 non-null   int64  
 11  primary_uza_name  4050 non-null   object 
 12  uza_population    4050 non-null   int64  
 13  year              4050 non-null   object 
 14  upt               2642 non-null   float64
dtypes: float64(1), int64(2), object(12)
memory usage: 474.7+ KB
<class 'pandas.core.frame.Dat

None

None

CA    3996
NV      42
Name: state, dtype: int64

Los Angeles--Long Beach--Anaheim, CA               1116
California Non-UZA                                  876
San Francisco--Oakland, CA                          258
Sacramento, CA                                      240
San Diego, CA                                       156
Modesto, CA                                          90
Riverside--San Bernardino, CA                        90
Stockton, CA                                         78
San Jose, CA                                         60
Visalia, CA                                          54
Santa Rosa, CA                                       48
Vallejo, CA                                          48
Thousand Oaks, CA                                    42
San Luis Obispo, CA                                  42
Lake Tahoe, CA-NV                                    42
Santa Barbara, CA                                    42
Oxnard--San Buenaventura (Ventura), CA               42
Madera, CA                                      

In [3]:
ntd_time_series[~ntd_time_series["primary_uza_name"].str.contains(", CA")]["primary_uza_name"].value_counts()
# who has uza in texas new mex?!

California Non-UZA    146
El Paso, TX--NM         2
Name: primary_uza_name, dtype: int64

In [43]:
ntd_time_series[ntd_time_series["primary_uza_name"].str.contains("El Paso, TX--NM")] # just 1 agency, PAso Robles Transit Services

Unnamed: 0,agency_name,agency_status,city,state,ntd_id,primary_uza_name,reporter_type,mode,service,year,upt
1872,Paso Robles Transit Services (PE),Inactive,Paso Robles,CA,90195,"El Paso, TX--NM",Reduced Reporter,DR,PT,2020,0.0
1873,Paso Robles Transit Services (PE),Inactive,Paso Robles,CA,90195,"El Paso, TX--NM",Reduced Reporter,MB,PT,2022,0.0
1874,Paso Robles Transit Services (PE),Inactive,Paso Robles,CA,90195,"El Paso, TX--NM",Reduced Reporter,DR,PT,2021,0.0
1875,Paso Robles Transit Services (PE),Inactive,Paso Robles,CA,90195,"El Paso, TX--NM",Reduced Reporter,MB,PT,2020,0.0
1876,Paso Robles Transit Services (PE),Inactive,Paso Robles,CA,90195,"El Paso, TX--NM",Reduced Reporter,MB,PT,2019,0.0
1877,Paso Robles Transit Services (PE),Inactive,Paso Robles,CA,90195,"El Paso, TX--NM",Reduced Reporter,MB,PT,2021,0.0
1878,Paso Robles Transit Services (PE),Inactive,Paso Robles,CA,90195,"El Paso, TX--NM",Reduced Reporter,DR,PT,2023,0.0
1879,Paso Robles Transit Services (PE),Inactive,Paso Robles,CA,90195,"El Paso, TX--NM",Reduced Reporter,DR,PT,2022,0.0
1880,Paso Robles Transit Services (PE),Inactive,Paso Robles,CA,90195,"El Paso, TX--NM",Reduced Reporter,MB,PT,2023,0.0
1881,Paso Robles Transit Services (PE),Inactive,Paso Robles,CA,90195,"El Paso, TX--NM",Reduced Reporter,DR,PT,2018,0.0


In [5]:
# how many cities in ntd_time_serires are there?
# wonder if all the cities will be in the crosswalk
ntd_time_series["city"].nunique() # 183. should be enough


184

## See `explore_rtpa_ntd_crosswalk` notebook to see how the new rtpa to ntd id crosswalk was made

## read in crosswalk from gcs

In [6]:
test_data = pd.read_parquet(f"{GCS_FILE_PATH}annual_report_data_2022-2023.parquet")
rtpa_ntd_crosswalk = pd.read_parquet(f"{GCS_FILE_PATH}ntd_id_rtpa_crosswalk_all_reporter_types.parquet")

display(
    test_data.head(),
    rtpa_ntd_crosswalk.head()
)

Unnamed: 0,report_year,ntd_id,agency,reporter_type,organization_type,city,state,primary_uza_name,actual_vehicles_passenger_car_revenue_hours,actual_vehicles_passenger_car_revenue_miles,unlinked_passenger_trips_upt,cdp_name,RTPA,_merge
0,2022,90003,San Francisco Bay Area Rapid Transit District,Full Reporter,Independent Public Agency or Authority of Tran...,Oakland,CA,"San Francisco--Oakland, CA",2486029.0,78554914.0,38224072.0,Oakland,Metropolitan Transportation Commission,both
1,2023,90003,"San Francisco Bay Area Rapid Transit District,...",Full Reporter,Independent Public Agency or Authority of Tran...,Oakland,CA,"San Francisco--Oakland, CA",2724074.0,85233749.0,50764402.0,Oakland,Metropolitan Transportation Commission,both
2,2022,90004,Golden Empire Transit District,Full Reporter,Independent Public Agency or Authority of Tran...,Bakersfield,CA,"Bakersfield, CA",292325.0,3758507.0,3201046.0,Bakersfield,Kern Council of Governments,both
3,2023,90004,Golden Empire Transit District,Full Reporter,Independent Public Agency or Authority of Tran...,Bakersfield,CA,"Bakersfield, CA",289338.0,3924016.0,3293593.0,Bakersfield,Kern Council of Governments,both
4,2023,90006,Santa Cruz Metropolitan Transit District,Full Reporter,Independent Public Agency or Authority of Tran...,Santa Cruz,CA,"Santa Cruz, CA",214748.0,2975126.0,3350026.0,Santa Cruz,Santa Cruz County Regional Transportation Comm...,both


Unnamed: 0,ntd_id,agency_name,reporter_type,agency_status,city,state,RTPA
0,90003,San Francisco Bay Area Rapid Transit District ...,Full Reporter,Active,Oakland,CA,Metropolitan Transportation Commission
1,90004,Golden Empire Transit District (GET),Full Reporter,Active,Bakersfield,CA,Kern Council of Governments
2,90006,Santa Cruz Metropolitan Transit District (SCMTD),Full Reporter,Active,Santa Cruz,CA,Santa Cruz County Regional Transportation Comm...
3,90007,City of Modesto (MAX),Full Reporter,Inactive,Modesto,CA,Stanislaus Council of Governments
4,90008,City of Santa Monica (BBB) - Department of Tra...,Full Reporter,Active,Santa Monica,CA,Southern California Association of Governments


## test merge of `ntd_time_series` with `rtpa ntd crosswalk`

In [7]:
ntd_time_series_rtpa = ntd_time_series.merge(
    rtpa_ntd_crosswalk,
    on="ntd_id",
    how="left",
    indicator=True
)
display(
    ntd_time_series_rtpa.info(),
    ntd_time_series_rtpa["_merge"].value_counts()
)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 673 entries, 0 to 672
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   agency_name_x     673 non-null    object  
 1   agency_status_x   673 non-null    object  
 2   city_x            673 non-null    object  
 3   state_x           673 non-null    object  
 4   ntd_id            673 non-null    object  
 5   primary_uza_name  673 non-null    object  
 6   reporter_type_x   673 non-null    object  
 7   mode              673 non-null    object  
 8   service           673 non-null    object  
 9   upt               673 non-null    float64 
 10  agency_name_y     673 non-null    object  
 11  reporter_type_y   673 non-null    object  
 12  agency_status_y   673 non-null    object  
 13  city_y            673 non-null    object  
 14  state_y           673 non-null    object  
 15  RTPA              673 non-null    object  
 16  _merge            673 non-

None

both          673
left_only       0
right_only      0
Name: _merge, dtype: int64

In [15]:
# gtg
ntd_time_series_rtpa.head()

Unnamed: 0,agency_name_x,agency_status_x,city_x,state_x,ntd_id,primary_uza_name,reporter_type_x,mode,service,upt,agency_name_y,reporter_type_y,agency_status_y,city_y,state_y,RTPA,_merge
0,San Francisco Bay Area Rapid Transit District ...,Active,Oakland,CA,90003,"San Francisco--Oakland, CA",Full Reporter,DR,PT,0.0,San Francisco Bay Area Rapid Transit District ...,Full Reporter,Active,Oakland,CA,Metropolitan Transportation Commission,both
1,San Francisco Bay Area Rapid Transit District ...,Active,Oakland,CA,90003,"San Francisco--Oakland, CA",Full Reporter,HR,DO,444622064.0,San Francisco Bay Area Rapid Transit District ...,Full Reporter,Active,Oakland,CA,Metropolitan Transportation Commission,both
2,San Francisco Bay Area Rapid Transit District ...,Active,Oakland,CA,90003,"San Francisco--Oakland, CA",Full Reporter,MB,PT,0.0,San Francisco Bay Area Rapid Transit District ...,Full Reporter,Active,Oakland,CA,Metropolitan Transportation Commission,both
3,San Francisco Bay Area Rapid Transit District ...,Active,Oakland,CA,90003,"San Francisco--Oakland, CA",Full Reporter,MG,PT,3369844.0,San Francisco Bay Area Rapid Transit District ...,Full Reporter,Active,Oakland,CA,Metropolitan Transportation Commission,both
4,San Francisco Bay Area Rapid Transit District ...,Active,Oakland,CA,90003,"San Francisco--Oakland, CA",Full Reporter,YR,DO,7104589.0,San Francisco Bay Area Rapid Transit District ...,Full Reporter,Active,Oakland,CA,Metropolitan Transportation Commission,both


## Full Test `produce_annual_ntd_ridership_data_by_rtpa` function
- need to updat to new NTD data table.


In [31]:
check_id=["90253","90227","90259","90286"]

display(
    ntd_time_series_rtpa[ntd_time_series_rtpa["ntd_id"].isin(check_id)]["city_x"].unique(),
    rtpa_ntd_crosswalk[rtpa_ntd_crosswalk["ntd_id"].isin(check_id)]["city"].unique()
)

# cities match, no need for the dictionary update anymore in the function

array(['Moorpark', 'Bell', 'Cerritos', 'Montery Park'], dtype=object)

array(['Moorpark', 'Bell', 'Cerritos', 'Montery Park'], dtype=object)

In [47]:
def produce_annual_ntd_ridership_data_by_rtpa():
    """
    Function that ingest ridership data from `dim_annual_service_agencies`, filters for CA agencies.
    Merges in ntd_id_to_RTPA_crosswalk.
    
    """
    from annual_ridership_module import add_change_columns
    
    print("ingest annual ridership data from warehouse")
    
    ntd_service = (
    tbls.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt()
    >> filter(_.state.str.contains("CA") | 
              _.state.str.contains("NV"), # to get lake Tahoe Transportation back
              _.year >= "2018",
              _.city != None,
              _.primary_uza_name.str.contains(", CA") | 
              _.primary_uza_name.str.contains("CA-NV") |
              _.primary_uza_name.str.contains("California Non-UZA") | 
              _.primary_uza_name.str.contains("El Paso, TX--NM") # something about Paso 
             )
    >> select(
        'agency_name',
        'agency_status',
        'city',
        'legacy_ntd_id',
        'mode',
        'ntd_id',
        'reporter_type',
        'reporting_module',
        'service',
        'state',
        'uace_code',
        'primary_uza_name',
        'uza_population',
        'year',
        'upt',
    )
    >> collect())
    
    ntd_service = ntd_service.groupby(
        [
            "agency_name",
            'agency_status',
            "city",
            "state",
            "ntd_id",
            'primary_uza_name',
            "reporter_type",
            "mode",
            "service",
            "year"
        ]
    ).agg({
        "upt":"sum"
    }).sort_values(by="ntd_id").reset_index()
        
    #print("clean up warehouse data")
    
    #update_dict={
    #    "90227":"Moorpark",
    #    "90253":"Bell Gardens",
    #    "90259":"Cerritos",
    #    "90286":"Monterey Park",
    #}
    
    #for i, v in update_dict.items():
    #    ntd_service.loc[ntd_service["ntd_id"]== i,"city"] = v
    
    print("read in new `ntd_id_to_rtpa_all_reporter_types` crosswalk") 
    
    ntd_to_rtpa_crosswalk = pd.read_parquet(f"{GCS_FILE_PATH}ntd_id_rtpa_crosswalk_all_reporter_types.parquet")
        
    print("merge ntd data to crosswalk")
    
    ntd_data_by_rtpa = ntd_service.merge(
    ntd_to_rtpa_crosswalk,
    how="left",
    on=[
        "ntd_id",
        #"agency", # sometime agency name change, but ntd id stays the same. causing unmerged rows
        #"reporter_type",
        #"city"
    ],
    indicator=True
    ).rename(
    columns={
       "actual_vehicles_passenger_car_revenue_hours":"vrh",
       "actual_vehicles_passenger_car_revenue_miles":"vrm",
       "unlinked_passenger_trips_upt":"upt" 
    }
    )
    
    print(ntd_data_by_rtpa._merge.value_counts())
        
    if len(ntd_data_by_rtpa[ntd_data_by_rtpa._merge=="left_only"]) > 0:
        raise ValueError("There are unmerged rows to crosswalk")
    
    print("add `change_column` to data")
    ntd_data_by_rtpa = add_change_columns(ntd_data_by_rtpa)
    
    return ntd_data_by_rtpa

# will add step to export data to gcs in final script

In [48]:
annual_ridership_report_data = produce_annual_ntd_ridership_data_by_rtpa()

ingest annual ridership data from warehouse
read in new `ntd_id_to_rtpa_all_reporter_types` crosswalk
merge ntd data to crosswalk
both          4038
left_only        0
right_only       0
Name: _merge, dtype: int64
add `change_column` to data


KeyError: 'report_year'

### export report data to GCS, as parquet,
- to be read in by portfolio

In [None]:
# annual_ridership_report_data.to_parquet(f"{GCS_FILE_PATH}annual_ridership_report_data.parquet")

## read in data from GCS to make sure things work 

In [None]:
annual_ridership_report_data= pd.read_parquet(f"{GCS_FILE_PATH}annual_ridership_report_data.parquet")

In [None]:
annual_ridership_report_data['RTPA'].value_counts()