In [1]:
import sys

sys.path.append("../")  # up one level

import os
import shutil

import gcsfs
import geopandas as gpd
import pandas as pd
from calitp_data_analysis.sql import to_snakecase
from calitp_data_analysis.tables import tbls
from segment_speed_utils.project_vars import PUBLIC_GCS
from siuba import _, collect, count, filter, show_query, select
from update_vars import GCS_FILE_PATH, NTD_MODES, NTD_TOS

GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/ntd/"

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

# Work plan

## Need List of all annual reporters
- Full, Reduced, Rural reporters all submit an annual Report to NTD

## Need to assign rural and reduced annual reporters to RTPAs
- Full reporters are already mapped to RTPAs via the monthly report
- but reduced and rurap reporters need to be mapped to RTPAs

### How to traverse operators to RTPA?
- find all the operators in a GTFS schedule. 
- do any of the rural operators appear in a schedule?
    - if so, then what is the RTPA of that schedule!!!!!
    - ntd > operator > schedule > RTPA 

## Read in `dim_annual_service_agencies` to get metrics
but need to RTPA info to this data

In [2]:
# dim_annual_service_agenices is the annual report module. 
# included uza, VRM, VRH, UPT 
# report years include 2022-2023

ntd_service = (
    tbls.mart_ntd.dim_annual_service_agencies()
    >> filter(_.primary_uza_name.str.contains(", CA"))
    >> select(
        "report_year",
        "ntd_id",
        "agency",
        "reporter_type",
        "organization_type",
        "city",
        "primary_uza_name",
        "actual_vehicles_passenger_car_revenue_hours",
        "actual_vehicles_passenger_car_revenue_miles",
        "unlinked_passenger_trips_upt"
    )
    >> collect()
)
ntd_service.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 315 entries, 0 to 314
Data columns (total 10 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   report_year                                  315 non-null    int64  
 1   ntd_id                                       315 non-null    object 
 2   agency                                       315 non-null    object 
 3   reporter_type                                315 non-null    object 
 4   organization_type                            315 non-null    object 
 5   city                                         312 non-null    object 
 6   primary_uza_name                             315 non-null    object 
 7   actual_vehicles_passenger_car_revenue_hours  315 non-null    float64
 8   actual_vehicles_passenger_car_revenue_miles  315 non-null    float64
 9   unlinked_passenger_trips_upt                 315 non-null    float64
dtypes:

In [6]:
ntd_service["reporter_type"].value_counts()

Full Reporter       165
Reduced Reporter    148
Rural Reporter        2
Name: reporter_type, dtype: int64

### Does `ntd_id-to-rtap crosswalk` include rural/reduced reporter info?

In [3]:
# reading in NTD ID crosswalk from GCS
ntd_rtpa_crosswalk = pd.read_csv(
    f"{GCS_FILE_PATH}ntd_id_rtpa_crosswalk.csv", dtype={"ntd_id": "str"}
).rename(columns={"NTD ID": "ntd_id"})

ntd_rtpa_crosswalk.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   ntd_id          122 non-null    object
 1   Legacy NTD ID   111 non-null    object
 2   Agency          122 non-null    object
 3   UZA Name        121 non-null    object
 4   RTPA_open_data  122 non-null    object
 5   RTPA            122 non-null    object
dtypes: object(6)
memory usage: 5.8+ KB


In [4]:
# are all agencies in the ntd_service in the crosswalk? NO
check = ntd_service.merge(ntd_rtpa_crosswalk, on="ntd_id", how="left", indicator=True)

# the rural reporters are not in the ntd_rtpa_crosswalk
check[check["_merge"] == "left_only"]["reporter_type"].value_counts()


Reduced Reporter    117
Rural Reporter        1
Name: reporter_type, dtype: int64

In [5]:
# crosswalk[["UZA Name", "RTPA"]].sort_values(by="UZA Name")

Some UZA Names can go to multiple RTPAs
- some agencies in the Sacramento UZA go to Sac and Placer RTPA
- some agencies in LA UZA got to Metro or OCTA


In [7]:
# just rural/reduced reporters
rural_reduced_reporters = ntd_service[ntd_service["reporter_type"]!="Full Reporter"].sort_values(by="primary_uza_name")

In [8]:
#rural_reduced_reporters.to_csv("rural_reduced_reporters.csv")

## Get all operators from RTPA using function from Tiffany

via [slack thread](https://cal-itp.slack.com/archives/C02H6JUSS9L/p1729102048291249)

initial work done in `rtpa_operator_explore.ipynb`

- rip `create_gtfs_dataset_key_to_organization_crosswalk` function from `gtfs_funnel/crosswalk_gtfs_dataset_key_to_organization.py`
- rip any date from GCS (latest is 2025-01-15) `calitp-analytics-data/data-analyses/gtfs_schedule/crosswalk`
- find the corresponding `gtfs_key-to-org crosswalk` by date (2025-01-15) via `calitp-analytics-data/data-analyses/gtfs_schedule/crosswalk`

can use these 2 dataframes to go from `schedule_gtfs_dataset_key (operator)` > `ntd_id`

A single schedule feed can have multiple operators. This is how you identify 

In [9]:
# need this to run `create_gtfs_dataset_key_to_organzaition_crosswalk`
from shared_utils.schedule_rt_utils import sample_gtfs_dataset_key_to_organization_crosswalk
from segment_speed_utils import helpers

In [10]:
def create_gtfs_dataset_key_to_organization_crosswalk(
    analysis_date: str
) -> pd.DataFrame:
    """
    For every operator that appears in schedule data, 
    create a crosswalk that links to organization_source_record_id.
    For all our downstream outputs, at various aggregations,
    we need to attach these over and over again.
    """
    df = helpers.import_scheduled_trips(
        analysis_date,
        columns = ["gtfs_dataset_key", "name"],
        get_pandas = True
    ).rename(columns = {"schedule_gtfs_dataset_key": "gtfs_dataset_key"})
    # rename columns because we must use simply gtfs_dataset_key in schedule_rt_utils function
    
    # Get base64_url, organization_source_record_id and organization_name
    crosswalk = sample_gtfs_dataset_key_to_organization_crosswalk(
        df,
        analysis_date,
        quartet_data = "schedule",
        dim_gtfs_dataset_cols = ["key", "source_record_id", "base64_url"],
        dim_organization_cols = ["source_record_id", "name", 
                                 "itp_id", "caltrans_district",
                                  "ntd_id_2022"]
    )

    df_with_org = pd.merge(
        df.rename(columns = {"gtfs_dataset_key": "schedule_gtfs_dataset_key"}),
        crosswalk,
        on = "schedule_gtfs_dataset_key",
        how = "inner"
    )
    
    return df_with_org

In [16]:
# get gtfs to org df,
# same date as used below
gtfs_to_org = create_gtfs_dataset_key_to_organization_crosswalk(
            "2025-01-15"
        )
gtfs_to_org.info()

# initialize gtfs rosswalk
gtfs_key_orgs = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/gtfs_schedule/crosswalk/gtfs_key_organization_2025-01-15.parquet"
)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 216 entries, 0 to 215
Data columns (total 10 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   schedule_gtfs_dataset_key      216 non-null    object 
 1   name                           216 non-null    object 
 2   schedule_source_record_id      216 non-null    object 
 3   base64_url                     216 non-null    object 
 4   organization_source_record_id  216 non-null    object 
 5   organization_name              216 non-null    object 
 6   itp_id                         204 non-null    float64
 7   caltrans_district_x            214 non-null    object 
 8   ntd_id_2022                    172 non-null    object 
 9   caltrans_district_y            216 non-null    object 
dtypes: float64(1), object(9)
memory usage: 18.6+ KB


## dataframes so far
- gtfs_to_org,
- gtfs_key_orgs
- ntd_rtpa_crosswalk 
- rural_reduced_reporters
       
try merging `gtfs_to_org` and r`ural_recuded_reporter` to find the schedule of the rural operators

then merge with ntd_rtpa_crosswalk to get ntd id > schedule > rtpa. then fill down ?

In [35]:
keep_cols=[
    'schedule_gtfs_dataset_key', 
    'name', 
    'organization_name',
    'ntd_id_2022',
    
]

ntd_metrics_to_sched = ntd_service.merge(
    gtfs_to_org[keep_cols],
    right_on= "ntd_id_2022",
    left_on= "ntd_id",
    how="left",
    indicator=True
)
display(
    ntd_metrics_to_sched.info()
)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 345 entries, 0 to 344
Data columns (total 15 columns):
 #   Column                                       Non-Null Count  Dtype   
---  ------                                       --------------  -----   
 0   report_year                                  345 non-null    int64   
 1   ntd_id                                       345 non-null    object  
 2   agency                                       345 non-null    object  
 3   reporter_type                                345 non-null    object  
 4   organization_type                            345 non-null    object  
 5   city                                         341 non-null    object  
 6   primary_uza_name                             345 non-null    object  
 7   actual_vehicles_passenger_car_revenue_hours  345 non-null    float64 
 8   actual_vehicles_passenger_car_revenue_miles  345 non-null    float64 
 9   unlinked_passenger_trips_upt                 345 non-null    floa

None

In [36]:
ntd_metrics_to_sched["_merge"].value_counts()
# so there are some left-only rows, meaning some operators did not match to schedule 

both          257
left_only      88
right_only      0
Name: _merge, dtype: int64

In [37]:
# what are the reporter types for these left_only operators?
ntd_metrics_to_sched[ntd_metrics_to_sched["_merge"]=="left_only"]["reporter_type"].value_counts() # 65 reduced, 23 full reporters

# also checked "both" and got mix of full, reduced and rural
# may need to check a different crosswalk data

Reduced Reporter    65
Full Reporter       23
Name: reporter_type, dtype: int64

In [38]:
# looking into the cities of the left_only merges
ntd_metrics_to_sched[ntd_metrics_to_sched["_merge"]=="left_only"][["agency","reporter_type","city"]].value_counts()



agency                                                                          reporter_type     city            
Los Angeles County                                                              Reduced Reporter  Alhambra            9
Access Services                                                                 Full Reporter     El Monte            2
City of Davis, dba: Davis Community Transit                                     Reduced Reporter  Davis               2
Easy Lift Transportation                                                        Reduced Reporter  Goleta              2
County of Ventura                                                               Reduced Reporter  Ventura             2
City of Whittier                                                                Reduced Reporter  Whittier            2
City of South Pasadena                                                          Reduced Reporter  South Pasadena      2
City of Santa Fe Springs                     

In [39]:
# if i sort by city, can i match copy the RTPA of the good rows to the left_only rows?
ntd_metrics_to_sched[["agency","city","schedule_gtfs_dataset_key","name","_merge"]].sort_values(by=["city","agency"], ascending=True)

Unnamed: 0,agency,city,schedule_gtfs_dataset_key,name,_merge
237,City of Agoura Hills,Agoura Hills,,,left_only
238,City of Agoura Hills,Agoura Hills,,,left_only
239,"City of Alhambra, dba: Alhambra Community Transit",Alhambra,9471fbb88956076b211cc4a8df1a536c,Alhambra Schedule,both
240,"City of Alhambra, dba: Alhambra Community Tran...",Alhambra,9471fbb88956076b211cc4a8df1a536c,Alhambra Schedule,both
282,Los Angeles County,Alhambra,,,left_only
284,Los Angeles County,Alhambra,2606479845d2cf0077fac54ff25a3a69,LADPW Schedule,both
286,Los Angeles County,Alhambra,,,left_only
288,Los Angeles County,Alhambra,,,left_only
290,Los Angeles County,Alhambra,,,left_only
291,Los Angeles County,Alhambra,,,left_only


In [45]:
metrics_sched_rtpa = ntd_metrics_to_sched.drop(
        columns="_merge",
        axis=1
    ).merge(
    ntd_rtpa_crosswalk,
    on = "ntd_id",
    how="left",
    indicator=True
)

metrics_sched_rtpa.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 347 entries, 0 to 346
Data columns (total 20 columns):
 #   Column                                       Non-Null Count  Dtype   
---  ------                                       --------------  -----   
 0   report_year                                  347 non-null    int64   
 1   ntd_id                                       347 non-null    object  
 2   agency                                       347 non-null    object  
 3   reporter_type                                347 non-null    object  
 4   organization_type                            347 non-null    object  
 5   city                                         343 non-null    object  
 6   primary_uza_name                             347 non-null    object  
 7   actual_vehicles_passenger_car_revenue_hours  347 non-null    float64 
 8   actual_vehicles_passenger_car_revenue_miles  347 non-null    float64 
 9   unlinked_passenger_trips_upt                 347 non-null    floa

In [53]:
metrics_sched_rtpa[["report_year","ntd_id","agency","reporter_type","city","name","RTPA"]].sort_values(by=["city","RTPA"],ascending=True)

Unnamed: 0,report_year,ntd_id,agency,reporter_type,city,name,RTPA
239,2023,90246,City of Agoura Hills,Reduced Reporter,Agoura Hills,,
240,2022,90246,City of Agoura Hills,Reduced Reporter,Agoura Hills,,
241,2023,90247,"City of Alhambra, dba: Alhambra Community Transit",Reduced Reporter,Alhambra,Alhambra Schedule,
242,2022,90247,"City of Alhambra, dba: Alhambra Community Tran...",Reduced Reporter,Alhambra,Alhambra Schedule,
281,2023,90269,"Los Angeles County, dba: LA County Public Works",Reduced Reporter,Alhambra,,
282,2022,90269,Los Angeles County Dept. of Public Works - Ath...,Reduced Reporter,Alhambra,,
283,2022,90270,Los Angeles County Department of Public Works ...,Reduced Reporter,Alhambra,,
284,2023,90270,Los Angeles County,Reduced Reporter,Alhambra,,
285,2022,90271,Los Angeles County Department of Public Works ...,Reduced Reporter,Alhambra,LADPW Schedule,
286,2023,90271,Los Angeles County,Reduced Reporter,Alhambra,LADPW Schedule,
