In [1]:
import sys

sys.path.append("../")  # up one level

import os
import shutil

import gcsfs
import geopandas as gpd
import pandas as pd
from calitp_data_analysis.sql import to_snakecase
from calitp_data_analysis.tables import tbls
from segment_speed_utils.project_vars import PUBLIC_GCS
from siuba import _, collect, count, filter, show_query, select
from update_vars import GCS_FILE_PATH, NTD_MODES, NTD_TOS

GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/ntd/"

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

# Work plan

## Need List of all annual reporters
- Full, Reduced, Rural reporters all submit an annual Report to NTD

## Need to assign rural and reduced annual reporters to RTPAs
- Full reporters are already mapped to RTPAs via the monthly report
- but reduced and rurap reporters need to be mapped to RTPAs

### How to traverse operators to RTPA?
- find all the operators in a GTFS schedule. 
- do any of the rural operators appear in a schedule?
    - if so, then what is the RTPA of that schedule!!!!!
    - ntd > operator > schedule > RTPA 

In [2]:
# dim_annual_service_agenices is the annual report module. 
# included uza, VRM, VRH, UPT 
# report years include 2022-2023

ntd_service = (
    tbls.mart_ntd.dim_annual_service_agencies()
    >> filter(_.primary_uza_name.str.contains(", CA"))
    >> select(
        "report_year",
        "ntd_id",
        "agency",
        "reporter_type",
        "organization_type",
        "city",
        "primary_uza_name",
        "actual_vehicles_passenger_car_revenue_hours",
        "actual_vehicles_passenger_car_revenue_miles",
        "unlinked_passenger_trips_upt"
    )
    >> collect()
)
ntd_service.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 315 entries, 0 to 314
Data columns (total 10 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   report_year                                  315 non-null    int64  
 1   ntd_id                                       315 non-null    object 
 2   agency                                       315 non-null    object 
 3   reporter_type                                315 non-null    object 
 4   organization_type                            315 non-null    object 
 5   city                                         312 non-null    object 
 6   primary_uza_name                             315 non-null    object 
 7   actual_vehicles_passenger_car_revenue_hours  315 non-null    float64
 8   actual_vehicles_passenger_car_revenue_miles  315 non-null    float64
 9   unlinked_passenger_trips_upt                 315 non-null    float64
dtypes:

In [3]:
# reading in NTD ID crosswalk from GCS
ntd_rtpa_crosswalk = pd.read_csv(
    f"{GCS_FILE_PATH}ntd_id_rtpa_crosswalk.csv", dtype={"ntd_id": "str"}
).rename(columns={"NTD ID": "ntd_id"})

ntd_rtpa_crosswalk.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   ntd_id          122 non-null    object
 1   Legacy NTD ID   111 non-null    object
 2   Agency          122 non-null    object
 3   UZA Name        121 non-null    object
 4   RTPA_open_data  122 non-null    object
 5   RTPA            122 non-null    object
dtypes: object(6)
memory usage: 5.8+ KB


In [40]:
# are all agencies in the ntd_service in the crosswalk? NO
check = ntd_service.merge(crosswalk, on="ntd_id", how="left", indicator=True)

# the rural reporters did not merge with an NTD ID from crosswalk
check[check["_merge"] == "left_only"]["reporter_type"].value_counts()



Reduced Reporter    117
Rural Reporter        1
Name: reporter_type, dtype: int64

In [5]:
# crosswalk[["UZA Name", "RTPA"]].sort_values(by="UZA Name")

Some UZA Names can go to multiple RTPAs
- some agencies in the Sacramento UZA go to Sac and Placer RTPA
- some agencies in LA UZA got to Metro or OCTA


## Maybe its easier to find the RTPA of the rural/reduced reporter agencies
- how many rural/reduced reporters are there?


In [6]:
ntd_service["reporter_type"].value_counts()

Full Reporter       165
Reduced Reporter    148
Rural Reporter        2
Name: reporter_type, dtype: int64

In [8]:
rural_reduced_reporters = ntd_service[ntd_service["reporter_type"]!="Full Reporter"].sort_values(by="primary_uza_name")

In [9]:
#rural_reduced_reporters.to_csv("rural_reduced_reporters.csv")

## Get all operators from RTPA using function from Tiffany

via [slack thread](https://cal-itp.slack.com/archives/C02H6JUSS9L/p1729102048291249)

initial work done in `rtpa_operator_explore.ipynb`

- rip `create_gtfs_dataset_key_to_organization_crosswalk` function from `gtfs_funnel/crosswalk_gtfs_dataset_key_to_organization.py`
- rip any date from GCS (latest is 2025-01-15) `calitp-analytics-data/data-analyses/gtfs_schedule/crosswalk`
- find the corresponding crosswalk by date (2025-01-15) via `calitp-analytics-data/data-analyses/gtfs_schedule/crosswalk`

can use these 2 dataframes to go from `schedule_gtfs_dataset_key (operator)` > `ntd_id`

A single schedule feed can have multiple operators. This is how you identify 

In [26]:
# need this to run `create_gtfs_dataset_key_to_organzaition_crosswalk`
from shared_utils.schedule_rt_utils import sample_gtfs_dataset_key_to_organization_crosswalk
from segment_speed_utils import helpers

In [27]:
def create_gtfs_dataset_key_to_organization_crosswalk(
    analysis_date: str
) -> pd.DataFrame:
    """
    For every operator that appears in schedule data, 
    create a crosswalk that links to organization_source_record_id.
    For all our downstream outputs, at various aggregations,
    we need to attach these over and over again.
    """
    df = helpers.import_scheduled_trips(
        analysis_date,
        columns = ["gtfs_dataset_key", "name"],
        get_pandas = True
    ).rename(columns = {"schedule_gtfs_dataset_key": "gtfs_dataset_key"})
    # rename columns because we must use simply gtfs_dataset_key in schedule_rt_utils function
    
    # Get base64_url, organization_source_record_id and organization_name
    crosswalk = sample_gtfs_dataset_key_to_organization_crosswalk(
        df,
        analysis_date,
        quartet_data = "schedule",
        dim_gtfs_dataset_cols = ["key", "source_record_id", "base64_url"],
        dim_organization_cols = ["source_record_id", "name", 
                                 "itp_id", "caltrans_district",
                                  "ntd_id_2022"]
    )

    df_with_org = pd.merge(
        df.rename(columns = {"gtfs_dataset_key": "schedule_gtfs_dataset_key"}),
        crosswalk,
        on = "schedule_gtfs_dataset_key",
        how = "inner"
    )
    
    return df_with_org

In [41]:
# get gtfs to org df,
# same date as used below
gtfs_to_org = create_gtfs_dataset_key_to_organization_crosswalk(
            "2025-01-15"
        )

# initialize gtfs rosswalk
gtfs_key_orgs = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/gtfs_schedule/crosswalk/gtfs_key_organization_2025-01-15.parquet"
)

## dataframes so far
- gtfs_to_org,
- gtfs_key_orgs
- ntd_rtpa_crosswalk 
- rural_reduced_reporters
       
try merging `gtfs_to_org` and r`ural_recuded_reporter` to find the schedule of the rural operators

then merge with ntd_rtpa_crosswalk to get ntd id > schedule > rtpa. then fill down ?

In [62]:
keep_cols=[
    'schedule_gtfs_dataset_key', 
    'name', 
    'organization_name',
    'ntd_id_2022',
    
]

merge = gtfs_to_org[keep_cols].merge(
    ntd_service,
    left_on= "ntd_id_2022",
    right_on= "ntd_id",
    how="left",
    indicator=True
)
display(
    gtfs_to_org.columns,
    rural_reduced_reporters.columns,
    merge.info()
)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 343 entries, 0 to 342
Data columns (total 15 columns):
 #   Column                                       Non-Null Count  Dtype   
---  ------                                       --------------  -----   
 0   schedule_gtfs_dataset_key                    343 non-null    object  
 1   name                                         343 non-null    object  
 2   organization_name                            343 non-null    object  
 3   ntd_id_2022                                  299 non-null    object  
 4   report_year                                  257 non-null    float64 
 5   ntd_id                                       257 non-null    object  
 6   agency                                       257 non-null    object  
 7   reporter_type                                257 non-null    object  
 8   organization_type                            257 non-null    object  
 9   city                                         253 non-null    obje

Index(['schedule_gtfs_dataset_key', 'name', 'schedule_source_record_id',
       'base64_url', 'organization_source_record_id', 'organization_name',
       'itp_id', 'caltrans_district_x', 'ntd_id_2022', 'caltrans_district_y'],
      dtype='object')

Index(['report_year', 'ntd_id', 'agency', 'reporter_type', 'organization_type',
       'city', 'primary_uza_name',
       'actual_vehicles_passenger_car_revenue_hours',
       'actual_vehicles_passenger_car_revenue_miles',
       'unlinked_passenger_trips_upt'],
      dtype='object')

None

In [67]:
display(gtfs_to_org.columns,
gtfs_key_orgs.columns)

Index(['schedule_gtfs_dataset_key', 'name', 'schedule_source_record_id',
       'base64_url', 'organization_source_record_id', 'organization_name',
       'itp_id', 'caltrans_district_x', 'ntd_id_2022', 'caltrans_district_y'],
      dtype='object')

Index(['schedule_gtfs_dataset_key', 'name', 'schedule_source_record_id',
       'base64_url', 'organization_source_record_id', 'organization_name',
       'caltrans_district', 'counties_served', 'hq_city', 'hq_county',
       'is_public_entity', 'is_publicly_operating', 'funding_sources',
       'on_demand_vehicles_at_max_service', 'vehicles_at_max_service',
       'number_of_state_counties', 'primary_uza_name', 'density',
       'number_of_counties_with_service', 'state_admin_funds_expended',
       'service_area_sq_miles', 'population', 'service_area_pop',
       'subrecipient_type', 'primary_uza_code', 'reporter_type',
       'organization_type', 'voms_pt', 'voms_do', 'year'],
      dtype='object')

In [64]:
merge["_merge"].value_counts()
# so there are some left-only rows, meaning 

both          257
left_only      86
right_only      0
Name: _merge, dtype: int64

In [65]:
merge

Unnamed: 0,schedule_gtfs_dataset_key,name,organization_name,ntd_id_2022,report_year,ntd_id,agency,reporter_type,organization_type,city,primary_uza_name,actual_vehicles_passenger_car_revenue_hours,actual_vehicles_passenger_car_revenue_miles,unlinked_passenger_trips_upt,_merge
0,1770249a5a2e770ca90628434d4934b1,VCTC GMV Schedule,City of Ojai,91058.0,2023.0,91058.0,City of Ojai,Reduced Reporter,"City, County or Local Government Unit or Depar...",Ojai,"Oxnard--San Buenaventura (Ventura), CA",4316.0,41642.0,48294.0,both
1,1770249a5a2e770ca90628434d4934b1,VCTC GMV Schedule,Ventura County Transportation Commission,90164.0,2023.0,90164.0,Ventura County Transportation Commission,Full Reporter,Independent Public Agency or Authority of Tran...,Camarillo,"Oxnard--San Buenaventura (Ventura), CA",79809.0,1618834.0,419280.0,both
2,1770249a5a2e770ca90628434d4934b1,VCTC GMV Schedule,Ventura County Transportation Commission,90164.0,2022.0,90164.0,Ventura County Transportation Commission,Full Reporter,Independent Public Agency or Authority of Tran...,Camarillo,"Oxnard--San Buenaventura (Ventura), CA",79396.0,1625487.0,365883.0,both
3,1770249a5a2e770ca90628434d4934b1,VCTC GMV Schedule,Gold Coast Transit District,90035.0,2023.0,90035.0,Gold Coast Transit District,Full Reporter,Independent Public Agency or Authority of Tran...,Oxnard,"Oxnard--San Buenaventura (Ventura), CA",226242.0,2725175.0,3043329.0,both
4,1770249a5a2e770ca90628434d4934b1,VCTC GMV Schedule,Gold Coast Transit District,90035.0,2022.0,90035.0,Gold Coast Transit District,Full Reporter,Independent Public Agency or Authority of Tran...,Oxnard,"Oxnard--San Buenaventura (Ventura), CA",219068.0,2634165.0,2337201.0,both
5,1770249a5a2e770ca90628434d4934b1,VCTC GMV Schedule,City of Simi Valley,90050.0,2023.0,90050.0,"City of Simi Valley, dba: Simi Valley Transit",Reduced Reporter,"City, County or Local Government Unit or Depar...",Simi Valley,"Simi Valley, CA",35925.0,528382.0,206836.0,both
6,1770249a5a2e770ca90628434d4934b1,VCTC GMV Schedule,City of Simi Valley,90050.0,2022.0,90050.0,"City of Simi Valley, dba: Simi Valley Transit",Reduced Reporter,"City, County or Local Government Unit or Depar...",Simi Valley,"Simi Valley, CA",34553.0,510664.0,164630.0,both
7,1770249a5a2e770ca90628434d4934b1,VCTC GMV Schedule,City of Moorpark,90227.0,2023.0,90227.0,"City of Moorpark, dba: Moorpark City Transit",Reduced Reporter,"City, County or Local Government Unit or Depar...",Moorpark,"Thousand Oaks, CA",5804.0,82493.0,26853.0,both
8,1770249a5a2e770ca90628434d4934b1,VCTC GMV Schedule,City of Moorpark,90227.0,2022.0,90227.0,City of Moorpark,Reduced Reporter,"City, County or Local Government Unit or Depar...",,"Thousand Oaks, CA",5665.0,79923.0,27822.0,both
9,1770249a5a2e770ca90628434d4934b1,VCTC GMV Schedule,City of Thousand Oaks,90165.0,2023.0,90165.0,"City of Thousand Oaks, dba: Thousand Oaks Transit",Reduced Reporter,"City, County or Local Government Unit or Depar...",Thousand Oaks,"Thousand Oaks, CA",36354.0,554092.0,191874.0,both
