In [1]:
import sys

sys.path.append("../")  # up one level

import os
import shutil

import annual_ridership_module
import gcsfs
import geopandas as gpd
import pandas as pd
from calitp_data_analysis.sql import to_snakecase
from calitp_data_analysis.tables import tbls
from segment_speed_utils.project_vars import PUBLIC_GCS
from siuba import _, collect, count, filter, select, show_query
from update_vars import GCS_FILE_PATH, NTD_MODES, NTD_TOS

GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/ntd/"

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

# Explore updating RTPA-to-NTD_id crosswalk

Current rtpa/ntd_id crosswalk only contains full system reporters. This notebook will explore updating the crosswalk to include all reporter types (full, reduced, and rural system reporters).

- Start with getting list of NTD reporters from the warehouse `dim_annual_services_agencies`
- Then get a list of all California cities and all RTPAs, merge to get a list of cities and the RTPA they would be in
- then merge the list of NTD reporters to the list of cities/rtpa, using `city` column. result should be a list of NTD reporters with their RTPAs
---

## Read in `dim_annual_service_agencies` to get metrics

In [2]:
# dim_annual_service_agenices is the annual report module.
# included uza, VRM, VRH, UPT
# report years include 2022-2023

ntd_service = (
    tbls.mart_ntd.dim_annual_service_agencies()
    >> filter(_.state == "CA")
    >> select(
        "report_year",
        "ntd_id",
        "agency",
        "reporter_type",
        "organization_type",
        "city",
        "state",
        "primary_uza_name",
        "actual_vehicles_passenger_car_revenue_hours",
        "actual_vehicles_passenger_car_revenue_miles",
        "unlinked_passenger_trips_upt",
    )
    >> collect()
)

## Manual Update to `ntd_service`

In [3]:
update_dict = {
    "90227": "Moorpark",
    "90253": "Bell Gardens",
    "90259": "Cerritos",
    "90286": "Monterey Park",
}

for i, v in update_dict.items():
    ntd_service.loc[ntd_service["ntd_id"] == i, "city"] = v

## merging GDF of city in `ntd_service` to RTPA bounaries

Census Designated Places (CDPs) 2010 - California map
- https://data.sacog.org/datasets/SACOG::census-designated-places-cdps-2010-california/about


RTPA map opend data

- https://www.lab.data.ca.gov/dataset/regional-transportation-planning-agencies

In [4]:
# RTPA map
rtpa_url = "https://cecgis-caenergy.opendata.arcgis.com/api/download/v1/items/3a83743378be4e7f84c8230889c01dea/geojson?layers=0"
rtpa_map = gpd.read_file(rtpa_url)[
    ["RTPA", "LABEL_RTPA", "geometry"]
]  # .set_crs("ESRI:102600", allow_override=True)
rtpa_map = rtpa_map.to_crs("ESRI:102600")  # for sjoin later

In [5]:
# California Census Designated Places (2010), includes cities and CDPs
cdp_url = "https://services6.arcgis.com/YBp5dUuxCMd8W1EI/arcgis/rest/services/California_Census_Designated_Places_2010/FeatureServer/0/query?outFields=*&where=1%3D1&f=geojson"
keep_cdp_col = ["FID", "NAME10", "NAMELSAD10", "geometry"]
cdp_map = gpd.read_file(cdp_url)[keep_cdp_col].rename(
    columns={"NAME10": "cdp_name", "NAMELSAD10": "name_lsad"}
)  # .set_crs("ESRI:102600", allow_override=True)

## get centroid of CDPs

In [6]:
cdp_map["centroid"] = (
    cdp_map["geometry"].to_crs("ESRI:102600").centroid
)  # CRS used to create centoid. is not presistent
cdp_points = cdp_map.set_geometry("centroid", drop=True)

## sjoin `cdp_map` to `rtpa_map` get get cites-to-rtpa crosswalk

In [7]:
cdp_points.crs == rtpa_map.crs
# both are ESRI:102600

True

In [8]:
city_to_rtpa = gpd.sjoin(
    cdp_points,  # includes cities and CDPs.
    rtpa_map,
    how="left",
    # how="inner",
    predicate="intersects",
    # predicate="within",
)
# left, intersects = 1523 rows, 1521 CDPs matched,2 CDPs did not match to an RTPA (Avalon and City of San Francisco)
# inner, intersects = 1521 rows,1521 CDPs matched
# left, within = 1523 rows, 1521 CDPs matched,2 CDPs did not match to an RTPA
# inner, within = 1521 rows,1521 CDPs matched

### fix unmerged CDP rtpa rows

In [9]:
# Avalon fix
city_to_rtpa.loc[city_to_rtpa["cdp_name"] == "Avalon", ("RTPA", "LABEL_RTPA")] = (
    "Southern California Association of Governments",
    "SCAG",
)

# San Francisco Fix
city_to_rtpa.loc[
    city_to_rtpa["cdp_name"] == "San Francisco", ("RTPA", "LABEL_RTPA")
] = ("Metropolitan Transportation Commission", "MTC")

# check fix
city_to_rtpa[city_to_rtpa["cdp_name"].isin(["Avalon", "San Francisco"])]

Unnamed: 0,FID,cdp_name,name_lsad,geometry,index_right,RTPA,LABEL_RTPA
862,863,Avalon,Avalon city,POINT (510003.406 7421706.792),,Southern California Association of Governments,SCAG
889,890,San Francisco,San Francisco city,POINT (-777550.185 9040909.841),,Metropolitan Transportation Commission,MTC


## merge `ntd_service` to `city_to_rtpa`

In [10]:
ntd_data_to_rtpa = ntd_service.merge(
    city_to_rtpa[["cdp_name", "RTPA"]],
    left_on=("city"),
    right_on=("cdp_name"),
    how="left",  # with left join, got some unmerged rows. aka reporters that dont appear in the city_to_rtpa list (CDP?). and fan out 1:m rows due to some dupe cdp names
    indicator=True,
)

### manual updates to `ntd_data_to_rtpa`

In [11]:
# rows with NaN RTPAs

ntd_data_to_rtpa[ntd_data_to_rtpa["RTPA"].isna()][
    ["report_year", "agency", "city", "cdp_name", "RTPA"]
]  # 9 rows didnt get an RTPA

Unnamed: 0,report_year,agency,city,cdp_name,RTPA
285,2022,Palos Verdes Peninsula Transit Authority,Palos Verdes Peninsula,,
306,2023,County of Ventura,Ventura,,
307,2022,County of Ventura,Ventura,,
322,2022,Chemehuevi Indian Tribe,Havasu Lake,,
323,2023,Chemehuevi Indian Tribe,Havasu Lake,,
326,2023,North Fork Rancheria of Mono Indians of Califo...,North Fork,,
327,2022,North Fork Rancheria of Mono Indians of Califo...,North Fork,,
406,2023,"City of McFarland, dba: McFarland City Transit...",Mcfarland,,
407,2022,City of McFarland,Mcfarland,,


In [12]:
update_dict={
    "Mcfarland": ("Mcfarland","Kern Council of Governments"),
    "Ventura":("Ventura","Southern California Association of Governments"),
    "Palos Verdes Peninsula":("Rolling Hills","Southern California Association of Governments"),# to match other entries for this agency
    "Havasu Lake":("Havasu Lake","Southern California Association of Governments"), # aka Lake Havasu. shares zip code with Needles. so update to SCAG
    "North Fork":("North Fork","Madera County Transportation Commission") #in Madera County, update to 
}

#make loop using update_dict:
for k,v in update_dict.items():
    ntd_data_to_rtpa.loc[ntd_data_to_rtpa["city"]==k,("cdp_name","RTPA")] = v

In [13]:
#check for NaN RTPAs again
ntd_data_to_rtpa[ntd_data_to_rtpa["RTPA"].isna()][["report_year","agency","city","cdp_name","RTPA"]] # all fixed!!

Unnamed: 0,report_year,agency,city,cdp_name,RTPA


### Duplicate NTD ID fixes
- review ntd_id's with more than 2 rows. there are some CDPs with duplicate names.
- remove the rows with non-matching UZA to RTPA names

In [14]:
check_ntd_id =[
    "90256",
    "90287"
]
ntd_data_to_rtpa[ntd_data_to_rtpa["ntd_id"].isin(check_ntd_id)]

Unnamed: 0,report_year,ntd_id,agency,reporter_type,organization_type,city,state,primary_uza_name,actual_vehicles_passenger_car_revenue_hours,actual_vehicles_passenger_car_revenue_miles,unlinked_passenger_trips_upt,cdp_name,RTPA,_merge
225,2022,90256,City of Burbank,Reduced Reporter,"City, County or Local Government Unit or Depar...",Burbank,CA,"Los Angeles--Long Beach--Anaheim, CA",27008.0,243964.0,161971.0,Burbank,Metropolitan Transportation Commission,both
226,2022,90256,City of Burbank,Reduced Reporter,"City, County or Local Government Unit or Depar...",Burbank,CA,"Los Angeles--Long Beach--Anaheim, CA",27008.0,243964.0,161971.0,Burbank,Southern California Association of Governments,both
227,2023,90256,City of Burbank,Reduced Reporter,"City, County or Local Government Unit or Depar...",Burbank,CA,"Los Angeles--Long Beach--Anaheim, CA",28827.0,271752.0,171517.0,Burbank,Metropolitan Transportation Commission,both
228,2023,90256,City of Burbank,Reduced Reporter,"City, County or Local Government Unit or Depar...",Burbank,CA,"Los Angeles--Long Beach--Anaheim, CA",28827.0,271752.0,171517.0,Burbank,Southern California Association of Governments,both
285,2022,90287,Palos Verdes Peninsula Transit Authority,Reduced Reporter,Independent Public Agency or Authority of Tran...,Palos Verdes Peninsula,CA,"Los Angeles--Long Beach--Anaheim, CA",16411.0,242914.0,122042.0,Rolling Hills,Southern California Association of Governments,left_only
286,2023,90287,Palos Verdes Peninsula Transit Authority,Reduced Reporter,Independent Public Agency or Authority of Tran...,Rolling Hills,CA,"Los Angeles--Long Beach--Anaheim, CA",18785.0,258339.0,131176.0,Rolling Hills,Madera County Transportation Commission,both
287,2023,90287,Palos Verdes Peninsula Transit Authority,Reduced Reporter,Independent Public Agency or Authority of Tran...,Rolling Hills,CA,"Los Angeles--Long Beach--Anaheim, CA",18785.0,258339.0,131176.0,Rolling Hills,Southern California Association of Governments,both


In [15]:
#identify conditions to drop rows by
remove_1 = (ntd_data_to_rtpa["ntd_id"]=="90256") & (ntd_data_to_rtpa["RTPA"]=="Metropolitan Transportation Commission")
remove_2 = (ntd_data_to_rtpa["ntd_id"]=="90287") & (ntd_data_to_rtpa["RTPA"]=="Madera County Transportation Commission")

ntd_data_to_rtpa = ntd_data_to_rtpa[~(remove_1 | remove_2)]

#check again
print(ntd_data_to_rtpa["ntd_id"].value_counts().head())

90003    2
91005    2
90276    2
90277    2
90278    2
Name: ntd_id, dtype: int64


## final checks

In [16]:
display(
    len(ntd_service) == len(ntd_data_to_rtpa), # with left join, length should be the same since we removed duplicated rows
    ntd_data_to_rtpa.info() #confirms that every column has data. except for primary uza, because the rural reporters dont get a uza name
)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 421 entries, 0 to 423
Data columns (total 14 columns):
 #   Column                                       Non-Null Count  Dtype   
---  ------                                       --------------  -----   
 0   report_year                                  421 non-null    int64   
 1   ntd_id                                       421 non-null    object  
 2   agency                                       421 non-null    object  
 3   reporter_type                                421 non-null    object  
 4   organization_type                            421 non-null    object  
 5   city                                         421 non-null    object  
 6   state                                        421 non-null    object  
 7   primary_uza_name                             328 non-null    object  
 8   actual_vehicles_passenger_car_revenue_hours  421 non-null    float64 
 9   actual_vehicles_passenger_car_revenue_miles  421 non-null    floa

True

None

## Create new `ntd_id_to_rtpa_crosswalk` file!

In [22]:
ntd_data_to_rtpa_cleaned = ntd_data_to_rtpa[["ntd_id","agency","reporter_type","city","RTPA"]].drop_duplicates(subset=["ntd_id"]).reset_index(drop=True)

display(
    ntd_data_to_rtpa_cleaned.info(),
    ntd_data_to_rtpa_cleaned["ntd_id"].value_counts().head(),
    ntd_data_to_rtpa_cleaned["reporter_type"].value_counts(),
    ntd_data_to_rtpa_cleaned.head()
)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 213 entries, 0 to 212
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ntd_id         213 non-null    object
 1   agency         213 non-null    object
 2   reporter_type  213 non-null    object
 3   city           213 non-null    object
 4   RTPA           213 non-null    object
dtypes: object(5)
memory usage: 8.4+ KB


None

90003    1
90277    1
90279    1
90281    1
90282    1
Name: ntd_id, dtype: int64

Reduced Reporter    84
Full Reporter       83
Rural Reporter      46
Name: reporter_type, dtype: int64

Unnamed: 0,ntd_id,agency,reporter_type,city,RTPA
0,90003,"San Francisco Bay Area Rapid Transit District,...",Full Reporter,Oakland,Metropolitan Transportation Commission
1,90004,Golden Empire Transit District,Full Reporter,Bakersfield,Kern Council of Governments
2,90006,Santa Cruz Metropolitan Transit District,Full Reporter,Santa Cruz,Santa Cruz County Regional Transportation Comm...
3,90008,"City of Santa Monica, dba: Big Blue Bus",Full Reporter,Santa Monica,Southern California Association of Governments
4,90009,San Mateo County Transit District,Full Reporter,San Carlos,Metropolitan Transportation Commission


# Save crosswalk to GCS
- as `.parquet` and `.csv`

In [23]:
# ntd_data_to_rtpa_cleaned.to_parquet(f"{GCS_FILE_PATH}ntd_id_rtpa_crosswalk_all_reporter_types.parquet")
# ntd_data_to_rtpa_cleaned.to_csv(f"{GCS_FILE_PATH}ntd_id_rtpa_crosswalk_all_reporter_types.csv")


## Test reading in data from gcs

In [24]:
xwalk_parquet = pd.read_parquet(f"{GCS_FILE_PATH}ntd_id_rtpa_crosswalk_all_reporter_types.parquet")
xwalk_csv = pd.read_csv(f"{GCS_FILE_PATH}ntd_id_rtpa_crosswalk_all_reporter_types.csv")


display(
    len(xwalk_parquet) == len(xwalk_csv)
)

True