In [56]:
import sys

sys.path.append("../")  # up one level

import os
import shutil

import annual_ridership_module
import gcsfs
import geopandas as gpd
import pandas as pd
from calitp_data_analysis.sql import to_snakecase
from calitp_data_analysis.tables import tbls
from segment_speed_utils.project_vars import PUBLIC_GCS
from siuba import _, collect, count, filter, select, show_query
from update_vars import GCS_FILE_PATH, NTD_MODES, NTD_TOS

GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/ntd/"

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

# Explore updating RTPA-to-NTD_id crosswalk

Current rtpa/ntd_id crosswalk only contains full system reporters. This notebook will explore updating the crosswalk to include all reporter types (full, reduced, and rural system reporters).

- Start with getting list of NTD reporters from the warehouse `dim_annual_services_agencies` AND `fct_service_data_and_operating_expenses_time_series_by_mode_upt`
- Then get a list of all California cities and all RTPAs, merge to get a list of cities and the RTPA they would be in
- then merge the list of NTD reporters to the list of cities/rtpa, using `city` column. result should be a list of NTD reporters with their RTPAs
---

## Read in `dim_annual_service_agencies` to get agency info

In [2]:
# dim_annual_service_agenices is the annual report module.
# included uza, VRM, VRH, UPT
# report years include 2022-2023

get_ntd_service = (
    tbls.mart_ntd.dim_annual_service_agencies()
    >> filter(_.state == "CA")
    >> select(
        "report_year",
        "ntd_id",
        "agency",
        "reporter_type",
        "organization_type",
        "city",
        "state",
        "primary_uza_name",
        "actual_vehicles_passenger_car_revenue_hours",
        "actual_vehicles_passenger_car_revenue_miles",
        "unlinked_passenger_trips_upt",
    )
    >> collect()
)

ntd_service = get_ntd_service.groupby(
    ["agency",
    "ntd_id",
    "reporter_type",
    "city",
    "primary_uza_name"
    ]   
).agg({"unlinked_passenger_trips_upt":"sum"}).reset_index()

ntd_service.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 236 entries, 0 to 235
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   agency                        236 non-null    object 
 1   ntd_id                        236 non-null    object 
 2   reporter_type                 236 non-null    object 
 3   city                          236 non-null    object 
 4   primary_uza_name              236 non-null    object 
 5   unlinked_passenger_trips_upt  236 non-null    float64
dtypes: float64(1), object(5)
memory usage: 11.2+ KB


In [57]:
ntd_service.sort_values(by="ntd_id").head(10) #some duplicate rows exist because the agency name differ through the years?!

Unnamed: 0,agency,ntd_id,reporter_type,city,primary_uza_name,unlinked_passenger_trips_upt
198,"San Francisco Bay Area Rapid Transit District, dba: SF BART",90003,Full Reporter,Oakland,"San Francisco--Oakland, CA",50764402.0
197,San Francisco Bay Area Rapid Transit District,90003,Full Reporter,Oakland,"San Francisco--Oakland, CA",38224072.0
138,Golden Empire Transit District,90004,Full Reporter,Bakersfield,"Bakersfield, CA",6494639.0
212,Santa Cruz Metropolitan Transit District,90006,Full Reporter,Santa Cruz,"Santa Cruz, CA",6187917.0
108,"City of Santa Monica, dba: Big Blue Bus",90008,Full Reporter,Santa Monica,"Los Angeles--Long Beach--Anaheim, CA",14101648.0
206,"San Mateo County Transit District, dba: SamTrans",90009,Full Reporter,San Carlos,"San Francisco--Oakland, CA",8773845.0
205,San Mateo County Transit District,90009,Full Reporter,San Carlos,"San Francisco--Oakland, CA",7128074.0
115,"City of Torrance, dba: Torrance Transit System",90010,Full Reporter,Torrance,"Los Angeles--Long Beach--Anaheim, CA",3962066.0
203,"San Joaquin Regional Transit District, dba: San Joaquin RTD",90012,Full Reporter,Stockton,"Stockton, CA",2301789.0
202,San Joaquin Regional Transit District,90012,Full Reporter,Stockton,"Stockton, CA",1839753.0


## Manual Update to `ntd_service`

In [3]:
ntd_service[ntd_service["ntd_id"]=="90227"]

Unnamed: 0,agency,ntd_id,reporter_type,city,primary_uza_name,unlinked_passenger_trips_upt
88,"City of Moorpark, dba: Moorpark City Transit",90227,Reduced Reporter,Moorpark,"Thousand Oaks, CA",26853.0


In [4]:
update_dict = {
    "90227": "Moorpark",
    "90253": "Bell Gardens",
    "90259": "Cerritos",
    "90286": "Monterey Park",
}

for i, v in update_dict.items():
    ntd_service.loc[ntd_service["ntd_id"] == i, "city"] = v

## Read in data from `fct_service_data_and_operating_expenses_time_series_by_mode_upt`

In [5]:
# new warehouse table
# includes upt, mode, tos and report year starting from 2018
# contains historical,inactive agencies

get_ntd_time_series = (
    tbls.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt()
    >> filter(_.state == "CA",
              _.year >= "2018",
              _.city != None,
             )
    >> select(
        'agency_name',
        'agency_status',
        'city',
        'legacy_ntd_id',
        'mode',
        'ntd_id',
        'reporter_type',
        'reporting_module',
        'service',
        'state',
        'uace_code',
        'primary_uza_name',
        'uza_population',
        'year',
        'upt',
    )
    >> collect()
)

#what does group by look like
ntd_time_series = get_ntd_time_series.groupby(
    [
        "agency_name",
        'agency_status',
        "city",
        "ntd_id",
        'primary_uza_name',
        "reporter_type"
    ]
).agg({
    "upt":"sum"
}).sort_values(by="ntd_id").reset_index()

ntd_time_series.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253 entries, 0 to 252
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   agency_name       253 non-null    object 
 1   agency_status     253 non-null    object 
 2   city              253 non-null    object 
 3   ntd_id            253 non-null    object 
 4   primary_uza_name  253 non-null    object 
 5   reporter_type     253 non-null    object 
 6   upt               253 non-null    float64
dtypes: float64(1), object(6)
memory usage: 14.0+ KB


## Read in GDF of Census Designated Places (aka cities) and RTPA bounaries

Census Designated Places (CDPs) 2010 - California map
- https://data.sacog.org/datasets/SACOG::census-designated-places-cdps-2010-california/about


RTPA map opend data

- https://www.lab.data.ca.gov/dataset/regional-transportation-planning-agencies

In [6]:
# RTPA map
rtpa_url = "https://cecgis-caenergy.opendata.arcgis.com/api/download/v1/items/3a83743378be4e7f84c8230889c01dea/geojson?layers=0"
rtpa_map = gpd.read_file(rtpa_url)[
    ["RTPA", "LABEL_RTPA", "geometry"]
]  # .set_crs("ESRI:102600", allow_override=True)
rtpa_map = rtpa_map.to_crs("ESRI:102600")  # for sjoin later

In [7]:
# California Census Designated Places (2010), includes cities and CDPs
cdp_url = "https://services6.arcgis.com/YBp5dUuxCMd8W1EI/arcgis/rest/services/California_Census_Designated_Places_2010/FeatureServer/0/query?outFields=*&where=1%3D1&f=geojson"
keep_cdp_col = ["FID", "NAME10", "NAMELSAD10", "geometry"]
cdp_map = gpd.read_file(cdp_url)[keep_cdp_col].rename(
    columns={"NAME10": "cdp_name", "NAMELSAD10": "name_lsad"}
)  # .set_crs("ESRI:102600", allow_override=True)

## get centroid of CDPs to get point geom instead of polygons

In [8]:
cdp_map["centroid"] = (
    cdp_map["geometry"].to_crs("ESRI:102600").centroid
)  # CRS used to create centoid. is not presistent
cdp_points = cdp_map.set_geometry("centroid", drop=True)

## sjoin `cdp_map` to `rtpa_map` get get cites-to-rtpa crosswalk

In [9]:
cdp_points.crs == rtpa_map.crs
# both are ESRI:102600

True

In [10]:
city_to_rtpa = gpd.sjoin(
    cdp_points,  # includes cities and CDPs.
    rtpa_map,
    how="left",
    # how="inner",
    predicate="intersects",
    # predicate="within",
)
# left, intersects = 1523 rows, 1521 CDPs matched,2 CDPs did not match to an RTPA (Avalon and City of San Francisco)
# inner, intersects = 1521 rows,1521 CDPs matched
# left, within = 1523 rows, 1521 CDPs matched,2 CDPs did not match to an RTPA
# inner, within = 1521 rows,1521 CDPs matched

### fix unmerged CDP rtpa rows

In [11]:
# Avalon fix
city_to_rtpa.loc[city_to_rtpa["cdp_name"] == "Avalon", ("RTPA", "LABEL_RTPA")] = (
    "Southern California Association of Governments",
    "SCAG",
)

# San Francisco Fix
city_to_rtpa.loc[
    city_to_rtpa["cdp_name"] == "San Francisco", ("RTPA", "LABEL_RTPA")
] = ("Metropolitan Transportation Commission", "MTC")

# check fix
city_to_rtpa[city_to_rtpa["cdp_name"].isin(["Avalon", "San Francisco"])]

Unnamed: 0,FID,cdp_name,name_lsad,geometry,index_right,RTPA,LABEL_RTPA
862,863,Avalon,Avalon city,POINT (510003.406 7421706.792),,Southern California Association of Governments,SCAG
889,890,San Francisco,San Francisco city,POINT (-777550.185 9040909.841),,Metropolitan Transportation Commission,MTC


## merge `ntd_service` to `city_to_rtpa`

In [12]:
ntd_data_to_rtpa = ntd_service.merge(
    city_to_rtpa[["cdp_name", "RTPA"]],
    left_on=("city"),
    right_on=("cdp_name"),
    how="left",  # with left join, got some unmerged rows. aka reporters that dont appear in the city_to_rtpa list (CDP?). and fan out 1:m rows due to some dupe cdp names
    indicator=True,
)
ntd_data_to_rtpa.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 238 entries, 0 to 237
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   agency                        238 non-null    object  
 1   ntd_id                        238 non-null    object  
 2   reporter_type                 238 non-null    object  
 3   city                          238 non-null    object  
 4   primary_uza_name              238 non-null    object  
 5   unlinked_passenger_trips_upt  238 non-null    float64 
 6   cdp_name                      232 non-null    object  
 7   RTPA                          232 non-null    object  
 8   _merge                        238 non-null    category
dtypes: category(1), float64(1), object(7)
memory usage: 17.1+ KB


In [44]:
ntd_data_to_rtpa.head()

Unnamed: 0,agency,ntd_id,reporter_type,city,primary_uza_name,unlinked_passenger_trips_upt,cdp_name,RTPA,_merge
0,Access Services,90157,Full Reporter,El Monte,"Los Angeles--Long Beach--Anaheim, CA",6472858.0,El Monte,Southern California Association of Governments,both
1,Alameda-Contra Costa Transit District,90014,Full Reporter,Oakland,"San Francisco--Oakland, CA",29347581.0,Oakland,Metropolitan Transportation Commission,both
2,"Alameda-Contra Costa Transit District, dba: AC...",90014,Full Reporter,Oakland,"San Francisco--Oakland, CA",35190057.0,Oakland,Metropolitan Transportation Commission,both
3,Altamont Corridor Express,90182,Full Reporter,Stockton,"Stockton, CA",796250.0,Stockton,San Joaquin Council of Governments,both
4,Anaheim Transportation Network,90211,Full Reporter,Anaheim,"Los Angeles--Long Beach--Anaheim, CA",7187312.0,Anaheim,Southern California Association of Governments,both


## merge `ntd_time_series` to `city_to_rtpa`

In [13]:
alt_ntd_to_rtpa = ntd_time_series.merge(
    city_to_rtpa[["cdp_name", "RTPA"]],
    left_on=("city"),
    right_on=("cdp_name"),
    how="left",  # with left join, got some unmerged rows. aka reporters that dont appear in the city_to_rtpa list (CDP?). and fan out 1:m rows due to some dupe cdp names
    indicator=True,
)
alt_ntd_to_rtpa.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 255 entries, 0 to 254
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   agency_name       255 non-null    object  
 1   agency_status     255 non-null    object  
 2   city              255 non-null    object  
 3   ntd_id            255 non-null    object  
 4   primary_uza_name  255 non-null    object  
 5   reporter_type     255 non-null    object  
 6   upt               255 non-null    float64 
 7   cdp_name          248 non-null    object  
 8   RTPA              248 non-null    object  
 9   _merge            255 non-null    category
dtypes: category(1), float64(1), object(8)
memory usage: 20.3+ KB


## check for unmerged values 

In [17]:
display(
    ntd_data_to_rtpa["_merge"].value_counts(),
    alt_ntd_to_rtpa["_merge"].value_counts()
)

both          232
left_only       6
right_only      0
Name: _merge, dtype: int64

both          248
left_only       7
right_only      0
Name: _merge, dtype: int64

### manual updates to `ntd_data_to_rtpa` & `alt_ntd_to_rtpa`

In [16]:
# rows with NaN RTPAs
ntd_data_to_rtpa[ntd_data_to_rtpa["RTPA"].isna()][
    ["agency", "city", "cdp_name", "RTPA"]
] # 9 rows didnt get an RTPA

Unnamed: 0,agency,city,cdp_name,RTPA
14,Chemehuevi Indian Tribe,Havasu Lake,,
15,Chemehuevi Indian Tribe,Havasu Lake,,
132,County of Ventura,Ventura,,
180,North Fork Rancheria of Mono Indians of Califo...,North Fork,,
181,North Fork Rancheria of Mono Indians of Califo...,North Fork,,
185,Palos Verdes Peninsula Transit Authority,Palos Verdes Peninsula,,


In [18]:
# rows with NaN RTPAs
alt_ntd_to_rtpa[alt_ntd_to_rtpa["RTPA"].isna()][
    ["agency_name", "city", "cdp_name", "RTPA"]
].drop_duplicates().sort_values(by="city")  # 21 rows with empty RTPAs

Unnamed: 0,agency_name,city,cdp_name,RTPA
243,Chemehuevi Indian Tribe (CTA) - Reality and Pl...,Havasu Lake,,
234,City of McFarland - Public Works Department,Mcfarland,,
168,City of Monterey Park - Public Works Department,Montery Park,,
245,North Fork Rancheria of Mono Indians of Califo...,North Fork,,
95,Paso Robles Transit Services (PE),Paso Robles,,
68,"DAVE Transportation Services, Inc.",Sherman Oaks,,
181,County of Ventura (PWATD) - Public Works,Ventura,,


In [19]:
alt_ntd_to_rtpa[alt_ntd_to_rtpa["RTPA"].isna()]["city"].unique()

array(['Sherman Oaks', 'Paso Robles', 'Montery Park', 'Ventura',
       'Mcfarland', 'Havasu Lake', 'North Fork'], dtype=object)

In [20]:
city_to_rtpa[city_to_rtpa["cdp_name"].str.contains("Sherman")]

# Sherman Oaks doesnt even exist as a city  in the city-to-rtpa list

Unnamed: 0,FID,cdp_name,name_lsad,geometry,index_right,RTPA,LABEL_RTPA


In [21]:
#dictionary to update missing cdp and RTPA values from Cities

update_dict={
    "Mcfarland": ("Mcfarland","Kern Council of Governments"),
    "Ventura":("Ventura","Southern California Association of Governments"),
    "Palos Verdes Peninsula":("Rolling Hills","Southern California Association of Governments"),# to match other entries for this agency
    "Havasu Lake":("Havasu Lake","Southern California Association of Governments"), # aka Lake Havasu. shares zip code with Needles. so update to SCAG
    "North Fork":("North Fork","Madera County Transportation Commission"), #in Madera County, update to
    "Montery Park":("Monterey Park","Southern California Association of Governments"),
    "Paso Robles":("Paso Robles","San Luis Obispo Council of Governments"),
    "Sherman Oaks":("Sherman Oaks","Southern California Association of Governments")
}

#make loop using update_dict:
for k,v in update_dict.items():
    ntd_data_to_rtpa.loc[ntd_data_to_rtpa["city"]==k,("cdp_name","RTPA")] = v
    alt_ntd_to_rtpa.loc[alt_ntd_to_rtpa["city"]==k,("cdp_name","RTPA")] = v


In [23]:
#check for NaN RTPAs again
display(
    ntd_data_to_rtpa[ntd_data_to_rtpa["RTPA"].isna()][["agency","city","cdp_name","RTPA"]], # all fixed!!
    alt_ntd_to_rtpa[alt_ntd_to_rtpa["RTPA"].isna()][["agency_name", "city", "cdp_name", "RTPA"]]
)

Unnamed: 0,agency,city,cdp_name,RTPA


Unnamed: 0,agency_name,city,cdp_name,RTPA


### Duplicate NTD ID fixes
- review ntd_id's with more than 2 rows. there are some CDPs with duplicate names.
- remove the rows with non-matching UZA to RTPA names

In [24]:
alt_ntd_to_rtpa["ntd_id"].value_counts().head() # looking for rows with more than 2 rows

90287    2
90256    2
90003    1
90293    1
90280    1
Name: ntd_id, dtype: int64

In [25]:
check_ntd_id =[
    "90256", # City of Burbank, matched to MTC and SCAG. There is a "Burbank" in both areas
    "90287" # Palos Verdes Peninsula Transit Authority. similary, shows in 2 RTPAs.  
]

display(
    ntd_data_to_rtpa[ntd_data_to_rtpa["ntd_id"].isin(check_ntd_id)],
    alt_ntd_to_rtpa[alt_ntd_to_rtpa["ntd_id"].isin(check_ntd_id)]
)

Unnamed: 0,agency,ntd_id,reporter_type,city,primary_uza_name,unlinked_passenger_trips_upt,cdp_name,RTPA,_merge
34,City of Burbank,90256,Reduced Reporter,Burbank,"Los Angeles--Long Beach--Anaheim, CA",333488.0,Burbank,Metropolitan Transportation Commission,both
35,City of Burbank,90256,Reduced Reporter,Burbank,"Los Angeles--Long Beach--Anaheim, CA",333488.0,Burbank,Southern California Association of Governments,both
185,Palos Verdes Peninsula Transit Authority,90287,Reduced Reporter,Palos Verdes Peninsula,"Los Angeles--Long Beach--Anaheim, CA",122042.0,Rolling Hills,Southern California Association of Governments,left_only
186,Palos Verdes Peninsula Transit Authority,90287,Reduced Reporter,Rolling Hills,"Los Angeles--Long Beach--Anaheim, CA",131176.0,Rolling Hills,Madera County Transportation Commission,both
187,Palos Verdes Peninsula Transit Authority,90287,Reduced Reporter,Rolling Hills,"Los Angeles--Long Beach--Anaheim, CA",131176.0,Rolling Hills,Southern California Association of Governments,both


Unnamed: 0,agency_name,agency_status,city,ntd_id,primary_uza_name,reporter_type,upt,cdp_name,RTPA,_merge
137,City of Burbank - Community Development-Transp...,Active,Burbank,90256,"Los Angeles--Long Beach--Anaheim, CA",Reduced Reporter,1156840.0,Burbank,Metropolitan Transportation Commission,both
138,City of Burbank - Community Development-Transp...,Active,Burbank,90256,"Los Angeles--Long Beach--Anaheim, CA",Reduced Reporter,1156840.0,Burbank,Southern California Association of Governments,both
169,Palos Verdes Peninsula Transit Authority (PVPTA),Active,Rolling Hills,90287,"Los Angeles--Long Beach--Anaheim, CA",Reduced Reporter,703745.0,Rolling Hills,Madera County Transportation Commission,both
170,Palos Verdes Peninsula Transit Authority (PVPTA),Active,Rolling Hills,90287,"Los Angeles--Long Beach--Anaheim, CA",Reduced Reporter,703745.0,Rolling Hills,Southern California Association of Governments,both


In [26]:
#identify conditions to drop rows by
remove_1 = (ntd_data_to_rtpa["ntd_id"]=="90256") & (ntd_data_to_rtpa["RTPA"]=="Metropolitan Transportation Commission")
remove_2 = (ntd_data_to_rtpa["ntd_id"]=="90287") & (ntd_data_to_rtpa["RTPA"]=="Madera County Transportation Commission")

ntd_data_to_rtpa = ntd_data_to_rtpa[~(remove_1 | remove_2)]


In [27]:
remove_3 = (alt_ntd_to_rtpa["ntd_id"]=="90256") & (alt_ntd_to_rtpa["RTPA"]=="Metropolitan Transportation Commission")
remove_4 = (alt_ntd_to_rtpa["ntd_id"]=="90287") & (alt_ntd_to_rtpa["RTPA"]=="Madera County Transportation Commission")

alt_ntd_to_rtpa = alt_ntd_to_rtpa[~(remove_3 | remove_4)]


In [28]:
display(
    ntd_data_to_rtpa[ntd_data_to_rtpa["ntd_id"].isin(check_ntd_id)],
    alt_ntd_to_rtpa[alt_ntd_to_rtpa["ntd_id"].isin(check_ntd_id)]
)

Unnamed: 0,agency,ntd_id,reporter_type,city,primary_uza_name,unlinked_passenger_trips_upt,cdp_name,RTPA,_merge
35,City of Burbank,90256,Reduced Reporter,Burbank,"Los Angeles--Long Beach--Anaheim, CA",333488.0,Burbank,Southern California Association of Governments,both
185,Palos Verdes Peninsula Transit Authority,90287,Reduced Reporter,Palos Verdes Peninsula,"Los Angeles--Long Beach--Anaheim, CA",122042.0,Rolling Hills,Southern California Association of Governments,left_only
187,Palos Verdes Peninsula Transit Authority,90287,Reduced Reporter,Rolling Hills,"Los Angeles--Long Beach--Anaheim, CA",131176.0,Rolling Hills,Southern California Association of Governments,both


Unnamed: 0,agency_name,agency_status,city,ntd_id,primary_uza_name,reporter_type,upt,cdp_name,RTPA,_merge
138,City of Burbank - Community Development-Transp...,Active,Burbank,90256,"Los Angeles--Long Beach--Anaheim, CA",Reduced Reporter,1156840.0,Burbank,Southern California Association of Governments,both
170,Palos Verdes Peninsula Transit Authority (PVPTA),Active,Rolling Hills,90287,"Los Angeles--Long Beach--Anaheim, CA",Reduced Reporter,703745.0,Rolling Hills,Southern California Association of Governments,both


## final checks

In [39]:
display(
    len(ntd_service) == len(ntd_data_to_rtpa), # with left join, length should be the same since we removed duplicated rows
    #ntd_data_to_rtpa.info() #confirms that every column has data. except for primary uza, because the rural reporters dont get a uza name
    len(ntd_time_series) == len(alt_ntd_to_rtpa),
)

True

True

In [42]:
ntd_data_to_rtpa.head()

Unnamed: 0,agency,ntd_id,reporter_type,city,primary_uza_name,unlinked_passenger_trips_upt,cdp_name,RTPA,_merge
0,Access Services,90157,Full Reporter,El Monte,"Los Angeles--Long Beach--Anaheim, CA",6472858.0,El Monte,Southern California Association of Governments,both
1,Alameda-Contra Costa Transit District,90014,Full Reporter,Oakland,"San Francisco--Oakland, CA",29347581.0,Oakland,Metropolitan Transportation Commission,both
2,"Alameda-Contra Costa Transit District, dba: AC...",90014,Full Reporter,Oakland,"San Francisco--Oakland, CA",35190057.0,Oakland,Metropolitan Transportation Commission,both
3,Altamont Corridor Express,90182,Full Reporter,Stockton,"Stockton, CA",796250.0,Stockton,San Joaquin Council of Governments,both
4,Anaheim Transportation Network,90211,Full Reporter,Anaheim,"Los Angeles--Long Beach--Anaheim, CA",7187312.0,Anaheim,Southern California Association of Governments,both


In [40]:
if ntd_data_to_rtpa["ntd_id"].nunique() == alt_ntd_to_rtpa["ntd_id"].nunique():
    print("same amount of NTD_ID, use either list")
else:
    print(f""" 
    # of NTD ID in ntd_data_to_rtpa: {ntd_data_to_rtpa["ntd_id"].nunique()}
    # of NTD ID in alt_ntd_to_rtpa: {alt_ntd_to_rtpa["ntd_id"].nunique()}
    """
    )


 
    # of NTD ID in ntd_data_to_rtpa: 167
    # of NTD ID in alt_ntd_to_rtpa: 253
    


## Create new `ntd_id_to_rtpa_crosswalk` file!

In [63]:
ntd_data_to_rtpa_cleaned = alt_ntd_to_rtpa[["ntd_id","agency_name","reporter_type","city","RTPA"]].drop_duplicates(subset=["ntd_id"]).reset_index(drop=True)

display(
    ntd_data_to_rtpa_cleaned.info(),
    ntd_data_to_rtpa_cleaned["ntd_id"].value_counts().head(),
    ntd_data_to_rtpa_cleaned["reporter_type"].value_counts(),
    ntd_data_to_rtpa_cleaned["agency_name"].value_counts().head(),
)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253 entries, 0 to 252
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ntd_id         253 non-null    object
 1   agency_name    253 non-null    object
 2   reporter_type  253 non-null    object
 3   city           253 non-null    object
 4   RTPA           253 non-null    object
dtypes: object(5)
memory usage: 10.0+ KB


None

90003    1
90293    1
90280    1
90281    1
90282    1
Name: ntd_id, dtype: int64

Full Reporter        108
Reduced Reporter      93
Rural Reporter        50
Building Reporter      1
Separate Service       1
Name: reporter_type, dtype: int64

Los Angeles County Metropolitan Transportation Authority (LACMTA)    2
San Francisco Bay Area Rapid Transit District (BART)                 1
City of West Covina - Public Services Department                     1
City of Lawndale                                                     1
City of Lynwood - Public Works Department                            1
Name: agency_name, dtype: int64

In [62]:
ntd_data_to_rtpa_cleaned

Unnamed: 0,ntd_id,agency_name,reporter_type,city,RTPA
0,90003,San Francisco Bay Area Rapid Transit District (BART),Full Reporter,Oakland,Metropolitan Transportation Commission
1,90004,Golden Empire Transit District (GET),Full Reporter,Bakersfield,Kern Council of Governments
2,90006,Santa Cruz Metropolitan Transit District (SCMTD),Full Reporter,Santa Cruz,Santa Cruz County Regional Transportation Commission
3,90007,City of Modesto (MAX),Full Reporter,Modesto,Stanislaus Council of Governments
4,90008,City of Santa Monica (BBB) - Department of Transportation,Full Reporter,Santa Monica,Southern California Association of Governments
5,90009,San Mateo County Transit District (SMCTD),Full Reporter,San Carlos,Metropolitan Transportation Commission
6,90010,City of Torrance (TTS) - Transit Department,Full Reporter,Torrance,Southern California Association of Governments
7,90012,San Joaquin Regional Transit District (RTD),Full Reporter,Stockton,San Joaquin Council of Governments
8,90013,Santa Clara Valley Transportation Authority (VTA),Full Reporter,San Jose,Metropolitan Transportation Commission
9,90014,Alameda-Contra Costa Transit District,Full Reporter,Oakland,Metropolitan Transportation Commission


# Save crosswalk to GCS
- as `.parquet` and `.csv`

In [64]:
ntd_data_to_rtpa_cleaned.to_parquet(f"{GCS_FILE_PATH}ntd_id_rtpa_crosswalk_all_reporter_types.parquet")
ntd_data_to_rtpa_cleaned.to_csv(f"{GCS_FILE_PATH}ntd_id_rtpa_crosswalk_all_reporter_types.csv")


## Test reading in data from gcs

In [65]:
xwalk_parquet = pd.read_parquet(f"{GCS_FILE_PATH}ntd_id_rtpa_crosswalk_all_reporter_types.parquet")
xwalk_csv = pd.read_csv(f"{GCS_FILE_PATH}ntd_id_rtpa_crosswalk_all_reporter_types.csv")


display(
    len(xwalk_parquet) == len(xwalk_csv)
)

True

## Investigate agencies in RTPAs that dont match their cities
- Roseville
- Placer
- Tahoe
- El Dorado

In [67]:
name_check=[
    "Roseville",
    "Placer",
    "Tahoe",
    "El Dorado"
]

xwalk_parquet[xwalk_parquet["agency_name"].str.contains("|".join(name_check))] # these align correctly. 

Unnamed: 0,ntd_id,agency_name,reporter_type,city,RTPA
78,90168,City of Roseville - Public Works,Reduced Reporter,Roseville,Placer County Transportation Planning Agency
96,90196,County of Placer (PCT/TART) - Department of Public Works,Full Reporter,Auburn,Placer County Transportation Planning Agency
118,90229,El Dorado County Transit Authority (EDCTA),Reduced Reporter,Diamond Springs,El Dorado County Transportation Commission


In [75]:
# Where is Tahoe?????

display(
    ntd_service[ntd_service["agency"].str.contains("Tahoe")],
    ntd_time_series[ntd_time_series["agency_name"].str.contains("Tahoe")],
    city_to_rtpa[city_to_rtpa["cdp_name"].str.contains("Tahoe")],
    xwalk_parquet[xwalk_parquet["RTPA"].str.contains("Tahoe")]
)

# investigated in BG, Tahoe Transportation District is HQ in Stateline, NV. with UZA = "Lake Tahoe, CA-NV"

Unnamed: 0,agency,ntd_id,reporter_type,city,primary_uza_name,unlinked_passenger_trips_upt


Unnamed: 0,agency_name,agency_status,city,ntd_id,primary_uza_name,reporter_type,upt


Unnamed: 0,FID,cdp_name,name_lsad,geometry,index_right,RTPA,LABEL_RTPA
163,164,Sunnyside-Tahoe City,Sunnyside-Tahoe City CDP,POINT (-45717.780 9536944.668),37.0,Tahoe Regional Planning Agency,TRPA
171,172,Tahoe Vista,Tahoe Vista CDP,POINT (-15139.458 9572604.932),37.0,Tahoe Regional Planning Agency,TRPA
651,652,South Lake Tahoe,South Lake Tahoe city,POINT (4877.149 9459931.525),37.0,Tahoe Regional Planning Agency,TRPA


Unnamed: 0,ntd_id,agency_name,reporter_type,city,RTPA
