In [17]:
import sys

sys.path.append("../")  # up one level

import os
import shutil

import gcsfs
import geopandas as gpd
import pandas as pd
from annual_ridership_report import annual_ridership_module
from calitp_data_analysis.sql import get_engine, to_snakecase
from calitp_data_analysis.tables import tbls
from segment_speed_utils.project_vars import PUBLIC_GCS
from update_vars import GCS_FILE_PATH, NTD_MODES, NTD_TOS

GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/ntd/"

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

# Explore updating RTPA-to-NTD_id crosswalk

Current rtpa/ntd_id crosswalk only contains full system reporters. This notebook will explore updating the crosswalk to include all reporter types (full, reduced, and rural system reporters).

- Start with getting list of NTD reporters from the warehouse `dim_annual_services_agencies` AND `fct_service_data_and_operating_expenses_time_series_by_mode_upt`
- Then get a list of all California cities and all RTPAs, merge to get a list of cities and the RTPA they would be in
- then merge the list of NTD reporters to the list of cities/rtpa, using `city` column. result should be a list of NTD reporters with their RTPAs
---

## Read in `dim_annual_service_agencies` to get agency info

In [None]:
ntd_service = (
    get_ntd_service.groupby(
        ["agency", "ntd_id", "reporter_type", "city", "primary_uza_name"]
    )
    .agg({"unlinked_passenger_trips_upt": "sum"})
    .reset_index()
)

ntd_service.info()

In [7]:
db_engine = get_engine()

In [None]:
with db_engine.connect() as connection:
    query = """
        SELECT
            report_year,
            ntd_id,
            agency,
            reporter_type,
            organization_type,
            city,
            state,
            primary_uza_name,
            actual_vehicles_passenger_car_revenue_hours,
            actual_vehicles_passenger_car_revenue_miles,
            unlinked_passenger_trips_upt 
        FROM 
            cal-itp-data-infra.mart_ntd.dim_annual_service_agencies
        WHERE 
            state = 'CA'
            """
    get_ntd_service_2 = pd.read_sql(query, connection)

In [None]:
get_ntd_service_2.info()

In [None]:
ntd_service_2 = (
    get_ntd_service_2.groupby(
        ["agency", "ntd_id", "reporter_type", "city", "primary_uza_name"]
    )
    .agg({"unlinked_passenger_trips_upt": "sum"})
    .reset_index()
)

ntd_service_2.info()

In [None]:
display(get_ntd_service.equals(get_ntd_service_2), ntd_service.equals(ntd_service_2))

In [None]:
ntd_service.sort_values(by="ntd_id").head(
    10
)  # some duplicate rows exist because the agency name differ through the years?!

## Manual Update to `ntd_service`

In [None]:
ntd_service[ntd_service["ntd_id"] == "90227"]

In [None]:
update_dict = {
    "90227": "Moorpark",
    "90253": "Bell Gardens",
    "90259": "Cerritos",
    "90286": "Monterey Park",
}

for i, v in update_dict.items():
    ntd_service.loc[ntd_service["ntd_id"] == i, "city"] = v

## Read in data from `fct_service_data_and_operating_expenses_time_series_by_mode_upt`

In [19]:
with db_engine.connect() as connection:
    query = """
        SELECT 
            `mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt_1`.`agency_name`, 
            `mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt_1`.`agency_status`, 
            `mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt_1`.`city`, 
            `mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt_1`.`legacy_ntd_id`, 
            `mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt_1`.`mode`, 
            `mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt_1`.`ntd_id`, 
            `mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt_1`.`reporter_type`, 
            `mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt_1`.`reporting_module`, 
            `mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt_1`.`service`, 
            `mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt_1`.`state`, 
            `mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt_1`.`uace_code`, 
            `mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt_1`.`primary_uza_name`, 
            `mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt_1`.`uza_population`, 
            `mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt_1`.`year`, 
            `mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt_1`.`upt` 
        FROM 
            `mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt` AS `mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt_1` 
        WHERE 
            (regexp_contains(`mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt_1`.`state`, 'CA') 
            OR regexp_contains(`mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt_1`.`state`, 'NV')) 
            AND `mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt_1`.`year` >= 2018 
            AND `mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt_1`.`city` IS NOT NULL 
            AND (regexp_contains(`mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt_1`.`primary_uza_name`, ', CA') 
            OR regexp_contains(`mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt_1`.`primary_uza_name`, 'CA-NV') 
            OR regexp_contains(`mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt_1`.`primary_uza_name`, 'California Non-UZA') 
            OR regexp_contains(`mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt_1`.`primary_uza_name`, 'El Paso, TX--NM'))            
            """
    get_ntd_time_series_2 = pd.read_sql(query, connection)

In [23]:
display(
    get_ntd_time_series.equals(get_ntd_time_series_2),
    len(get_ntd_time_series) - len(get_ntd_time_series_2),
    get_ntd_time_series.info(),
    get_ntd_time_series_2.info(),
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1195 entries, 0 to 1194
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   agency_name       1195 non-null   object 
 1   agency_status     1195 non-null   object 
 2   city              1195 non-null   object 
 3   legacy_ntd_id     966 non-null    object 
 4   mode              1195 non-null   object 
 5   ntd_id            1195 non-null   object 
 6   reporter_type     1195 non-null   object 
 7   reporting_module  1187 non-null   object 
 8   service           1195 non-null   object 
 9   state             1195 non-null   object 
 10  uace_code         1195 non-null   int64  
 11  primary_uza_name  1195 non-null   object 
 12  uza_population    1195 non-null   int64  
 13  year              1195 non-null   int64  
 14  upt               860 non-null    float64
dtypes: float64(1), int64(3), object(11)
memory usage: 140.2+ KB
<class 'pandas.core.frame.Dat

True

0

None

None

In [None]:
# what does group by look like
ntd_time_series = (
    get_ntd_time_series_2.groupby(
        [
            "agency_name",
            "agency_status",
            "city",
            "state",
            "ntd_id",
            "primary_uza_name",
            "reporter_type",
            # "mode", # will need this for actual report, but will cause fan out. dont need that to create the crosswalk
            # "service", # will need this for actual report
        ]
    )
    .agg({"upt": "sum"})
    .sort_values(by="ntd_id")
    .reset_index()
)

display(
    get_ntd_time_series_2.info(),
    ntd_time_series.info(),
    ntd_time_series["state"].value_counts(),
    ntd_time_series["primary_uza_name"].value_counts(),
)

In [None]:
# who is in tx?
display(
    ntd_time_series[
        ntd_time_series["primary_uza_name"] == "El Paso, TX--NM"
    ],  # Paso Robles Transit Services, but in CA?
    get_ntd_time_series_2[get_ntd_time_series_2["ntd_id"] == "90195"],
)

## Read in GDF of Census Designated Places (aka cities) and RTPA bounaries

Census Designated Places (CDPs) 2010 - California map
- https://data.sacog.org/datasets/SACOG::census-designated-places-cdps-2010-california/about


RTPA map opend data

- https://www.lab.data.ca.gov/dataset/regional-transportation-planning-agencies

In [None]:
# RTPA map
rtpa_url = "https://cecgis-caenergy.opendata.arcgis.com/api/download/v1/items/3a83743378be4e7f84c8230889c01dea/geojson?layers=0"
rtpa_map = gpd.read_file(rtpa_url)[
    ["RTPA", "LABEL_RTPA", "geometry"]
]  # .set_crs("ESRI:102600", allow_override=True)
rtpa_map = rtpa_map.to_crs("ESRI:102600")  # for sjoin later

In [None]:
# California Census Designated Places (2010), includes cities and CDPs
cdp_url = "https://services6.arcgis.com/YBp5dUuxCMd8W1EI/arcgis/rest/services/California_Census_Designated_Places_2010/FeatureServer/0/query?outFields=*&where=1%3D1&f=geojson"
keep_cdp_col = ["FID", "NAME10", "NAMELSAD10", "geometry"]
cdp_map = gpd.read_file(cdp_url)[keep_cdp_col].rename(
    columns={"NAME10": "cdp_name", "NAMELSAD10": "name_lsad"}
)  # .set_crs("ESRI:102600", allow_override=True)

## get centroid of CDPs to get point geom instead of polygons

In [None]:
cdp_map["centroid"] = (
    cdp_map["geometry"].to_crs("ESRI:102600").centroid
)  # CRS used to create centoid. is not presistent
cdp_points = cdp_map.set_geometry("centroid", drop=True)

## sjoin `cdp_map` to `rtpa_map` get get cites-to-rtpa crosswalk

In [None]:
cdp_points.crs == rtpa_map.crs
# both are ESRI:102600

In [None]:
city_to_rtpa = gpd.sjoin(
    cdp_points,  # includes cities and CDPs.
    rtpa_map,
    how="left",
    # how="inner",
    predicate="intersects",
    # predicate="within",
)
# left, intersects = 1523 rows, 1521 CDPs matched,2 CDPs did not match to an RTPA (Avalon and City of San Francisco)
# inner, intersects = 1521 rows,1521 CDPs matched
# left, within = 1523 rows, 1521 CDPs matched,2 CDPs did not match to an RTPA
# inner, within = 1521 rows,1521 CDPs matched

### fix unmerged CDP rtpa rows

In [None]:
# Avalon fix
city_to_rtpa.loc[city_to_rtpa["cdp_name"] == "Avalon", ("RTPA", "LABEL_RTPA")] = (
    "Southern California Association of Governments",
    "SCAG",
)

# San Francisco Fix
city_to_rtpa.loc[
    city_to_rtpa["cdp_name"] == "San Francisco", ("RTPA", "LABEL_RTPA")
] = ("Metropolitan Transportation Commission", "MTC")

# check fix
city_to_rtpa[city_to_rtpa["cdp_name"].isin(["Avalon", "San Francisco"])]

## merge `ntd_service` to `city_to_rtpa`

In [None]:
ntd_data_to_rtpa = ntd_service.merge(
    city_to_rtpa[["cdp_name", "RTPA"]],
    left_on=("city"),
    right_on=("cdp_name"),
    how="left",  # with left join, got some unmerged rows. aka reporters that dont appear in the city_to_rtpa list (CDP?). and fan out 1:m rows due to some dupe cdp names
    indicator=True,
)
ntd_data_to_rtpa.info()

In [None]:
ntd_data_to_rtpa.head()

## merge `ntd_time_series` to `city_to_rtpa`

In [None]:
alt_ntd_to_rtpa = ntd_time_series.merge(
    city_to_rtpa[["cdp_name", "RTPA"]],
    left_on=("city"),
    right_on=("cdp_name"),
    how="left",  # with left join, got some unmerged rows. aka reporters that dont appear in the city_to_rtpa list (CDP?). and fan out 1:m rows due to some dupe cdp names
    indicator=True,
)
alt_ntd_to_rtpa.info()

## check for unmerged values 

In [None]:
display(
    ntd_data_to_rtpa["_merge"].value_counts(), alt_ntd_to_rtpa["_merge"].value_counts()
)

### manual updates to `ntd_data_to_rtpa` & `alt_ntd_to_rtpa`

In [None]:
# rows with NaN RTPAs
ntd_data_to_rtpa[ntd_data_to_rtpa["RTPA"].isna()][
    ["agency", "city", "cdp_name", "RTPA"]
]  # 9 rows didnt get an RTPA

In [None]:
# rows with NaN RTPAs
alt_ntd_to_rtpa[alt_ntd_to_rtpa["RTPA"].isna()][
    ["agency_name", "city", "cdp_name", "RTPA"]
].drop_duplicates().sort_values(
    by="city"
)  # 7 rows with empty RTPAs

In [None]:
alt_ntd_to_rtpa[alt_ntd_to_rtpa["RTPA"].isna()]["city"].unique()

In [None]:
city_to_rtpa[city_to_rtpa["cdp_name"].str.contains("Sherman")]

# Sherman Oaks doesnt even exist as a city  in the city-to-rtpa list

In [None]:
# dictionary to update missing cdp and RTPA values from Cities

update_dict = {
    "Mcfarland": ("Mcfarland", "Kern Council of Governments"),
    "Ventura": ("Ventura", "Southern California Association of Governments"),
    "Palos Verdes Peninsula": (
        "Rolling Hills",
        "Southern California Association of Governments",
    ),  # to match other entries for this agency
    "Havasu Lake": (
        "Havasu Lake",
        "Southern California Association of Governments",
    ),  # aka Lake Havasu. shares zip code with Needles. so update to SCAG
    "North Fork": (
        "North Fork",
        "Madera County Transportation Commission",
    ),  # in Madera County, update to
    "Montery Park": ("Monterey Park", "Southern California Association of Governments"),
    "Paso Robles": ("Paso Robles", "San Luis Obispo Council of Governments"),
    "Sherman Oaks": ("Sherman Oaks", "Southern California Association of Governments"),
    "Stateline": ("Stateline", "Tahoe Regional Planning Agency"),
}

# make loop using update_dict:
for k, v in update_dict.items():
    ntd_data_to_rtpa.loc[ntd_data_to_rtpa["city"] == k, ("cdp_name", "RTPA")] = v
    alt_ntd_to_rtpa.loc[alt_ntd_to_rtpa["city"] == k, ("cdp_name", "RTPA")] = v

In [None]:
# check for NaN RTPAs again
display(
    ntd_data_to_rtpa[ntd_data_to_rtpa["RTPA"].isna()][
        ["agency", "city", "cdp_name", "RTPA"]
    ],  # all fixed!!
    alt_ntd_to_rtpa[alt_ntd_to_rtpa["RTPA"].isna()][
        ["agency_name", "city", "cdp_name", "RTPA"]
    ],
)

### Duplicate NTD ID fixes
- review ntd_id's with more than 2 rows. there are some CDPs with duplicate names.
- remove the rows with non-matching UZA to RTPA names

In [None]:
alt_ntd_to_rtpa[
    "ntd_id"
].value_counts().head()  # looking for rows with more than 2 rows

In [None]:
check_ntd_id = [
    "90256",  # City of Burbank, matched to MTC and SCAG. There is a "Burbank" in both areas
    "90287",  # Palos Verdes Peninsula Transit Authority. similary, shows in 2 RTPAs.
]

display(
    ntd_data_to_rtpa[ntd_data_to_rtpa["ntd_id"].isin(check_ntd_id)],
    alt_ntd_to_rtpa[alt_ntd_to_rtpa["ntd_id"].isin(check_ntd_id)],
)

In [None]:
# identify conditions to drop rows by
remove_1 = (ntd_data_to_rtpa["ntd_id"] == "90256") & (
    ntd_data_to_rtpa["RTPA"] == "Metropolitan Transportation Commission"
)
remove_2 = (ntd_data_to_rtpa["ntd_id"] == "90287") & (
    ntd_data_to_rtpa["RTPA"] == "Madera County Transportation Commission"
)

ntd_data_to_rtpa = ntd_data_to_rtpa[~(remove_1 | remove_2)]

In [None]:
remove_3 = (alt_ntd_to_rtpa["ntd_id"] == "90256") & (
    alt_ntd_to_rtpa["RTPA"] == "Metropolitan Transportation Commission"
)
remove_4 = (alt_ntd_to_rtpa["ntd_id"] == "90287") & (
    alt_ntd_to_rtpa["RTPA"] == "Madera County Transportation Commission"
)

alt_ntd_to_rtpa = alt_ntd_to_rtpa[~(remove_3 | remove_4)]

In [None]:
display(
    ntd_data_to_rtpa[ntd_data_to_rtpa["ntd_id"].isin(check_ntd_id)],
    alt_ntd_to_rtpa[alt_ntd_to_rtpa["ntd_id"].isin(check_ntd_id)],
)

## final checks

In [None]:
display(
    len(ntd_service)
    == len(
        ntd_data_to_rtpa
    ),  # with left join, length should be the same since we removed duplicated rows
    # ntd_data_to_rtpa.info() #confirms that every column has data. except for primary uza, because the rural reporters dont get a uza name
    len(ntd_time_series) == len(alt_ntd_to_rtpa),
)

In [None]:
ntd_data_to_rtpa.head()

In [None]:
if ntd_data_to_rtpa["ntd_id"].nunique() == alt_ntd_to_rtpa["ntd_id"].nunique():
    print("same amount of NTD_ID, use either list")
else:
    print(
        f""" 
    # of unique NTD ID in `ntd_data_to_rtpa` (dim_annual_service_agencies): {ntd_data_to_rtpa["ntd_id"].nunique()}
    # of NTD ID in `alt_ntd_to_rtpa` (fct_service_data_and_operating_expenses_time_series_by_mode_upt): {alt_ntd_to_rtpa["ntd_id"].nunique()}
    """
    )

## Create new `ntd_id_to_rtpa_crosswalk` file!

In [None]:
ntd_data_to_rtpa_cleaned = (
    alt_ntd_to_rtpa[
        [
            "ntd_id",
            "agency_name",
            "reporter_type",
            "agency_status",
            "city",
            "state",
            "RTPA",
        ]
    ]
    .drop_duplicates(subset=["ntd_id"])
    .reset_index(drop=True)
)

display(
    ntd_data_to_rtpa_cleaned.info(),
    ntd_data_to_rtpa_cleaned["ntd_id"].value_counts().head(),
    ntd_data_to_rtpa_cleaned["reporter_type"].value_counts(),
    ntd_data_to_rtpa_cleaned["agency_name"].value_counts().head(),
)

In [None]:
ntd_data_to_rtpa_cleaned

# Save crosswalk to GCS
- as `.parquet` and `.csv`

In [None]:
# ntd_data_to_rtpa_cleaned.to_parquet(f"{GCS_FILE_PATH}ntd_id_rtpa_crosswalk_all_reporter_types.parquet")
# ntd_data_to_rtpa_cleaned.to_csv(f"{GCS_FILE_PATH}ntd_id_rtpa_crosswalk_all_reporter_types.csv")

## Test reading in data from gcs

In [None]:
xwalk_parquet = pd.read_parquet(
    f"{GCS_FILE_PATH}ntd_id_rtpa_crosswalk_all_reporter_types.parquet"
)
xwalk_csv = pd.read_csv(f"{GCS_FILE_PATH}ntd_id_rtpa_crosswalk_all_reporter_types.csv")


display(len(xwalk_parquet) == len(xwalk_csv))

In [None]:
display(
    xwalk_parquet.info(),
    xwalk_parquet["ntd_id"].value_counts().head(),
    xwalk_parquet["reporter_type"].value_counts(),
    xwalk_parquet["agency_name"].value_counts().head(),
)

## Double check these agencies. 
They appeared in other RPTAs previously.
- Roseville
- Placer
- Tahoe
- El Dorado

In [None]:
name_check = ["Roseville", "Placer", "Tahoe", "El Dorado"]

xwalk_parquet[
    xwalk_parquet["agency_name"].str.contains("|".join(name_check))
]  # these align correctly.

In [None]:
# Where is Tahoe?????
city = "Tahoe"

display(
    ntd_service[ntd_service["agency"].str.contains(city)],  # initial warehouse table
    ntd_time_series[
        ntd_time_series["agency_name"].str.contains(city)
    ],  # updated warehouse table
    city_to_rtpa[
        city_to_rtpa["cdp_name"].str.contains(city)
    ],  # initial city to rtpa list
    xwalk_parquet[xwalk_parquet["RTPA"].str.contains(city)],  # final list
)

# investigated in BG, Tahoe Transportation District is HQ in Stateline, NV. with UZA = "Lake Tahoe, CA-NV"
# warehouse tables were filtered for state = CA, so we are losing it
# adjusted the code to allow Stateline NV to be part of the list. GTG