## Incorporating `NTD` stuff directly into `crosswalks`
* How do I rerun everything and make sure the files are update with all the dates?

In [1]:
from datetime import datetime

import _operators_prep as op_prep
import _report_utils
import _section1_utils as section1
import geopandas as gpd
import numpy as np
import pandas as pd
from IPython.display import HTML, Image, Markdown, display, display_html
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)
from shared_utils import catalog_utils, rt_dates, rt_utils

In [2]:
# Warehouse
import os

from calitp_data_analysis.sql import query_sql
from calitp_data_analysis.tables import tbls
from siuba import *

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### Incoporate `Crosswalk` work into the portfolio
* Use helpers function
* Grab latest analysis date automatically. 

In [4]:
op_profiles_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_profiles}.parquet"

In [5]:
op_profiles_df = pd.read_parquet(op_profiles_url)

In [19]:
test_organization = "City and County of San Francisco"

In [6]:
op_profiles_df.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,operator_n_routes,operator_n_trips,operator_n_shapes,operator_n_stops,operator_n_arrivals,operator_route_length_miles,operator_arrivals_per_stop,n_downtown_local_routes,n_local_routes,n_coverage_routes,n_rapid_routes,n_express_routes,n_rail_routes,name,organization_source_record_id,organization_name,service_date
0,014d0998350083249a9eb310635548c2,8,137,8,159,3771,79.5,23.72,2,0,6,8,0,0,SLO Schedule,reciakGBN1DP9dK9N,San Luis Obispo Regional Transit Authority,2023-10-11


In [7]:
op_profiles_df1 = op_profiles_df.sort_values(
    by=["service_date"], ascending=False
).drop_duplicates(subset=["schedule_gtfs_dataset_key"])

In [8]:
len(op_profiles_df), len(op_profiles_df1)

(1947, 260)

In [9]:
op_profiles_df1.service_date.nunique()

14

In [10]:
most_recent_date = rt_dates.y2024_dates[-1]

In [20]:
sf_only = pd.read_parquet(
    op_profiles_url, filters=[[("organization_name", "==", test_organization)]]
)

# Keep only the most recent row
sf_only1 = sf_only.sort_values(by=["service_date"], ascending=False).head(1)

In [None]:
schedule_sf_key = sf_only1.schedule_gtfs_dataset_key.iloc[0]

In [30]:
crosswalk = helpers.import_schedule_gtfs_key_organization_crosswalk(most_recent_date)[
    crosswalk_cols
]

In [11]:
crosswalk_cols = [
    "schedule_gtfs_dataset_key",
    "counties_served",
    "service_area_sq_miles",
    "hq_city",
    "uza_name",
    "service_area_pop",
    "organization_type",
    "primary_uza",
    "reporter_type",
]

In [22]:
crosswalk.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,counties_served,service_area_sq_miles,hq_city,uza_name,service_area_pop,organization_type,primary_uza,reporter_type
0,1770249a5a2e770ca90628434d4934b1,,,Ojai,,,County or Local Government Unit or Department of Transportation,,Rural Reporter


In [13]:
op_profiles_df2 = pd.merge(
    op_profiles_df1, crosswalk, on="schedule_gtfs_dataset_key", how="inner"
)

In [17]:
len(op_profiles_df2), op_profiles_df2.schedule_gtfs_dataset_key.nunique()

(121, 121)

In [15]:
op_profiles_df2.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,operator_n_routes,operator_n_trips,operator_n_shapes,operator_n_stops,operator_n_arrivals,operator_route_length_miles,operator_arrivals_per_stop,n_downtown_local_routes,n_local_routes,n_coverage_routes,n_rapid_routes,n_express_routes,n_rail_routes,name,organization_source_record_id,organization_name,service_date,counties_served,service_area_sq_miles,hq_city,uza_name,service_area_pop,organization_type,primary_uza,reporter_type
0,ff72e90ec439c37fe3ce0d3273a0073e,9,139,10,159,3771,85.34,23.72,2,0,8,10,0,0,SLO Schedule,recMM99msxjmc6PPv,City of San Luis Obispo,2024-04-17,San Luis Obispo,22.0,San Luis Obispo,"San Luis Obispo, CA",46997.0,County or Local Government Unit or Department of Transportation,,Full Reporter


In [18]:
len(op_profiles_df1), op_profiles_df1.schedule_gtfs_dataset_key.nunique()

(260, 260)

#### Why aren't all the values merging??

### Incorporate NTD with Crosswalk script.

#### Running `crosswalk_gtfs_datasetkey_to_organization` after my changes.
* Using only a few test dates.
* Discovered repeated itp_id and operators -> what to do? 

In [None]:
january_og = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/gtfs_schedule/crosswalk/gtfs_key_organization_2024-01-17.parquet"
)

In [None]:
january_og.name.nunique(), len(january_og)

In [None]:
january_og.head(1)

In [None]:
january_og.columns

In [None]:
january_test = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/gtfs_schedule/crosswalk/gtfs_key_organization_2024-01-17_AH_TESTING.parquet"
)

In [None]:
january_test.columns

In [None]:
january_test.name.nunique(), january_test.agency_name.nunique(), january_test.ntd_id_2022.nunique()

In [None]:
len(january_test)

In [None]:
january_test.head(1)

In [None]:
june_24_test = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/gtfs_schedule/crosswalk/gtfs_key_organization_2024-06-12_AH_TESTING.parquet"
)

In [None]:
june_24_test.name.nunique(), june_24_test.agency_name.nunique(), june_24_test.ntd_id_2022.nunique()

In [None]:
len(june_24_test)

In [None]:
SCHED_GCS

In [None]:
GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk

In [None]:
crosswalk_may_url = "gs://calitp-analytics-data/data-analyses/gtfs_schedule/crosswalk/gtfs_key_organization_2024-05-26.parquet"

In [None]:
crosswalk_may = pd.read_parquet(crosswalk_may_url)

In [None]:
crosswalk_may.shape

In [None]:
crosswalk_may.itp_id.nunique()

In [None]:
crosswalk_may.itp_id.value_counts().head()

In [None]:
crosswalk_may.name.nunique()

In [None]:
crosswalk_may.loc[crosswalk_may.itp_id == 127]

In [None]:
crosswalk_may.loc[crosswalk_may.itp_id == 331]

In [None]:
crosswalk_mar_url = "gs://calitp-analytics-data/data-analyses/gtfs_schedule/crosswalk/gtfs_key_organization_2024-03-13_AH_TESTING.parquet"

In [None]:
crosswalk_mar = pd.read_parquet(crosswalk_mar_url)

In [None]:
crosswalk_june = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/gtfs_schedule/crosswalk/gtfs_key_organization_2024-06-12.parquet"
)

In [None]:
len(crosswalk_june), crosswalk_june.itp_id.nunique(), crosswalk_june.name.nunique()

In [None]:
crosswalk_june.itp_id.value_counts().head()

In [None]:
crosswalk_may.loc[crosswalk_may.itp_id == 214]

#### Merges

In [None]:
import sys

sys.path.append("../gtfs_funnel")
import crosswalk_gtfs_dataset_key_to_organization

In [None]:
final_ntd = crosswalk_gtfs_dataset_key_to_organization.merge_ntd_mobility(2022)

In [None]:
final_ntd.shape

In [None]:
crosswalk_mar.head(1)

In [None]:
crosswalk_mar.name.nunique()

In [None]:
crosswalk_mar.ntd_id.nunique()

In [None]:
crosswalk_mar.ntd_id_2022.value_counts().head(10)

In [None]:
crosswalk_mar.loc[crosswalk_mar.ntd_id_2022 == "99454"]

In [None]:
crosswalk_mar.loc[crosswalk_mar.ntd_id_2022 == "90154"]

In [None]:
len(crosswalk_mar)

In [None]:
pd.merge(
    crosswalk_mar,
    final_ntd,
    left_on=["ntd_id_2022"],
    right_on=["ntd_id"],
    how="outer",
    indicator=True,
)[["_merge"]].value_counts()

In [None]:
pd.merge(crosswalk_mar, final_ntd, on=["ntd_id"], how="outer", indicator=True)[
    ["_merge"]
].value_counts()

In [None]:
pd.merge(
    crosswalk_mar,
    final_ntd,
    left_on=["name"],
    right_on=["agency_name"],
    how="outer",
    indicator=True,
)[["_merge"]].value_counts()

### Find the NTD IDs in `dim_annual_ntd_agency_service`

In [None]:
ntd_agency_service = tbls.mart_ntd.dim_annual_ntd_agency_service() >> collect()

In [None]:
ntd_agency_service.head(1)

In [None]:
ntd_agency_service.ntd_id.nunique()

In [None]:
ntd_agency_service.agency_name.nunique()

In [None]:
len(ntd_agency_service)

In [None]:
len(ntd_agency_service.drop_duplicates(subset=["ntd_id", "agency_name"]))

In [None]:
ntd_agency_service2 = ntd_agency_service.drop_duplicates(
    subset=["ntd_id", "agency_name"]
)

In [None]:
ntd_agency_service.year.unique()

#### Merging NTD Agency Service with the Crosswalk

In [None]:
pd.merge(
    crosswalk_mar,
    ntd_agency_service2,
    left_on=["ntd_id_2022", "organization_name"],
    right_on=["ntd_id", "agency_name"],
    how="outer",
    indicator=True,
)[["_merge"]].value_counts()

In [None]:
pd.merge(
    crosswalk_mar, ntd_agency_service2, on=["ntd_id"], how="outer", indicator=True
)[["_merge"]].value_counts()

In [None]:
len(crosswalk_mar)

#### Compare the original NTD table versus NTD Agency Service

In [None]:
agency_service_id = set(ntd_agency_service2.ntd_id.unique().tolist())
agency_profile_id = set(final_ntd.ntd_id.unique().tolist())
agency_profile_id - agency_service_id

In [None]:
pd.merge(final_ntd, ntd_agency_service2, on=["ntd_id"], how="outer", indicator=True)[
    ["_merge"]
].value_counts()

In [None]:
len(final_ntd)

In [None]:
pd.merge(
    final_ntd,
    ntd_agency_service2,
    on=["ntd_id", "agency_name"],
    how="left",
    indicator=True,
)[["_merge"]].value_counts()

In [None]:
len(agency_profile_id)

In [None]:
len(agency_service_id - agency_profile_id)

In [None]:
agency_service_agency = set(ntd_agency_service.agency_name.unique().tolist())
agency_profile_agency = set(final_ntd.agency_name.unique().tolist())

In [None]:
agency_profile_agency - agency_service_agency

### Checkout NTD

In [None]:
ntd_test = (
    tbls.mart_ntd.dim_annual_ntd_agency_information()
    >> filter(_.state == "CA", _._is_current == True)
    >> collect()
)

In [None]:
ntd_test.shape

In [None]:
ntd_test.head(1)

In [None]:
ntd_test.year.unique()

In [None]:
ntd2 = ntd.sort_values(by=list(ntd.columns), na_position="last")

In [None]:
ntd.shape

In [None]:
ntd.loc[ntd.agency_name == "Kern Regional Transit"]

In [None]:
ntd2.loc[ntd2.agency_name == "Kern Regional Transit"]

In [None]:
ntd.loc[ntd.agency_name == "San Luis Obispo Regional Transit Authority"]

In [None]:
ntd2.loc[ntd2.agency_name == "San Luis Obispo Regional Transit Authority"]

In [None]:
ntd.agency_name.value_counts().head(10)

In [None]:
ntd3 = ntd2.groupby("agency_name").first().reset_index()

In [None]:
ntd3.loc[ntd3.agency_name == "San Luis Obispo Regional Transit Authority"]

In [None]:
ntd3.head(1).T

In [None]:
ntd4 = (
    tbls.mart_ntd.dim_annual_ntd_agency_information()
    >> filter(_.year == 2022, _.state == "CA", _._is_current == True)
    >> collect()
)

In [None]:
ntd4.head(1).T

### Checkout `mobility` 
* Need this because there is additional columns here that isn't in NTD.

In [None]:
mob_og = tbls.mart_transit_database.dim_mobility_mart_providers() >> collect()

In [None]:
mob_og.head(1).T

In [None]:
mob = (
    tbls.mart_transit_database.dim_mobility_mart_providers()
    >> select(
        _.agency_name,
        _.counties_served,
        _.hq_city,
        _.hq_county,
        _.is_public_entity,
        _.is_publicly_operating,
        _.funding_sources,
        _.on_demand_vehicles_at_max_service,
        _.vehicles_at_max_service,
    )
    >> collect()
)

In [None]:
mob.head(1).T

In [None]:
mob.agency_name.value_counts().head(10)

In [None]:
mob.loc[mob.agency_name == "Kern Regional Transit"]

In [None]:
mob2 = mob.sort_values(
    by=["on_demand_vehicles_at_max_service", "vehicles_at_max_service"],
    ascending=[False, False],
)

In [None]:
mob2.loc[mob2.agency_name == "Kern Regional Transit"]

In [None]:
mob3 = mob2.groupby("agency_name").first().reset_index()

In [None]:
mob3.loc[mob3.agency_name == "Kern Regional Transit"]

### I made a minor change to loading `mobility` warehouse data (just deleting an unncessary line). Otherwise, the functions are good to go.

In [None]:
def merge_ntd_mobility(year: int) -> pd.DataFrame:
    ntd = section1.load_ntd(year)
    mobility = section1.load_mobility()
    m1 = pd.merge(mobility, ntd, how="inner", on="agency_name")
    agency_dict = {
        "City of Fairfield, California": "City of Fairfield",
        "Livermore / Amador Valley Transit Authority": "Livermore-Amador Valley Transit Authority",
        "Nevada County Transit Services": "Nevada County",
        "Omnitrans": "OmniTrans",
    }

    m1.agency_name = m1.agency_name.replace(agency_dict)
    m1.agency_name = m1.agency_name.str.strip()
    m1 = m1.drop_duplicates(subset=["agency_name"]).reset_index(drop=True)
    return m1

In [None]:
m1 = merge_ntd_mobility(2022)

In [None]:
m1.shape

In [None]:
m1.agency_name.nunique()

In [None]:
m1.agency_name.value_counts().head()

In [None]:
m1.loc[m1.agency_name == "Redding Area Bus Authority"]

### Checkout Route Typology
* All this work lives in another script `gtfs_Funnel/route_typologies.py`

In [None]:
ROUTE_TYPOLOGY = GTFS_DATA_DICT.schedule_tables.route_typologies

In [None]:
SCHED_GCS

In [None]:
ROUTE_TYPOLOGY

In [None]:
apr_24_url = "gs://calitp-analytics-data/data-analyses/gtfs_schedule/nacto_typologies/route_typologies_2024-04-19.parquet"

In [None]:
apr_24_df = pd.read_parquet(apr_24_url)

In [None]:
apr_24_df.head(2)