## Incorporating `NTD` stuff directly into `crosswalks`
* How do I rerun everything and make sure the files are update with all the dates?

In [1]:
from datetime import datetime

import _operators_prep as op_prep
import _report_utils
import _section1_utils as section1
import geopandas as gpd
import numpy as np
import pandas as pd
from IPython.display import HTML, Image, Markdown, display, display_html
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)
from shared_utils import catalog_utils, rt_dates, rt_utils

In [2]:
# Warehouse
import os
from calitp_data_analysis.sql import query_sql
from calitp_data_analysis.tables import tbls
from siuba import *

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### Incorporate NTD with Crosswalk script.

#### Running `crosswalk_gtfs_datasetkey_to_organization` after my changes.
* Using only a few test dates.
* Discovered repeated itp_id and operators -> what to do? 

In [22]:
january_test = pd.read_parquet("gs://calitp-analytics-data/data-analyses/gtfs_schedule/crosswalk/gtfs_key_organization_2024-01-17_AH_TESTING.parquet")

In [23]:
january_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 168 entries, 0 to 167
Data columns (total 35 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   schedule_gtfs_dataset_key          168 non-null    object 
 1   name                               168 non-null    object 
 2   schedule_source_record_id          168 non-null    object 
 3   base64_url                         168 non-null    object 
 4   organization_source_record_id      168 non-null    object 
 5   organization_name                  168 non-null    object 
 6   itp_id                             165 non-null    float64
 7   caltrans_district                  167 non-null    object 
 8   ntd_id_x                           142 non-null    object 
 9   ntd_id_2022                        142 non-null    object 
 10  agency_name                        135 non-null    object 
 11  counties_served                    101 non-null    object 

In [35]:
january_test.name.nunique(), january_test.agency_name.nunique(), january_test.ntd_id_2022.nunique()

(168, 128, 133)

In [36]:
january_test.ntd_id_x.nunique()

133

In [8]:
SCHED_GCS

'gs://calitp-analytics-data/data-analyses/gtfs_schedule/'

In [9]:
GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk

'crosswalk/gtfs_key_organization'

In [10]:
crosswalk_may_url = "gs://calitp-analytics-data/data-analyses/gtfs_schedule/crosswalk/gtfs_key_organization_2024-05-26.parquet"

In [11]:
crosswalk_may = pd.read_parquet(crosswalk_may_url)

In [12]:
crosswalk_may.shape

(101, 8)

In [38]:
crosswalk_may.itp_id.nunique()

90

In [40]:
crosswalk_may.itp_id.value_counts().head()

127.00    2
343.00    2
360.00    2
164.00    2
331.00    2
Name: itp_id, dtype: int64

In [39]:
crosswalk_may.name.nunique()

101

In [41]:
crosswalk_may.loc[crosswalk_may.itp_id == 127]

Unnamed: 0,schedule_gtfs_dataset_key,name,schedule_source_record_id,base64_url,organization_source_record_id,organization_name,itp_id,caltrans_district
41,ca270cd1ac30a9ec5336a11bc9223c41,Bay Area 511 Golden Gate Ferry Schedule,recnMK3h6zvqw3SFa,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1HRg==,recoX7qMhlPrgfuz3,"Golden Gate Bridge, Highway and Transportation District",127.0,04 - Oakland
42,aea4108997c66a74fbdae27b34b69fde,Bay Area 511 Golden Gate Transit Schedule,recCNNGH8SHfXBKvv,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1HRw==,recoX7qMhlPrgfuz3,"Golden Gate Bridge, Highway and Transportation District",127.0,04 - Oakland


In [42]:
crosswalk_may.loc[crosswalk_may.itp_id == 331]

Unnamed: 0,schedule_gtfs_dataset_key,name,schedule_source_record_id,base64_url,organization_source_record_id,organization_name,itp_id,caltrans_district
11,c3499b856c717e5706299664fb1c5261,Tahoe Transportation District GMV Schedule,recAFSynoBmVNer8r,aHR0cHM6Ly90YWhvZS5zeW5jcm9tYXRpY3MuY29tL2d0ZnM=,rec3u4aMplqObcoTR,Tahoe Transportation District,331.0,03 - Marysville
72,07d3b79f14cec8099119e1eb649f065b,Tahoe Transportation District Schedule,recgyP2nm59f3KKXo,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3Rmcy90YWhvZS1jYS11cy90YWhvZS1jYS11cy56aXA=,rec3u4aMplqObcoTR,Tahoe Transportation District,331.0,03 - Marysville


In [13]:
crosswalk_mar_url = "gs://calitp-analytics-data/data-analyses/gtfs_schedule/crosswalk/gtfs_key_organization_2024-03-13_AH_TESTING.parquet"

In [14]:
crosswalk_mar= pd.read_parquet(crosswalk_mar_url)

In [43]:
crosswalk_june = pd.read_parquet("gs://calitp-analytics-data/data-analyses/gtfs_schedule/crosswalk/gtfs_key_organization_2024-06-12.parquet")

In [44]:
len(crosswalk_june), crosswalk_june.itp_id.nunique(), crosswalk_june.name.nunique()

(160, 142, 160)

In [45]:
crosswalk_june.itp_id.value_counts().head()

214.00    2
127.00    2
162.00    2
481.00    2
63.00     2
Name: itp_id, dtype: int64

In [46]:
crosswalk_may.loc[crosswalk_may.itp_id == 214]

Unnamed: 0,schedule_gtfs_dataset_key,name,schedule_source_record_id,base64_url,organization_source_record_id,organization_name,itp_id,caltrans_district
19,3834987b7e9b1f81a5a91cf274bb3ed8,Mountain Transit GMV Schedule,rec9sSbtaPxxEtBkX,aHR0cHM6Ly9tb3VudGFpbnRyYW5zaXQuc3luY3JvbWF0aWNzLmNvbS9ndGZz,recHbquam1bWEwC3P,Mountain Area Regional Transit Authority,214.0,08 - San Bernardino
75,5ca5d244836397b178993c9bdc4dfb00,Mountain Transit Schedule,recCvLW5YvXVhOzG5,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3Rmcy9iaWdiZWFyLWNhLXVzL2JpZ2JlYXItY2EtdXMuemlw,recHbquam1bWEwC3P,Mountain Area Regional Transit Authority,214.0,08 - San Bernardino


#### Merges

In [None]:

import sys

sys.path.append("../gtfs_funnel")
import crosswalk_gtfs_dataset_key_to_organization

In [None]:
final_ntd = crosswalk_gtfs_dataset_key_to_organization.merge_ntd_mobility(2022)

In [15]:
final_ntd.shape

(228, 25)

In [16]:
crosswalk_mar.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,name,schedule_source_record_id,base64_url,organization_source_record_id,organization_name,itp_id,caltrans_district,ntd_id,ntd_id_2022
0,1770249a5a2e770ca90628434d4934b1,VCTC GMV Schedule,recrAG7e0oOiR6FiP,aHR0cHM6Ly9nb3ZjYnVzLmNvbS9ndGZz,rec7EN71rsZxDFxZd,Ventura County Transportation Commission,380.0,07 - Los Angeles,90164,90164


In [26]:
crosswalk_mar.name.nunique()

171

In [27]:
crosswalk_mar.ntd_id.nunique()

135

In [32]:
crosswalk_mar.ntd_id_2022.value_counts().head(10)

99454    2
90154    2
90148    2
90173    2
91098    2
90016    2
90259    2
90280    2
91092    2
90164    1
Name: ntd_id_2022, dtype: int64

In [33]:
crosswalk_mar.loc[crosswalk_mar.ntd_id_2022 == "99454"]

Unnamed: 0,schedule_gtfs_dataset_key,name,schedule_source_record_id,base64_url,organization_source_record_id,organization_name,itp_id,caltrans_district,ntd_id,ntd_id_2022
3,4383eb1cca04093020f1583f57f32d9b,Desert Roadrunner GMV Schedule,rec4i7pXkVh7Z74N9,aHR0cHM6Ly9yaWRlcHZ2dGEuY29tL2d0ZnM=,recGcv4NidDjwVSiN,Palo Verde Valley Transit Agency,238.0,08 - San Bernardino,9R02-99454,99454
149,ac9384d5e25378d1898ca522070cef66,Desert Roadrunner Schedule,reclg968KFmeD1FDV,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3Rmcy9wYWxvdmVyZGVfdmFsbGV5LWNhLXVzL3BhbG92ZXJkZV92YWxsZXktY2EtdXMuemlw,recGcv4NidDjwVSiN,Palo Verde Valley Transit Agency,238.0,08 - San Bernardino,9R02-99454,99454


In [34]:
crosswalk_mar.loc[crosswalk_mar.ntd_id_2022 == "90154"]

Unnamed: 0,schedule_gtfs_dataset_key,name,schedule_source_record_id,base64_url,organization_source_record_id,organization_name,itp_id,caltrans_district,ntd_id,ntd_id_2022
85,3f3f36b4c41cc6b5df3eb7f5d8ea6e3c,LA Metro Bus Schedule,recX8JOPmBQM9aWLC,aHR0cHM6Ly9naXRsYWIuY29tL0xBQ01UQS9ndGZzX2J1cy9yYXcvbWFzdGVyL2d0ZnNfYnVzLnppcA==,recPnGkwdpnr8jmHB,Los Angeles County Metropolitan Transportation Authority,182.0,07 - Los Angeles,90154,90154
86,2a0571758141f412b6a546fd70a65bf3,LA Metro Rail Schedule,recofCmylEKq2zuPr,aHR0cHM6Ly9naXRsYWIuY29tL0xBQ01UQS9ndGZzX3JhaWwvcmF3L21hc3Rlci9ndGZzX3JhaWwuemlw,recPnGkwdpnr8jmHB,Los Angeles County Metropolitan Transportation Authority,182.0,07 - Los Angeles,90154,90154


In [25]:
len(crosswalk_mar)

171

In [21]:
pd.merge(crosswalk_mar,
        final_ntd,
        left_on = ["ntd_id_2022"],
        right_on = ["ntd_id"],
        how = "outer",
        indicator= True)[["_merge"]].value_counts()

_merge    
both          136
right_only     99
left_only      35
dtype: int64

In [17]:
pd.merge(crosswalk_mar,
        final_ntd,
        on = ["ntd_id"],
        how = "outer",
        indicator= True)[["_merge"]].value_counts()

_merge    
right_only    132
both          101
left_only      70
dtype: int64

In [None]:
pd.merge(crosswalk_mar,
        final_ntd,
        left_on = ["name"],
        right_on = ["agency_name"],
        how = "outer",
        indicator= True)[["_merge"]].value_counts()

_merge    
right_only    228
left_only     171
both            0
dtype: int64

### Find the NTD IDs in `dim_annual_ntd_agency_service`

In [None]:
ntd_agency_service = (
    tbls.mart_ntd.dim_annual_ntd_agency_service()
    >> collect()
)

In [None]:
ntd_agency_service.head(1)

In [None]:
ntd_agency_service.ntd_id.nunique()

In [None]:
ntd_agency_service.agency_name.nunique()

In [None]:
len(ntd_agency_service)

In [None]:
len(ntd_agency_service.drop_duplicates(subset = ['ntd_id','agency_name']))

In [None]:
ntd_agency_service2 = ntd_agency_service.drop_duplicates(subset = ['ntd_id','agency_name'])

In [None]:
ntd_agency_service.year.unique()

#### Merging NTD Agency Service with the Crosswalk

In [None]:
pd.merge(crosswalk_mar,
        ntd_agency_service2,
        left_on = ["ntd_id_2022", "organization_name"],
        right_on = ["ntd_id", "agency_name"],
        how = "outer",
        indicator= True)[["_merge"]].value_counts()

In [None]:
pd.merge(crosswalk_mar,
        ntd_agency_service2,
        on = ["ntd_id"],
        how = "outer",
        indicator= True)[["_merge"]].value_counts()

In [None]:
len(crosswalk_mar)

#### Compare the original NTD table versus NTD Agency Service

In [None]:
agency_service_id = set(ntd_agency_service2.ntd_id.unique().tolist())
agency_profile_id = set(final_ntd.ntd_id.unique().tolist())
agency_profile_id - agency_service_id

In [None]:
pd.merge(
        final_ntd,
    ntd_agency_service2,
        on = ["ntd_id"],
        how = "outer",
        indicator= True)[["_merge"]].value_counts()

In [None]:
len(final_ntd)

In [None]:
pd.merge(
       final_ntd,
       ntd_agency_service2,
        on = ["ntd_id", "agency_name"],
        how = "left",
        indicator= True)[["_merge"]].value_counts()

In [None]:
len(agency_profile_id)

In [None]:
len(agency_service_id-agency_profile_id)

In [None]:
agency_service_agency= set(ntd_agency_service.agency_name.unique().tolist())
agency_profile_agency = set(final_ntd.agency_name.unique().tolist())

In [None]:
agency_profile_agency - agency_service_agency

### Checkout NTD

In [None]:
ntd_test = (
    tbls.mart_ntd.dim_annual_ntd_agency_information()
    >> filter(_.state == "CA", _._is_current == True)
    >> collect()
)

In [None]:
ntd_test.shape

In [None]:
ntd_test.head(1)

In [None]:
ntd_test.year.unique()

In [None]:
ntd2 = ntd.sort_values(by=list(ntd.columns), na_position="last")

In [None]:
ntd.shape

In [None]:
ntd.loc[ntd.agency_name == "Kern Regional Transit"]

In [None]:
ntd2.loc[ntd2.agency_name == "Kern Regional Transit"]

In [None]:
ntd.loc[ntd.agency_name == "San Luis Obispo Regional Transit Authority"]

In [None]:
ntd2.loc[ntd2.agency_name == "San Luis Obispo Regional Transit Authority"]

In [None]:
ntd.agency_name.value_counts().head(10)

In [None]:
ntd3 = ntd2.groupby("agency_name").first().reset_index()

In [None]:
ntd3.loc[ntd3.agency_name == "San Luis Obispo Regional Transit Authority"]

In [None]:
ntd3.head(1).T

In [None]:
ntd4 = (
    tbls.mart_ntd.dim_annual_ntd_agency_information()
    >> filter(_.year == 2022, _.state == "CA", _._is_current == True)
    >> collect()
)

In [None]:
ntd4.head(1).T

### Checkout `mobility` 
* Need this because there is additional columns here that isn't in NTD.

In [None]:
mob_og = (
    tbls.mart_transit_database.dim_mobility_mart_providers()
    >> collect()
)

In [None]:
mob_og.head(1).T

In [None]:
mob = (
    tbls.mart_transit_database.dim_mobility_mart_providers()
    >> select(
        _.agency_name,
        _.counties_served,
        _.hq_city,
        _.hq_county,
        _.is_public_entity,
        _.is_publicly_operating,
        _.funding_sources,
        _.on_demand_vehicles_at_max_service,
        _.vehicles_at_max_service,
    )
    >> collect()
)

In [None]:
mob.head(1).T

In [None]:
mob.agency_name.value_counts().head(10)

In [None]:
mob.loc[mob.agency_name == "Kern Regional Transit"]

In [None]:
mob2 = mob.sort_values(
    by=["on_demand_vehicles_at_max_service", "vehicles_at_max_service"],
    ascending=[False, False],
)

In [None]:
mob2.loc[mob2.agency_name == "Kern Regional Transit"]

In [None]:
mob3 = mob2.groupby('agency_name').first().reset_index()

In [None]:
mob3.loc[mob3.agency_name == "Kern Regional Transit"]

### I made a minor change to loading `mobility` warehouse data (just deleting an unncessary line). Otherwise, the functions are good to go.

In [None]:
def merge_ntd_mobility(year:int)->pd.DataFrame:
    ntd = section1.load_ntd(year)
    mobility = section1.load_mobility()
    m1 = pd.merge(
    mobility,
    ntd,
    how="inner",
    on="agency_name")
    agency_dict = {
    "City of Fairfield, California": "City of Fairfield",
    "Livermore / Amador Valley Transit Authority": "Livermore-Amador Valley Transit Authority",
    "Nevada County Transit Services": "Nevada County",
    "Omnitrans": "OmniTrans"}
    
    m1.agency_name = m1.agency_name.replace(agency_dict)
    m1.agency_name = m1.agency_name.str.strip()
    m1 = m1.drop_duplicates(subset = ["agency_name"]).reset_index(drop = True)
    return m1

In [None]:
m1 = merge_ntd_mobility(2022)

In [None]:
m1.shape

In [None]:
m1.agency_name.nunique()

In [None]:
m1.agency_name.value_counts().head()

In [None]:
m1.loc[m1.agency_name == "Redding Area Bus Authority"]

### Checkout Route Typology
* All this work lives in another script `gtfs_Funnel/route_typologies.py`

In [None]:
ROUTE_TYPOLOGY = GTFS_DATA_DICT.schedule_tables.route_typologies

In [None]:
SCHED_GCS

In [None]:
ROUTE_TYPOLOGY

In [None]:
apr_24_url = "gs://calitp-analytics-data/data-analyses/gtfs_schedule/nacto_typologies/route_typologies_2024-04-19.parquet"

In [None]:
apr_24_df = pd.read_parquet(apr_24_url)

In [None]:
apr_24_df.head(2)