## Incorporating `NTD` stuff directly into `operator_profiles`

In [7]:
from datetime import datetime

import _operators_prep as op_prep
import _report_utils
import _section1_utils as section1
import geopandas as gpd
import numpy as np
import pandas as pd
from IPython.display import HTML, Image, Markdown, display, display_html
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)
from shared_utils import catalog_utils, rt_dates, rt_utils

In [14]:
# Warehouse
import os

from calitp_data_analysis.sql import query_sql
from calitp_data_analysis.tables import tbls
from siuba import *

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### This is the dataframe I use for my GTFS Digest portfolio work.

In [3]:
op_profiles_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_profiles}.parquet"

In [4]:
GTFS_DATA_DICT.digest_tables.dir

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [5]:
GTFS_DATA_DICT.digest_tables.operator_profiles

'digest/operator_profiles'

In [10]:
op_profiles.head(2)

Unnamed: 0,organization_name,name,service_date,schedule_gtfs_dataset_key,caltrans_district,sched_rt_category
0,Antelope Valley Transit Authority,Antelope Valley Transit Authority Schedule,2024-04-17,e681c3a8dafa2c80e5b8e2cdd01f917a,07 - Los Angeles,schedule_and_vp
1,Butte County Association of Governments,B-Line Schedule,2024-04-17,68aa06a25a32c83eb38c20c43977feff,03 - Marysville,schedule_and_vp


#### Figuring out where `operator_profiles` is created
* [Where](https://github.com/cal-itp/data-analyses/blob/ce779ede5c48f84e974615a457be77cba2f8b08b/gtfs_digest/merge_operator_data.py#L109) `operator_profiles` is stacked for all the available dates.
* `GTFS_DATA_DICT.schedule_tables.operator_scheduled_stats` is the actual file being stacked that is being created here in [GTFS Funnel](https://github.com/cal-itp/data-analyses/blob/ce779ede5c48f84e974615a457be77cba2f8b08b/gtfs_funnel/operator_scheduled_stats.py#L147)

In [12]:
SCHED_GCS

'gs://calitp-analytics-data/data-analyses/gtfs_schedule/'

In [11]:
GTFS_DATA_DICT.schedule_tables.operator_scheduled_stats

'operator_profiles/operator_scheduled_stats'

In [29]:
may_24 = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/gtfs_schedule/operator_profiles/operator_scheduled_stats_2024-05-26.parquet"
)

In [30]:
may_24.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,operator_n_routes,operator_n_trips,operator_n_shapes,operator_n_stops,operator_n_arrivals,operator_route_length_miles,operator_arrivals_per_stop,n_downtown_local_routes,n_local_routes,n_coverage_routes,n_rapid_routes,n_express_routes,n_rail_routes,name,organization_source_record_id,organization_name
0,015d67d5b75b5cf2b710bbadadfb75f5,14,467,34,481,11145,193.71,23.17,20,0,4,14,0,0,Bay Area 511 Marin Schedule,recNOb7pqBRlQVG5e,Marin County Transit District


#### `gtfs_funnel/operator_scheduled_stats`
* Add `ntd` stuff somewhere around 164-174
* How often is the NTD data updated in our warehouse?
* So far I am querying 2022 data.
* Where do I add this NTD stuff?

#### Checkout NTD

In [15]:
def load_ntd(year: int) -> pd.DataFrame:
    """
    Load NTD Data stored in our warehouse.
    """
    df = (
        tbls.mart_ntd.dim_annual_ntd_agency_information()
        >> filter(_.year == year, _.state == "CA", _._is_current == True)
        >> select(
            _.number_of_state_counties,
            _.uza_name,
            _.density,
            _.number_of_counties_with_service,
            _.state_admin_funds_expended,
            _.service_area_sq_miles,
            _.population,
            _.service_area_pop,
            _.subrecipient_type,
            _.primary_uza,
            _.reporter_type,
            _.organization_type,
            _.agency_name,
            _.voms_pt,
            _.voms_do,
        )
        >> collect()
    )

    cols = list(df.columns)

    # Put rows with a lot of nans at the bottom
    # because some of the operators have 1+ entry.
    df2 = df.sort_values(by=cols, na_position="last")

    # Keep only the rows with the most data.
    df3 = df2.groupby("agency_name").first().reset_index()

    return df3

In [16]:
ntd = (
    tbls.mart_ntd.dim_annual_ntd_agency_information()
    >> filter(_.year == 2022, _.state == "CA", _._is_current == True)
    >> select(
        _.number_of_state_counties,
        _.uza_name,
        _.density,
        _.number_of_counties_with_service,
        _.state_admin_funds_expended,
        _.service_area_sq_miles,
        _.population,
        _.service_area_pop,
        _.subrecipient_type,
        _.primary_uza,
        _.reporter_type,
        _.organization_type,
        _.agency_name,
        _.voms_pt,
        _.voms_do,
    )
    >> collect()
)

In [22]:
ntd2 = ntd.sort_values(by=list(ntd.columns), na_position="last")

In [17]:
ntd.shape

(234, 15)

In [21]:
ntd.loc[ntd.agency_name == "Kern Regional Transit"]

Unnamed: 0,number_of_state_counties,uza_name,density,number_of_counties_with_service,state_admin_funds_expended,service_area_sq_miles,population,service_area_pop,subrecipient_type,primary_uza,reporter_type,organization_type,agency_name,voms_pt,voms_do
57,,,,,,,,,Rural General Public Transit,,Rural Reporter,Public Agency or Authority of Transit Service,Kern Regional Transit,40.0,
219,,,,,,,,,,,Group Plan Sponsor,County or Local Government Unit or Department of Transportation,Kern Regional Transit,,


In [23]:
ntd2.loc[ntd2.agency_name == "Kern Regional Transit"]

Unnamed: 0,number_of_state_counties,uza_name,density,number_of_counties_with_service,state_admin_funds_expended,service_area_sq_miles,population,service_area_pop,subrecipient_type,primary_uza,reporter_type,organization_type,agency_name,voms_pt,voms_do
57,,,,,,,,,Rural General Public Transit,,Rural Reporter,Public Agency or Authority of Transit Service,Kern Regional Transit,40.0,
219,,,,,,,,,,,Group Plan Sponsor,County or Local Government Unit or Department of Transportation,Kern Regional Transit,,


In [24]:
ntd.loc[ntd.agency_name == "San Luis Obispo Regional Transit Authority"]

Unnamed: 0,number_of_state_counties,uza_name,density,number_of_counties_with_service,state_admin_funds_expended,service_area_sq_miles,population,service_area_pop,subrecipient_type,primary_uza,reporter_type,organization_type,agency_name,voms_pt,voms_do
73,,"San Luis Obispo, CA",4034.0,,,130.0,56904.0,206008.0,,,Full Reporter,Public Agency or Authority of Transit Service,San Luis Obispo Regional Transit Authority,,41.0
143,,,,,,,,,,,Group Plan Sponsor,Public Agency or Authority of Transit Service,San Luis Obispo Regional Transit Authority,,


In [25]:
ntd2.loc[ntd2.agency_name == "San Luis Obispo Regional Transit Authority"]

Unnamed: 0,number_of_state_counties,uza_name,density,number_of_counties_with_service,state_admin_funds_expended,service_area_sq_miles,population,service_area_pop,subrecipient_type,primary_uza,reporter_type,organization_type,agency_name,voms_pt,voms_do
73,,"San Luis Obispo, CA",4034.0,,,130.0,56904.0,206008.0,,,Full Reporter,Public Agency or Authority of Transit Service,San Luis Obispo Regional Transit Authority,,41.0
143,,,,,,,,,,,Group Plan Sponsor,Public Agency or Authority of Transit Service,San Luis Obispo Regional Transit Authority,,


In [55]:
ntd.agency_name.value_counts().head(10)

Los Angeles County Metropolitan Transportation Authority     2
San Luis Obispo Regional Transit Authority                   2
Kern Regional Transit                                        2
Metropolitan Transportation Commission                       2
Humboldt Transit Authority                                   2
Quechan Indian Tribe                                         1
City of Covina                                               1
Mendocino Transit Authority                                  1
City of Arcata                                               1
Pomona Valley Transportation Authority                       1
Name: agency_name, dtype: int64

In [26]:
ntd3 = ntd2.groupby("agency_name").first().reset_index()

In [27]:
ntd3.loc[ntd3.agency_name == "San Luis Obispo Regional Transit Authority"]

Unnamed: 0,agency_name,number_of_state_counties,uza_name,density,number_of_counties_with_service,state_admin_funds_expended,service_area_sq_miles,population,service_area_pop,subrecipient_type,primary_uza,reporter_type,organization_type,voms_pt,voms_do
199,San Luis Obispo Regional Transit Authority,,"San Luis Obispo, CA",4034.0,,,130.0,56904.0,206008.0,,,Full Reporter,Public Agency or Authority of Transit Service,,41.0


In [35]:
ntd3.head(1).T

Unnamed: 0,0
agency_name,Access Services
number_of_state_counties,
uza_name,"Los Angeles--Long Beach--Anaheim, CA"
density,7476.00
number_of_counties_with_service,
state_admin_funds_expended,
service_area_sq_miles,1621.00
population,12237376.00
service_area_pop,11638106.00
subrecipient_type,


In [37]:
ntd4 = (
    tbls.mart_ntd.dim_annual_ntd_agency_information()
    >> filter(_.year == 2022, _.state == "CA", _._is_current == True)
    >> collect()
)

In [38]:
ntd4.head(1).T

Unnamed: 0,0
key,f31fc355c5863d32e69198799d15ee63
year,2022
ntd_id,90167
number_of_state_counties,
tam_tier,Tier II
personal_vehicles,
density,6334.00
uza_name,"Davis, CA"
tribal_area_name,
service_area_sq_miles,21.00


#### Checkout `mobility` 
* Need this because there is additional columns here that isn't in NTD.

In [49]:
mob_og = (
    tbls.mart_transit_database.dim_mobility_mart_providers()
    >> collect()
)

In [50]:
mob_og.head(1).T

Unnamed: 0,0
agency_name,City of Montebello
ntd_id,90041
hq_city,Montebello
hq_county,Los Angeles
counties_served,Los Angeles
agency_website,http://www.ridembl.com/
caltrans_district_id,7.00
caltrans_district_name,Los Angeles
is_public_entity,True
is_publicly_operating,True


In [32]:
mob = (
    tbls.mart_transit_database.dim_mobility_mart_providers()
    >> select(
        _.agency_name,
        _.counties_served,
        _.hq_city,
        _.hq_county,
        _.is_public_entity,
        _.is_publicly_operating,
        _.funding_sources,
        _.on_demand_vehicles_at_max_service,
        _.vehicles_at_max_service,
    )
    >> collect()
)

In [34]:
mob.head(1).T

Unnamed: 0,0
agency_name,City of Montebello
counties_served,Los Angeles
hq_city,Montebello
hq_county,Los Angeles
is_public_entity,True
is_publicly_operating,True
funding_sources,5307
on_demand_vehicles_at_max_service,42.00
vehicles_at_max_service,87.00


In [54]:
mob.agency_name.value_counts().head(10)

Humboldt Transit Authority                                   2
Los Angeles County Metropolitan Transportation Authority     2
Kern Regional Transit                                        2
San Luis Obispo Regional Transit Authority                   2
Metropolitan Transportation Commission                       2
City of Rio Vista                                            1
City of Ojai                                                 1
City of Redondo Beach                                        1
Morongo Basin Transit Authority                              1
Solano County Transit                                        1
Name: agency_name, dtype: int64

In [40]:
mob.loc[mob.agency_name == "Kern Regional Transit"]

Unnamed: 0,agency_name,counties_served,hq_city,hq_county,is_public_entity,is_publicly_operating,funding_sources,on_demand_vehicles_at_max_service,vehicles_at_max_service
74,Kern Regional Transit,,Bakersfield,,,,,,0.0
172,Kern Regional Transit,,Bakersfield,,,,,,40.0


In [43]:
mob2 = mob2.sort_values(
    by=["on_demand_vehicles_at_max_service", "vehicles_at_max_service"],
    ascending=[False, False],
)

In [44]:
mob2.loc[mob2.agency_name == "Kern Regional Transit"]

Unnamed: 0,agency_name,counties_served,hq_city,hq_county,is_public_entity,is_publicly_operating,funding_sources,on_demand_vehicles_at_max_service,vehicles_at_max_service
172,Kern Regional Transit,,Bakersfield,,,,,,40.0
74,Kern Regional Transit,,Bakersfield,,,,,,0.0


In [45]:
mob3 = mob2.groupby('agency_name').first().reset_index()

In [46]:
mob3.loc[mob3.agency_name == "Kern Regional Transit"]

Unnamed: 0,agency_name,counties_served,hq_city,hq_county,is_public_entity,is_publicly_operating,funding_sources,on_demand_vehicles_at_max_service,vehicles_at_max_service
142,Kern Regional Transit,,Bakersfield,,,,,,40.0


In [57]:
def merge_ntd_mobility(year:int)->pd.DataFrame:
    ntd = section1.load_ntd(year)
    mobility = section1.load_mobility()
    m1 = pd.merge(
    mobility,
    ntd,
    how="inner",
    on="agency_name")
    agency_dict = {
    "City of Fairfield, California": "City of Fairfield",
    "Livermore / Amador Valley Transit Authority": "Livermore-Amador Valley Transit Authority",
    "Nevada County Transit Services": "Nevada County",
    "Omnitrans": "OmniTrans"}
    
    m1.agency_name = m1.agency_name.replace(agency_dict)
    m1.agency_name = m1.agency_name.str.strip()
    m1 = m1.drop_duplicates(subset = ["agency_name"]).reset_index(drop = True)
    return m1

In [58]:
m1 = merge_ntd_mobility(2022)

In [59]:
m1.shape

(228, 23)

In [60]:
m1.agency_name.nunique()

228

In [53]:
m1.agency_name.value_counts().head()

Redding Area Bus Authority                                   2
Access Services                                              1
Los Angeles County Dept. of Public Works - Lennox Shuttle    1
Lassen Transit Service Agency                                1
Livermore-Amador Valley Transit Authority                    1
Name: agency_name, dtype: int64

In [56]:
m1.loc[m1.agency_name == "Redding Area Bus Authority"]

Unnamed: 0,agency_name,counties_served,hq_city,hq_county,is_public_entity,is_publicly_operating,funding_sources,on_demand_vehicles_at_max_service,vehicles_at_max_service,number_of_state_counties,uza_name,density,number_of_counties_with_service,state_admin_funds_expended,service_area_sq_miles,population,service_area_pop,subrecipient_type,primary_uza,reporter_type,organization_type,voms_pt,voms_do
183,Redding Area Bus Authority,Shasta,Redding,Shasta,True,True,5307;5311;5337,,21.0,,"Redding, CA",1798.0,,,100.0,120602.0,126551.0,,,Full Reporter,Public Agency or Authority of Transit Service,21.0,
184,Redding Area Bus Authority,,Redding,,,,,,0.0,,,,,,,,,,,Group Plan Sponsor,Public Agency or Authority of Transit Service,,
