## Add NTD Data to Section 1

In [1]:
import os

import pandas as pd
from calitp_data_analysis.sql import query_sql
from calitp_data_analysis.tables import tbls
from shared_utils import catalog_utils
from siuba import *

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
import yaml

GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")
# Readable Dictionary
with open("readable.yml") as f:
    readable_dict = yaml.safe_load(f)

### Operator Profiles - Keep only those that have RT data. 

In [4]:
op_profiles_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_profiles}.parquet"
op_profiles_df = (
    pd.read_parquet(op_profiles_url)
    .sort_values(by=["service_date"], ascending=False)
    .drop_duplicates(subset=["organization_name"])[
        [
            "schedule_gtfs_dataset_key",
            "organization_name",
            "organization_source_record_id",
            "name",
        ]
    ]
)

In [5]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"
schd_vp_df = (
    pd.read_parquet(schd_vp_url)
    .sort_values(by=["service_date"], ascending=False)
    .drop_duplicates(subset=["organization_name"])[["organization_name"]]
)

In [6]:
op_profiles_df2 = pd.merge(schd_vp_df, op_profiles_df, on="organization_name")

In [7]:
op_profiles_df2.shape

(171, 4)

In [8]:
op_profiles_df2.head(1)

Unnamed: 0,organization_name,schedule_gtfs_dataset_key,organization_source_record_id,name
0,Orange County Transportation Authority,ecd018ad66f497fb8f188ed5a71b284b,recfma7GNR5lQTTTg,OCTA Schedule


In [9]:
# op_profiles_df.sort_values(by = ['organization_name'])

### NTD 
* https://dbt-docs.calitp.org/#!/model/model.calitp_warehouse.dim_annual_ntd_agency_information
* https://www.transit.dot.gov/ntd/data-product/2022-annual-database-agency-information
* 

In [10]:
annual_ntd = (
    tbls.mart_ntd.dim_annual_ntd_agency_information()
    >> filter(_.year == 2022, _.state == "CA", _._is_current == True)
    >> select(
        _.number_of_state_counties,
        _.uza_name,
        _.density,
        _.number_of_counties_with_service,
        _.state_admin_funds_expended,
        _.service_area_sq_miles,
        _.population,
        _.service_area_pop,
        _.subrecipient_type,
        _.primary_uza,
        _.reporter_type,
        _.organization_type,
        _.agency_name,
        _.voms_pt,
        _.voms_do,
    )
    >> collect()
)

In [11]:
annual_ntd.agency_name.value_counts().head()

Humboldt Transit Authority                                   2
Los Angeles County Metropolitan Transportation Authority     2
San Luis Obispo Regional Transit Authority                   2
Kern Regional Transit                                        2
Metropolitan Transportation Commission                       2
Name: agency_name, dtype: int64

In [12]:
ntd_cols = list(annual_ntd.columns)

In [13]:
annual_ntd2 = annual_ntd.sort_values(by=ntd_cols, na_position='last')

In [14]:
annual_ntd3 = annual_ntd2.groupby('agency_name').first().reset_index()


In [15]:
annual_ntd.shape, annual_ntd.agency_name.nunique()

((234, 15), 229)

In [16]:
annual_ntd3.shape, annual_ntd3.agency_name.nunique()

((229, 15), 229)

In [17]:
annual_ntd.loc[annual_ntd.agency_name == "Kern Regional Transit"]

Unnamed: 0,number_of_state_counties,uza_name,density,number_of_counties_with_service,state_admin_funds_expended,service_area_sq_miles,population,service_area_pop,subrecipient_type,primary_uza,reporter_type,organization_type,agency_name,voms_pt,voms_do
38,,,,,,,,,,,Group Plan Sponsor,County or Local Government Unit or Department of Transportation,Kern Regional Transit,,
142,,,,,,,,,Rural General Public Transit,,Rural Reporter,Public Agency or Authority of Transit Service,Kern Regional Transit,40.0,


In [18]:
annual_ntd3.loc[annual_ntd3.agency_name == "Kern Regional Transit"]

Unnamed: 0,agency_name,number_of_state_counties,uza_name,density,number_of_counties_with_service,state_admin_funds_expended,service_area_sq_miles,population,service_area_pop,subrecipient_type,primary_uza,reporter_type,organization_type,voms_pt,voms_do
142,Kern Regional Transit,,,,,,,,,Rural General Public Transit,,Rural Reporter,Public Agency or Authority of Transit Service,40.0,


###  Mobility Marketplace provider map
* https://dbt-docs.calitp.org/#!/model/model.calitp_warehouse.dim_mobility_mart_providers

In [19]:
mobility_marketplace = (
    tbls.mart_transit_database.dim_mobility_mart_providers()
     >> select(
        _.agency_name,
        _.counties_served,
        _.hq_city,
        _.hq_county,
        _.is_public_entity,
        _.is_publicly_operating,
        _.funding_sources,
        _.on_demand_vehicles_at_max_service,
        _.vehicles_at_max_service
    )
    >> collect()
)

In [20]:
mobility_marketplace_cols = list(mobility_marketplace.columns)

In [21]:
mobility_marketplace2 = mobility_marketplace.sort_values(by=mobility_marketplace_cols, na_position='last')

In [22]:
mobility_marketplace2 = mobility_marketplace.sort_values(by=["on_demand_vehicles_at_max_service","vehicles_at_max_service"], ascending = [False, False])

In [23]:
mobility_marketplace3 = mobility_marketplace2.groupby('agency_name').first().reset_index()


In [24]:
mobility_marketplace2.loc[mobility_marketplace2.agency_name == "Kern Regional Transit"]

Unnamed: 0,agency_name,counties_served,hq_city,hq_county,is_public_entity,is_publicly_operating,funding_sources,on_demand_vehicles_at_max_service,vehicles_at_max_service
64,Kern Regional Transit,,Bakersfield,,,,,,40.0
7,Kern Regional Transit,,Bakersfield,,,,,,0.0


In [25]:
mobility_marketplace3.loc[mobility_marketplace3.agency_name == "Kern Regional Transit"]

Unnamed: 0,agency_name,counties_served,hq_city,hq_county,is_public_entity,is_publicly_operating,funding_sources,on_demand_vehicles_at_max_service,vehicles_at_max_service
142,Kern Regional Transit,,Bakersfield,,,,,,40.0


In [26]:
mobility_marketplace.shape

(234, 9)

In [27]:
# mobility_marketplace[["agency_name"]].sort_values(by=["agency_name"])

#### Merge mobility w/ NTD

In [28]:
mobility_ntd = pd.merge(
    mobility_marketplace3,
    annual_ntd3,
    how="outer",
    on="agency_name",
)

In [29]:
mobility_ntd.agency_name.value_counts().head()

Access Services                                                    1
Lake Transit Authority                                             1
Livermore / Amador Valley Transit Authority                        1
Long Beach Transit                                                 1
Los Angeles County Department of Public Works - Avocado Heights    1
Name: agency_name, dtype: int64

In [30]:
mobility_ntd.shape, mobility_ntd.agency_name.nunique()

((229, 23), 229)

In [39]:
mobility_ntd.agency_name = mobility_ntd.agency_name.str.strip()

#### Merge mobility w/ relevant operators

In [40]:
m1 = pd.merge(
    op_profiles_df2,
    mobility_ntd,
    how="left",
    left_on="organization_name",
    right_on="agency_name",
    indicator=True,
)

In [32]:
op_profiles_df2.shape, op_profiles_df2.organization_name.nunique()

((171, 4), 171)

In [41]:
m1._merge.value_counts()

both          122
left_only      50
right_only      0
Name: _merge, dtype: int64

In [42]:
sorted(list(mobility_ntd.agency_name.unique()))

['Access Services',
 'Alameda-Contra Costa Transit District',
 'Alpine County Local Transportation Commission',
 'Altamont Corridor Express',
 'Amador Regional Transit System',
 'Anaheim Transportation Network',
 'Antelope Valley Transit Authority',
 'Bishop Paiute Tribe',
 'Blue Lake Rancheria',
 'Bob Hope Airport',
 'Butte County Association of Governments',
 'Calaveras Transit Agency',
 'California Department of Transportation',
 'California Vanpool Authority',
 'Central Contra Costa Transit Authority',
 'Chemehuevi Indian Tribe',
 'City and County of San Francisco',
 'City of Agoura Hills',
 'City of Alhambra',
 'City of Arcadia',
 'City of Arcata',
 'City of Artesia',
 'City of Arvin',
 'City of Atascadero',
 'City of Auburn',
 'City of Avalon',
 'City of Azusa',
 'City of Baldwin Park',
 'City of Bell',
 'City of Bell Gardens',
 'City of Bellflower',
 'City of Beverly Hills',
 'City of Burbank',
 'City of Calabasas',
 'City of California City',
 'City of Camarillo',
 'City of Car

#### Manually change some names in mobility?
* Why are there repeated operators? 
* City of Fairfield, California -> change to City of Fairfield
* Glenn Transit Service -> Glenn County?
* Kern Regional Transit -> change to Kern County?
* Livermore / Amador Valley Transit Authority -> Livermore-Amador Valley Transit Authority
* Los Angeles County Metropolitan Transportation Authority -> change to Los Angeles County
* Nevada County Transit Services -> change to Nevada County
* 'Mountain Area Regional Transit Authority' -> change to Mountain View Transportation Management Association
* Sacramento Regional Transit District -> Sacramento County
* Omnitrans -> change to OmniTrans
* San Benito County LTA -> San Benito County Local Transportation Authority
* Solano County Transit -> Solano Transportation Authority
* 

In [43]:
m1.loc[m1._merge == "left_only"][["organization_name"]].sort_values(
by=["organization_name"])

Unnamed: 0,organization_name
14,Basin Transit
108,Capitol Corridor Joint Powers Authority
137,City of Banning
97,City of Beaumont
35,City of Clovis
1,City of Duarte
56,City of Elk Grove
131,City of Fairfield
79,City of La Puente
84,City of Lawndale


### Dim Organizations
* https://dbt-docs.calitp.org/#!/model/model.calitp_warehouse.dim_organizations

In [None]:
#rganizations = tbls.mart_transit_database.dim_organizations() >> collect()

In [None]:
#organizations.head(2)