## Multiple organization names to names conundrum 

In [1]:
import geopandas as gpd
import merge_data
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from shared_utils import catalog_utils, rt_dates, rt_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS

In [2]:
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [4]:
import _operators_prep as op_prep

In [5]:
analysis_date_list = rt_dates.y2024_dates + rt_dates.y2023_dates

In [6]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

In [7]:
schd_vp_df = pd.read_parquet(
    schd_vp_url,
    filters=[[("sched_rt_category", "in", ["schedule_and_vp", "schedule_only"])]],
    columns=[
        "schedule_gtfs_dataset_key",
        "caltrans_district",
        "organization_name",
        "name",
        "sched_rt_category",
        "service_date",
    ],
)

In [8]:
schd_vp_df.groupby(["organization_name", "name"]).agg({"service_date": "nunique"})

Unnamed: 0_level_0,Unnamed: 1_level_0,service_date
organization_name,name,Unnamed: 2_level_1
Alameda-Contra Costa Transit District,Bay Area 511 AC Transit Schedule,20
Amador Regional Transit System,Amador Schedule,20
Amtrak,Amtrak Schedule,2
Antelope Valley Transit Authority,Antelope Valley Transit Authority Schedule,20
Basin Transit,Morongo Basin Schedule,20
Blue Lake Rancheria,Humboldt Schedule,7
Butte County Association of Governments,B-Line Schedule,16
Calaveras Transit Agency,Calaveras Schedule,18
Capitol Corridor Joint Powers Authority,Bay Area 511 Capitol Corridor Schedule,20
Central Contra Costa Transit Authority,Bay Area 511 County Connection Schedule,19


### Same organization name corresponds with two different names.

In [9]:
schd_vp_df.groupby(["organization_name"]).agg({"name": "nunique"}).sort_values(
    by=["name"], ascending=False
).head(20)

Unnamed: 0_level_0,name
organization_name,Unnamed: 1_level_1
City of Roseville,2
Transit Joint Powers Authority for Merced County,2
City of San Luis Obispo,2
Tahoe Transportation District,2
Mountain Area Regional Transit Authority,2
Los Angeles County Metropolitan Transportation Authority,2
Palo Verde Valley Transit Agency,2
City of Downey,2
Victor Valley Transit Authority,2
Mission Bay Transportation Management Agency,2


In [10]:
schd_vp_df.loc[schd_vp_df.organization_name == "City of Roseville"][
    ["name", "service_date"]
].drop_duplicates().sort_values(by=["service_date"])

Unnamed: 0,name,service_date
125545,Roseville Schedule,2023-03-15
125546,Roseville Schedule,2023-04-12
125547,Roseville Schedule,2023-05-17
125548,Roseville Schedule,2023-06-14
125549,Roseville Schedule,2023-07-12
125550,Roseville Schedule,2023-08-15
125551,Roseville Schedule,2023-09-13
125552,Roseville Schedule,2023-10-11
125553,Roseville Schedule,2023-11-15
125554,Roseville Schedule,2023-12-13


#### What's the difference in the data between the two names?

In [11]:
schd_vp_df.loc[schd_vp_df.organization_name == "Yosemite National Park"][
    ["name", "service_date"]
].drop_duplicates().sort_values(by=["service_date"])

Unnamed: 0,name,service_date
65760,Yosemite Valley Shuttle Schedule,2024-02-14
65761,Yosemite Valley Shuttle Schedule,2024-03-13
65762,Yosemite Valley Shuttle Schedule,2024-04-17
65763,Yosemite Valley Shuttle Schedule,2024-05-22
65814,Mariposa Grove Shuttle Schedule,2024-05-22
65764,Yosemite Valley Shuttle Schedule,2024-06-12
65815,Mariposa Grove Shuttle Schedule,2024-06-12
65765,Yosemite Valley Shuttle Schedule,2024-07-17
65816,Mariposa Grove Shuttle Schedule,2024-07-17
65766,Yosemite Valley Shuttle Schedule,2024-08-14


#### Same name corresponds with different organization_names

In [12]:
schd_vp_df.groupby(["name"]).agg({"organization_name": "nunique"}).sort_values(
    by=["organization_name"], ascending=False
).head(18)

Unnamed: 0_level_0,organization_name
name,Unnamed: 1_level_1
VCTC GMV Schedule,7
Humboldt Schedule,4
San Diego Schedule,3
"TART, North Lake Tahoe Schedule",2
UCSC Schedule,2
SLO Schedule,2
Sacramento Schedule,2
Bay Area 511 Sonoma County Transit Schedule,2
Bay Area 511 SolTrans Schedule,2
Flixbus Schedule,2


In [33]:
schd_vp_df.loc[schd_vp_df.name == "VCTC GMV Schedule"][
    ["organization_name", "caltrans_district", "name"]
].drop_duplicates().sort_values(by = ['organization_name'])

Unnamed: 0,organization_name,caltrans_district,name
14504,City of Camarillo,07 - Los Angeles,VCTC GMV Schedule
14508,City of Moorpark,07 - Los Angeles,VCTC GMV Schedule
14505,City of Ojai,07 - Los Angeles,VCTC GMV Schedule
14507,City of Simi Valley,07 - Los Angeles,VCTC GMV Schedule
14503,City of Thousand Oaks,07 - Los Angeles,VCTC GMV Schedule
14506,Gold Coast Transit District,07 - Los Angeles,VCTC GMV Schedule
14502,Ventura County Transportation Commission,07 - Los Angeles,VCTC GMV Schedule


###  Bringing in Operator_Profiles helps subset which values in name will actually have data when spinning up the charts.

In [14]:
final_df = op_prep.operators_schd_vp_rt()

In [15]:
final_df.head(2)

Unnamed: 0,organization_name,name,service_date,schedule_gtfs_dataset_key,caltrans_district
0,Lake Transit Authority,Lake Schedule,2024-10-16,0a3c0b21c85fb09f8db91599e14dd7f7,01 - Eureka
1,Mendocino Transit Authority,Mendocino Schedule,2024-10-16,770072d7a8d356b529ef34fe01715bcb,01 - Eureka


In [16]:
final_df.organization_name.nunique()

142

In [17]:
len(final_df)

142

In [18]:
final_df.loc[final_df.organization_name == "City of Ojai"]

Unnamed: 0,organization_name,name,service_date,schedule_gtfs_dataset_key,caltrans_district


In [32]:
final_df.loc[final_df.caltrans_district == "07 - Los Angeles"]

Unnamed: 0,organization_name,name,service_date,schedule_gtfs_dataset_key,caltrans_district
69,Antelope Valley Transit Authority,Antelope Valley Transit Authority Schedule,2024-10-16,802173591303c37b28d30d1ee341517b,07 - Los Angeles
70,City of Alhambra,Alhambra Schedule,2024-10-16,c7899a7a44b15b4b4dd0116aae3b1bbe,07 - Los Angeles
71,City of Arcadia,Arcadia Schedule,2023-12-13,a3a2a40ae51e523796f0be989b8b3493,07 - Los Angeles
72,City of Baldwin Park,Baldwin Park Schedule,2024-04-17,3a2aa5e411c107ea6867b5316f98000b,07 - Los Angeles
73,City of Bell,La Campana Schedule,2024-10-16,c9742056f6aee3d04faaf7226391deeb,07 - Los Angeles
74,City of Bell Gardens,Bell Gardens Schedule,2024-07-17,f4798e5de7526e2b6bfc95258401641b,07 - Los Angeles
75,City of Bellflower,Bellflower Bus Schedule,2024-10-16,e888f0fa71bc47939d2e46c6a17f3edc,07 - Los Angeles
76,City of Burbank,Burbank Schedule,2024-10-16,22b1fd6db336c11d8df960e58cf79d73,07 - Los Angeles
77,City of Calabasas,Calabasas Schedule,2024-10-16,1de16807999628e56b26db6197b622ab,07 - Los Angeles
78,City of Cerritos,Cerritos on Wheels Website Schedule,2024-10-16,c1cae4094e4940b2ff88730d9ae327b9,07 - Los Angeles


In [19]:
op_profile = op_prep.operator_profiles()

In [22]:
op_profiles_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_profiles}.parquet"

In [23]:
op_profiles_df_og = pd.read_parquet(op_profiles_url)

In [25]:
op_profiles_df_og.caltrans_district.unique()

array(['06 - Fresno', '04 - Oakland', '07 - Los Angeles',
       '03 - Marysville', '01 - Eureka', '08 - San Bernardino', None,
       '11 - San Diego', '05 - San Luis Obispo', '12 - Irvine',
       '10 - Stockton', '02 - Redding', '09 - Bishop'], dtype=object)

In [28]:
op_profiles_df_og.loc[op_profiles_df_og.caltrans_district == "07 - Los Angeles"][
    ["organization_name"]
].drop_duplicates().sort_values(by=["organization_name"])

Unnamed: 0,organization_name
2096,Antelope Valley Transit Authority
3454,City of Alhambra
4128,City of Arcadia
1178,City of Avalon
3347,City of Baldwin Park
3221,City of Bell
3716,City of Bell Gardens
3960,City of Bellflower
1327,City of Burbank
3276,City of Calabasas
