## Some operators switch names for the same gtfs_dataset_key

In [7]:
import _section1_utils as section1
import _section2_utils as section2
import geopandas as gpd
import merge_data
import merge_operator_data
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers
from segment_speed_utils.project_vars import COMPILED_CACHED_VIEWS, PROJECT_CRS
from shared_utils import catalog_utils, portfolio_utils, rt_dates
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

In [8]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [9]:
analysis_date_list = [rt_dates.DATES["feb2025"]]

In [10]:
analysis_date = rt_dates.DATES["feb2025"]

In [11]:
import _operators_prep

In [12]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"
    
schd_vp_df = (pd.read_parquet(schd_vp_url, 
                       filters=[[("sched_rt_category", "in", ["schedule_and_vp", "schedule_only"])]],
                       columns = [ "schedule_gtfs_dataset_key",
                                    "caltrans_district",
                                    "organization_name",
                                    "name",
                                    "sched_rt_category",
                                    "service_date",]
                                     )
                     )

schd_vp_df2 = schd_vp_df.loc[schd_vp_df.caltrans_district != '07 - Los Angeles / Ventura']
    
schd_vp_df2 = (
    schd_vp_df.dropna(subset="caltrans_district")
    .sort_values(
        by=[
            "caltrans_district",
            "organization_name",
            "service_date",
        ],
        ascending=[True, True, False],
    )
    .drop_duplicates(
        subset=[
            "organization_name",
            "caltrans_district",
        ]
    )
    .reset_index(drop=True)
    )

In [13]:
schd_vp_df2.sched_rt_category.value_counts()

schedule_and_vp    136
schedule_only      126
vp_only              0
Name: sched_rt_category, dtype: int64

In [14]:
ferry_ops = ["San Francisco Bay Area Water Emergency Transit Authority",
             "City of Alameda",
            "Golden Gate Bridge, Highway and Transportation District",
            "Southern California Regional Rail Authority",
            "San Bernardino County Transportation Authority"]

In [15]:
schd_vp_df2.loc[schd_vp_df2.organization_name.isin(ferry_ops)]

Unnamed: 0,schedule_gtfs_dataset_key,caltrans_district,organization_name,name,sched_rt_category,service_date
42,82f30e22dafe8156367297eb9a316c57,04 - Oakland,City of Alameda,Bay Area 511 San Francisco Bay Ferry Schedule,schedule_and_vp,2025-02-12
57,aea4108997c66a74fbdae27b34b69fde,04 - Oakland,"Golden Gate Bridge, Highway and Transportation District",Bay Area 511 Golden Gate Transit Schedule,schedule_and_vp,2025-02-12
66,82f30e22dafe8156367297eb9a316c57,04 - Oakland,San Francisco Bay Area Water Emergency Transit Authority,Bay Area 511 San Francisco Bay Ferry Schedule,schedule_and_vp,2025-02-12
160,c4092405159366c705b62df938293a4e,07 - Los Angeles,Southern California Regional Rail Authority,Metrolink Schedule,schedule_only,2024-06-12
227,c4092405159366c705b62df938293a4e,08 - San Bernardino,San Bernardino County Transportation Authority,Metrolink Schedule,schedule_only,2024-06-12


In [16]:
len(schd_vp_df2)

262

In [17]:
schd_vp_df2.schedule_gtfs_dataset_key.nunique()

194

In [18]:
schd_vp_df2.schedule_gtfs_dataset_key.value_counts().head(20)

1770249a5a2e770ca90628434d4934b1    14
f74424acf8c41e4c1e9fd42838c4875c     4
a37760dde6b9fdcb76b82e57afab7274     4
a253a8d7acd57657bb98050f37dd6b0f     3
baeeb157e85a901e47b828ef9fe75091     3
40ead758629da2ad8a74dbc687652e5a     2
82f30e22dafe8156367297eb9a316c57     2
1c7027faabfeec976ea388973100bcf3     2
b34f8d2270968f55f23f80b267df1d5f     2
c2a40ce92e76ec5beb88c40df3cd3a67     2
fc6cd27871cce0092a08ccf68fb240a2     2
144acd9a0dc18e2cddce39e0fa6d42a0     2
84d3c3507a4c8df851d935b63908bbd5     2
5ed4b903a3c6049509b935883c440209     2
364d59b3aea55aec2962a0b3244a40e0     2
efbbd5293be71f7a5de0cf82b59febe1     2
f56b72b9b1c0bd141c27d2fe9874c346     2
09e16227fc42c4fe90204a9d11581034     2
714116de6e520beb2bdd4fcfb78cfe21     2
cc53a0dbf5df90e3009b9cb5d89d80ba     2
Name: schedule_gtfs_dataset_key, dtype: int64

In [19]:
schd_vp_df2.loc[schd_vp_df2.schedule_gtfs_dataset_key == "91af7482fde58c6261f386b732404e11"]

Unnamed: 0,schedule_gtfs_dataset_key,caltrans_district,organization_name,name,sched_rt_category,service_date
13,91af7482fde58c6261f386b732404e11,02 - Redding,Redding Area Bus Authority,Redding Schedule,schedule_and_vp,2025-02-12
14,91af7482fde58c6261f386b732404e11,02 - Redding,Shasta County,Redding Schedule,schedule_and_vp,2025-02-12


In [20]:
schd_vp_df2.loc[schd_vp_df2.schedule_gtfs_dataset_key == "f74424acf8c41e4c1e9fd42838c4875c"]

Unnamed: 0,schedule_gtfs_dataset_key,caltrans_district,organization_name,name,sched_rt_category,service_date
121,f74424acf8c41e4c1e9fd42838c4875c,07 - Los Angeles,City of Duarte,Foothill Schedule,schedule_and_vp,2024-12-11
152,f74424acf8c41e4c1e9fd42838c4875c,07 - Los Angeles,Foothill Transit,Foothill Schedule,schedule_and_vp,2024-12-11
178,f74424acf8c41e4c1e9fd42838c4875c,07 - Los Angeles / Ventura,City of Duarte,Foothill Schedule,schedule_and_vp,2025-02-12
207,f74424acf8c41e4c1e9fd42838c4875c,07 - Los Angeles / Ventura,Foothill Transit,Foothill Schedule,schedule_and_vp,2025-02-12


In [21]:
schd_vp_df2.loc[schd_vp_df2.schedule_gtfs_dataset_key == "1770249a5a2e770ca90628434d4934b1"]

Unnamed: 0,schedule_gtfs_dataset_key,caltrans_district,organization_name,name,sched_rt_category,service_date
115,1770249a5a2e770ca90628434d4934b1,07 - Los Angeles,City of Camarillo,VCTC GMV Schedule,schedule_and_vp,2024-12-11
136,1770249a5a2e770ca90628434d4934b1,07 - Los Angeles,City of Moorpark,VCTC GMV Schedule,schedule_and_vp,2024-12-11
138,1770249a5a2e770ca90628434d4934b1,07 - Los Angeles,City of Ojai,VCTC GMV Schedule,schedule_and_vp,2024-12-11
145,1770249a5a2e770ca90628434d4934b1,07 - Los Angeles,City of Simi Valley,VCTC GMV Schedule,schedule_and_vp,2024-12-11
147,1770249a5a2e770ca90628434d4934b1,07 - Los Angeles,City of Thousand Oaks,VCTC GMV Schedule,schedule_and_vp,2024-12-11
153,1770249a5a2e770ca90628434d4934b1,07 - Los Angeles,Gold Coast Transit District,VCTC GMV Schedule,schedule_and_vp,2024-12-11
162,1770249a5a2e770ca90628434d4934b1,07 - Los Angeles,Ventura County Transportation Commission,VCTC GMV Schedule,schedule_and_vp,2024-12-11
172,1770249a5a2e770ca90628434d4934b1,07 - Los Angeles / Ventura,City of Camarillo,VCTC GMV Schedule,schedule_and_vp,2025-02-12
191,1770249a5a2e770ca90628434d4934b1,07 - Los Angeles / Ventura,City of Moorpark,VCTC GMV Schedule,schedule_and_vp,2025-02-12
193,1770249a5a2e770ca90628434d4934b1,07 - Los Angeles / Ventura,City of Ojai,VCTC GMV Schedule,schedule_and_vp,2025-02-12


In [22]:
schd_vp_all_cols_df = (pd.read_parquet(schd_vp_url, 
                       filters=[[("sched_rt_category", "in", ["schedule_and_vp", "schedule_only"])]],
                       
                     ))


In [23]:
schd_vp_all_cols_df = schd_vp_all_cols_df.loc[schd_vp_all_cols_df.organization_name.isin(
ferry_ops)]

In [24]:
schd_vp_all_cols_df.groupby(['organization_name','service_date']).agg({'route_id':'nunique'})

Unnamed: 0_level_0,Unnamed: 1_level_0,route_id
organization_name,service_date,Unnamed: 2_level_1
City of Alameda,2024-11-13,7
City of Alameda,2024-12-11,7
City of Alameda,2025-01-15,7
City of Alameda,2025-02-12,7
"Golden Gate Bridge, Highway and Transportation District",2023-03-15,14
"Golden Gate Bridge, Highway and Transportation District",2023-04-12,14
"Golden Gate Bridge, Highway and Transportation District",2023-05-17,14
"Golden Gate Bridge, Highway and Transportation District",2023-06-14,14
"Golden Gate Bridge, Highway and Transportation District",2023-07-12,14
"Golden Gate Bridge, Highway and Transportation District",2023-08-15,14


## Operators differ between D7 Los Angeles vs D7 LA/Ventura

In [26]:
schd_vp_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 387498 entries, 0 to 387497
Data columns (total 6 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   schedule_gtfs_dataset_key  387498 non-null  object        
 1   caltrans_district          385857 non-null  object        
 2   organization_name          385857 non-null  object        
 3   name                       387498 non-null  object        
 4   sched_rt_category          387498 non-null  category      
 5   service_date               387498 non-null  datetime64[ns]
dtypes: category(1), datetime64[ns](1), object(4)
memory usage: 18.1+ MB


In [27]:
schd_vp_df = schd_vp_df.fillna("None")

In [28]:

d7_only = schd_vp_df.loc[schd_vp_df.caltrans_district.str.contains("07")]
    

In [30]:
d7_only2 = (
    d7_only.dropna(subset="caltrans_district")
    .sort_values(
        by=[
            "caltrans_district",
            "organization_name",
            "service_date",
        ],
        ascending=[True, True, False],
    )
    .drop_duplicates(
        subset=[
            "organization_name",
            "caltrans_district",
        ]
    )
    .reset_index(drop=True)
    )

In [31]:
d7_only2.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,caltrans_district,organization_name,name,sched_rt_category,service_date
0,802173591303c37b28d30d1ee341517b,07 - Los Angeles,Antelope Valley Transit Authority,Antelope Valley Transit Authority Schedule,schedule_and_vp,2024-12-11


In [32]:
d7_og = d7_only2.loc[d7_only2.caltrans_district == "07 - Los Angeles"]

In [43]:
d7_og.loc[d7_og.organization_name.str.contains("Calabasas")]

Unnamed: 0,schedule_gtfs_dataset_key,caltrans_district,organization_name,name,sched_rt_category,service_date
10,1de16807999628e56b26db6197b622ab,07 - Los Angeles,City of Calabasas,Calabasas Schedule,schedule_only,2024-12-11


In [46]:
d7_og.loc[d7_og.organization_name.str.contains("Sierra Madre")]

Unnamed: 0,schedule_gtfs_dataset_key,caltrans_district,organization_name,name,sched_rt_category,service_date
40,92d654fdb86639c8587210495827fb77,07 - Los Angeles,City of Sierra Madre,Sierra Madre Schedule,schedule_only,2024-12-11


In [47]:
d7_og.loc[d7_og.organization_name.str.contains("Avalon")]

Unnamed: 0,schedule_gtfs_dataset_key,caltrans_district,organization_name,name,sched_rt_category,service_date
4,1c10c6bdc6d306a5a05bba52c00e3755,07 - Los Angeles,City of Avalon,Avalon Schedule,schedule_and_vp,2024-03-13


In [34]:
d7_og_orgs = set(list(d7_og.organization_name.unique()))

In [38]:
d7_og_sched = set(list(d7_og.schedule_gtfs_dataset_key.unique()))

In [33]:
d7_la_ventura = d7_only2.loc[d7_only2.caltrans_district != "07 - Los Angeles"]

In [45]:
d7_la_ventura.loc[d7_la_ventura.schedule_gtfs_dataset_key.str.contains("1de16807999628e56b26db6197b622ab")]

Unnamed: 0,schedule_gtfs_dataset_key,caltrans_district,organization_name,name,sched_rt_category,service_date


In [48]:
d7_la_ventura.loc[d7_la_ventura.schedule_gtfs_dataset_key.str.contains("1c10c6bdc6d306a5a05bba52c00e3755")]

Unnamed: 0,schedule_gtfs_dataset_key,caltrans_district,organization_name,name,sched_rt_category,service_date


In [35]:
d7_la_ventura_orgs = set(list(d7_la_ventura.organization_name.unique()))

In [39]:
d7_la_ventura_sched = set(list(d7_og.schedule_gtfs_dataset_key.unique()))

In [36]:
d7_la_ventura_orgs - d7_og_orgs

{'City of San Fernando'}

In [37]:
d7_og_orgs - d7_la_ventura_orgs 

{'City of Avalon',
 'City of Calabasas',
 'City of El Segundo',
 'City of La Puente',
 'City of Sierra Madre',
 'Southern California Regional Rail Authority'}

In [40]:
d7_la_ventura_sched - d7_og_sched

set()

In [41]:
d7_og_sched - d7_la_ventura_sched

set()

## Try to delete out 07 - Los Angeles