## Some operators switch names for the same gtfs_dataset_key

In [1]:
import _section1_utils as section1
import _section2_utils as section2
import geopandas as gpd
import merge_data
import merge_operator_data
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers
from segment_speed_utils.project_vars import COMPILED_CACHED_VIEWS, PROJECT_CRS
from shared_utils import catalog_utils, portfolio_utils, rt_dates
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
analysis_date_list = [rt_dates.DATES["feb2025"]]

In [4]:
analysis_date = rt_dates.DATES["feb2025"]

In [5]:
import _operators_prep

In [6]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"
    
schd_vp_df = (pd.read_parquet(schd_vp_url, 
                       filters=[[("sched_rt_category", "in", ["schedule_and_vp", "schedule_only"])]],
                       columns = [ "schedule_gtfs_dataset_key",
                                    "caltrans_district",
                                    "organization_name",
                                    "name",
                                    "sched_rt_category",
                                    "service_date",]
                                     )
                     )

schd_vp_df = schd_vp_df.loc[schd_vp_df.caltrans_district != '07 - Los Angeles / Ventura']
    
schd_vp_df2 = (
    schd_vp_df.dropna(subset="caltrans_district")
    .sort_values(
        by=[
            "caltrans_district",
            "organization_name",
            "service_date",
        ],
        ascending=[True, True, False],
    )
    .drop_duplicates(
        subset=[
            "organization_name",
            "caltrans_district",
        ]
    )
    .reset_index(drop=True)
    )

In [7]:
schd_vp_df2.sched_rt_category.value_counts()

schedule_and_vp    106
schedule_only      102
vp_only              0
Name: sched_rt_category, dtype: int64

In [8]:
ferry_ops = ["San Francisco Bay Area Water Emergency Transit Authority",
             "City of Alameda",
            "Golden Gate Bridge, Highway and Transportation District",
            "Southern California Regional Rail Authority",
            "San Bernardino County Transportation Authority"]

In [9]:
schd_vp_df2.loc[schd_vp_df2.organization_name.isin(ferry_ops)]

Unnamed: 0,schedule_gtfs_dataset_key,caltrans_district,organization_name,name,sched_rt_category,service_date
42,82f30e22dafe8156367297eb9a316c57,04 - Oakland,City of Alameda,Bay Area 511 San Francisco Bay Ferry Schedule,schedule_and_vp,2025-02-12
57,aea4108997c66a74fbdae27b34b69fde,04 - Oakland,"Golden Gate Bridge, Highway and Transportation District",Bay Area 511 Golden Gate Transit Schedule,schedule_and_vp,2025-02-12
66,82f30e22dafe8156367297eb9a316c57,04 - Oakland,San Francisco Bay Area Water Emergency Transit Authority,Bay Area 511 San Francisco Bay Ferry Schedule,schedule_and_vp,2025-02-12
160,c4092405159366c705b62df938293a4e,07 - Los Angeles,Southern California Regional Rail Authority,Metrolink Schedule,schedule_only,2024-06-12
173,c4092405159366c705b62df938293a4e,08 - San Bernardino,San Bernardino County Transportation Authority,Metrolink Schedule,schedule_only,2024-06-12


In [10]:
len(schd_vp_df2)

208

In [11]:
schd_vp_df2.schedule_gtfs_dataset_key.nunique()

185

In [12]:
schd_vp_df2.schedule_gtfs_dataset_key.value_counts().head(20)

1770249a5a2e770ca90628434d4934b1    7
baeeb157e85a901e47b828ef9fe75091    3
a253a8d7acd57657bb98050f37dd6b0f    3
48e137bc977da88970393f629c18432c    2
c2a40ce92e76ec5beb88c40df3cd3a67    2
70c8a8b71c815224299523bf2115924a    2
6fda78099793184fe08dd78945d188c0    2
f74424acf8c41e4c1e9fd42838c4875c    2
b34f8d2270968f55f23f80b267df1d5f    2
1ebafaca8716652559b2017b6eedc4ef    2
73c79ccbfd681df300489226a158b9db    2
09e16227fc42c4fe90204a9d11581034    2
91af7482fde58c6261f386b732404e11    2
82f30e22dafe8156367297eb9a316c57    2
a37760dde6b9fdcb76b82e57afab7274    2
c4092405159366c705b62df938293a4e    2
5ed4b903a3c6049509b935883c440209    1
1c7027faabfeec976ea388973100bcf3    1
40ead758629da2ad8a74dbc687652e5a    1
fc6cd27871cce0092a08ccf68fb240a2    1
Name: schedule_gtfs_dataset_key, dtype: int64

In [13]:
schd_vp_df2.loc[schd_vp_df2.schedule_gtfs_dataset_key == "f74424acf8c41e4c1e9fd42838c4875c"]

Unnamed: 0,schedule_gtfs_dataset_key,caltrans_district,organization_name,name,sched_rt_category,service_date
121,f74424acf8c41e4c1e9fd42838c4875c,07 - Los Angeles,City of Duarte,Foothill Schedule,schedule_and_vp,2024-12-11
152,f74424acf8c41e4c1e9fd42838c4875c,07 - Los Angeles,Foothill Transit,Foothill Schedule,schedule_and_vp,2024-12-11


In [17]:
schd_vp_df2.loc[schd_vp_df2.schedule_gtfs_dataset_key == "1770249a5a2e770ca90628434d4934b1"]

Unnamed: 0,schedule_gtfs_dataset_key,caltrans_district,organization_name,name,sched_rt_category,service_date
115,1770249a5a2e770ca90628434d4934b1,07 - Los Angeles,City of Camarillo,VCTC GMV Schedule,schedule_and_vp,2024-12-11
136,1770249a5a2e770ca90628434d4934b1,07 - Los Angeles,City of Moorpark,VCTC GMV Schedule,schedule_and_vp,2024-12-11
138,1770249a5a2e770ca90628434d4934b1,07 - Los Angeles,City of Ojai,VCTC GMV Schedule,schedule_and_vp,2024-12-11
145,1770249a5a2e770ca90628434d4934b1,07 - Los Angeles,City of Simi Valley,VCTC GMV Schedule,schedule_and_vp,2024-12-11
147,1770249a5a2e770ca90628434d4934b1,07 - Los Angeles,City of Thousand Oaks,VCTC GMV Schedule,schedule_and_vp,2024-12-11
153,1770249a5a2e770ca90628434d4934b1,07 - Los Angeles,Gold Coast Transit District,VCTC GMV Schedule,schedule_and_vp,2024-12-11
162,1770249a5a2e770ca90628434d4934b1,07 - Los Angeles,Ventura County Transportation Commission,VCTC GMV Schedule,schedule_and_vp,2024-12-11


In [14]:
schd_vp_all_cols_df = (pd.read_parquet(schd_vp_url, 
                       filters=[[("sched_rt_category", "in", ["schedule_and_vp", "schedule_only"])]],
                       
                     ))


In [15]:
schd_vp_all_cols_df = schd_vp_all_cols_df.loc[schd_vp_all_cols_df.organization_name.isin(
ferry_ops)]

In [16]:
schd_vp_all_cols_df.groupby(['organization_name','service_date']).agg({'route_id':'nunique'})

Unnamed: 0_level_0,Unnamed: 1_level_0,route_id
organization_name,service_date,Unnamed: 2_level_1
City of Alameda,2024-11-13,7
City of Alameda,2024-12-11,7
City of Alameda,2025-01-15,7
City of Alameda,2025-02-12,7
"Golden Gate Bridge, Highway and Transportation District",2023-03-15,14
"Golden Gate Bridge, Highway and Transportation District",2023-04-12,14
"Golden Gate Bridge, Highway and Transportation District",2023-05-17,14
"Golden Gate Bridge, Highway and Transportation District",2023-06-14,14
"Golden Gate Bridge, Highway and Transportation District",2023-07-12,14
"Golden Gate Bridge, Highway and Transportation District",2023-08-15,14
