## Add in schedule only operators 
* Upon request.
* Test with D4 first to see if BART pops up properly.

In [1]:
import pandas as pd
from shared_utils import catalog_utils

GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### Editing `_operators_prep`

In [3]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

In [4]:
schd_vp_df = pd.read_parquet(schd_vp_url)

In [5]:
schd_vp_df.sched_rt_category.unique()

['schedule_only', 'schedule_and_vp', 'vp_only']
Categories (3, object): ['schedule_only', 'vp_only', 'schedule_and_vp']

In [6]:
schd_vp_df = pd.read_parquet(
    schd_vp_url,
    filters=[[("sched_rt_category", "in", ["schedule_and_vp", "schedule_only"])]],
    columns=[
        "schedule_gtfs_dataset_key",
        "caltrans_district",
        "organization_name",
        "name",
        "sched_rt_category",
        "service_date",
    ],
)

In [7]:
len(schd_vp_df)

140108

In [54]:
# schd_vp_df.organization_name.unique()

### First Drop
* Sort by `sched_rt_category` so any values that are `schedule_and_vp` will pop up first.

In [59]:
drop_1 = (
    schd_vp_df.dropna(subset="caltrans_district")
    .sort_values(
        by=[
            "caltrans_district",
            "organization_name",
            "service_date",
            "sched_rt_category",
        ],
        ascending=[True, True, False, False],
    )
    .drop_duplicates(
        subset=[
            "organization_name",
            "caltrans_district",
        ]
    )
    .reset_index(drop=True)
)

In [60]:
len(drop_1)

177

In [61]:
drop_1.organization_name.nunique(), drop_1.name.nunique()

(177, 161)

In [62]:
drop_1.sort_values(by=["caltrans_district", "organization_name", "name"])

Unnamed: 0,schedule_gtfs_dataset_key,caltrans_district,organization_name,name,sched_rt_category,service_date
0,6693efa56a541b6276da9b424f78a170,01 - Eureka,Blue Lake Rancheria,Humboldt Schedule,schedule_and_vp,2023-09-13
1,a253a8d7acd57657bb98050f37dd6b0f,01 - Eureka,City of Arcata,Humboldt Schedule,schedule_only,2023-10-11
2,a253a8d7acd57657bb98050f37dd6b0f,01 - Eureka,City of Eureka,Humboldt Schedule,schedule_and_vp,2024-04-17
3,1c698dddc3779d140521d3f1366a8df6,01 - Eureka,Curry Public Transit,Curry Public Transit Schedule,schedule_only,2024-04-17
4,a73dea57836841cc1dfaa02585424deb,01 - Eureka,Humboldt Transit Authority,Humboldt Schedule,schedule_and_vp,2023-03-15
5,0a3c0b21c85fb09f8db91599e14dd7f7,01 - Eureka,Lake Transit Authority,Lake Schedule,schedule_and_vp,2024-04-17
6,770072d7a8d356b529ef34fe01715bcb,01 - Eureka,Mendocino Transit Authority,Mendocino Schedule,schedule_and_vp,2024-04-17
7,0d04ec340550e5a62b031a8e125e6658,01 - Eureka,POINT,Oregon POINT,schedule_only,2024-04-17
8,090b30e4249a7ec2b4c6a0923ed2f953,01 - Eureka,Redwood Coast Transit Authority,Redwood Coast Schedulel,schedule_and_vp,2024-04-17
9,e524db270831632bdcf71df1d7e74d25,02 - Redding,Lassen Transit Service Agency,Lassen Schedule,schedule_only,2024-04-17


In [70]:
drop_2 = (
    drop_1.sort_values(
        by=["caltrans_district", "name", "service_date"], ascending=[True, False, False]
    )
    .drop_duplicates(subset=["caltrans_district", "name"])
    .reset_index(drop=True)
)

In [82]:
drop_2.sort_values(by=["caltrans_district", "organization_name"])

Unnamed: 0,schedule_gtfs_dataset_key,caltrans_district,organization_name,name,sched_rt_category,service_date
4,a253a8d7acd57657bb98050f37dd6b0f,01 - Eureka,City of Eureka,Humboldt Schedule,schedule_and_vp,2024-04-17
5,1c698dddc3779d140521d3f1366a8df6,01 - Eureka,Curry Public Transit,Curry Public Transit Schedule,schedule_only,2024-04-17
3,0a3c0b21c85fb09f8db91599e14dd7f7,01 - Eureka,Lake Transit Authority,Lake Schedule,schedule_and_vp,2024-04-17
2,770072d7a8d356b529ef34fe01715bcb,01 - Eureka,Mendocino Transit Authority,Mendocino Schedule,schedule_and_vp,2024-04-17
1,0d04ec340550e5a62b031a8e125e6658,01 - Eureka,POINT,Oregon POINT,schedule_only,2024-04-17
0,090b30e4249a7ec2b4c6a0923ed2f953,01 - Eureka,Redwood Coast Transit Authority,Redwood Coast Schedulel,schedule_and_vp,2024-04-17
12,e524db270831632bdcf71df1d7e74d25,02 - Redding,Lassen Transit Service Agency,Lassen Schedule,schedule_only,2024-04-17
9,0d65d96d07115e28313f207d5ed0d3b2,02 - Redding,Modoc Transportation Agency,Sage Stage Schedule,schedule_only,2024-04-17
11,96a5b8c5bc22b3936ddee01cac7b5fb6,02 - Redding,Plumas Transit Systems,Plumas Schedule,schedule_only,2024-04-17
10,91af7482fde58c6261f386b732404e11,02 - Redding,Shasta County,Redding Schedule,schedule_and_vp,2024-04-17


In [72]:
drop_2.shape, drop_2.organization_name.nunique(), drop_2.name.nunique()

((161, 6), 161, 161)

### Merge with `op_profiles`

In [73]:
op_profiles_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_profiles}.parquet"
op_cols = ["organization_name", "name", "service_date", "schedule_gtfs_dataset_key"]
op_profiles_df = pd.read_parquet(op_profiles_url)[op_cols]

# Keep the name with the most recent service date
op_profiles2 = op_profiles_df.sort_values(
    by=["name", "service_date"], ascending=[True, False]
)
# Drop duplicated names
op_profiles3 = op_profiles2.drop_duplicates(subset=["name"])

# Drop duplicated organization names
op_profiles4 = op_profiles3.drop_duplicates(subset=["organization_name"]).reset_index(
    drop=True
)

In [74]:
len(op_profiles4)

155

In [78]:
m1 = pd.merge(
    op_profiles4,
    drop_2,
    on=["name", "organization_name", "schedule_gtfs_dataset_key"],
    how="outer",
    indicator=True,
)

In [79]:
m1._merge.value_counts()

both          148
right_only     13
left_only       7
Name: _merge, dtype: int64

In [80]:
m1.loc[m1._merge == "right_only"]

Unnamed: 0,organization_name,name,service_date_x,schedule_gtfs_dataset_key,caltrans_district,sched_rt_category,service_date_y,_merge
155,Redwood Coast Transit Authority,Redwood Coast Schedulel,NaT,090b30e4249a7ec2b4c6a0923ed2f953,01 - Eureka,schedule_and_vp,2024-04-17,right_only
156,POINT,Oregon POINT,NaT,0d04ec340550e5a62b031a8e125e6658,01 - Eureka,schedule_only,2024-04-17,right_only
157,Curry Public Transit,Curry Public Transit Schedule,NaT,1c698dddc3779d140521d3f1366a8df6,01 - Eureka,schedule_only,2024-04-17,right_only
158,Amtrak,Amtrak Schedule,NaT,b9473e19aebf7ee2ec18623eb35762a1,03 - Marysville,schedule_only,2023-07-12,right_only
159,Sonoma-Marin Area Rail Transit District,Bay Area 511 Sonoma-Marin Area Rail Transit Schedule,NaT,0881af3822466784992a49f1cc57d38f,04 - Oakland,schedule_and_vp,2024-04-17,right_only
160,San Francisco International Airport,Bay Area 511 SFO AirTrain Schedule,NaT,2ff70dd1151d7532db40436f8228cd33,04 - Oakland,schedule_only,2024-04-17,right_only
161,Peninsula Corridor Joint Powers Board,Bay Area 511 Caltrain Schedule,NaT,9b4c9cf681dfcc10dffe9523b43598bf,04 - Oakland,schedule_and_vp,2024-04-17,right_only
162,City of Sierra Madre,Sierra Madre Schedule,NaT,92d654fdb86639c8587210495827fb77,07 - Los Angeles,schedule_only,2024-04-17,right_only
163,Los Angeles County Metropolitan Transportation Authority,LA Metro Rail Schedule,NaT,2a0571758141f412b6a546fd70a65bf3,07 - Los Angeles,schedule_and_vp,2024-04-17,right_only
164,Yosemite National Park,Yosemite Valley Shuttle Schedule,NaT,31152914d10e2d0977b8b2fabb167922,10 - Stockton,schedule_only,2024-04-17,right_only


In [86]:
m1.loc[m1._merge == "left_only"]

Unnamed: 0,organization_name,name,service_date_x,schedule_gtfs_dataset_key,caltrans_district,sched_rt_category,service_date_y,_merge
6,City of Avalon,Avalon Schedule,2024-03-13,1c10c6bdc6d306a5a05bba52c00e3755,,,NaT,left_only
9,City of Banning,Banning Pass Schedule,2024-04-17,ebc783bace70899492d6206c352547d6,,,NaT,left_only
15,Dumbarton Bridge Regional Operations Consortium,Bay Area 511 Dumbarton Express Schedule,2024-03-13,107627829aee1cf434ca17849ce8395e,,,NaT,left_only
18,"Golden Gate Bridge, Highway and Transportation District",Bay Area 511 Golden Gate Transit Schedule,2024-03-13,aea4108997c66a74fbdae27b34b69fde,,,NaT,left_only
84,Los Angeles County Metropolitan Transportation Authority,LA Metro Bus Schedule,2024-04-17,3f3f36b4c41cc6b5df3eb7f5d8ea6e3c,,,NaT,left_only
100,Transit Joint Powers Authority for Merced County,Merced GMV Schedule,2024-04-17,bff13f8993ff18e43577db1f5596e014,,,NaT,left_only
116,Redwood Coast Transit Authority,Redwood Coast Schedule,2024-03-13,6b09003d3a547f37aba623fe8211b355,,,NaT,left_only


In [83]:
m2 = pd.merge(
    op_profiles4, drop_2, on=["name", "organization_name"], how="outer", indicator=True
)

In [84]:
m2._merge.value_counts()

both          148
right_only     13
left_only       7
Name: _merge, dtype: int64

In [85]:
m2.loc[m2._merge == "right_only"]

Unnamed: 0,organization_name,name,service_date_x,schedule_gtfs_dataset_key_x,schedule_gtfs_dataset_key_y,caltrans_district,sched_rt_category,service_date_y,_merge
155,Redwood Coast Transit Authority,Redwood Coast Schedulel,NaT,,090b30e4249a7ec2b4c6a0923ed2f953,01 - Eureka,schedule_and_vp,2024-04-17,right_only
156,POINT,Oregon POINT,NaT,,0d04ec340550e5a62b031a8e125e6658,01 - Eureka,schedule_only,2024-04-17,right_only
157,Curry Public Transit,Curry Public Transit Schedule,NaT,,1c698dddc3779d140521d3f1366a8df6,01 - Eureka,schedule_only,2024-04-17,right_only
158,Amtrak,Amtrak Schedule,NaT,,b9473e19aebf7ee2ec18623eb35762a1,03 - Marysville,schedule_only,2023-07-12,right_only
159,Sonoma-Marin Area Rail Transit District,Bay Area 511 Sonoma-Marin Area Rail Transit Schedule,NaT,,0881af3822466784992a49f1cc57d38f,04 - Oakland,schedule_and_vp,2024-04-17,right_only
160,San Francisco International Airport,Bay Area 511 SFO AirTrain Schedule,NaT,,2ff70dd1151d7532db40436f8228cd33,04 - Oakland,schedule_only,2024-04-17,right_only
161,Peninsula Corridor Joint Powers Board,Bay Area 511 Caltrain Schedule,NaT,,9b4c9cf681dfcc10dffe9523b43598bf,04 - Oakland,schedule_and_vp,2024-04-17,right_only
162,City of Sierra Madre,Sierra Madre Schedule,NaT,,92d654fdb86639c8587210495827fb77,07 - Los Angeles,schedule_only,2024-04-17,right_only
163,Los Angeles County Metropolitan Transportation Authority,LA Metro Rail Schedule,NaT,,2a0571758141f412b6a546fd70a65bf3,07 - Los Angeles,schedule_and_vp,2024-04-17,right_only
164,Yosemite National Park,Yosemite Valley Shuttle Schedule,NaT,,31152914d10e2d0977b8b2fabb167922,10 - Stockton,schedule_only,2024-04-17,right_only


In [87]:
m3 = pd.merge(
    op_profiles4, drop_2, on=["organization_name"], how="outer", indicator=True
)

In [88]:
m3._merge.value_counts()

both          151
right_only     10
left_only       4
Name: _merge, dtype: int64

In [89]:
m3.head(2)

Unnamed: 0,organization_name,name_x,service_date_x,schedule_gtfs_dataset_key_x,schedule_gtfs_dataset_key_y,caltrans_district,name_y,sched_rt_category,service_date_y,_merge
0,City of Alhambra,Alhambra Schedule,2024-04-17,6894087758e4c76d3e591daee4c46dc9,6894087758e4c76d3e591daee4c46dc9,07 - Los Angeles,Alhambra Schedule,schedule_only,2024-04-17,both
1,Amador Regional Transit System,Amador Schedule,2024-04-17,36b8fbf12e4adc76b21651462b200860,36b8fbf12e4adc76b21651462b200860,10 - Stockton,Amador Schedule,schedule_only,2024-04-17,both
