# Missing operator debug

Quite a few GitHub issues have to do with missing operators.

Go through initial downloaded schedule tables and scripts in `gtfs_funnel` to see whether it's missing the entire time or because of values that are needed in several merges.

In [1]:
import geopandas as gpd
import pandas as pd

from segment_speed_utils import (gtfs_schedule_wrangling, 
                                 helpers, 
                                 time_series_utils)
from shared_utils import rt_dates
from update_vars import (GTFS_DATA_DICT, 
                         SEGMENT_GCS, 
                         RT_SCHED_GCS, 
                         SCHED_GCS, 
                         COMPILED_CACHED_VIEWS
                        )

'''
import sys
sys.path.append("../gtfs_digest/")
import merge_data

route_time_cols = merge_data.route_time_cols
'''

analysis_date_list = rt_dates.y2025_dates + rt_dates.y2024_dates

In [2]:
def check_dates_available(df):
    print(df.service_date.value_counts())
    return

def feed_keys_in_downloaded_schedule_tables(
    date_list: list,
    gcs_bucket: str = COMPILED_CACHED_VIEWS,
    file: str = "",
    subset_feeds: list = []
):
    df = time_series_utils.concatenate_datasets_across_dates(
        gcs_bucket,
        file,
        date_list,
        data_type = "df",
        get_pandas=True,
        filters = [[("feed_key", "in", subset_feeds)]],
        columns = ["feed_key"],
    )
    
    df.pipe(check_dates_available)
    
    return

In [3]:
def get_operator_key(operator_name: str) -> str: 
    TRIPS_FILE = GTFS_DATA_DICT.schedule_downloads.trips

    operator_key = time_series_utils.concatenate_datasets_across_dates(
        COMPILED_CACHED_VIEWS,
        TRIPS_FILE,
        analysis_date_list,
        data_type = "df",
        get_pandas=True,
        filters = [[("name", "==", operator_name)]],
        columns = ["gtfs_dataset_key"],
    ).gtfs_dataset_key.iloc[0]
    
    return operator_key

In [4]:
def check_downloaded_schedule_tables(
    date_list: list,
    one_operator_key: str
): 

    TRIPS_FILE = GTFS_DATA_DICT.schedule_downloads.trips
    SHAPES_FILE = GTFS_DATA_DICT.schedule_downloads.shapes
    STOPS_FILE = GTFS_DATA_DICT.schedule_downloads.stops
    STOP_TIMES_FILE = GTFS_DATA_DICT.schedule_downloads.stop_times

    trips = time_series_utils.concatenate_datasets_across_dates(
        COMPILED_CACHED_VIEWS,
        TRIPS_FILE,
        date_list,
        data_type = "df",
        get_pandas=True,
        filters = [[("gtfs_dataset_key", "==", one_operator_key)]],
        columns = ["name", "gtfs_dataset_key", "feed_key", "route_id", "direction_id"],
    )  
    
    subset_feeds = trips.feed_key.unique().tolist()
    print(f"feed keys for {one_operator_key}: {subset_feeds}")
    
    print("trips")
    trips.pipe(check_dates_available)
    
    
    display(trips[["route_id", "direction_id"]].drop_duplicates().head())
    display(trips.direction_id.value_counts(dropna=False))
    
    print("shapes")
    feed_keys_in_downloaded_schedule_tables(
        date_list, 
        file = SHAPES_FILE,
        subset_feeds = subset_feeds
    )
    
    
    print("stops")
    feed_keys_in_downloaded_schedule_tables(
        date_list, 
        file = STOPS_FILE,
        subset_feeds = subset_feeds
    )
    
    
    print("stop_times")
    feed_keys_in_downloaded_schedule_tables(
        date_list, 
        file = STOP_TIMES_FILE,
        subset_feeds = subset_feeds
    )
    
    return 

In [5]:
def check_route_typologies(
    date_list: list,
    one_operator_key: str
): 
    
    FILE = GTFS_DATA_DICT.schedule_tables.route_typologies

    df = time_series_utils.concatenate_datasets_across_dates(
        SCHED_GCS,
        FILE,
        date_list,
        data_type = "df",
        get_pandas=True,
        filters = [[("schedule_gtfs_dataset_key", "==", one_operator_key)]],
        columns = ["schedule_gtfs_dataset_key","route_id", "direction_id"],
    )  
    
    print("route typologies")
    
    df.pipe(check_dates_available)
    
    display(df[["route_id", "direction_id"]].drop_duplicates().head())
    display(df.direction_id.value_counts(dropna=False))
    
    return 

In [6]:
def check_monthly_service_download(
    one_operator_key: str
):
    FILE = GTFS_DATA_DICT.schedule_tables.monthly_scheduled_service
    
    df = pd.read_parquet(
        f"{SCHED_GCS}{FILE}.parquet",
        filters = [[("schedule_gtfs_dataset_key", "==", one_operator_key)]],
        columns = ["schedule_gtfs_dataset_key", "month", "year"]
    ).sort_values(
        ["year", "month"], 
        ascending=[False, True]
    ).drop_duplicates().reset_index(drop=True)
    
    display(df.head())
    
    return 
    

## Montebello

* `gtfs_funnel`: present in `published_operators.yml`
* schedule downloads: all feed_keys and stuff are present
* route typlogies: present as gtfs_dataset_key

it's gone after, and it's because direction_id is NaN in trips, which means we DO NOT want to fill it in.

In [7]:
montebello_key = get_operator_key("Montebello Schedule") 
#"84d3c3507a4c8df851d935b63908bbd5"
montebello_key

'84d3c3507a4c8df851d935b63908bbd5'

In [8]:
check_downloaded_schedule_tables(
    analysis_date_list, 
    montebello_key
)

feed keys for 84d3c3507a4c8df851d935b63908bbd5: ['ff9e070d7a3789e930373a27fb3f98b4', 'c661339d3e18aea9cd1772c3d3f6b26e', '224bf24f6aa76263b3fc0da4bb7d8742', 'f2cbb9a2791d058588af2e6b577414f0', '5c226e42216e550ad4c0a68a5dd2d5a0']
trips
2025-01-15    8
2025-02-12    8
2024-01-17    8
2024-02-14    8
2024-03-13    8
2024-04-17    8
2024-05-22    8
2024-06-12    8
2024-07-17    8
2024-08-14    8
2024-09-18    8
2024-10-16    8
2024-11-13    8
2024-12-11    8
Name: service_date, dtype: int64


Unnamed: 0,route_id,direction_id
0,50,
23,40,
65,30,
82,10,
110,20,


NaN    112
Name: direction_id, dtype: int64

shapes
2025-01-15    1
2025-02-12    1
2024-01-17    1
2024-02-14    1
2024-03-13    1
2024-04-17    1
2024-05-22    1
2024-06-12    1
2024-07-17    1
2024-08-14    1
2024-09-18    1
2024-10-16    1
2024-11-13    1
2024-12-11    1
Name: service_date, dtype: int64
stops
2025-01-15    1
2025-02-12    1
2024-01-17    1
2024-02-14    1
2024-03-13    1
2024-04-17    1
2024-05-22    1
2024-06-12    1
2024-07-17    1
2024-08-14    1
2024-09-18    1
2024-10-16    1
2024-11-13    1
2024-12-11    1
Name: service_date, dtype: int64
stop_times
2025-01-15    1
2025-02-12    1
2024-01-17    1
2024-02-14    1
2024-03-13    1
2024-04-17    1
2024-05-22    1
2024-06-12    1
2024-07-17    1
2024-08-14    1
2024-09-18    1
2024-10-16    1
2024-11-13    1
2024-12-11    1
Name: service_date, dtype: int64


In [9]:
check_route_typologies(analysis_date_list, montebello_key)

route typologies
2025-01-15    8
2025-02-12    8
2024-01-17    8
2024-02-14    8
2024-03-13    8
2024-04-17    8
2024-05-22    8
2024-06-12    8
2024-07-17    8
2024-08-14    8
2024-09-18    8
2024-10-16    8
2024-11-13    8
2024-12-11    8
Name: service_date, dtype: int64


Unnamed: 0,route_id,direction_id
0,10,0.0
1,20,0.0
2,30,0.0
3,40,0.0
4,50,0.0


0.0    112
Name: direction_id, dtype: int64

In [10]:
check_monthly_service_download(montebello_key)

Unnamed: 0,schedule_gtfs_dataset_key,month,year
0,84d3c3507a4c8df851d935b63908bbd5,1,2025
1,84d3c3507a4c8df851d935b63908bbd5,2,2025
2,84d3c3507a4c8df851d935b63908bbd5,1,2024
3,84d3c3507a4c8df851d935b63908bbd5,2,2024
4,84d3c3507a4c8df851d935b63908bbd5,3,2024


## Muni

Muni should not be missing against single day downloads, but missing probably because of monthly grain dataset.

In [11]:
muni_key = get_operator_key("Bay Area 511 Muni Schedule")
# "7cc0cb1871dfd558f11a2885c145d144"
muni_key

'7cc0cb1871dfd558f11a2885c145d144'

In [12]:
check_downloaded_schedule_tables(
    analysis_date_list, 
    muni_key
)

feed keys for 7cc0cb1871dfd558f11a2885c145d144: ['71b56f8c9033a28e4ca37a90bd9cc920', '8d28ffd77b0ce6366515729e249957ea', 'a54b3af296fc9ecf295244b8047f861d', '36cb7395061d5a38f2beeae8b1bcda0a', 'cad7d9dfbba521f1911ebdf004ad2007', '7f69c2fdaa134642f14064a0b64d1495', '3de56f52621869e0f5d56d999fe7a500', 'acdab4e68774c338b4b7a5be0f7a6d12', '366786e620b639368592570df334976c', 'c003d4f79fed1324631f458133f3e388']
trips
2025-01-15    140
2024-03-13    139
2024-09-18    138
2025-02-12    137
2024-01-17    136
2024-02-14    136
2024-04-17    136
2024-05-22    136
2024-07-17    136
2024-08-14    136
2024-10-16    136
2024-11-13    136
2024-12-11    136
Name: service_date, dtype: int64


Unnamed: 0,route_id,direction_id
0,1,0.0
86,14R,0.0
151,14R,1.0
205,12,0.0
249,12,1.0


0.0    889
1.0    889
Name: direction_id, dtype: int64

shapes
2025-01-15    1
2025-02-12    1
2024-01-17    1
2024-02-14    1
2024-03-13    1
2024-04-17    1
2024-05-22    1
2024-07-17    1
2024-08-14    1
2024-09-18    1
2024-10-16    1
2024-11-13    1
2024-12-11    1
Name: service_date, dtype: int64
stops
2025-01-15    1
2025-02-12    1
2024-01-17    1
2024-02-14    1
2024-03-13    1
2024-04-17    1
2024-05-22    1
2024-07-17    1
2024-08-14    1
2024-09-18    1
2024-10-16    1
2024-11-13    1
2024-12-11    1
Name: service_date, dtype: int64
stop_times
2025-01-15    1
2025-02-12    1
2024-01-17    1
2024-02-14    1
2024-03-13    1
2024-04-17    1
2024-05-22    1
2024-07-17    1
2024-08-14    1
2024-09-18    1
2024-10-16    1
2024-11-13    1
2024-12-11    1
Name: service_date, dtype: int64


In [13]:
check_route_typologies(analysis_date_list, muni_key)

route typologies
2025-01-15    139
2024-03-13    138
2024-09-18    137
2025-02-12    136
2024-01-17    135
2024-02-14    135
2024-04-17    135
2024-05-22    135
2024-07-17    135
2024-08-14    135
2024-10-16    135
2024-11-13    135
2024-12-11    135
Name: service_date, dtype: int64


Unnamed: 0,route_id,direction_id
0,1,0.0
1,1,1.0
2,12,0.0
3,12,1.0
4,14,0.0


1.0    889
0.0    876
Name: direction_id, dtype: int64

In [14]:
check_monthly_service_download(muni_key)

Unnamed: 0,schedule_gtfs_dataset_key,month,year
0,7cc0cb1871dfd558f11a2885c145d144,1,2025
1,7cc0cb1871dfd558f11a2885c145d144,2,2025
2,7cc0cb1871dfd558f11a2885c145d144,1,2024
3,7cc0cb1871dfd558f11a2885c145d144,2,2024
4,7cc0cb1871dfd558f11a2885c145d144,3,2024


## Metrolink

This one we know should be missing through scripts where `shape_array_key` is used, but should be present in schedule download tables.

As expected:
* `shapes`: we cannot find any rows based on `feed_key`
* `route_typologies`: not present because we would have used most common shape to spatial join against roads.

In [15]:
metrolink_key = get_operator_key("Metrolink Schedule") 
# "c4092405159366c705b62df938293a4e"
metrolink_key

'c4092405159366c705b62df938293a4e'

In [16]:
check_downloaded_schedule_tables(
    analysis_date_list, 
    metrolink_key
)

feed keys for c4092405159366c705b62df938293a4e: ['6a20a36fec1e51356175f529fc1bfac7', '8fc9cfe86b4e9e8c7cf508b2486605f1', '1fc19f46e82ceb8c94281566b39a6669', 'e87161fdb9478556c72f47f9a3f5b896']
trips
2025-01-15    14
2025-02-12    14
2024-01-17    14
2024-02-14    14
2024-03-13    14
2024-04-17    14
2024-05-22    14
2024-06-12    14
2024-07-17    14
2024-08-14    14
2024-09-18    14
2024-10-16    14
2024-11-13    14
2024-12-11    14
Name: service_date, dtype: int64


Unnamed: 0,route_id,direction_id
0,San Bernardino Line,1.0
2,Ventura County Line,0.0
3,Inland Emp.-Orange Co. Line,0.0
4,Orange County Line,1.0
5,Antelope Valley Line,0.0


1.0    98
0.0    98
Name: direction_id, dtype: int64

shapes
Series([], Name: service_date, dtype: int64)
stops
2025-01-15    1
2025-02-12    1
2024-01-17    1
2024-02-14    1
2024-03-13    1
2024-04-17    1
2024-05-22    1
2024-06-12    1
2024-07-17    1
2024-08-14    1
2024-09-18    1
2024-10-16    1
2024-11-13    1
2024-12-11    1
Name: service_date, dtype: int64
stop_times
2025-01-15    1
2025-02-12    1
2024-01-17    1
2024-02-14    1
2024-03-13    1
2024-04-17    1
2024-05-22    1
2024-06-12    1
2024-07-17    1
2024-08-14    1
2024-09-18    1
2024-10-16    1
2024-11-13    1
2024-12-11    1
Name: service_date, dtype: int64


In [17]:
check_route_typologies(analysis_date_list, metrolink_key)

route typologies
Series([], Name: service_date, dtype: int64)


Unnamed: 0,route_id,direction_id


Series([], Name: direction_id, dtype: int64)

In [18]:
check_monthly_service_download(metrolink_key)

Unnamed: 0,schedule_gtfs_dataset_key,month,year
0,c4092405159366c705b62df938293a4e,1,2025
1,c4092405159366c705b62df938293a4e,2,2025
2,c4092405159366c705b62df938293a4e,1,2024
3,c4092405159366c705b62df938293a4e,2,2024
4,c4092405159366c705b62df938293a4e,3,2024


## Santa Maria

* `gtfs_funnel`: this one is in published_operators.yml too.
* similar to Montebello. fillnas should not happen until the most downstream step.

In [19]:
santa_maria_key = get_operator_key("Santa Maria Schedule") 
# "73105f2d1cabc8170ab066d96863c5d5"
santa_maria_key

'73105f2d1cabc8170ab066d96863c5d5'

In [20]:
check_downloaded_schedule_tables(
    analysis_date_list, 
    santa_maria_key
)

feed keys for 73105f2d1cabc8170ab066d96863c5d5: ['253ed64830e600a63d55112c347823dc', 'fdb0383d9d3c18b23aa2afdfaf8b0c0b', '5392e99b9f95a8f2cae17074b0592ae2', '14dee63047914f25fd449d8d072976f9', '9f4e184b7ecc81bf155e258853ff7e4d', 'cb071c19da84a01e27d6a85d40df72df', '9d42d92b785fe9caf7b6f5394fb8601a', 'd7ca92a34b302e9091b47fb849f45380', 'c86a471a1a4c36bb8cf7da9d6a20f202']
trips
2024-09-18    17
2024-10-16    17
2024-11-13    17
2024-12-11    17
2025-01-15    16
2025-02-12    16
2024-07-17    16
2024-08-14    16
2024-01-17    14
2024-02-14    14
2024-03-13    14
2024-04-17    14
2024-05-22    14
2024-06-12    14
Name: service_date, dtype: int64


Unnamed: 0,route_id,direction_id
0,13X,
3,12X,
7,2,
12,1B,
15,1,


NaN    202
0.0     14
Name: direction_id, dtype: int64

shapes
2025-01-15    1
2025-02-12    1
2024-01-17    1
2024-02-14    1
2024-03-13    1
2024-04-17    1
2024-05-22    1
2024-06-12    1
2024-07-17    1
2024-08-14    1
2024-09-18    1
2024-10-16    1
2024-11-13    1
2024-12-11    1
Name: service_date, dtype: int64
stops
2025-01-15    1
2025-02-12    1
2024-01-17    1
2024-02-14    1
2024-03-13    1
2024-04-17    1
2024-05-22    1
2024-06-12    1
2024-07-17    1
2024-08-14    1
2024-09-18    1
2024-10-16    1
2024-11-13    1
2024-12-11    1
Name: service_date, dtype: int64
stop_times
2025-01-15    1
2025-02-12    1
2024-01-17    1
2024-02-14    1
2024-03-13    1
2024-04-17    1
2024-05-22    1
2024-06-12    1
2024-07-17    1
2024-08-14    1
2024-09-18    1
2024-10-16    1
2024-11-13    1
2024-12-11    1
Name: service_date, dtype: int64


In [21]:
check_route_typologies(analysis_date_list, santa_maria_key)

route typologies
2024-09-18    17
2024-10-16    17
2024-11-13    17
2024-12-11    17
2024-07-17    16
2024-08-14    16
2025-01-15    15
2025-02-12    15
2024-01-17    14
2024-02-14    14
2024-03-13    14
2024-04-17    14
2024-05-22    14
2024-06-12    14
Name: service_date, dtype: int64


Unnamed: 0,route_id,direction_id
0,1,0.0
1,11,0.0
2,12X,0.0
3,13X,0.0
4,1B,0.0


0.0    214
Name: direction_id, dtype: int64

In [22]:
check_monthly_service_download(santa_maria_key)

Unnamed: 0,schedule_gtfs_dataset_key,month,year
0,73105f2d1cabc8170ab066d96863c5d5,1,2025
1,73105f2d1cabc8170ab066d96863c5d5,2,2025
2,73105f2d1cabc8170ab066d96863c5d5,1,2024
3,73105f2d1cabc8170ab066d96863c5d5,2,2024
4,73105f2d1cabc8170ab066d96863c5d5,3,2024
