# Missing operator debug

Quite a few GitHub issues have to do with missing operators.

Go through initial downloaded schedule tables and scripts in `gtfs_funnel` to see whether it's missing the entire time or because of values that are needed in several merges.

In [1]:
import geopandas as gpd
import pandas as pd

from segment_speed_utils import (gtfs_schedule_wrangling, 
                                 helpers, 
                                 time_series_utils)
from shared_utils import rt_dates
from update_vars import (GTFS_DATA_DICT, 
                         SEGMENT_GCS, 
                         RT_SCHED_GCS, 
                         SCHED_GCS, 
                         COMPILED_CACHED_VIEWS
                        )

'''
import sys
sys.path.append("../gtfs_digest/")
import merge_data

route_time_cols = merge_data.route_time_cols
'''

analysis_date_list = rt_dates.y2025_dates + rt_dates.y2024_dates

In [3]:
def check_dates_available(df):
    print(df.service_date.value_counts())
    return

def feed_keys_in_downloaded_schedule_tables(
    date_list: list,
    gcs_bucket: str = COMPILED_CACHED_VIEWS,
    file: str = "",
    subset_feeds: list = []
):
    df = time_series_utils.concatenate_datasets_across_dates(
        gcs_bucket,
        file,
        date_list,
        data_type = "df",
        get_pandas=True,
        filters = [[("feed_key", "in", subset_feeds)]],
        columns = ["feed_key"],
    )
    
    df.pipe(check_dates_available)
    
    return

## Montebello

* `gtfs_funnel`: present in `published_operators.yml`
* schedule downloads: all feed_keys and stuff are present
* route typlogies: present as gtfs_dataset_key

it's gone after, and it's because direction_id is NaN in trips, which means we DO NOT want to fill it in.

In [14]:
def check_downloaded_schedule_tables(
    date_list: list,
    one_operator_key: str
): 

    TRIPS_FILE = GTFS_DATA_DICT.schedule_downloads.trips
    SHAPES_FILE = GTFS_DATA_DICT.schedule_downloads.shapes
    STOPS_FILE = GTFS_DATA_DICT.schedule_downloads.stops
    STOP_TIMES_FILE = GTFS_DATA_DICT.schedule_downloads.stop_times

    trips = time_series_utils.concatenate_datasets_across_dates(
        COMPILED_CACHED_VIEWS,
        TRIPS_FILE,
        date_list,
        data_type = "df",
        get_pandas=True,
        filters = [[("gtfs_dataset_key", "==", one_operator_key)]],
        columns = ["name", "gtfs_dataset_key", "feed_key", "route_id", "direction_id"],
    )  
    
    subset_feeds = trips.feed_key.unique().tolist()
    print(f"feed keys for {one_operator_key}: {subset_feeds}")
    
    print("trips")
    trips.pipe(check_dates_available)
    
    
    display(trips[["route_id", "direction_id"]].drop_duplicates().head())
    display(trips.direction_id.value_counts(dropna=False))
    
    print("shapes")
    feed_keys_in_downloaded_schedule_tables(
        date_list, 
        file = SHAPES_FILE,
        subset_feeds = subset_feeds
    )
    
    
    print("stops")
    feed_keys_in_downloaded_schedule_tables(
        date_list, 
        file = STOPS_FILE,
        subset_feeds = subset_feeds
    )
    
    
    print("stop_times")
    feed_keys_in_downloaded_schedule_tables(
        date_list, 
        file = STOP_TIMES_FILE,
        subset_feeds = subset_feeds
    )
    
    return 
    
    

In [15]:
TRIPS_FILE = GTFS_DATA_DICT.schedule_downloads.trips

df = time_series_utils.concatenate_datasets_across_dates(
    COMPILED_CACHED_VIEWS,
    TRIPS_FILE,
    analysis_date_list,
    data_type = "df",
    get_pandas=True,
    filters = [[("name", "==", "Montebello Schedule")]],
    columns = ["name", "gtfs_dataset_key"],
)

df.gtfs_dataset_key.unique()

array(['84d3c3507a4c8df851d935b63908bbd5'], dtype=object)

In [16]:
montebello_key = df.gtfs_dataset_key.iloc[0] # "84d3c3507a4c8df851d935b63908bbd5"

In [17]:
check_downloaded_schedule_tables(analysis_date_list, montebello_key)

feed keys for 84d3c3507a4c8df851d935b63908bbd5: ['ff9e070d7a3789e930373a27fb3f98b4', 'c661339d3e18aea9cd1772c3d3f6b26e', '224bf24f6aa76263b3fc0da4bb7d8742', 'f2cbb9a2791d058588af2e6b577414f0', '5c226e42216e550ad4c0a68a5dd2d5a0']
trips
2025-01-15    8
2025-02-12    8
2024-01-17    8
2024-02-14    8
2024-03-13    8
2024-04-17    8
2024-05-22    8
2024-06-12    8
2024-07-17    8
2024-08-14    8
2024-09-18    8
2024-10-16    8
2024-11-13    8
2024-12-11    8
Name: service_date, dtype: int64


Unnamed: 0,route_id,direction_id
0,50,
23,40,
65,30,
82,10,
110,20,


NaN    112
Name: direction_id, dtype: int64

shapes
2025-01-15    1
2025-02-12    1
2024-01-17    1
2024-02-14    1
2024-03-13    1
2024-04-17    1
2024-05-22    1
2024-06-12    1
2024-07-17    1
2024-08-14    1
2024-09-18    1
2024-10-16    1
2024-11-13    1
2024-12-11    1
Name: service_date, dtype: int64
stops
2025-01-15    1
2025-02-12    1
2024-01-17    1
2024-02-14    1
2024-03-13    1
2024-04-17    1
2024-05-22    1
2024-06-12    1
2024-07-17    1
2024-08-14    1
2024-09-18    1
2024-10-16    1
2024-11-13    1
2024-12-11    1
Name: service_date, dtype: int64
stop_times
2025-01-15    1
2025-02-12    1
2024-01-17    1
2024-02-14    1
2024-03-13    1
2024-04-17    1
2024-05-22    1
2024-06-12    1
2024-07-17    1
2024-08-14    1
2024-09-18    1
2024-10-16    1
2024-11-13    1
2024-12-11    1
Name: service_date, dtype: int64


In [18]:
def check_route_typologies(
    date_list: list,
    one_operator_key: str
): 
    
    FILE = GTFS_DATA_DICT.schedule_tables.route_typologies

    df = time_series_utils.concatenate_datasets_across_dates(
        SCHED_GCS,
        FILE,
        date_list,
        data_type = "df",
        get_pandas=True,
        filters = [[("schedule_gtfs_dataset_key", "==", one_operator_key)]],
        columns = ["schedule_gtfs_dataset_key","route_id", "direction_id"],
    )  
    
    print("route typologies")
    
    df.pipe(check_dates_available)
    
    display(df[["route_id", "direction_id"]].drop_duplicates().head())
    display(df.direction_id.value_counts(dropna=False))
    
    return 

In [19]:
check_route_typologies(analysis_date_list, montebello_key)

route typologies
2025-01-15    8
2025-02-12    8
2024-01-17    8
2024-02-14    8
2024-03-13    8
2024-04-17    8
2024-05-22    8
2024-06-12    8
2024-07-17    8
2024-08-14    8
2024-09-18    8
2024-10-16    8
2024-11-13    8
2024-12-11    8
Name: service_date, dtype: int64


Unnamed: 0,route_id,direction_id
0,10,0.0
1,20,0.0
2,30,0.0
3,40,0.0
4,50,0.0


0.0    112
Name: direction_id, dtype: int64