# Route typologies - check that route grain merges well with schedule time-series

Modify the function to just grab operator-route-direction columns and see how well the merge performs.

If we're starting with scheduled trips to get route grain, we should decrease the rows that are left only. 

In [1]:
import pandas as pd

from segment_speed_utils import time_series_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS
from shared_utils import rt_dates

In [5]:
def concatenate_schedule_by_route_direction_modified(
    date_list: list
) -> pd.DataFrame:
    """
    Concatenate schedule metrics (from gtfs_funnel)
    for route-direction-time_period grain
    for all the dates we have.
    """
    FILE = GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics
        
    df = time_series_utils.concatenate_datasets_across_dates(
        RT_SCHED_GCS,
        FILE,
        date_list,
        data_type = "df",
        columns = ["schedule_gtfs_dataset_key", "route_id", "direction_id"],
    ).drop_duplicates().reset_index(drop=True)    
    
    df = df.assign(
        year = df.service_date.dt.year
    )
    
    return df

In [6]:
def concatenate_new_route_typologies(year_list: list) -> pd.DataFrame:
    EXPORT = f"{GTFS_DATA_DICT.schedule_tables.route_typologies}_new"
    
    df = pd.concat([
        pd.read_parquet(
            f"{SCHED_GCS}{EXPORT}_{str(y)}.parquet"
        ).assign(year=int(y))
        for y in year_list
    ], axis=0, ignore_index=True)

    return df

In [7]:
analysis_date_list = rt_dates.y2023_dates + rt_dates.y2024_dates + rt_dates.y2025_dates

scheduled_df = concatenate_schedule_by_route_direction_modified(
    analysis_date_list
)
route_typo_df = concatenate_new_route_typologies([2023, 2024, 2025])

In [9]:
scheduled_df.shape, route_typo_df.shape

((93132, 5), (12991, 16))

In [11]:
route_typo_df.dtypes

schedule_gtfs_dataset_key     object
name                          object
route_id                      object
route_type                    object
route_long_name               object
route_short_name              object
combined_name                 object
is_express                     int64
is_rapid                       int64
is_rail                        int64
is_ferry                       int64
is_local                       int64
common_shape_id               object
is_coverage                  float64
is_downtown_local            float64
year                           int64
dtype: object

In [14]:
scheduled_df.dtypes

schedule_gtfs_dataset_key            object
route_id                             object
direction_id                        float64
service_date                 datetime64[ns]
year                                  int64
dtype: object

In [15]:
pd.merge(
    scheduled_df,
    route_typo_df,
    on = ["schedule_gtfs_dataset_key", "route_id", "year"],
    how = "outer",
    indicator = True
)._merge.value_counts()
    

both          98638
right_only     1639
left_only         0
Name: _merge, dtype: int64

In [16]:
pd.merge(
    scheduled_df,
    route_typo_df,
    on = ["schedule_gtfs_dataset_key", "route_id", "year"],
    how = "outer",
    indicator = True
)._merge.value_counts(normalize=True)
    

both          0.983655
right_only    0.016345
left_only     0.000000
Name: _merge, dtype: float64