# Sanity check: RT vs schedule metrics

1. Check that the metrics are calculated correctly.
Especially metrics that look at % RT journey or % scheduled journey, we want to make sure we're counting the number of minutes with 1+ or 2+ vp correctly.

2. Drop outliers 
At first glance, there are unusually high values for `rt_service_minutes`. We should drop these before they get aggregated into route-dir.

In [1]:
import pandas as pd

from shared_utils import rt_dates
from segment_speed_utils import (gtfs_schedule_wrangling, 
                                 metrics
                                )
from segment_speed_utils.project_vars import RT_SCHED_GCS
from segment_speed_utils.time_series_utils import ROUTE_DIR_COLS
analysis_date = rt_dates.DATES["mar2024"]

CONFIG_PATH = "./scripts/config.yml"
import yaml
with open(CONFIG_PATH) as f: 
    dict_inputs= yaml.safe_load(f) 

In [2]:
TRIP_EXPORT = dict_inputs["trip_metrics"]
ROUTE_EXPORT = dict_inputs["route_direction_metrics"]

trip_df = pd.read_parquet(
    f"{RT_SCHED_GCS}{TRIP_EXPORT}_{analysis_date}.parquet"
)

In [4]:
early = -5
late = 5
trip_df = trip_df.pipe(
    metrics.derive_trip_comparison_metrics, early, late
)

In [13]:
route_df.columns

Index(['schedule_gtfs_dataset_key', 'route_id', 'direction_id', 'time_period',
       'minutes_atleast1_vp', 'minutes_atleast2_vp', 'rt_service_minutes',
       'scheduled_service_minutes', 'total_vp', 'vp_in_shape', 'is_early',
       'is_ontime', 'is_late', 'n_vp_trips', 'vp_per_minute', 'pct_in_shape',
       'pct_rt_journey_atleast1_vp', 'pct_rt_journey_atleast2_vp',
       'pct_sched_journey_atleast1_vp', 'pct_sched_journey_atleast2_vp',
       'name', 'base64_url', 'organization_source_record_id',
       'organization_name', 'caltrans_district'],
      dtype='object')

In [12]:
route_df = metrics.concatenate_peak_offpeak_allday_averages(
    trip_df,
    group_cols = ["schedule_gtfs_dataset_key"] + ROUTE_DIR_COLS,
    metric_type = "rt_vs_schedule"
).pipe(
    metrics.derive_rt_vs_schedule_metrics
).pipe(
    gtfs_schedule_wrangling.merge_operator_identifiers,
    [analysis_date]
)

In [15]:
route_df.columns

Index(['schedule_gtfs_dataset_key', 'route_id', 'direction_id', 'time_period',
       'minutes_atleast1_vp', 'minutes_atleast2_vp', 'rt_service_minutes',
       'scheduled_service_minutes', 'total_vp', 'vp_in_shape', 'is_early',
       'is_ontime', 'is_late', 'n_vp_trips', 'vp_per_minute', 'pct_in_shape',
       'pct_rt_journey_atleast1_vp', 'pct_rt_journey_atleast2_vp',
       'pct_sched_journey_atleast1_vp', 'pct_sched_journey_atleast2_vp',
       'name', 'base64_url', 'organization_source_record_id',
       'organization_name', 'caltrans_district'],
      dtype='object')

In [18]:
cols = ["name", "route_id", "time_period", "direction_id", 'is_early',
       'is_ontime', 'is_late', 'n_vp_trips']
route_df[(route_df.name=="Big Blue Bus Schedule") & 
         (route_df.route_id=="3705")].sort_values(["route_id", "direction_id", 
                                                   "time_period"])[cols]

Unnamed: 0,name,route_id,time_period,direction_id,is_early,is_ontime,is_late,n_vp_trips
7383,Big Blue Bus Schedule,3705,all_day,0.0,3,10,71,84
7316,Big Blue Bus Schedule,3705,offpeak,0.0,1,8,34,43
7317,Big Blue Bus Schedule,3705,peak,0.0,2,2,37,41
7384,Big Blue Bus Schedule,3705,all_day,1.0,2,15,67,84
7318,Big Blue Bus Schedule,3705,offpeak,1.0,1,8,32,41
7319,Big Blue Bus Schedule,3705,peak,1.0,1,7,35,43


In [None]:
df[df.pct_rt_journey_atleast2_vp <= 
   df.pct_rt_journey_atleast1_vp].shape, df.shape

In [None]:
rt_trip_cutoff = 60 * 2
df[df.rt_service_minutes > rt_trip_cutoff].shape

In [None]:
# ~8% of trips have unusually high RT trip times (> 2 hrs)
6798/86803

In [None]:
df[df.rt_service_minutes > rt_trip_cutoff].sched_rt_category.value_counts()

In [None]:
df.dtypes

In [None]:
df[df.rt_service_minutes > rt_trip_cutoff]

In [None]:
route_direction_metrics = "vp_route_dir/route_direction_metrics"
df2 = pd.read_parquet(
    f"{RT_SCHED_GCS}{route_direction_metrics}_{analysis_date}.parquet"
)

In [None]:
from segment_speed_utils import metrics, time_series_utils
ROUTE_DIR_COLS = time_series_utils.ROUTE_DIR_COLS

route_dir = metrics.calculate_weighted_average_vp_schedule_metrics(
    df, 
    ["schedule_gtfs_dataset_key"] + ROUTE_DIR_COLS
)
    

In [None]:
# derive metrics without masking
def derive_metrics(df: pd.DataFrame):
    integrify = ["vp_in_shape", "total_vp"]
    df[integrify] = df[integrify].fillna(0).astype("int")
    
    df = df.assign(
        vp_per_minute = df.total_vp / df.rt_service_minutes,
        pct_in_shape = df.vp_in_shape / df.total_vp,
        pct_rt_journey_atleast1_vp = df.minutes_atleast1_vp / df.rt_service_minutes,
        pct_rt_journey_atleast2_vp = df.minutes_atleast2_vp / df.rt_service_minutes,
        pct_sched_journey_atleast1_vp = (df.minutes_atleast1_vp / 
                                         df.scheduled_service_minutes),
        pct_sched_journey_atleast2_vp = (df.minutes_atleast2_vp / 
                                         df.scheduled_service_minutes),
    )
    
    two_decimal_cols = [
        "vp_per_minute", "rt_service_minutes", 
    ]
    
    df[two_decimal_cols] = df[two_decimal_cols].round(2)
    
    three_decimal_cols = [
        c for c in df.columns if "pct_" in c
    ]
    
    df[three_decimal_cols] = df[three_decimal_cols].round(3)
    
    return df

In [None]:
route_dir2 = derive_metrics(route_dir)

In [None]:
route_dir2[
    route_dir2.pct_rt_journey_atleast2_vp > 1
].sort_values("pct_rt_journey_atleast2_vp", ascending=False)

In [None]:
route_dir2.dtypes

In [None]:
df = df.assign(
    rt_sched_journey_ratio = df.rt_service_minutes.divide(
        df.scheduled_service_minutes).round(2),   
    rt_sched_journey_difference = (df.rt_service_minutes - 
                                   df.scheduled_service_minutes)
)

In [None]:
late = 5
early = -5

df = df.assign(
    is_early = df.apply(
        lambda x: 
        1 if x.rt_sched_journey_difference < early
        else 0, axis=1).astype(int),
    is_ontime = df.apply(
        lambda x: 
        1 if (x.rt_sched_journey_difference >= early) and 
        (x.rt_sched_journey_difference <= late)
        else 0, axis=1).astype(int), 
    is_late = df.apply(
        lambda x: 
        1 if x.rt_sched_journey_difference > late
        else 0, axis=1).astype(int),
)

In [None]:
df[["is_late", "is_early", "is_ontime", "rt_sched_journey_difference"]]