# Metric 4: Reliable Prediction Accuracy

Use this equation: 

-60ln(`Time to Prediction`+1.3) < `Prediction Error` < 60ln(`Time to Prediction`+1.5)

## Rabbit Hole 
* time to prediction should be positive. Using `predicted arrival - _extract_ts` should yield positive time differences, but there are cases where negative time differences exists
* negative time difference means the predicted arrival is earlier than the `_extract_ts`, which definitely can be possible in the case of a delayed bus
* for now, just use absolute value to make sure it's positive

Summary Levels
* Route by hour of day/day of week
* Stops by  hour of day/day of week
* Route by stops

In [1]:
import numpy as  np
import pandas as pd

import utils
from segment_speed_utils.project_vars import (PREDICTIONS_GCS, 
                                              analysis_date)


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas


In [2]:
df = pd.read_parquet(
    f"{PREDICTIONS_GCS}rt_sched_stop_times_{analysis_date}.parquet", 
)

In [3]:
def calculate_prediction_accuracy_parts(
    df: pd.DataFrame,
    metric_col: str = "reliable_accuracy",
) -> pd.DataFrame:
    """
    Time to prediction: current time until predicted stop arrival, 
    in minutes. 
    To be positive, we'll flip this, predicted arrival - extract_ts.
    
    Prediction error: actual stop arrival - predicted stop arrival,
    in seconds.
    
    Calculate metric:
    lower_bound = -60 ln(Time to Prediction+1.3) 
    upper_bound = 60ln(Time to Prediction+1.5)
    prediction_error = actual - predicted arrival 
    reliable_accuracy: 1 if prediction_error within [lower_bound, upper_bound],
    and 0 if it's not.
    
    Future TODO: if we implement prediction score, we will need to score
    the actual value (how close it is and also penalty), 
    not just dummy variable for yes/no.
    """
    
    df = df.assign(
        time_to_prediction_min = (
            abs(df.predicted_pacific - df._extract_ts_local)
            .dt.total_seconds().divide(60)),
        prediction_error = (
            (df.actual_stop_arrival_pacific - df.predicted_pacific)
            .dt.total_seconds())
    )

    time_to_predic = df.time_to_prediction_min.to_numpy()
    predic_error = df.prediction_error.to_numpy()
    
    lower_bound = -60 * np.log(time_to_predic + 1.3)
    upper_bound = 60 * np.log(time_to_predic + 1.5)
    
    #https://stackoverflow.com/questions/45314494/is-there-a-nice-way-to-check-if-numpy-array-elements-are-within-a-range
    acceptable_prediction = np.logical_and(
        lower_bound < predic_error, 
        predic_error < upper_bound)
    
    df = df.assign(
        metric_col = acceptable_prediction.astype(int)
    ).rename(columns = {"metric_col": metric_col})
    
    return df

In [4]:
def reliably_accurate_metric(df: pd.DataFrame) -> pd.DataFrame:
    """
    Start with assembled RT stop_time_updates with 
    scheduled stop_times and also final_trip_updates columns.
    
    For a given stop, if there are predictions present 
    for the same minute as the actual arrival or after, exclude these.
    
    Apply the equation and check if the prediction error 
    falls within certain bounds.
    Get percent by dividing up predictions within the bounds over 
    all predictions for that stop.
    """
    timestamp_col = "_extract_ts_local"
    minute_cols = [f"{timestamp_col}_hour", f"{timestamp_col}_min"]
    metric_col = "reliable_accuracy"
    
    all_stop_cols = [
        "gtfs_dataset_key", "_gtfs_dataset_name", 
        "service_date", 
        "shape_id", "route_id",
        "trip_id", 
        "stop_id", "stop_sequence",
        "scheduled_arrival", "actual_stop_arrival_pacific", 
    ]

    df2 = utils.exclude_predictions_at_actual_stop_arrival(df)
    df2 = utils.exclude_predictions_after_actual_stop_arrival(
        df2, timestamp_col)
    
    df2 = calculate_prediction_accuracy_parts(
        df, 
        metric_col
    )    
    
    df3 = utils.parse_hour_min(df2, [timestamp_col])
    
    df4 = (df3.groupby(all_stop_cols)
       .agg({
           f"{timestamp_col}_hour": "size",
           metric_col: "sum"})
       .reset_index()
       .rename(columns = {
           f"{timestamp_col}_hour": "total_stop_predictions",
           metric_col: "num_accurate_predictions"})
      )

    df4[f"pct_{metric_col}"] = df4.num_accurate_predictions.divide(
            df4.total_stop_predictions)
    
    
    return df4

In [5]:
by_trip_stop = reliably_accurate_metric(df)

In [12]:
by_trip_stop.shape, by_trip_stop.trip_id.nunique(), by_trip_stop.stop_id.nunique()

((2967, 13), 372, 190)

In [15]:
by_trip_stop[by_trip_stop.trip_id == "9383943"].shape

(30, 13)

In [9]:
by_trip_stop.sample(5)

Unnamed: 0,gtfs_dataset_key,_gtfs_dataset_name,service_date,shape_id,route_id,trip_id,stop_id,stop_sequence,scheduled_arrival,actual_stop_arrival_pacific,total_stop_predictions,num_accurate_predictions,pct_reliable_accuracy
1885,9255cb4744d73d4a39f512180a7cf63a,Bay Area 511 Fairfield and Suisun Transit Trip...,2023-03-15,p_2485,1,t_5525637_b_79892_tn_10,75105,1.0,2023-03-15 15:51:00,2023-03-15 15:57:44,132,72,0.545455
2066,9255cb4744d73d4a39f512180a7cf63a,Bay Area 511 Fairfield and Suisun Transit Trip...,2023-03-15,p_2485,1,t_5525669_b_79892_tn_1,75018,23.0,2023-03-15 15:32:00,2023-03-15 15:40:14,181,0,0.0
561,5c3e65766dda65958cf4da845286c0d5,Bay Area 511 Dumbarton Express TripUpdates,2023-03-15,DB0085,DB,9384010,58299,16.0,2023-03-15 18:28:00,2023-03-15 19:13:57,162,37,0.228395
2285,9255cb4744d73d4a39f512180a7cf63a,Bay Area 511 Fairfield and Suisun Transit Trip...,2023-03-15,p_2494,2,t_5525665_b_79892_tn_3,75048,16.0,2023-03-15 17:31:00,2023-03-15 17:37:47,83,0,0.0
1672,5c3e65766dda65958cf4da845286c0d5,Bay Area 511 Dumbarton Express TripUpdates,2023-03-15,DB10050,DB1,9384005,53414,20.0,2023-03-15 13:18:00,2023-03-15 13:17:59,181,181,1.0


In [6]:
def quick_descriptives(df: pd.DataFrame, 
                       operator: str,
                       cols_to_describe: list):
    print(f"------------- {operator}-------------")
    subset_df = df[df._gtfs_dataset_name==operator] 
    
    for c in cols_to_describe:
        print(subset_df[c].describe())
        print("\n")

In [7]:
cols = [
    "num_accurate_predictions", 
    "total_stop_predictions",
    "pct_reliable_accuracy"]

for i in by_trip_stop._gtfs_dataset_name.unique():
    quick_descriptives(by_trip_stop, i, cols)


------------- Anaheim Resort TripUpdates-------------
count    265.000000
mean      18.792453
std       33.366740
min        0.000000
25%        0.000000
50%        0.000000
75%       17.000000
max      142.000000
Name: num_accurate_predictions, dtype: float64


count    265.000000
mean      72.822642
std       55.296743
min        2.000000
25%       23.000000
50%       64.000000
75%      113.000000
max      181.000000
Name: total_stop_predictions, dtype: float64


count    265.000000
mean       0.282106
std        0.396125
min        0.000000
25%        0.000000
50%        0.000000
75%        0.605263
max        1.000000
Name: pct_reliable_accuracy, dtype: float64


------------- Bay Area 511 Dumbarton Express TripUpdates-------------
count    1424.000000
mean       40.735955
std        58.752496
min         0.000000
25%         0.000000
50%         1.000000
75%        65.000000
max       233.000000
Name: num_accurate_predictions, dtype: float64


count    1424.000000
mean      145.76