# Metric 4: Reliable Prediction Accuracy

## Rabbit Hole 
* time to prediction should be positive. Using `predicted arrival - _extract_ts` should yield positive time differences, but there are cases where negative time differences exists
* negative time difference means the predicted arrival is earlier than the `_extract_ts`, which definitely can be possible in the case of a delayed bus
* for now, just use absolute value to make sure it's positive

Summary Levels
* Route by hour of day/day of week
* Stops by  hour of day/day of week
* Route by stops

In [1]:
import numpy as  np
import pandas as pd

import chart_utils
import utils
from segment_speed_utils.project_vars import PREDICTIONS_GCS

analysis_date = utils.analysis_date           


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas


In [2]:
df = pd.read_parquet(
    f"{PREDICTIONS_GCS}rt_sched_stop_times_{analysis_date}.parquet", 
)

### Define Functions for Metrics

In [3]:
def calculate_prediction_accuracy_parts(
    df: pd.DataFrame,
    metric_col: str = "reliable_accuracy",
) -> pd.DataFrame:
    """
    Time to prediction: current time until predicted stop arrival, 
    in minutes. 
    To be positive, we'll flip this, predicted arrival - extract_ts.
    
    Prediction error: actual stop arrival - predicted stop arrival,
    in seconds.
    
    Calculate metric:
    lower_bound = -60 ln(Time to Prediction+1.3) 
    upper_bound = 60ln(Time to Prediction+1.5)
    prediction_error = actual - predicted arrival 
    reliable_accuracy: 1 if prediction_error within [lower_bound, upper_bound],
    and 0 if it's not.
    
    Future TODO: if we implement prediction score, we will need to score
    the actual value (how close it is and also penalty), 
    not just dummy variable for yes/no.
    """
    
    df = df.assign(
        time_to_prediction_min = (
            abs(df.predicted_pacific - df._extract_ts_local)
            .dt.total_seconds().divide(60)),
        prediction_error = (
            (df.actual_stop_arrival_pacific - df.predicted_pacific)
            .dt.total_seconds())
    )

    time_to_predic = df.time_to_prediction_min.to_numpy()
    predic_error = df.prediction_error.to_numpy()
    
    lower_bound = -60 * np.log(time_to_predic + 1.3)
    upper_bound = 60 * np.log(time_to_predic + 1.5)
    
    #https://stackoverflow.com/questions/45314494/is-there-a-nice-way-to-check-if-numpy-array-elements-are-within-a-range
    acceptable_prediction = np.logical_and(
        lower_bound < predic_error, 
        predic_error < upper_bound)
    
    df = df.assign(
        metric_col = acceptable_prediction.astype(int)
    ).rename(columns = {"metric_col": metric_col})
    
    return df

In [4]:
def reliably_accurate_metric(df: pd.DataFrame) -> pd.DataFrame:
    """
    Start with assembled RT stop_time_updates with 
    scheduled stop_times and also final_trip_updates columns.
    
    For a given stop, if there are predictions present 
    for the same minute as the actual arrival or after, exclude these.
    
    Apply the equation and check if the prediction error 
    falls within certain bounds.
    Get percent by dividing up predictions within the bounds over 
    all predictions for that stop.
    """
    timestamp_col = "_extract_ts_local"
    minute_cols = utils.minute_cols(timestamp_col)
    metric_col = "reliable_accuracy"
    
    all_stop_cols = [
        "gtfs_dataset_key", "_gtfs_dataset_name", 
        "service_date", 
        "shape_id", "route_id",
        "trip_id", 
        "stop_id", "stop_sequence",
        "scheduled_arrival", "actual_stop_arrival_pacific", 
    ]

    df2 = utils.exclude_predictions_at_actual_stop_arrival(df)
    df2 = utils.exclude_predictions_after_actual_stop_arrival(
        df2, timestamp_col)
    
    df2 = calculate_prediction_accuracy_parts(
        df, 
        metric_col
    )    
    
    df3 = utils.parse_hour_min(df2, [timestamp_col])
    
    df4 = (df3.groupby(all_stop_cols)
       .agg({
           f"{timestamp_col}_hour": "size",
           metric_col: "sum"})
       .reset_index()
       .rename(columns = {
           f"{timestamp_col}_hour": "total_stop_predictions",
           metric_col: "num_accurate_predictions"})
      )

    df4[f"pct_{metric_col}"] = df4.num_accurate_predictions.divide(
            df4.total_stop_predictions)
    
    
    return df4

### Calculate Metric and Quick Descriptives

In [5]:
by_trip_stop = reliably_accurate_metric(df)

In [6]:
cols = [
    "num_accurate_predictions", 
    "total_stop_predictions",
    "pct_reliable_accuracy"]

In [7]:
for i in by_trip_stop._gtfs_dataset_name.unique():
    display(
        chart_utils.describe_to_df(
            by_trip_stop,
            i,
            cols,
        )
    )

Unnamed: 0,Measure,Num Accurate Predictions,Total Stop Predictions,Pct Reliable Accuracy
0,Count,2041.0,2041.0,2041.0
1,Mean,42.0,96.5,0.5
2,Std,53.0,72.9,0.4
3,Min,0.0,1.0,0.0
4,25%,1.0,40.0,0.0
5,50%,29.0,83.0,0.5
6,75%,58.0,120.0,0.8
7,Max,517.0,517.0,1.0


Unnamed: 0,Measure,Num Accurate Predictions,Total Stop Predictions,Pct Reliable Accuracy
0,Count,1490.0,1490.0,1490.0
1,Mean,76.9,245.9,0.4
2,Std,68.6,108.0,0.3
3,Min,1.0,6.0,0.0
4,25%,18.0,170.0,0.1
5,50%,54.0,249.0,0.3
6,75%,125.8,312.0,0.6
7,Max,294.0,692.0,1.0


Unnamed: 0,Measure,Num Accurate Predictions,Total Stop Predictions,Pct Reliable Accuracy
0,Count,514.0,514.0,514.0
1,Mean,128.5,192.6,0.7
2,Std,53.7,45.3,0.3
3,Min,1.0,1.0,0.0
4,25%,88.0,183.0,0.5
5,50%,138.0,201.0,0.8
6,75%,172.8,216.0,1.0
7,Max,236.0,288.0,1.0


Unnamed: 0,Measure,Num Accurate Predictions,Total Stop Predictions,Pct Reliable Accuracy
0,Count,1289.0,1289.0,1289.0
1,Mean,108.2,150.4,0.7
2,Std,55.5,31.2,0.3
3,Min,1.0,6.0,0.0
4,25%,62.0,136.0,0.4
5,50%,129.0,142.0,0.9
6,75%,137.0,151.0,1.0
7,Max,321.0,334.0,1.0


Unnamed: 0,Measure,Num Accurate Predictions,Total Stop Predictions,Pct Reliable Accuracy
0,Count,1203.0,1203.0,1203.0
1,Mean,44.4,81.4,0.6
2,Std,30.6,28.7,0.3
3,Min,0.0,1.0,0.0
4,25%,18.0,69.5,0.3
5,50%,41.0,89.0,0.6
6,75%,72.0,99.0,0.9
7,Max,98.0,159.0,1.0


In [8]:
charts_df = chart_utils.prep_df_for_chart(
    df = by_trip_stop,
    percentage_column = "pct_reliable_accuracy",
    columns_to_round = ["pct_reliable_accuracy"],
    columns_to_keep = [
        "_gtfs_dataset_name",
        "trip_id",
        "stop_id",
        "stop_sequence",
        "pct_reliable_accuracy",
    ],
)

In [9]:
for i in charts_df['Gtfs Dataset Name'].unique():
    display(chart_utils.scatter_plot_domain(
    charts_df,
    operator = i,
    x_col="Stop Sequence",
    y_col="Pct Reliable Accuracy",
    color_col="Rounded Pct Reliable Accuracy",
    dropdown_col="Trip Id",
    dropdown_col_title="Trip ID",))