# Metric 1: Update Completeness


### Rabbit Hole
* `_extract_ts_local` doesn't always lead up to the stop's actual arrival, or even the max(stop's predicted arrival). If we stop asking, should we penalize? 
* Right now, we'll only count the trip updates for as much as we're asking.
* If `_extract_ts` is not present, we're not asking, then that's a different issue.
* Notice that if we subset to prediction durations, we might lose a lot of rows.

In [None]:
import os
os.environ['USE_PYGEOS'] = '0'

#import dask.dataframe as dd
#import dask_geopandas as dg
#import geopandas as gpd
import pandas as pd

import utils
from segment_speed_utils.project_vars import (PREDICTIONS_GCS, 
                                              analysis_date)

In [None]:
all_stop_cols = [
    "gtfs_dataset_key", "_gtfs_dataset_name", 
    "service_date", 
    "shape_id", "route_id",
    "trip_id", 
    "stop_id", "stop_sequence",
    "scheduled_arrival", "actual_stop_arrival_pacific", 
]

def atleast2_updates_by_trip_stop(
    df: pd.DataFrame,
    timestamp_col: str = "_extract_ts_local",
    metric_timestamp_col: str = "trip_update_timestamp_local"
) -> pd.DataFrame: 
    """
    For every trip-stop-minute combination,
    count the number of unique trip_update_timestamps.
    (Checked that this is 3 max).
    If that minute has at least 2, flag that as passing.
    
    Sum up the number that of passing for that stop and 
    calculate the percent. The denominator is the number of 
    trip_min_elapsed.
    
    Note: size here used to count number of rows as denominator.
    But, if we are not asking for predictions (`_extract_ts`), 
    we are also not going to penalize operator for not having predictions
    leading up to the stop.
    """
    minute_cols = [f"{timestamp_col}_hour", f"{timestamp_col}_min"]
    
    # Count for every stop-min, how many unique trip updates
    df2 = (df.groupby(all_stop_cols + minute_cols)
           .agg({metric_timestamp_col: "nunique"})
           .reset_index()
    )
    
    # 1 if it has more than 2 updates, 0 otherwise.
    # Easier to sum and calculate percent.
    df2 = df2.assign(
        atleast2_trip_updates = df2.apply(
            lambda x: 1 if x[metric_timestamp_col] >= 2
            else 0, axis=1)
    )    
    
    # Size: gets us number of rows for that stop
    df3 = (df2.groupby(all_stop_cols)
           .agg({
               f"{timestamp_col}_hour": "size",
               "atleast2_trip_updates": "sum"})
           .reset_index()
          ).rename(columns = {
            f"{timestamp_col}_hour": "trip_min_elapsed"
    })
    
    df3 = df3.assign(
        pct_update_complete = df3.atleast2_trip_updates.divide(
            df3.trip_min_elapsed)
    ) 
    
    return df3

In [None]:
def update_completeness_metric(df: pd.DataFrame) -> pd.DataFrame:
    """
    Start with assembled RT stop_time_updates with 
    scheduled stop_times and also final_trip_updates columns.
    
    For a given stop, if there are predictions/rows present because
    of _extract_ts after the "actual stop arrival" (final_trip_updates), 
    exclude those.
    """
    # Set timestamp columns here, in case these are not correct
    # Row should be derived from _extract_ts (convert to minute combinations)
    # along with stop identifiers
    # For metric, we want to get # unique trip updates
    timestamp_col = "_extract_ts_local"
    metric_col = "trip_update_timestamp_local"
  
    df2 = utils.exclude_predictions_after_actual_stop_arrival(
        df, timestamp_col)
    df3 = utils.parse_hour_min(df2, [timestamp_col])
    
    df4 = atleast2_updates_by_trip_stop(
        df3, 
        timestamp_col,
        metric_col
    )
    
    return df4

In [None]:
df = pd.read_parquet(
    "rt_sched_stop_times.parquet", 
)
df._gtfs_dataset_name.unique()

In [None]:
by_trip_stop = update_completeness_metric(df)

In [None]:
by_trip_stop

In [None]:
df2[df2.stop_sequence==3].scheduled_arrival.value_counts()

In [None]:
df2[df2.stop_sequence==3].predicted_pacific.value_counts()

In [None]:
df2[df2.stop_sequence==3].actual_stop_arrival_pacific.value_counts()

In [None]:
df2[df2.stop_sequence==3]._extract_ts_local.value_counts()

In [None]:
df2[df2.stop_sequence==3].trip_update_timestamp_local.value_counts()

"Actual" stop arrivals appears really close together.

If we care about `prediction_duration`, no rows will be kept because predictions within the `prediction_duration` do not exist. There are predictions for stop 4 when we're at stops 1, 2, and 3, but nothing between stop 3 and 4.

Based on the query set up for final updates, we want to get the `max(arrival_time_pacific)` for each stop. Equivalently, this is would e `max(predicted_pacific)`.

In [None]:
cols = [
    "stop_sequence", "stop_id", 
    "predicted_pacific",
    "_extract_ts_local",
    "trip_update_timestamp_local",
    "prior_stop_arrival_pacific",
    "actual_stop_arrival_pacific",
]

# Pick 1 trip
df[(df.trip_id==one_trip) & 
   (df.stop_sequence == 3)
  ][cols].sort_values(
    ["_extract_ts_local"])

In [None]:
df[(df.trip_id==one_trip) & 
   (df.stop_sequence == 3) & 
   (df._extract_ts_local > df.prior_stop_arrival_pacific) & 
   (df._extract_ts_local <= df.actual_stop_arrival_pacific)
  ][cols]

In [None]:
# Based on the query, ignoring nulls and populating by
# arrival_time, we do want to use the max(predicted arrival time) 
# for that stop
max_predicted_prior = df[(df.trip_id==one_trip) & 
   (df.stop_sequence == 2)][cols].predicted_pacific.max()
max_predicted_prior

In [None]:
df[(df.trip_id==one_trip) & 
   (df.stop_sequence == 3) #& 
   #(df._extract_ts_local > max_predicted_prior)
  ][cols].trip_update_timestamp_local.max()

In [None]:
df[(df.trip_id==one_trip) & 
   (df.stop_sequence == 3)
  ][cols].actual_stop_arrival_pacific.max()

In [None]:
df[(df.trip_id==one_trip) & 
   (df.stop_sequence == 4)
  ][cols].sort_values(
    ["_extract_ts_local", "stop_sequence"]).prior_stop_arrival_pacific.value_counts()