# Explore set up prior to calculating update completeness

## Rabbit Hole
* `_extract_ts_local` doesn't always lead up to the stop's actual arrival, or even the max(stop's predicted arrival). If we stop asking, should we penalize? 
* Right now, we'll only count the trip updates for as much as we're asking.
* If `_extract_ts` is not present, we're not asking, then that's a different issue.

In [1]:
import os
os.environ['USE_PYGEOS'] = '0'

import dask.dataframe as dd
#import dask_geopandas as dg
#import geopandas as gpd
import pandas as pd

from segment_speed_utils.project_vars import PREDICTIONS_GCS, analysis_date

In [None]:
def import_scheduled_stop_times_geom(analysis_date: str, 
                                     filtering):
    gdf = gpd.read_parquet(
        f"{PREDICTIONS_GCS}scheduled_stop_times_with_geom_{analysis_date}/", 
        filters = filtering
    )
    
    return gdf


In [2]:
df = pd.read_parquet(
    "rt_sched_stop_times.parquet", 
    #filters = [[("_gtfs_dataset_name", "==", operator)]]
)
df._gtfs_dataset_name.unique()

array(['Anaheim Resort TripUpdates',
       'Bay Area 511 Dumbarton Express TripUpdates',
       'Bay Area 511 Fairfield and Suisun Transit TripUpdates'],
      dtype=object)

In [3]:
operator = "Bay Area 511 Dumbarton Express TripUpdates"

one_trip = df[df._gtfs_dataset_name==operator].trip_id.iloc[0]

In [None]:
stop_df = df[assemble_stop_times.stop_cols + 
             ["_gtfs_dataset_name", "shape_id", "route_id",
              "stop_sequence", "actual_stop_arrival_pacific", 
              "prior_stop_arrival_pacific", 
              "scheduled_arrival"]].drop_duplicates()

In [None]:
shape = import_scheduled_stop_times_geom(
    analysis_date, 
    filtering=[[("trip_id", "==", one_trip)]]
)

one_trip_df = stop_df[
    (stop_df._gtfs_dataset_name==operator) & 
    (stop_df.trip_id==one_trip)
].sort_values(
    ["trip_id", "stop_sequence"])[[
    "shape_id", "stop_id", "stop_sequence", 
    "scheduled_arrival",
    "actual_stop_arrival_pacific", 
    "prior_stop_arrival_pacific",
]]

one_trip_with_shape = pd.merge(
    shape,
    one_trip_df,
    on = ["shape_id", "stop_id", "stop_sequence"],
    how = "inner"
)

In [None]:
one_trip_with_shape.astype(
    {"scheduled_arrival": "str", 
     "actual_stop_arrival_pacific": "str",
     "prior_stop_arrival_pacific": "str"
}).explore(
    "stop_sequence", 
    tiles="CartoDB Positron")

In [4]:
import utils
import t1_update_completeness

In [5]:
by_trip_stop = t1_update_completeness.update_completeness_metric(df)

In [6]:
by_trip_stop

Unnamed: 0,gtfs_dataset_key,_gtfs_dataset_name,service_date,shape_id,route_id,trip_id,stop_id,stop_sequence,scheduled_arrival,actual_stop_arrival_pacific,trip_min_elapsed,atleast2_trip_updates,pct_update_complete
0,262d7b27183fa8d174ab8fc83ad5848f,Anaheim Resort TripUpdates,2023-03-15,1690ec23-5b3b-4ebe-92c2-89d093ac558b,f17cd7a6-d88c-4e61-a6a6-fe87a775d3c8,c99a46d2-5e55-47ee-b516-9aadb732fd19:1,2020,6.0,2023-03-15 06:40:29,2023-03-15 06:40:02,26,25,0.961538
1,262d7b27183fa8d174ab8fc83ad5848f,Anaheim Resort TripUpdates,2023-03-15,1690ec23-5b3b-4ebe-92c2-89d093ac558b,f17cd7a6-d88c-4e61-a6a6-fe87a775d3c8,c99a46d2-5e55-47ee-b516-9aadb732fd19:1,2038,5.0,2023-03-15 06:39:14,2023-03-15 06:38:35,26,25,0.961538
2,262d7b27183fa8d174ab8fc83ad5848f,Anaheim Resort TripUpdates,2023-03-15,1690ec23-5b3b-4ebe-92c2-89d093ac558b,f17cd7a6-d88c-4e61-a6a6-fe87a775d3c8,c99a46d2-5e55-47ee-b516-9aadb732fd19:1,3009,10.0,2023-03-15 06:44:21,2023-03-15 06:44:10,26,25,0.961538
3,262d7b27183fa8d174ab8fc83ad5848f,Anaheim Resort TripUpdates,2023-03-15,1690ec23-5b3b-4ebe-92c2-89d093ac558b,f17cd7a6-d88c-4e61-a6a6-fe87a775d3c8,c99a46d2-5e55-47ee-b516-9aadb732fd19:1,3013,4.0,2023-03-15 06:38:34,2023-03-15 06:37:22,26,25,0.961538
4,262d7b27183fa8d174ab8fc83ad5848f,Anaheim Resort TripUpdates,2023-03-15,1690ec23-5b3b-4ebe-92c2-89d093ac558b,f17cd7a6-d88c-4e61-a6a6-fe87a775d3c8,c99a46d2-5e55-47ee-b516-9aadb732fd19:1,3014,7.0,2023-03-15 06:41:37,2023-03-15 06:41:03,26,25,0.961538
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2962,9255cb4744d73d4a39f512180a7cf63a,Bay Area 511 Fairfield and Suisun Transit Trip...,2023-03-15,p_915149,8,t_5525671_b_79892_tn_0,75290,15.0,2023-03-15 18:51:00,2023-03-15 18:47:51,25,24,0.960000
2963,9255cb4744d73d4a39f512180a7cf63a,Bay Area 511 Fairfield and Suisun Transit Trip...,2023-03-15,p_915149,8,t_5525671_b_79892_tn_0,75294,19.0,2023-03-15 18:54:00,2023-03-15 18:53:50,22,21,0.954545
2964,9255cb4744d73d4a39f512180a7cf63a,Bay Area 511 Fairfield and Suisun Transit Trip...,2023-03-15,p_915149,8,t_5525671_b_79892_tn_0,75296,21.0,2023-03-15 18:54:00,2023-03-15 19:00:56,20,19,0.950000
2965,9255cb4744d73d4a39f512180a7cf63a,Bay Area 511 Fairfield and Suisun Transit Trip...,2023-03-15,p_915149,8,t_5525671_b_79892_tn_0,75313,4.0,2023-03-15 18:36:00,2023-03-15 18:35:44,39,38,0.974359


In [None]:
df2[df2.stop_sequence==3].scheduled_arrival.value_counts()

In [None]:
df2[df2.stop_sequence==3].predicted_pacific.value_counts()

In [None]:
df2[df2.stop_sequence==3].actual_stop_arrival_pacific.value_counts()

In [None]:
df2[df2.stop_sequence==3]._extract_ts_local.value_counts()

In [None]:
df2[df2.stop_sequence==3].trip_update_timestamp_local.value_counts()

"Actual" stop arrivals appears really close together.

If we care about `prediction_duration`, no rows will be kept because predictions within the `prediction_duration` do not exist. There are predictions for stop 4 when we're at stops 1, 2, and 3, but nothing between stop 3 and 4.

Based on the query set up for final updates, we want to get the `max(arrival_time_pacific)` for each stop. Equivalently, this is would e `max(predicted_pacific)`.

In [None]:
cols = [
    "stop_sequence", "stop_id", 
    "predicted_pacific",
    "_extract_ts_local",
    "trip_update_timestamp_local",
    "prior_stop_arrival_pacific",
    "actual_stop_arrival_pacific",
]

# Pick 1 trip
df[(df.trip_id==one_trip) & 
   (df.stop_sequence == 3)
  ][cols].sort_values(
    ["_extract_ts_local"])

In [None]:
df[(df.trip_id==one_trip) & 
   (df.stop_sequence == 3) & 
   (df._extract_ts_local > df.prior_stop_arrival_pacific) & 
   (df._extract_ts_local <= df.actual_stop_arrival_pacific)
  ][cols]

In [None]:
# Based on the query, ignoring nulls and populating by
# arrival_time, we do want to use the max(predicted arrival time) 
# for that stop
max_predicted_prior = df[(df.trip_id==one_trip) & 
   (df.stop_sequence == 2)][cols].predicted_pacific.max()
max_predicted_prior

In [None]:
df[(df.trip_id==one_trip) & 
   (df.stop_sequence == 3) #& 
   #(df._extract_ts_local > max_predicted_prior)
  ][cols].trip_update_timestamp_local.max()

In [None]:
df[(df.trip_id==one_trip) & 
   (df.stop_sequence == 3)
  ][cols].actual_stop_arrival_pacific.max()

In [None]:
df[(df.trip_id==one_trip) & 
   (df.stop_sequence == 4)
  ][cols].sort_values(
    ["_extract_ts_local", "stop_sequence"]).prior_stop_arrival_pacific.value_counts()