# Metric 1: Vehicle Positions Update Completeness

### Rabbit Hole
* Current implementation:
   * start with stop time updates, and for each trip, calculate first and last stop's actual stop arrival
   * use raw vehicle positions, and only keep timestamps between first/last stop arrivals -- this solves the fact that vp may drift away from the shape during a layover while traveling back to the origin 
    * see if we observe at least 2 vp timestamps per minute for the trip duration
* No longer stop-level, but trip-level. To attach individual stop-level information, we would want to expand vehicle positions data rows to have a similar schema as stop time updates.
   * Google Doc outlines a trip-level summary, so inflating the rows would be undesirable for now.

In [1]:
import dask.dataframe as dd
import pandas as pd

from calitp_data_analysis.tables import tbls
from siuba import *

import chart_utils
import utils
from segment_speed_utils.project_vars import SEGMENT_GCS, PREDICTIONS_GCS
from shared_utils import schedule_rt_utils    
                    
analysis_date = "2023-03-15"


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas  # type: ignore


In [2]:
def first_and_last_actual_stop_arrival_by_trip(
    df: pd.DataFrame, 
    trip_cols: list,
    timestamp_col: str = "actual_stop_arrival_pacific"
) -> pd.DataFrame:
    """
    For each trip, find the min/max actual_stop_arrival time.
    Corresponds to the first stop's actual arrival and last stop's 
    actual arrival.
    """    
    grouped_df = df.groupby(trip_cols, observed=True, 
                            group_keys=False)
    
    df2 = (grouped_df
           .agg({timestamp_col: "min"})
           .reset_index()
           .rename(columns = {timestamp_col: "first_stop_arrival"})
          ).merge(
        grouped_df
        .agg({timestamp_col: "max"})
        .reset_index()
        .rename(columns  = {timestamp_col: "last_stop_arrival"}),
        on = trip_cols,
        how = "inner"
    )
    
    return df2

In [3]:
def vp_tu_crosswalk(analysis_date: str) -> pd.DataFrame:
    """
    Get crosswalk to match vehicle positions and trip updates 
    gtfs_dataset_keys.
    """
    dim_provider_gtfs = (
        tbls.mart_transit_database.dim_provider_gtfs_data()
        >> select(
            _.vehicle_positions_gtfs_dataset_key, 
            _.trip_updates_gtfs_dataset_key, 
            _._valid_from, _._valid_to)
        >> distinct() 
        >> collect()
    )
    
    dim_provider_gtfs = schedule_rt_utils.localize_timestamp_col(
        dim_provider_gtfs, ["_valid_from", "_valid_to"])

    dim_provider_gtfs2 = (
        dim_provider_gtfs[
            dim_provider_gtfs.vehicle_positions_gtfs_dataset_key.notna()]
        >> filter(
            _._valid_from_local <= pd.to_datetime(analysis_date), 
            _._valid_to_local >= pd.to_datetime(analysis_date), 
        )
        >> select(_.vehicle_positions_gtfs_dataset_key, 
                  _.trip_updates_gtfs_dataset_key)
    )
    
    return dim_provider_gtfs2

In [4]:
def import_vp_with_crosswalk_to_tu(
    analysis_date: str,
) -> dd.DataFrame:
    """
    Import vehicle positions with trip_updates_gtfs_dataset_key attached.
    """
    VP_FILE = f"{SEGMENT_GCS}vp_{analysis_date}.parquet"
    
    # Grab vp gtfs_dataset_keys and set up crosswalk to merge
    # with trip_updates
    vp_datasets = pd.read_parquet(
        VP_FILE,
        columns = ["gtfs_dataset_key"]
    ).drop_duplicates()
    
    crosswalk = vp_tu_crosswalk(analysis_date)

    vp_tu_datasets = pd.merge(
        vp_datasets,
        crosswalk.rename(columns = {
            "vehicle_positions_gtfs_dataset_key": "gtfs_dataset_key"}),
        how = "inner"
    )
    
    vp = dd.read_parquet(
        VP_FILE,
        columns = ["gtfs_dataset_key", "trip_id",
                   "location_timestamp_local"],
    ).merge(
        vp_tu_datasets,
        on = "gtfs_dataset_key",
        how = "inner"
    ).rename(columns = {
        "gtfs_dataset_key": "vehicle_positions_gtfs_dataset_key"})
    
    return vp

In [5]:
def merge_vp_with_trip_window(
    vp: dd.DataFrame,
    trip_window: pd.DataFrame,
    vp_timestamp: str = "location_timestamp_local",
    arrival_cols: tuple = ("first_stop_arrival", "last_stop_arrival")
) -> pd.DataFrame:
    """
    Merge vehicle positions with trip_updates first/last 
    actual stop arrivals, and only keep vp that occur
    between those 2 timestamps.
    """
    first_stop, last_stop = arrival_cols[0], arrival_cols[1]
    df = dd.merge(
        vp,
        trip_window,
        on = ["trip_updates_gtfs_dataset_key", "trip_id"],
        how = "inner"
    )
    
    df2 = df[(df[vp_timestamp] >= df[first_stop]) & 
             (df[vp_timestamp] <= df[last_stop])
            ].reset_index(drop=True)
    
    return df2

### Define Functions for Metrics

In [6]:
def atleast2_updates_by_trip(
    df: pd.DataFrame,
    trip_cols: list,
    timestamp_col: str = "location_timestamp_local",
) -> pd.DataFrame: 
    """
    For every trip-minute combination,
    count the number of unique location_timestamp_local.
    (Checked that this is 3 max).
    If that minute has at least 2, flag that as passing.
    """
    df = utils.parse_hour_min(df, [timestamp_col])
    minute_cols = utils.minute_cols(timestamp_col)
    
    df2 = (
        df.groupby(trip_cols + minute_cols, 
        observed=True, group_keys = False)
        .agg({timestamp_col: "count"})
        .reset_index()
    )    
    
    # 1 if it has more than 2 updates, 0 otherwise.
    # Easier to sum and calculate percent.
    df2 = df2.assign(
        atleast2_trip_updates = df2.apply(
            lambda x: 1 if x[timestamp_col] >= 2
            else 0, axis=1, 
            meta=("atleast2_trip_updates", "int8"))
    )    
    
    # By trip, sum up the number of at least 2 trip updates
    # that minute, and merge in trip_duration (minutes)
    df3 = (df2.groupby(trip_cols, observed=True, group_keys=False)
           .agg({
               "atleast2_trip_updates": "sum",
               timestamp_col: "size",
           }).reset_index()
           .rename(columns = {timestamp_col: "trip_min_elapsed"})
          )
    
    df3 = df3.assign(
        pct_update_complete = df3.atleast2_trip_updates.divide(
            df3.trip_min_elapsed)
    ) 
    
    return df3

In [7]:
def update_completeness_metric(df: pd.DataFrame) -> pd.DataFrame:
    """
    Start with RT vehicle_positions.
    
    For a given trip, calculate the trip_duration (minutes).
    Count the number of minutes 
    """
    # Set timestamp columns here, in case these are not correct
    # Row should be derived from _extract_ts (convert to minute combinations)
    # along with stop identifiers
    # For metric, we want to get # unique trip updates
    timestamp_col = "location_timestamp_local"
    
    # define all the columns needed for stop grouping
    # include columns for future aggregations    
    all_trip_cols = [
        "vehicle_positions_gtfs_dataset_key", 
        "_gtfs_dataset_name",
        "trip_id", 
        #"service_date"
    ]

    df2 = atleast2_updates_by_trip(
        df, 
        all_trip_cols,
        timestamp_col,
    )
    
    return df2

### Calculate Metric and Quick Descriptives

In [8]:
stop_time_updates = pd.read_parquet(
    f"{PREDICTIONS_GCS}rt_sched_stop_times_{analysis_date}.parquet", 
)

# From stop_time_updates, get the first stop's actual arrival
# and last stop's actual arrival for each trip
trip_window = first_and_last_actual_stop_arrival_by_trip(
    stop_time_updates, 
    trip_cols = ["gtfs_dataset_key", "_gtfs_dataset_name", 
                 "service_date", "trip_id"],
    timestamp_col = "actual_stop_arrival_pacific"
).rename(columns = {"gtfs_dataset_key": "trip_updates_gtfs_dataset_key"})


# Import vp and merge in crosswalk to get trip_updates_gtfs_dataset_key
vp = import_vp_with_crosswalk_to_tu(analysis_date)

In [9]:
# Merge vp with the trip window, and drop vp timestamps
# that are outside of those timestamp boundaries
df = merge_vp_with_trip_window(
    vp,
    trip_window,
    vp_timestamp = "location_timestamp_local",
    arrival_cols = ('first_stop_arrival', 'last_stop_arrival')
)

In [10]:
df

Unnamed: 0_level_0,vehicle_positions_gtfs_dataset_key,trip_id,location_timestamp_local,trip_updates_gtfs_dataset_key,_gtfs_dataset_name,service_date,first_stop_arrival,last_stop_arrival
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
,object,object,datetime64[ns],object,object,object,datetime64[ns],datetime64[ns]
,...,...,...,...,...,...,...,...


In [11]:
by_trip = update_completeness_metric(df).compute()

In [12]:
cols = [
    "atleast2_trip_updates", 
    "trip_min_elapsed",
    "pct_update_complete"]

In [13]:
for i in by_trip._gtfs_dataset_name.unique():
    display(
        chart_utils.describe_to_df(
            by_trip,
            i,
            cols,
        )
    )

Unnamed: 0,Measure,Atleast2 Trip Updates,Trip Min Elapsed,Pct Update Complete
0,Count,148.0,148.0,148.0
1,Mean,9.9,15.4,0.7
2,Std,4.5,7.1,0.2
3,Min,0.0,2.0,0.0
4,25%,7.0,10.0,0.5
5,50%,8.0,13.5,0.7
6,75%,11.0,19.2,0.8
7,Max,23.0,30.0,1.0


Unnamed: 0,Measure,Atleast2 Trip Updates,Trip Min Elapsed,Pct Update Complete
0,Count,56.0,56.0,56.0
1,Mean,48.2,49.4,1.0
2,Std,16.2,16.0,0.0
3,Min,8.0,8.0,0.9
4,25%,37.0,38.0,1.0
5,50%,47.0,49.0,1.0
6,75%,54.5,55.2,1.0
7,Max,85.0,86.0,1.0


Unnamed: 0,Measure,Atleast2 Trip Updates,Trip Min Elapsed,Pct Update Complete
0,Count,258.0,258.0,258.0
1,Mean,24.7,25.4,1.0
2,Std,5.4,5.4,0.0
3,Min,14.0,14.0,0.9
4,25%,21.0,21.0,1.0
5,50%,25.0,26.0,1.0
6,75%,28.0,29.0,1.0
7,Max,56.0,56.0,1.0


Unnamed: 0,Measure,Atleast2 Trip Updates,Trip Min Elapsed,Pct Update Complete
0,Count,571.0,571.0,571.0
1,Mean,15.5,16.2,0.9
2,Std,14.2,14.2,0.1
3,Min,0.0,1.0,0.0
4,25%,9.0,9.0,0.9
5,50%,11.0,12.0,1.0
6,75%,18.0,19.0,1.0
7,Max,151.0,151.0,1.0


Unnamed: 0,Measure,Atleast2 Trip Updates,Trip Min Elapsed,Pct Update Complete
0,Count,41.0,41.0,41.0
1,Mean,34.3,38.5,0.9
2,Std,16.6,17.9,0.1
3,Min,2.0,3.0,0.7
4,25%,21.0,25.0,0.8
5,50%,36.0,40.0,0.9
6,75%,41.0,50.0,0.9
7,Max,70.0,76.0,1.0
