# Metric 1: Vehicle Positions Update Completeness

### Rabbit Hole

In [1]:
import dask.dataframe as dd
import pandas as pd

from dask import delayed, compute

import chart_utils
import utils
from segment_speed_utils import helpers
from segment_speed_utils.project_vars import SEGMENT_GCS, PREDICTIONS_GCS
                              
analysis_date = "2023-03-15"
CONFIG_PATH = "../rt_segment_speeds/scripts/config.yml"


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas


In [2]:
import sys
sys.path.append('../rt_segment_speeds/scripts/')

from A2_valid_vehicle_positions import merge_usable_vp_with_sjoin_vpidx

dict_inputs = helpers.get_parameters(CONFIG_PATH, "stop_segments")

Use vehicle positions that have been spatially joined to segments. Even though we don't want to use the actual segment, we need to sjoin vp to the shape so we can get rid of the vp that occur while the bus is traveling back to origin.

In [3]:
SJOIN_FILE = f'{dict_inputs["stage2"]}_{analysis_date}'
USABLE_FILE = f'{dict_inputs["stage1"]}_{analysis_date}'
GROUPING_COL = dict_inputs["grouping_col"]

all_shapes = pd.read_parquet(
    f"{SEGMENT_GCS}vp_sjoin/{SJOIN_FILE}",
    columns = ["shape_array_key"]
).shape_array_key.unique().tolist()[:50]

# Use this function to attach the crosswalk of sjoin results
# back to usable_vp
ddf = delayed(merge_usable_vp_with_sjoin_vpidx)(
    all_shapes,
    USABLE_FILE,
    SJOIN_FILE,
    GROUPING_COL,
    columns = ["gtfs_dataset_key", "_gtfs_dataset_name", 
               "trip_id", "vp_idx", "location_timestamp_local"]
)[["gtfs_dataset_key", "_gtfs_dataset_name", 
   "trip_id", "location_timestamp_local"]]

### Define Functions for Metrics

In [4]:
def get_trip_duration(
    df: pd.DataFrame,
    trip_cols: list,
    timestamp_col: str
):
    """
    Find the number of minutes in for a trip from 
    vehicle positions table.
    """
    first_vp = (
        df.groupby(trip_cols, observed=True, group_keys=False)
        [timestamp_col]
        .min()
        .reset_index()
        .rename(columns = {timestamp_col: "earliest"})
      )
    
    last_vp = (
        df.groupby(trip_cols, observed=True, group_keys=False)
        [timestamp_col]
        .max()
        .reset_index()
        .rename(columns = {timestamp_col: "latest"})
    )
    
    df2 = first_vp.merge(last_vp, on = trip_cols, how = "inner")
    
    df2 = df2.assign(
        trip_min_elapsed = ((df2.latest - df2.earliest)
                            .dt.total_seconds()
                            .divide(60).round(0)
                            )
    )
    
    return df2

In [5]:
def atleast2_updates_by_trip(
    df: pd.DataFrame,
    trip_cols: list,
    timestamp_col: str = "location_timestamp_local",
) -> pd.DataFrame: 
    """
    For every trip-minute combination,
    count the number of unique location_timestamp_local.
    (Checked that this is 3 max).
    If that minute has at least 2, flag that as passing.
    """
    df = utils.parse_hour_min(df, [timestamp_col])
    minute_cols = utils.minute_cols(timestamp_col)
    
    trip_duration = get_trip_duration(
        df,
        trip_cols,
        timestamp_col
    )
    
    df2 = (
        df.groupby(trip_cols + minute_cols, 
        observed=True, group_keys = False)
        .agg({timestamp_col: "count"})
        .reset_index()
    )    
    
    # 1 if it has more than 2 updates, 0 otherwise.
    # Easier to sum and calculate percent.
    df2 = df2.assign(
        atleast2_trip_updates = df2.apply(
            lambda x: 1 if x[timestamp_col] >= 2
            else 0, axis=1, 
            meta=("atleast2_trip_updates", "int8"))
    )    
    
    # By trip, sum up the number of at least 2 trip updates
    # that minute, and merge in trip_duration (minutes)
    df3 = (df2.groupby(trip_cols, observed=True, group_keys=False)
           .agg({"atleast2_trip_updates": "sum"})
           .reset_index()
          )
    
    df4 = df3.merge(
        trip_duration,
        on = trip_cols,
        how = "inner"
    )
    
    df4 = df4.assign(
        pct_update_complete = df4.atleast2_trip_updates.divide(
            df4.trip_min_elapsed)
    ) 
    
    return df4

In [6]:
def update_completeness_metric(df: pd.DataFrame) -> pd.DataFrame:
    """
    Start with RT vehicle_positions.
    
    For a given trip, calculate the trip_duration (minutes).
    Count the number of minutes 
    """
    # Set timestamp columns here, in case these are not correct
    # Row should be derived from _extract_ts (convert to minute combinations)
    # along with stop identifiers
    # For metric, we want to get # unique trip updates
    timestamp_col = "location_timestamp_local"
    
    # define all the columns needed for stop grouping
    # include columns for future aggregations
    all_trip_cols = [
        "gtfs_dataset_key", "_gtfs_dataset_name", 
        "trip_id", 
        #service_date
    ]

    df2 = atleast2_updates_by_trip(
        df, 
        all_trip_cols,
        timestamp_col,
    )
    
    return df2

### Calculate Metric and Quick Descriptives

In [7]:
by_trip = update_completeness_metric(ddf)
by_trip = by_trip.persist()

In [8]:
by_trip_df = compute(by_trip)[0]

In [11]:
final = by_trip_df.compute()

In [14]:
final.pct_update_complete.describe()

count    618.000000
mean       0.953399
std        0.169221
min        0.040972
25%        1.000000
50%        1.006473
75%        1.016129
max        1.083333
Name: pct_update_complete, dtype: float64

In [None]:
cols = [
    "atleast2_trip_updates", 
    "trip_min_elapsed",
    "pct_update_complete"]

In [None]:
for i in by_trip._gtfs_dataset_name.unique():
    display(
        chart_utils.describe_to_df(
            by_trip,
            i,
            cols,
        )
    )

In [None]:
charts_df = chart_utils.prep_df_for_chart(
    df = by_trip,
    percentage_column = "pct_update_complete",
    columns_to_round = ["pct_update_complete"],
    columns_to_keep = [
        "_gtfs_dataset_name",
        "trip_id",
        "pct_update_complete",
    ],
)

In [None]:
for i in charts_df['Gtfs Dataset Name'].unique():
    display(chart_utils.scatter_plot_domain(
    charts_df,
    operator = i,
    x_col="Stop Sequence",
    y_col="Pct Update Complete",
    color_col="Rounded Pct Update Complete",
    dropdown_col="Trip Id",
    dropdown_col_title="Trip ID",))