# Big Blue Bus nearest neighbor comparison

* What would happen if we allowed all the full vp to be used to find nearest neighbor, instead of removing the opposite direction?
* The monotonically increasing condition is set later, could it catch the errors that way?

In [None]:
import geopandas as gpd
import pandas as pd

from segment_speed_utils import helpers, neighbor
from segment_speed_utils.project_vars import SEGMENT_GCS, GTFS_DATA_DICT
from shared_utils import rt_dates

analysis_date = rt_dates.DATES["apr2024"]

In [None]:
import numpy as np
import sys
sys.path.append("scripts/")
import nearest_vp_to_stop

WGS84 = "EPSG:4326"

In [None]:
bbb_trips = helpers.import_scheduled_trips(
    analysis_date,
    filters = [("name", "==", "Big Blue Bus Schedule")],
    columns = ["gtfs_dataset_key", "trip_instance_key"],
    get_pandas = True
)

bbb_key = bbb_trips.schedule_gtfs_dataset_key.iloc[0]
subset_trips = bbb_trips.trip_instance_key.unique()

## Construct proxy stop times

In [None]:
def construct_stop_times(
    analysis_date: str, 
    subset_trips: list
) -> gpd.GeoDataFrame:

    # Grab the relevant stop times rows
    # will need to concatenate RT stop times (all trips) 
    # with additional segments for speedmaps
    rt_stop_times = (
        nearest_vp_to_stop.stop_times_for_all_trips(analysis_date)
        .query('trip_instance_key in @subset_trips')
    )

    proxy_stop_times = (
        nearest_vp_to_stop.stop_times_for_speedmaps(analysis_date)
        .query('trip_instance_key in @subset_trips')
    )

    bbb_stop_times = pd.concat(
        [rt_stop_times, proxy_stop_times], 
        axis=0, ignore_index=True
    )
    
    return bbb_stop_times

bbb_stop_times = construct_stop_times(analysis_date, subset_trips)

## Merge stop and get nearest snap (`neighbor`)

In [None]:
# This is with opposite direction removed
gdf = neighbor.merge_stop_vp_for_nearest_neighbor(
    bbb_stop_times,
    analysis_date,
    filters = [[("trip_instance_key", "in", subset_trips)]]
)

In [None]:
# Try a version without removing vp points
# and allow nearest neighbor to select from any direction
vp_full = gpd.read_parquet(
    f"{SEGMENT_GCS}condensed/vp_condensed_{analysis_date}.parquet",
    columns = ["trip_instance_key", "vp_idx", 
               "location_timestamp_local", 
               "geometry"],
    filters = [[("trip_instance_key", "in", subset_trips)]]
).rename(columns = {
    "vp_idx": "trip_vp_idx",
    "geometry": "trip_geometry"
}).set_geometry("trip_geometry").to_crs(WGS84)

gdf2 = pd.merge(
    bbb_stop_times.rename(
        columns = {
            "geometry": "stop_geometry"}
    ).set_geometry("stop_geometry").to_crs(WGS84),
    vp_full.rename(
        columns = {
            "geometry": "vp_geometry"
        }),
    on = ["trip_instance_key"],
    how = "inner"
)

In [None]:
# this list comes from an adapted version of 
# GTFS_DATA_DICT stop_pair_cols
merge_cols = ["stop_id", "stop_pair", "stop_sequence", "stop_sequence1", 
          "stop_geometry", "stop_primary_direction", 
          "shape_array_key", "trip_instance_key"]

gdf_results = pd.merge(
    gdf,
    gdf2,
    on = merge_cols,
    how = "inner"
)

In [None]:
gdf_results.dtypes

In [None]:
nearest_vp_idx = np.vectorize(neighbor.add_nearest_vp_idx)( 
    gdf_results.vp_geometry, gdf_results.stop_geometry, gdf_results.vp_idx
)

nearest_vp_idx2 = np.vectorize(neighbor.add_nearest_vp_idx)( 
    gdf_results.trip_geometry, gdf_results.stop_geometry, gdf_results.trip_vp_idx
)

In [None]:
gdf_results = gdf_results.assign(
    nearest_vp_idx = nearest_vp_idx,
    nearest_vp_idx2 = nearest_vp_idx2
)

In [None]:
gdf_results.shape, gdf_results[
    gdf_results.nearest_vp_idx != gdf_results.nearest_vp_idx2
].shape

If we allow even opposite directions to show up in the nearest neighbor, about 4.1% of the rows would have different results for which `vp_idx` is selected.

About 2/3 of these are -1 or +1, which probably wouldn't change the result that much. The other 1/3 have differences more than 1, which could change the result, since the trio of points allows for a difference of 1, but not more to interpolate.

In [None]:
gdf_results.shape, gdf_results[
    gdf_results.nearest_vp_idx != gdf_results.nearest_vp_idx2].shape

In [None]:
diff_df = gdf_results[
    gdf_results.nearest_vp_idx != gdf_results.nearest_vp_idx2]

diff_df = diff_df.assign(
    vp_idx_diff = diff_df.nearest_vp_idx - diff_df.nearest_vp_idx2
)

diff_df["vp_idx_diff"].shape, diff_df[
    abs(diff_df.vp_idx_diff) > 1].shape

In [None]:
diff_df.vp_idx_diff.hist(bins = range(
    diff_df.vp_idx_diff.min(), 
    diff_df.vp_idx_diff.max(), 
    1)
                        )

In [None]:
diff_df.vp_idx_diff.describe()

## Add trio (`neighbor`)

In [None]:
import shapely
def add_trio_cols(
    gdf2: gpd.GeoDataFrame, 
    nearest_vp_col: str,
):

    nearest_vp_idx_series = []    
    vp_trio_series = []
    time_trio_series = []
    coords_trio_series = []
    
    for row in gdf2.itertuples():
        vp_trio, time_trio, coords_trio = neighbor.add_trio(
            getattr(row, nearest_vp_col), 
            np.asarray(getattr(row, "trip_vp_idx")),
            np.asarray(getattr(row, "location_timestamp_local")),
            np.asarray(getattr(row, "trip_geometry").coords),
        )
        
        vp_trio_series.append(vp_trio)
        time_trio_series.append(time_trio)
        coords_trio_series.append(shapely.LineString(coords_trio))
                
    drop_cols = [
        "location_timestamp_local",
        "trip_vp_idx", "trip_geometry"
    ]
    
    gdf2 = gdf2.assign(
        vp_idx_trio = vp_trio_series,
        location_timestamp_local_trio = time_trio_series,
        vp_coords_trio = gpd.GeoSeries(coords_trio_series, crs = WGS84)
    ).drop(columns = drop_cols)
    
    return gdf2

In [None]:
gdf_results1 = add_trio_cols(gdf_results, "nearest_vp_idx")
gdf_results2 = add_trio_cols(gdf_results, "nearest_vp_idx2")

In [None]:
trio_results = pd.merge(
    gdf_results1,
    gdf_results2.rename(columns = {
        "vp_idx": "vp_idx2",
        
        "vp_idx_trio": "vp_idx_trio2",
        "location_timestamp_local_trio": "location_timestamp_local_trio2",
        "vp_coords_trio": "vp_coords_trio2"
    }),
    on = merge_cols + ["vp_geometry", "nearest_vp_idx", "nearest_vp_idx2"],
    how = "inner",
)

## Interpolate arrival

In [None]:
trio_results.dtypes

In [None]:
PROJECT_CRS = "EPSG:3310"
import interpolate_stop_arrival

trip_stop_cols = ["trip_instance_key", "stop_sequence", "stop_sequence1"]

def interpolate_me(
    df: gpd.GeoDataFrame,
    analysis_date: str,
):
    df = df.assign(
        stop_geometry = df.stop_geometry.to_crs(PROJECT_CRS),
        vp_coords_trio = df.vp_coords_trio.to_crs(PROJECT_CRS)
    )

    shapes = helpers.import_scheduled_shapes(
        analysis_date,
        columns = ["shape_array_key", "geometry"],
        crs = PROJECT_CRS
    ).dropna(subset="geometry")

    gdf = pd.merge(
        df,
        shapes.rename(columns = {"geometry": "shape_geometry"}),
        on = "shape_array_key",
        how = "inner"
    )

    del df, shapes

    stop_meters_series = []
    stop_arrival_series = []
    
    for row in gdf.itertuples():
        
        stop_meters, interpolated_arrival = interpolate_stop_arrival.project_points_onto_shape(
            getattr(row, "stop_geometry"),
            getattr(row, "vp_coords_trio"),
            getattr(row, "shape_geometry"),
            getattr(row, "location_timestamp_local_trio")
        )
        
        stop_meters_series.append(stop_meters)
        stop_arrival_series.append(interpolated_arrival)

    results = gdf.assign(
        stop_meters = stop_meters_series,
        arrival_time = stop_arrival_series,
    )[trip_stop_cols + ["shape_array_key", "stop_id", 
         "stop_meters", "arrival_time"]
     ].sort_values(
        trip_stop_cols
    ).reset_index(drop=True)
    
    
    results = interpolate_stop_arrival.enforce_monotonicity_and_interpolate_across_stops(
        results, trip_stop_cols)
    
    return results

In [None]:
gdf_interp1 = interpolate_me(gdf_results1, analysis_date)

In [None]:
gdf_interp1.dtypes, gdf_interp2.dtypes

In [None]:
gdf_interp2 = interpolate_me(gdf_results2, analysis_date)

In [None]:
interp_results = pd.merge(
    gdf_interp1,
    gdf_interp2.rename(columns = {
        "stop_meters": "stop_meters2",
        "arrival_time": "arrival_time2"
    }),
    on = trip_stop_cols + ["shape_array_key", "stop_id"],
    how = "inner",
)

About 4.5% have different interpolated arrival time results. Before, about 4.1% have different nearest vp selected, and about 1/3 of those had differences greater than 1. 

But overall, it results in about 4.5% of different interpolated arrival times.

In [None]:
interp_results.shape, interp_results[
    interp_results.arrival_time != interp_results.arrival_time2
].shape

## Speeds

In [None]:
from segment_speed_utils import segment_calcs

def speed_calculation(df: pd.DataFrame):
    trip_cols = ["trip_instance_key"]

    df = segment_calcs.convert_timestamp_to_seconds(
        df, ["arrival_time"]
    ).sort_values(trip_stop_cols).reset_index(drop=True)
    
    df = df.assign(
        subseq_arrival_time_sec = (df.groupby(trip_cols, 
                                             observed=True, group_keys=False)
                                  .arrival_time_sec
                                  .shift(-1)
                                 ),
        subseq_stop_meters = (df.groupby(trip_cols, 
                                        observed=True, group_keys=False)
                             .stop_meters
                             .shift(-1)
                            )
    )

    speed = df.assign(
        meters_elapsed = df.subseq_stop_meters - df.stop_meters, 
        sec_elapsed = df.subseq_arrival_time_sec - df.arrival_time_sec,
    ).pipe(
        segment_calcs.derive_speed, 
        ("stop_meters", "subseq_stop_meters"), 
        ("arrival_time_sec", "subseq_arrival_time_sec")
    )
    
    return speed

In [None]:
drop_me = [
    "arrival_time_sec", "subseq_arrival_time_sec",
    "subseq_stop_meters"
]
speed1 = speed_calculation(gdf_interp1).drop(columns = drop_me)
speed2 = speed_calculation(gdf_interp2).drop(columns = drop_me)

In [None]:
speed_results = pd.merge(
    speed1,
    speed2.rename(columns = {
        "stop_meters": "stop_meters2",
        "arrival_time": "arrival_time2",
        "meters_elapsed": "meters_elapsed2",
        "sec_elapsed": "sec_elapsed2",
        "speed_mph": "speed_mph2"
    }),
    on = trip_stop_cols + ["shape_array_key", "stop_id"],
    how = "inner",
)

This magnifies to about 5.5% different speeds

In [None]:
speed_results.shape, speed_results[
    (speed_results.speed_mph != speed_results.speed_mph2) & 
    (speed_results.speed_mph.notna()) & 
    (speed_results.speed_mph < 100_000) # remove infinity
].shape

In [None]:
speed_results = speed_results.assign(
    speed_diff = speed_results.speed_mph - speed_results.speed_mph2
)

In [None]:
speed_results.speed_diff.describe()

In [None]:
speed_results[
    speed_results.speed_diff != 0
].speed_diff.hist(bins=range(-70, 70, 1))

In [None]:
speed_results[
    (speed_results.speed_diff != 0) & 
    (speed_results.speed_mph.notna()) & 
    (speed_results.speed_mph < 100_000)
].sort_values(["trip_instance_key", "arrival_time"])[
    ["arrival_time", "arrival_time2", 
     "speed_mph", "speed_mph2", "speed_diff"]]

In [None]:
SEGMENT_FILE = GTFS_DATA_DICT.speedmap_segments.segments_file
segment_gdf = gpd.read_parquet(
    f"{SEGMENT_GCS}{SEGMENT_FILE}_{analysis_date}.parquet", 
    filters = [[("trip_instance_key", "in", subset_trips)]]
)

In [None]:
from segment_speed_utils import gtfs_schedule_wrangling
speed_results2 = gtfs_schedule_wrangling.fill_missing_stop_sequence1(
    speed_results)

In [None]:
segment_gdf.shape

In [None]:
pd.merge(
    segment_gdf,
    speed_results2,
    on = trip_stop_cols + ["shape_array_key"],
    how = "outer",
    indicator = True
)._merge.value_counts()

In [None]:
speed_results_gdf = pd.merge(
    segment_gdf,
    speed_results2,
    on = trip_stop_cols + ["shape_array_key"],
    how = "inner",
)

In [None]:
speed_results_gdf.dtypes

In [None]:
from shared_utils import rt_utils

def make_map(gdf, speed_col):
    drop = ["arrival_time", "arrival_time2"]
    m = gdf.drop(columns = drop).explore(
        speed_col,
        tiles = "CartoDB Positron",
        cmap = rt_utils.ZERO_THIRTY_COLORSCALE
    )
    
    return m

In [None]:
bbb_trips_and_shape = helpers.import_scheduled_trips(
    analysis_date,
    filters = [("trip_instance_key", "in", subset_trips)],
    columns = ["shape_id", "shape_array_key", "trip_instance_key"],
    get_pandas = True
)

In [None]:
bbb_trips_and_shape[
    bbb_trips_and_shape.shape_id=="26714"
].trip_instance_key.unique()

In [None]:
make_map(
    speed_results_gdf[
        speed_results_gdf.trip_instance_key == 
        "523d9d30ace49b2cc966c2cbaa8e9071"], 
    "speed_mph"
)

In [None]:
make_map(
    speed_results_gdf[
        speed_results_gdf.trip_instance_key == 
        "523d9d30ace49b2cc966c2cbaa8e9071"], 
    "speed_mph2"
)

In [None]:
make_map(
    speed_results_gdf[
        speed_results_gdf.trip_instance_key == 
        "3dfd9bae3724d3f62363a8328696cb4e"], 
    "speed_mph2"
)

In [None]:
make_map(
    speed_results_gdf[
        speed_results_gdf.trip_instance_key == 
        "3dfd9bae3724d3f62363a8328696cb4e"], 
    "speed_mph2"
)

In [None]:
bbb_trips_and_shape[
    bbb_trips_and_shape.shape_id=="26751"].trip_instance_key.unique()

In [None]:
make_map(
    speed_results_gdf[
        speed_results_gdf.trip_instance_key == 
        "103ffb4be00deb25a90c82f92d431cb2"], 
    "speed_mph"
)

In [None]:
make_map(
    speed_results_gdf[
        speed_results_gdf.trip_instance_key == 
        "103ffb4be00deb25a90c82f92d431cb2"], 
    "speed_mph2"
)

In [None]:
bbb_trips_and_shape[
    bbb_trips_and_shape.shape_id=="26793"
].trip_instance_key.unique()

In [None]:
make_map(
    speed_results_gdf[
        speed_results_gdf.trip_instance_key == 
        "66c7c7215da8fc97c6e620c694aa689c"], 
    "speed_mph"
)

In [None]:
make_map(
    speed_results_gdf[
        speed_results_gdf.trip_instance_key == 
        "66c7c7215da8fc97c6e620c694aa689c"], 
    "speed_mph2"
)