In [None]:
#!pip install shapely==2.0.2

In [2]:
import dask.dataframe as dd
import geopandas as gpd
import numpy as np
import pandas as pd
import shapely

from segment_speed_utils import helpers, wrangle_shapes
from segment_speed_utils.project_vars import SEGMENT_GCS, analysis_date

In [3]:
vp_usable = dd.read_parquet(
    f"{SEGMENT_GCS}vp_usable_{analysis_date}/",
    columns = ["vp_idx",  "location_timestamp_local"],
)

In [4]:
# shape_meters is here
vp_projected = pd.read_parquet(
    f"{SEGMENT_GCS}projection/vp_projected_roads_{analysis_date}.parquet",
)

In [5]:
vp_usable.columns, vp_projected.columns

(Index(['vp_idx', 'location_timestamp_local'], dtype='object'),
 Index(['vp_idx', 'trip_instance_key', 'linearid', 'mtfcc', 'shape_meters'], dtype='object'))

In [6]:
vp_info = dd.merge(
    vp_usable,
    vp_projected,
    on = "vp_idx",
    how = "inner"
)

In [8]:
vp_info = vp_info.persist()

In [9]:
nearest_df = pd.read_parquet(
    f"{SEGMENT_GCS}nearest_vp_roads_{analysis_date}.parquet"
)

In [10]:
vp_df = vp_info.compute()

In [15]:
vp_df.head(2)

Unnamed: 0,vp_idx,location_timestamp_local,trip_instance_key,linearid,mtfcc,shape_meters
0,68016,2023-10-11 10:09:15,f26f828162f83b19644fdebcabb2439c,11010927740446,S1200,3109.907089
1,68017,2023-10-11 10:09:30,f26f828162f83b19644fdebcabb2439c,11010927740446,S1200,3127.906806


In [17]:
nearest_df.columns

Index(['linearid', 'mtfcc', 'segment_sequence', 'primary_direction',
       'road_meters', 'trip_instance_key', 'nearest_vp_idx', 'subseq_vp_idx'],
      dtype='object')

In [21]:
road_id_cols = ["linearid", "mtfcc"]

vp_with_nearest_info = pd.merge(
    nearest_df,
    vp_df.rename(columns = {
        "vp_idx": "nearest_vp_idx",
        "location_timestamp_local": "nearest_location_timestamp_local",
        "shape_meters": "nearest_shape_meters"
    }),
    on = road_id_cols + ["trip_instance_key", "nearest_vp_idx"],
    how = "inner"
)

In [22]:
df = pd.merge(
    vp_with_nearest_info,
    vp_df.rename(columns = {
        "vp_idx": "subseq_vp_idx",
        "location_timestamp_local": "subseq_location_timestamp_local",
        "shape_meters": "subseq_shape_meters"
    }),    
    on = road_id_cols + ["trip_instance_key", "subseq_vp_idx"],
    how = "inner"
)

In [24]:
def get_stop_arrivals(df: pd.DataFrame) -> pd.DataFrame:
    """
    Apply np.interp to df.
    df must be set up so that a given stop is populated with its
    own stop_meters, as well as columns for nearest and subseq 
    shape_meters / location_timestamp_local_sec.
    """
    x_col = "shape_meters"
    y_col = "location_timestamp_local"
    
    stop_arrival_series = []
    for row in df.itertuples():

        xp = np.asarray([
            getattr(row, f"nearest_{x_col}"), 
            getattr(row, f"subseq_{x_col}")
        ])

        yp = np.asarray([
            getattr(row, f"nearest_{y_col}"), 
            getattr(row, f"subseq_{y_col}")
        ]).astype("datetime64[s]").astype("float64")

        stop_position = getattr(row, "road_meters")
        interpolated_arrival = np.interp(stop_position, xp, yp)
        stop_arrival_series.append(interpolated_arrival)
        
    df = df.assign(
        arrival_time = stop_arrival_series,
    ).astype(
        {"arrival_time": "datetime64[s]"}
    ).sort_values(
        ["trip_instance_key", 
        "linearid", "mtfcc", "segment_sequence"]
    ).reset_index(drop=True)
    
    return df

In [26]:
stop_arrivals = get_stop_arrivals(df)

In [31]:
STOP_ARRIVALS_FILE = f"stop_arrivals_roads_{analysis_date}"
stop_arrivals.to_parquet(
        f"{SEGMENT_GCS}{STOP_ARRIVALS_FILE}.parquet")

References:
* https://gis.stackexchange.com/questions/416284/splitting-multiline-or-linestring-into-equal-segments-of-particular-length-using
* https://gis.stackexchange.com/questions/250784/splitting-line-shapefile-into-segments-of-equal-length-using-python/414888#414888
* https://gis.stackexchange.com/questions/464336/change-geopandas-geometry-from-geometrycollection-to-multipolygon