In [1]:
import dask.dataframe as dd
import geopandas as gpd
import numpy as np
import pandas as pd

from segment_speed_utils import helpers, wrangle_shapes
from segment_speed_utils.project_vars import (SEGMENT_GCS, 
                                              CONFIG_PATH, PROJECT_CRS)

from segment_speed_utils.project_vars import analysis_date_list
analysis_date = analysis_date_list[0]

road_id_cols = ["linearid", "mtfcc"]
segment_identifier_cols = road_id_cols + ["segment_sequence"]

test_trip = "00139041e36b607c7e10cb7ec023e837"

In [2]:
df = pd.read_parquet(
    f"{SEGMENT_GCS}vp_sjoin/vp_road_segments_wide_{analysis_date}.parquet",
    columns = ["trip_instance_key"]
).drop_duplicates()

In [55]:
test_trips = ['00062c6db9dbef9c80f5ada74b31e257',
       '0009c78b48866a26d664ab00f67d1606',
       '00139041e36b607c7e10cb7ec023e837',
       'ff780650b98209acf69a71a7dab2502c',
       'ff86898d6a8ff5df82912699133bf4b6',
       'ffb44943b394f891d2b2286bb3902305']

In [None]:
def merge_vp_to_crosswalk(
    analysis_date: str, 
    filters: tuple
):
    # vp to road segment crosswalk
    df = pd.read_parquet(
        f"{SEGMENT_GCS}vp_sjoin/vp_road_segments_wide_{analysis_date}.parquet",
        filters = filters
    )
    
    # only keep the road segments that have at least 2 vp
    df = df.assign(
        n_vp = df.apply(lambda x: len(x.vp_idx_arr), axis=1)
    ).query('n_vp > 1').drop(columns = "n_vp").reset_index(drop=True)
    
    df_long = df.explode(
        "vp_idx_arr", 
        ignore_index=True
    ).rename(
        columns = {"vp_idx_arr": "vp_idx"}
    ).astype({"vp_idx": "int64"})
        
    # Turn series of arrays into 1d array
    #subset_vp = np.concatenate(np.asarray(df.vp_idx_arr))
    subset_vp = df_long.vp_idx.tolist()
    
    # Pull the vp info for ones that join to road segments
    vp_usable = dd.read_parquet(
        f"{SEGMENT_GCS}vp_usable_{analysis_date}",
        filters = [[("vp_idx", "in", subset_vp)]],
        columns = ["vp_idx", "x", "y"],
    )
    
    vp_with_roads = dd.merge(
        vp_usable,
        df_long,
        on = "vp_idx",
        how = "inner"
    )
    
    return vp_with_roads

In [None]:
vp = merge_vp_to_crosswalk(
    analysis_date,
    filters = [[("trip_instance_key", "==", test_trip)]]
)

In [None]:
subset_roads = vp.linearid.unique().compute().tolist()

In [None]:
def expand_relevant_road_segments(
    analysis_date: str,
    segment_identifier_cols: list = ["linearid", "mtfcc",
                                     "segment_sequence"],
    filtering = None
):
    sjoin_results = pd.read_parquet(
        f"{SEGMENT_GCS}vp_sjoin/vp_road_segments_{analysis_date}",
        columns = segment_identifier_cols
    ).drop_duplicates()
    
    
    full_road_info = gpd.read_parquet(
        f"{SEGMENT_GCS}segments_staging/"
        f"roads_with_cutpoints_long_{analysis_date}.parquet",
        filters = filtering
    )
    
    road_segments = gpd.read_parquet(
        f"{SEGMENT_GCS}road_segments_{analysis_date}",
        filters = [[("mtfcc", "in", ["S1100", "S1200"])]],
        columns = segment_identifier_cols + [
            "primary_direction", "destination"],
    ).merge(
        sjoin_results,
        on = segment_identifier_cols,
        how = "inner"
    ).merge(
        full_road_info,
        on = segment_identifier_cols,
        how = "inner"
    )
    
    return road_segments

In [None]:
road_segments = expand_relevant_road_segments(
    analysis_date,
    segment_identifier_cols = segment_identifier_cols,
    filtering = [[("linearid", "in", subset_roads)]],
)

In [None]:
vp = vp.repartition(npartitions=3)

In [None]:
road_dtypes = vp[road_id_cols].dtypes.to_dict()

vp_projected = vp.map_partitions(
    wrangle_shapes.project_vp_onto_segment_geometry,
    road_segments,
    grouping_cols = road_id_cols,
    meta = {
        "vp_idx": "int64",
        **road_dtypes,
        "shape_meters": "float"},
    align_dataframes = False
).compute()

In [None]:
vp_projected.head(2)

In [None]:
# Merge vp with road segment info 
# with projected shape meters against the full road 
df_with_projection = dd.merge(
    vp,
    vp_projected,
    on = ["vp_idx"] + road_id_cols,
    how = "inner"
).drop(columns = ["x", "y"]).compute()

In [None]:
df_with_projection.columns

In [None]:
df_with_projection_wide = (df_with_projection
                           .groupby(["trip_instance_key"] + road_id_cols)
                           .agg({
                               "vp_idx": lambda x: list(x),
                               "shape_meters": lambda x: list(x)})
                           .reset_index()
                           .rename(columns = {
                               "vp_idx": "vp_idx_arr",
                               "shape_meters": "shape_meters_arr"
                           })
                  )

In [None]:
# Now merge road segments with each destination acting as the road's stop
# and merge on arrays of projected vp against that road
gdf = pd.merge(
    road_segments,
    df_with_projection_wide,
    on = road_id_cols,
    how = "inner"
)

In [None]:
gdf.head(2)

In [None]:
nearest_vp_idx = []
subseq_vp_idx = []

for row in gdf.itertuples():
    
    this_stop_meters = getattr(row, "road_meters")
    valid_shape_meters_array = getattr(row, "shape_meters_arr")
    valid_vp_idx_array = np.asarray(getattr(row, "vp_idx_arr"))

    idx = np.searchsorted(
        valid_shape_meters_array,
        this_stop_meters,
        side="right" 
        # want our stop_meters value to be < vp_shape_meters,
        # side = "left" would be stop_meters <= vp_shape_meters
    )

    # For the next value, if there's nothing to index into, 
    # just set it to the same position
    # if we set subseq_value = getattr(row, )[idx], 
    # we might not get a consecutive vp
    nearest_value = valid_vp_idx_array[idx-1]
    subseq_value = nearest_value + 1

    nearest_vp_idx.append(nearest_value)
    subseq_vp_idx.append(subseq_value)

In [None]:
gdf.columns

In [None]:
result = gdf[segment_identifier_cols + [
    "primary_direction", "fullname", "road_meters", 
    "trip_instance_key"]]

# Now assign the nearest vp for each trip that's nearest to
# a given stop
# Need to find the one after the stop later
result = result.assign(
    nearest_vp_idx = nearest_vp_idx,
    subseq_vp_idx = subseq_vp_idx,
)

In [None]:
result.head()

once the sjoin is done and we know which trip_instance_keys are linked
to what road segments,
those roads should be expanded from 1 row to many rows.
go back to long for road segments, but it should be filtered down so that only the relevant cutpoints are included.

then, for vp, that entire array that attached onto the linearid, is put in as an input for search. also leave vp primary direction in.
filter that vp array down for just the direction we want, and find the "nearest" vp as that array has been projected against the road.

In [5]:
df = pd.read_parquet(
    f"{SEGMENT_GCS}nearest_vp_roads_{analysis_date}.parquet"
)

In [8]:
subset_vp = np.union1d(
        df.nearest_vp_idx.unique(), 
        df.subseq_vp_idx.unique()
    )

In [11]:
# modify this from interpolate_stop_arrivals
def attach_vp_shape_meters_with_timestamp(
    analysis_date: str, **kwargs
) -> pd.DataFrame:
    """
    """
    # shape_meters is here
    vp_projected = pd.read_parquet(
        f"{SEGMENT_GCS}projection/vp_projected_roads_{analysis_date}.parquet",
        **kwargs
    )
    
    # location_timestamp_local is here, and needs to be converted to seconds
    vp_usable = pd.read_parquet(
        f"{SEGMENT_GCS}vp_usable_{analysis_date}/",
        columns = ["vp_idx",  "location_timestamp_local"],
        **kwargs,
    )

    vp_info = pd.merge(
        vp_projected,
        vp_usable,
        on = "vp_idx",
        how = "inner"
    )
    
    return vp_info

In [12]:
vp_info = attach_vp_shape_meters_with_timestamp(
    analysis_date, 
    filters = [[("vp_idx", "in", subset_vp)]]
)

In [27]:
df.columns

Index(['linearid', 'mtfcc', 'segment_sequence', 'primary_direction',
       'fullname', 'road_meters', 'trip_instance_key', 'nearest_vp_idx',
       'subseq_vp_idx'],
      dtype='object')

In [26]:
vp_info.columns

Index(['vp_idx', 'trip_instance_key', 'linearid', 'mtfcc', 'primary_direction',
       'shape_meters', 'location_timestamp_local'],
      dtype='object')

In [40]:
road_trip_cols = road_id_cols + [
    "primary_direction", "trip_instance_key"]

vp_with_nearest_info = pd.merge(
    df,
    vp_info.rename(columns = {
        "vp_idx": "nearest_vp_idx",
        "shape_meters": "nearest_shape_meters",
        "location_timestamp_local": "nearest_location_timestamp_local"
    }),
    on = ["nearest_vp_idx"] + road_trip_cols,
    how = "inner"
)

In [41]:
df2 = pd.merge(
    vp_with_nearest_info,
    vp_info.rename(columns = {
        "vp_idx": "subseq_vp_idx",
        "shape_meters": "subseq_shape_meters",
        "location_timestamp_local": "subseq_location_timestamp_local"
    }),
    on = ["subseq_vp_idx"] + road_trip_cols,
    how = "inner"
)


In [56]:
sjoin_results = pd.read_parquet(
        f"{SEGMENT_GCS}vp_sjoin/vp_road_segments_wide_{analysis_date}.parquet",
        filters = [[("trip_instance_key", "in", test_trips)]]
    )

In [66]:
one_road_id = "1104475175563"
sjoin_results[sjoin_results.linearid==one_road_id]

Unnamed: 0,trip_instance_key,linearid,mtfcc,primary_direction,vp_idx_arr
7,00139041e36b607c7e10cb7ec023e837,1104475175563,S1200,Northbound,"[6911436, 6911437, 6911438, 6911439, 6911441, ..."
27,00139041e36b607c7e10cb7ec023e837,1104475175563,S1200,Northbound,[6911435]
24,00139041e36b607c7e10cb7ec023e837,1104475175563,S1200,Northbound,[6911442]


In [61]:
sjoin_results.dtypes

trip_instance_key    object
linearid             object
mtfcc                object
primary_direction    object
vp_idx_arr           object
dtype: object

In [69]:
df3[df3.linearid==one_road_id]

Unnamed: 0,linearid,mtfcc,segment_sequence,primary_direction,fullname,road_meters,trip_instance_key,nearest_vp_idx,subseq_vp_idx,nearest_shape_meters,nearest_location_timestamp_local,subseq_shape_meters,subseq_location_timestamp_local,arrival_time
30,1104475175563,S1200,3,Northbound,State Rte 260,4000.0,00139041e36b607c7e10cb7ec023e837,6911443,6911444,3874.573489,2023-10-11 10:52:35,3949.629694,2023-10-11 10:52:58,2023-10-11 10:52:58
31,1104475175563,S1200,4,Northbound,State Rte 260,5000.0,00139041e36b607c7e10cb7ec023e837,6911443,6911444,3874.573489,2023-10-11 10:52:35,3949.629694,2023-10-11 10:52:58,2023-10-11 10:52:58
32,1104475175563,S1200,5,Northbound,State Rte 260,5865.337879,00139041e36b607c7e10cb7ec023e837,6911443,6911444,3874.573489,2023-10-11 10:52:35,3949.629694,2023-10-11 10:52:58,2023-10-11 10:52:58


In [45]:

def get_stop_arrivals(df: pd.DataFrame) -> pd.DataFrame:
    """
    Apply np.interp to df.
    df must be set up so that a given stop is populated with its
    own stop_meters, as well as columns for nearest and subseq 
    shape_meters / location_timestamp_local_sec.
    """
    x_col = "shape_meters"
    y_col = "location_timestamp_local"
    
    stop_arrival_series = []
    for row in df.itertuples():

        xp = np.asarray([
            getattr(row, f"nearest_{x_col}"), 
            getattr(row, f"subseq_{x_col}")
        ])

        yp = np.asarray([
            getattr(row, f"nearest_{y_col}"), 
            getattr(row, f"subseq_{y_col}")
        ]).astype("datetime64[s]").astype("float64")

        stop_position = getattr(row, "road_meters")
        interpolated_arrival = np.interp(stop_position, xp, yp)
        stop_arrival_series.append(interpolated_arrival)
        
    df = df.assign(
        arrival_time = stop_arrival_series,
    ).astype(
        {"arrival_time": "datetime64[s]"}
    ).sort_values(
        ["trip_instance_key", 
        "linearid", "mtfcc"]
    ).reset_index(drop=True)
    
    return df

In [46]:
df3 = get_stop_arrivals(df2)

In [53]:
df2

Unnamed: 0,linearid,mtfcc,segment_sequence,primary_direction,fullname,road_meters,trip_instance_key,nearest_vp_idx,subseq_vp_idx,nearest_shape_meters,nearest_location_timestamp_local,subseq_shape_meters,subseq_location_timestamp_local
0,1101576648900,S1200,0,Northbound,State Rte 39,1000.0,0009c78b48866a26d664ab00f67d1606,1041809,1041810,72.350034,2023-10-11 09:34:45,252.820371,2023-10-11 09:35:28
1,1101576648900,S1200,1,Northbound,State Rte 39,2000.0,0009c78b48866a26d664ab00f67d1606,1041809,1041810,72.350034,2023-10-11 09:34:45,252.820371,2023-10-11 09:35:28
2,1101576668784,S1200,0,Northbound,N Azusa Ave,1000.0,0009c78b48866a26d664ab00f67d1606,1041809,1041810,72.350034,2023-10-11 09:34:45,252.820371,2023-10-11 09:35:28
3,1101576668784,S1200,1,Northbound,N Azusa Ave,2000.0,0009c78b48866a26d664ab00f67d1606,1041809,1041810,72.350034,2023-10-11 09:34:45,252.820371,2023-10-11 09:35:28
4,1104475157864,S1200,0,Southbound,Webster St,1000.0,00139041e36b607c7e10cb7ec023e837,6911448,6911449,2757.580687,2023-10-11 10:54:06,2757.580687,2023-10-11 10:54:21
5,1104475175563,S1200,3,Northbound,State Rte 260,4000.0,00139041e36b607c7e10cb7ec023e837,6911443,6911444,3874.573489,2023-10-11 10:52:35,3949.629694,2023-10-11 10:52:58
6,1104475175563,S1200,4,Northbound,State Rte 260,5000.0,00139041e36b607c7e10cb7ec023e837,6911443,6911444,3874.573489,2023-10-11 10:52:35,3949.629694,2023-10-11 10:52:58
7,1104475175563,S1200,5,Northbound,State Rte 260,5865.337879,00139041e36b607c7e10cb7ec023e837,6911443,6911444,3874.573489,2023-10-11 10:52:35,3949.629694,2023-10-11 10:52:58
8,1108296283102,S1100,0,Northbound,State Rte 1,1000.0,00062c6db9dbef9c80f5ada74b31e257,6357546,6357547,931.337139,2023-10-11 10:02:00,876.371427,2023-10-11 10:02:30
9,1108296283102,S1100,1,Northbound,State Rte 1,2000.0,00062c6db9dbef9c80f5ada74b31e257,6357546,6357547,931.337139,2023-10-11 10:02:00,876.371427,2023-10-11 10:02:30


In [50]:
df3[["road_meters", "nearest_vp_idx", "subseq_vp_idx", "nearest_shape_meters",
   "subseq_shape_meters",
   "nearest_location_timestamp_local",
   "subseq_location_timestamp_local", "arrival_time"]]

Unnamed: 0,road_meters,nearest_vp_idx,subseq_vp_idx,nearest_shape_meters,subseq_shape_meters,nearest_location_timestamp_local,subseq_location_timestamp_local,arrival_time
0,1000.0,6357546,6357547,931.337139,876.371427,2023-10-11 10:02:00,2023-10-11 10:02:30,2023-10-11 10:02:30
1,2000.0,6357546,6357547,931.337139,876.371427,2023-10-11 10:02:00,2023-10-11 10:02:30,2023-10-11 10:02:30
2,3000.0,6357546,6357547,931.337139,876.371427,2023-10-11 10:02:00,2023-10-11 10:02:30,2023-10-11 10:02:30
3,4000.0,6357546,6357547,931.337139,876.371427,2023-10-11 10:02:00,2023-10-11 10:02:30,2023-10-11 10:02:30
4,6000.0,6357546,6357547,931.337139,876.371427,2023-10-11 10:02:00,2023-10-11 10:02:30,2023-10-11 10:02:30
5,7000.0,6357546,6357547,931.337139,876.371427,2023-10-11 10:02:00,2023-10-11 10:02:30,2023-10-11 10:02:30
6,7469.230087,6357546,6357547,931.337139,876.371427,2023-10-11 10:02:00,2023-10-11 10:02:30,2023-10-11 10:02:30
7,1000.0,6357496,6357497,458.71022,819.715656,2023-10-11 09:45:30,2023-10-11 09:45:50,2023-10-11 09:45:50
8,2000.0,6357496,6357497,458.71022,819.715656,2023-10-11 09:45:30,2023-10-11 09:45:50,2023-10-11 09:45:50
9,3000.0,6357496,6357497,458.71022,819.715656,2023-10-11 09:45:30,2023-10-11 09:45:50,2023-10-11 09:45:50
