In [1]:
import dask.dataframe as dd
import geopandas as gpd
import numpy as np
import pandas as pd

from segment_speed_utils import helpers, wrangle_shapes
from segment_speed_utils.project_vars import (SEGMENT_GCS, 
                                              CONFIG_PATH, PROJECT_CRS)

from segment_speed_utils.project_vars import analysis_date_list
analysis_date = analysis_date_list[0]

road_id_cols = ["linearid", "mtfcc"]
segment_identifier_cols = road_id_cols + ["segment_sequence"]

test_trip = "00139041e36b607c7e10cb7ec023e837"

In [18]:
def merge_vp_to_crosswalk(
    analysis_date: str, 
    filters: tuple
):
    # vp to road segment crosswalk
    df = pd.read_parquet(
        f"{SEGMENT_GCS}vp_sjoin/vp_road_segments_wide_{analysis_date}.parquet",
        filters = filters
    )
    
    # only keep the road segments that have at least 2 vp
    df = df.assign(
        n_vp = df.apply(lambda x: len(x.vp_idx_arr), axis=1)
    ).query('n_vp > 1').drop(columns = "n_vp").reset_index(drop=True)
    
    df_long = df.explode(
        "vp_idx_arr", 
        ignore_index=True
    ).rename(columns = {"vp_idx_arr": "vp_idx"}).astype({"vp_idx": "int64"})
        
    # Turn series of arrays into 1d array
    #subset_vp = np.concatenate(np.asarray(df.vp_idx_arr))
    subset_vp = df_long.vp_idx.tolist()
    
    # Pull the vp info for ones that join to road segments
    vp_usable = dd.read_parquet(
        f"{SEGMENT_GCS}vp_usable_{analysis_date}",
        filters = [[("vp_idx", "in", subset_vp)]],
        columns = ["vp_idx", "x", "y"],
    )
    
    vp_with_roads = dd.merge(
        vp_usable,
        df_long,
        on = "vp_idx",
        how = "inner"
    )
    
    return vp_with_roads

In [19]:
vp = merge_vp_to_crosswalk(
    analysis_date,
    filters = [[("trip_instance_key", "==", test_trip)]]
)

Index(['trip_instance_key', 'linearid', 'mtfcc', 'primary_direction',
       'vp_idx'],
      dtype='object')


In [22]:
subset_roads = vp.linearid.unique().compute().tolist()

In [23]:
def expand_relevant_road_segments(
    analysis_date: str,
    segment_identifier_cols: list = ["linearid", "mtfcc",
                                     "segment_sequence"],
    filtering = None
):
    sjoin_results = pd.read_parquet(
        f"{SEGMENT_GCS}vp_sjoin/vp_road_segments_{analysis_date}",
        columns = segment_identifier_cols
    ).drop_duplicates()
    
    
    full_road_info = gpd.read_parquet(
        f"{SEGMENT_GCS}segments_staging/"
        f"roads_with_cutpoints_long_{analysis_date}.parquet",
        filters = filtering
    )
    
    road_segments = gpd.read_parquet(
        f"{SEGMENT_GCS}road_segments_{analysis_date}",
        filters = [[("mtfcc", "in", ["S1100", "S1200"])]],
        columns = segment_identifier_cols + [
            "primary_direction", "destination"],
    ).merge(
        sjoin_results,
        on = segment_identifier_cols,
        how = "inner"
    ).merge(
        full_road_info,
        on = segment_identifier_cols,
        how = "inner"
    )
    
    return road_segments

In [24]:
road_segments = expand_relevant_road_segments(
    analysis_date,
    segment_identifier_cols = segment_identifier_cols,
    filtering = [[("linearid", "in", subset_roads)]],
)

In [25]:
vp = vp.repartition(npartitions=3)

In [26]:
road_dtypes = vp[road_id_cols].dtypes.to_dict()

res = vp.map_partitions(
    wrangle_shapes.project_vp_onto_segment_geometry,
    road_segments,
    grouping_cols = road_id_cols,
    meta = {
        "vp_idx": "int64",
        **road_dtypes,
        "shape_meters": "float"},
    align_dataframes = False
)

In [27]:
result = res.compute()

In [28]:
result

Unnamed: 0,vp_idx,linearid,mtfcc,shape_meters
0,6911436,1104475175563,S1200,3434.340438
6,6911437,1104475175563,S1200,3586.273915
12,6911438,1104475175563,S1200,3665.615087
18,6911439,1104475175563,S1200,3693.597353
24,6911441,1104475175563,S1200,3838.608471
0,6911442,1104475157864,S1200,2341.209261
4,6911447,1104475157864,S1200,2757.580687
8,6911448,1104475157864,S1200,2757.580687
12,6911443,1104475175563,S1200,3874.573489
18,6911444,1104475175563,S1200,3949.629694


In [None]:
df_with_projection = dd.merge(
    vp,
    result,
    on = ["vp_idx"] + road_id_cols,
    how = "inner"
).drop(columns = ["x", "y"]).rename(
    columns = {"shape_meters": "road_meters"}
)

df_result = df_with_projection.compute()

In [None]:
df_result2 = (df_result
                   .groupby(["trip_instance_key"] + road_id_cols)
                   .agg({"road_meters": lambda x: list(x)})
                   .reset_index()
                  )


In [None]:
road_segments.head(2)

In [None]:
gdf = pd.merge(
    road_segments,
    df_result,
    on = road_id_cols + ["primary_direction"],
    how = "inner"
)

In [None]:
gdf.head(2)

In [None]:
for row in gdf.itertuples():
    
    this_stop_meters = getattr(row, "road_meters")
    valid_shape_meters_array = getattr(row, "road_meters_arr")
    
    idx = np.searchsorted(
        valid_shape_meters_array,
        getattr(row, "road_meters"),
        side="right" 
        # want our stop_meters value to be < vp_shape_meters,
        # side = "left" would be stop_meters <= vp_shape_meters
    )

    # For the next value, if there's nothing to index into, 
    # just set it to the same position
    # if we set subseq_value = getattr(row, )[idx], 
    # we might not get a consecutive vp
    nearest_value = valid_vp_idx_array[idx-1]
    subseq_value = nearest_value + 1

In [None]:
df_result

In [None]:
df_result = df_with_projection.compute()

In [None]:
df_result

In [None]:
road_cutpoints_with_vp_sjoin["shape_meters_arr"] = shape_meters_series

In [None]:
road_cutpoints_with_vp_sjoin[["linearid", "mtfcc", "geometry",
                              #"trip_instance_key",
                              "vp_idx_arr",
                             "road_meters_arr",
                             "shape_meters_arr"]]

once the sjoin is done and we know which trip_instance_keys are linked
to what road segments,
those roads should be expanded from 1 row to many rows.
go back to long for road segments, but it should be filtered down so that only the relevant cutpoints are included.

then, for vp, that entire array that attached onto the linearid, is put in as an input for search. also leave vp primary direction in.
filter that vp array down for just the direction we want, and find the "nearest" vp as that array has been projected against the road.

In [None]:
road_cutpoints_with_vp_sjoin.road_meters_arr.iloc[2]

In [None]:
road_cutpoints_with_vp_sjoin.shape_meters_arr.iloc[2]

In [None]:
# modify this from shapely_project_vp
def project_vp_to_road(
    vp: dd.DataFrame, 
    roads: gpd.GeoDataFrame
):
    """
    shapely.project vp point geom onto shape_geometry.
    """
    roads = roads.rename(columns = {"geometry": "shape_geometry"})
    
    vp_gdf = vp_as_gdf(vp)
    
    gdf = pd.merge(
        vp_gdf,
        roads,
        on = "shape_array_key",
        how = "inner"
    )
    
    gdf = gdf.assign(
        shape_meters = gdf.shape_geometry.project(gdf.geometry)
    )
    
    vp_projected_result = gdf[["vp_idx", "shape_meters"]]
    
    return vp_projected_result