In [1]:
import dask.dataframe as dd
import geopandas as gpd
import numpy as np
import pandas as pd

from segment_speed_utils import helpers, wrangle_shapes
from segment_speed_utils.project_vars import (SEGMENT_GCS, 
                                              CONFIG_PATH, PROJECT_CRS)

from segment_speed_utils.project_vars import analysis_date_list
analysis_date = analysis_date_list[0]

road_id_cols = ["linearid", "mtfcc"]
segment_identifier_cols = road_id_cols + ["segment_sequence"]

test_trip = "00139041e36b607c7e10cb7ec023e837"

In [2]:
def merge_vp_to_crosswalk(
    analysis_date: str, 
    filters: tuple
):
    # vp to road segment crosswalk
    df = pd.read_parquet(
        f"{SEGMENT_GCS}vp_sjoin/vp_road_segments_wide_{analysis_date}.parquet",
        filters = filters
    )
    
    # only keep the road segments that have at least 2 vp
    df = df.assign(
        n_vp = df.apply(lambda x: len(x.vp_idx_arr), axis=1)
    ).query('n_vp > 1').drop(columns = "n_vp").reset_index(drop=True)
    
    df_long = df.explode(
        "vp_idx_arr", 
        ignore_index=True
    ).rename(
        columns = {"vp_idx_arr": "vp_idx"}
    ).astype({"vp_idx": "int64"})
        
    # Turn series of arrays into 1d array
    #subset_vp = np.concatenate(np.asarray(df.vp_idx_arr))
    subset_vp = df_long.vp_idx.tolist()
    
    # Pull the vp info for ones that join to road segments
    vp_usable = dd.read_parquet(
        f"{SEGMENT_GCS}vp_usable_{analysis_date}",
        filters = [[("vp_idx", "in", subset_vp)]],
        columns = ["vp_idx", "x", "y"],
    )
    
    vp_with_roads = dd.merge(
        vp_usable,
        df_long,
        on = "vp_idx",
        how = "inner"
    )
    
    return vp_with_roads

In [3]:
vp = merge_vp_to_crosswalk(
    analysis_date,
    filters = [[("trip_instance_key", "==", test_trip)]]
)

In [4]:
subset_roads = vp.linearid.unique().compute().tolist()

In [5]:
def expand_relevant_road_segments(
    analysis_date: str,
    segment_identifier_cols: list = ["linearid", "mtfcc",
                                     "segment_sequence"],
    filtering = None
):
    sjoin_results = pd.read_parquet(
        f"{SEGMENT_GCS}vp_sjoin/vp_road_segments_{analysis_date}",
        columns = segment_identifier_cols
    ).drop_duplicates()
    
    
    full_road_info = gpd.read_parquet(
        f"{SEGMENT_GCS}segments_staging/"
        f"roads_with_cutpoints_long_{analysis_date}.parquet",
        filters = filtering
    )
    
    road_segments = gpd.read_parquet(
        f"{SEGMENT_GCS}road_segments_{analysis_date}",
        filters = [[("mtfcc", "in", ["S1100", "S1200"])]],
        columns = segment_identifier_cols + [
            "primary_direction", "destination"],
    ).merge(
        sjoin_results,
        on = segment_identifier_cols,
        how = "inner"
    ).merge(
        full_road_info,
        on = segment_identifier_cols,
        how = "inner"
    )
    
    return road_segments

In [6]:
road_segments = expand_relevant_road_segments(
    analysis_date,
    segment_identifier_cols = segment_identifier_cols,
    filtering = [[("linearid", "in", subset_roads)]],
)

In [7]:
vp = vp.repartition(npartitions=3)

In [10]:
road_dtypes = vp[road_id_cols].dtypes.to_dict()

vp_projected = vp.map_partitions(
    wrangle_shapes.project_vp_onto_segment_geometry,
    road_segments,
    grouping_cols = road_id_cols,
    meta = {
        "vp_idx": "int64",
        **road_dtypes,
        "shape_meters": "float"},
    align_dataframes = False
).compute()

In [11]:
vp_projected.head(2)

Unnamed: 0,vp_idx,linearid,mtfcc,shape_meters
0,6911436,1104475175563,S1200,3434.340438
6,6911437,1104475175563,S1200,3586.273915


In [12]:
# Merge vp with road segment info 
# with projected shape meters against the full road 
df_with_projection = dd.merge(
    vp,
    vp_projected,
    on = ["vp_idx"] + road_id_cols,
    how = "inner"
).drop(columns = ["x", "y"]).compute()

In [23]:
df_with_projection.columns

Index(['vp_idx', 'trip_instance_key', 'linearid', 'mtfcc', 'primary_direction',
       'shape_meters'],
      dtype='object')

In [28]:
df_with_projection_wide = (df_with_projection
                           .groupby(["trip_instance_key"] + road_id_cols)
                           .agg({
                               "vp_idx": lambda x: list(x),
                               "shape_meters": lambda x: list(x)})
                           .reset_index()
                           .rename(columns = {
                               "vp_idx": "vp_idx_arr",
                               "shape_meters": "shape_meters_arr"
                           })
                  )

In [29]:
# Now merge road segments with each destination acting as the road's stop
# and merge on arrays of projected vp against that road
gdf = pd.merge(
    road_segments,
    df_with_projection_wide,
    on = road_id_cols,
    how = "inner"
)

In [30]:
gdf.head(2)

Unnamed: 0,linearid,mtfcc,segment_sequence,primary_direction,destination,geometry,fullname,road_meters,trip_instance_key,vp_idx_arr,shape_meters_arr
0,1104475157864,S1200,0,Southbound,POINT (-200250.105 -24388.777),"LINESTRING (-200239.867 -23392.923, -200236.87...",Webster St,1000.0,00139041e36b607c7e10cb7ec023e837,"[6911442, 6911447, 6911448, 6911449]","[2341.209260550117, 2757.5806868317354, 2757.5..."
1,1104475157864,S1200,1,Northbound,POINT (-200236.502 -24271.496),"LINESTRING (-200239.867 -23392.923, -200236.87...",Webster St,2000.0,00139041e36b607c7e10cb7ec023e837,"[6911442, 6911447, 6911448, 6911449]","[2341.209260550117, 2757.5806868317354, 2757.5..."


In [31]:
nearest_vp_idx = []
subseq_vp_idx = []

for row in gdf.itertuples():
    
    this_stop_meters = getattr(row, "road_meters")
    valid_shape_meters_array = getattr(row, "shape_meters_arr")
    valid_vp_idx_array = np.asarray(getattr(row, "vp_idx_arr"))

    idx = np.searchsorted(
        valid_shape_meters_array,
        this_stop_meters,
        side="right" 
        # want our stop_meters value to be < vp_shape_meters,
        # side = "left" would be stop_meters <= vp_shape_meters
    )

    # For the next value, if there's nothing to index into, 
    # just set it to the same position
    # if we set subseq_value = getattr(row, )[idx], 
    # we might not get a consecutive vp
    nearest_value = valid_vp_idx_array[idx-1]
    subseq_value = nearest_value + 1

    nearest_vp_idx.append(nearest_value)
    subseq_vp_idx.append(subseq_value)

In [32]:
gdf.columns

Index(['linearid', 'mtfcc', 'segment_sequence', 'primary_direction',
       'destination', 'geometry', 'fullname', 'road_meters',
       'trip_instance_key', 'vp_idx_arr', 'shape_meters_arr'],
      dtype='object')

In [34]:
result = gdf[segment_identifier_cols + [
    "primary_direction", "fullname", "road_meters", 
    "trip_instance_key"]]

# Now assign the nearest vp for each trip that's nearest to
# a given stop
# Need to find the one after the stop later
result = result.assign(
    nearest_vp_idx = nearest_vp_idx,
    subseq_vp_idx = subseq_vp_idx,
)

In [36]:
result.head()

Unnamed: 0,linearid,mtfcc,segment_sequence,primary_direction,fullname,road_meters,trip_instance_key,nearest_vp_idx,subseq_vp_idx
0,1104475157864,S1200,0,Southbound,Webster St,1000.0,00139041e36b607c7e10cb7ec023e837,6911449,6911450
1,1104475157864,S1200,1,Northbound,Webster St,2000.0,00139041e36b607c7e10cb7ec023e837,6911449,6911450
2,1104475157864,S1200,2,Northbound,Webster St,3000.0,00139041e36b607c7e10cb7ec023e837,6911449,6911450
3,1104475157864,S1200,3,Northbound,Webster St,3303.724958,00139041e36b607c7e10cb7ec023e837,6911449,6911450
4,1104475175526,S1200,0,Northbound,Posey Tube,1000.0,00139041e36b607c7e10cb7ec023e837,6911465,6911466


once the sjoin is done and we know which trip_instance_keys are linked
to what road segments,
those roads should be expanded from 1 row to many rows.
go back to long for road segments, but it should be filtered down so that only the relevant cutpoints are included.

then, for vp, that entire array that attached onto the linearid, is put in as an input for search. also leave vp primary direction in.
filter that vp array down for just the direction we want, and find the "nearest" vp as that array has been projected against the road.