In [1]:
import dask.dataframe as dd
import geopandas as gpd
import numpy as np
import pandas as pd

from segment_speed_utils import helpers, wrangle_shapes
from segment_speed_utils.project_vars import (SEGMENT_GCS, 
                                              CONFIG_PATH, PROJECT_CRS)

from segment_speed_utils.project_vars import analysis_date

road_id_cols = ["linearid", "mtfcc", "primary_direction"]
segment_identifier_cols = road_id_cols + ["segment_sequence"]

test_trip = "00139041e36b607c7e10cb7ec023e837"

In [None]:
df = pd.read_parquet(
    f"{SEGMENT_GCS}vp_sjoin/vp_road_segments_wide_{analysis_date}.parquet",
    columns = ["trip_instance_key"]
).drop_duplicates()

In [None]:
test_trips = ['00062c6db9dbef9c80f5ada74b31e257',
       '0009c78b48866a26d664ab00f67d1606',
       '00139041e36b607c7e10cb7ec023e837',
       'ff780650b98209acf69a71a7dab2502c',
       'ff86898d6a8ff5df82912699133bf4b6',
       'ffb44943b394f891d2b2286bb3902305']

In [2]:
import nearest_vp_to_road

vp = nearest_vp_to_road.merge_vp_to_crosswalk(
    analysis_date,
    filters = [[("trip_instance_key", "==", test_trip)]]
)

In [3]:
subset_roads = vp.linearid.unique().compute().tolist()

In [4]:
road_segments = nearest_vp_to_road.expand_relevant_road_segments(
    analysis_date,
    segment_identifier_cols = segment_identifier_cols,
    filters = [[("linearid", "in", subset_roads)]],
)

In [6]:
vp = vp.repartition(npartitions=3)

In [11]:
road_dtypes = vp[road_id_cols].dtypes.to_dict()

vp_projected = vp.map_partitions(
    wrangle_shapes.project_vp_onto_segment_geometry,
    road_segments,
    grouping_cols = road_id_cols,
    meta = {
        "vp_idx": "int64",
        **road_dtypes,
        "shape_meters": "float"},
    align_dataframes = False
).persist()

# Merge vp with road segment info 
# with projected shape meters against the full road 
df_with_projection = dd.merge(
    vp,
    vp_projected,
    on = ["vp_idx"] + road_id_cols,
    how = "inner"
).drop(columns = ["x", "y"]).compute()

In [12]:
df_with_projection = dd.merge(
    vp,
    vp_projected,
    on = ["vp_idx"] + road_id_cols,
    how = "inner"
).drop(columns = ["x", "y"]).compute()

In [13]:
df_with_projection_wide = (df_with_projection
                           .groupby(["trip_instance_key"] + road_id_cols)
                           .agg({
                               "vp_idx": lambda x: list(x),
                               "shape_meters": lambda x: list(x)})
                           .reset_index()
                           .rename(columns = {
                               "vp_idx": "vp_idx_arr",
                               "shape_meters": "shape_meters_arr"
                           })
                  )

In [15]:
# Now merge road segments with each destination acting as the road's stop
# and merge on arrays of projected vp against that road
gdf = pd.merge(
    road_segments,
    df_with_projection_wide,
    on = road_id_cols,
    how = "inner"
)

In [29]:
gdf.road_meters.iloc[1], gdf.shape_meters_arr.iloc[1]

(1000.0000000000005, [0.0, 643.8241590008627, 908.1937923704473])

In [27]:
gdf.road_meters.iloc[3], gdf.shape_meters_arr.iloc[3]

(475.52993366534616, [55.1403823309705, 319.510015700555])

In [28]:
gdf.road_meters.iloc[0], gdf.shape_meters_arr.iloc[0]

(999.9999999999982,
 [541.1330418199873,
  124.91034836850469,
  124.91034836850469,
  124.91034836850469])

In [31]:
gdf.road_meters.iloc[5], gdf.shape_meters_arr.iloc[5]

(2000.000000000001,
 [434.34043814365566,
  665.6150873104585,
  693.5973532520323,
  874.5734891731864,
  586.2739146177842,
  838.6084712041013,
  949.6296940259685])

[541.1330418199873, 124.91034836850469, 124.91034836850469, 124.91034836850469]

In [24]:
gdf

Unnamed: 0,linearid,mtfcc,primary_direction,segment_sequence,destination,geometry,road_meters,trip_instance_key,vp_idx_arr,shape_meters_arr
0,1104475157864,S1200,Southbound,0,POINT (-200250.105 -24388.777),"LINESTRING (-200239.867 -23392.923, -200236.87...",1000.0,00139041e36b607c7e10cb7ec023e837,"[6911442, 6911448, 6911449, 6911447]","[541.1330418199873, 124.91034836850469, 124.91..."
1,1104475175526,S1200,Northbound,0,POINT (-199756.672 -22077.028),"MULTILINESTRING ((-200202.160 -22969.494, -200...",1000.0,00139041e36b607c7e10cb7ec023e837,"[6911463, 6911464, 6911465]","[0.0, 643.8241590008627, 908.1937923704473]"
2,1104475175526,S1200,Northbound,1,POINT (-199720.571 -22009.948),"MULTILINESTRING ((-200202.160 -22969.494, -200...",1076.180232,00139041e36b607c7e10cb7ec023e837,"[6911463, 6911464, 6911465]","[0.0, 643.8241590008627, 908.1937923704473]"
3,1104475175553,S1200,Northbound,0,POINT (-199726.016 -22020.604),"LINESTRING (-199955.027 -22436.890, -199897.70...",475.529934,00139041e36b607c7e10cb7ec023e837,"[6911464, 6911465]","[55.1403823309705, 319.510015700555]"
4,1104475175563,S1200,Northbound,3,POINT (-200208.208 -23769.822),"MULTILINESTRING ((-200262.173 -24768.205, -200...",1000.0,00139041e36b607c7e10cb7ec023e837,"[6911436, 6911438, 6911439, 6911443, 6911437, ...","[434.34043814365566, 665.6150873104585, 693.59..."
5,1104475175563,S1200,Northbound,4,POINT (-200114.798 -22792.380),"MULTILINESTRING ((-200262.173 -24768.205, -200...",2000.0,00139041e36b607c7e10cb7ec023e837,"[6911436, 6911438, 6911439, 6911443, 6911437, ...","[434.34043814365566, 665.6150873104585, 693.59..."
6,1104475175563,S1200,Northbound,5,POINT (-199726.016 -22020.604),"MULTILINESTRING ((-200262.173 -24768.205, -200...",2865.337879,00139041e36b607c7e10cb7ec023e837,"[6911436, 6911438, 6911439, 6911443, 6911437, ...","[434.34043814365566, 665.6150873104585, 693.59..."


In [None]:
test_id = "1104475157864"

road_segments_all = gpd.read_parquet(
        f"{SEGMENT_GCS}road_segments_{analysis_date}",
        filters = [[("mtfcc", "in", ["S1100", "S1200"]), 
                   ("linearid", "==", test_id)]],
        #columns = segment_identifier_cols + [
        #    "primary_direction", "destination"],
    )

In [None]:
road_segments_all = gpd.read_parquet(
        f"{SEGMENT_GCS}road_segments_{analysis_date}",
        filters = [[("mtfcc", "in", ["S1100", "S1200"]), 
                   ]],
        #columns = segment_identifier_cols + [
        #    "primary_direction", "destination"],
    )

In [None]:
road_segments_all[
    road_segments_all.fullname.str.contains("Webster St")].explore("primary_direction", 
                                                                  tiles = "CartoDB Positron")

In [None]:
road_segments_all.drop(columns = ["origin", "destination"]).explore(
    "primary_direction",
    tiles = "CartoDB Positron", 
    categorical=True
)

In [None]:
road_segments_all

In [None]:
road_segments[
    road_segments.linearid==test_id
].explore("segment_sequence", tiles = "CartoDB Positron")

In [None]:
nearest_vp_idx = []
subseq_vp_idx = []

for row in gdf.itertuples():
    
    this_stop_meters = getattr(row, "road_meters")
    valid_shape_meters_array = getattr(row, "shape_meters_arr")
    valid_vp_idx_array = np.asarray(getattr(row, "vp_idx_arr"))

    idx = np.searchsorted(
        valid_shape_meters_array,
        this_stop_meters,
        side="right" 
        # want our stop_meters value to be < vp_shape_meters,
        # side = "left" would be stop_meters <= vp_shape_meters
    )

    # For the next value, if there's nothing to index into, 
    # just set it to the same position
    # if we set subseq_value = getattr(row, )[idx], 
    # we might not get a consecutive vp
    nearest_value = valid_vp_idx_array[idx-1]
    subseq_value = nearest_value + 1

    nearest_vp_idx.append(nearest_value)
    subseq_vp_idx.append(subseq_value)

In [None]:
gdf.columns

In [None]:
result = gdf[segment_identifier_cols + [
    "primary_direction", "fullname", "road_meters", 
    "trip_instance_key"]]

# Now assign the nearest vp for each trip that's nearest to
# a given stop
# Need to find the one after the stop later
result = result.assign(
    nearest_vp_idx = nearest_vp_idx,
    subseq_vp_idx = subseq_vp_idx,
)

In [None]:
result.head()

once the sjoin is done and we know which trip_instance_keys are linked
to what road segments,
those roads should be expanded from 1 row to many rows.
go back to long for road segments, but it should be filtered down so that only the relevant cutpoints are included.

then, for vp, that entire array that attached onto the linearid, is put in as an input for search. also leave vp primary direction in.
filter that vp array down for just the direction we want, and find the "nearest" vp as that array has been projected against the road.

In [None]:
df = pd.read_parquet(
    f"{SEGMENT_GCS}nearest_vp_roads_{analysis_date}.parquet"
)

In [None]:
subset_vp = np.union1d(
        df.nearest_vp_idx.unique(), 
        df.subseq_vp_idx.unique()
    )

In [None]:
# modify this from interpolate_stop_arrivals
def attach_vp_shape_meters_with_timestamp(
    analysis_date: str, **kwargs
) -> pd.DataFrame:
    """
    """
    # shape_meters is here
    vp_projected = pd.read_parquet(
        f"{SEGMENT_GCS}projection/vp_projected_roads_{analysis_date}.parquet",
        **kwargs
    )
    
    # location_timestamp_local is here, and needs to be converted to seconds
    vp_usable = pd.read_parquet(
        f"{SEGMENT_GCS}vp_usable_{analysis_date}/",
        columns = ["vp_idx",  "location_timestamp_local"],
        **kwargs,
    )

    vp_info = pd.merge(
        vp_projected,
        vp_usable,
        on = "vp_idx",
        how = "inner"
    )
    
    return vp_info

In [None]:
vp_info = attach_vp_shape_meters_with_timestamp(
    analysis_date, 
    filters = [[("vp_idx", "in", subset_vp)]]
)

In [None]:
road_trip_cols = road_id_cols + [
    "primary_direction", "trip_instance_key"]

vp_with_nearest_info = pd.merge(
    df,
    vp_info.rename(columns = {
        "vp_idx": "nearest_vp_idx",
        "shape_meters": "nearest_shape_meters",
        "location_timestamp_local": "nearest_location_timestamp_local"
    }),
    on = ["nearest_vp_idx"] + road_trip_cols,
    how = "inner"
)

In [None]:
df2 = pd.merge(
    vp_with_nearest_info,
    vp_info.rename(columns = {
        "vp_idx": "subseq_vp_idx",
        "shape_meters": "subseq_shape_meters",
        "location_timestamp_local": "subseq_location_timestamp_local"
    }),
    on = ["subseq_vp_idx"] + road_trip_cols,
    how = "inner"
)


In [None]:
sjoin_results = pd.read_parquet(
    f"{SEGMENT_GCS}vp_sjoin/"
    f"vp_road_segments_wide_{analysis_date}.parquet",
    #filters = [[("trip_instance_key", "in", test_trips)]]
)

In [None]:
sjoin_results.trip_instance_key.nunique()

In [None]:
one_road_id = "1104475175563"
sjoin_results[sjoin_results.linearid==one_road_id]

In [None]:
df2.shape

In [None]:
df2.columns

In [None]:
def get_stop_arrivals(df: pd.DataFrame) -> pd.DataFrame:
    """
    Apply np.interp to df.
    df must be set up so that a given stop is populated with its
    own stop_meters, as well as columns for nearest and subseq 
    shape_meters / location_timestamp_local_sec.
    """
    x_col = "shape_meters"
    y_col = "location_timestamp_local"
    
    stop_arrival_series = []
    for row in df.itertuples():

        xp = np.asarray([
            getattr(row, f"nearest_{x_col}"), 
            getattr(row, f"subseq_{x_col}")
        ])

        yp = np.asarray([
            getattr(row, f"nearest_{y_col}"), 
            getattr(row, f"subseq_{y_col}")
        ]).astype("datetime64[s]").astype("float64")

        stop_position = getattr(row, "road_meters")
        interpolated_arrival = np.interp(stop_position, xp, yp)
        stop_arrival_series.append(interpolated_arrival)
        
    df = df.assign(
        arrival_time = stop_arrival_series,
    ).astype(
        {"arrival_time": "datetime64[s]"}
    ).sort_values(
        ["trip_instance_key", 
        "linearid", "mtfcc"]
    ).reset_index(drop=True)
    
    return df

In [None]:
df3 = get_stop_arrivals(df2)

In [None]:
df3[(df3.subseq_location_timestamp_local > df3.arrival_time) & 
   (df3.nearest_location_timestamp_local < df3.arrival_time)].shape

In [None]:
# Something weird is happening here

In [None]:
df3.shape

In [None]:
df3[["road_meters", "nearest_vp_idx", "subseq_vp_idx", "nearest_shape_meters",
   "subseq_shape_meters",
   "nearest_location_timestamp_local",
   "subseq_location_timestamp_local", "arrival_time"]]