In [1]:
import dask.dataframe as dd
import dask_geopandas as dg
import geopandas as gpd
import numpy as np
import pandas as pd

from segment_speed_utils.project_vars import (SEGMENT_GCS, 
                                              CONFIG_PATH, 
                                              PROJECT_CRS
                                             )
from segment_speed_utils import (helpers, wrangle_shapes, 
                                 segment_calcs)
from shared_utils import rt_dates
import test_split

analysis_date = rt_dates.DATES["jul2023"]



In [2]:
from importlib import reload

dict_inputs = helpers.get_parameters(CONFIG_PATH, "stop_segments")

In [3]:
USABLE_VP = dict_inputs["stage1"]
INPUT_FILE = dict_inputs["stage3"]
SEGMENT_FILE = dict_inputs["segments_file"]
SEGMENT_IDENTIFIER_COLS = dict_inputs["segment_identifier_cols"]
GROUPING_COL = dict_inputs["grouping_col"]
TIMESTAMP_COL = dict_inputs["timestamp_col"]

In [4]:
usable_vp = dd.read_parquet(
    f"{SEGMENT_GCS}{USABLE_VP}_{analysis_date}",
    columns = ["trip_instance_key", "vp_idx", TIMESTAMP_COL, "x", "y"]
)
vp_idx_bounds = test_split.get_usable_vp_bounds_by_trip(usable_vp)

In [5]:
from dask import delayed, compute
# Start from pared down vp
df = delayed(pd.read_parquet)(
    f"{SEGMENT_GCS}vp_pare_down/{INPUT_FILE}_all_{analysis_date}",
    columns = SEGMENT_IDENTIFIER_COLS + ["trip_instance_key", "vp_idx"]
)

In [6]:
df2 = delayed(test_split.get_prior_position_on_segment)(
    df, 
    SEGMENT_IDENTIFIER_COLS,
    TIMESTAMP_COL
)

In [7]:
df3 = delayed(dd.merge)(
    df2,
    vp_idx_bounds,
    on = "trip_instance_key",
    how = "inner"
)

df3 = df3.assign(
    prior_vp_idx = df3.apply(
        lambda x: 
        x.vp_idx + 1 if (x.prior_vp_idx < x.min_vp_idx) and 
        (x.vp_idx + 1 <= x.max_vp_idx)
        else x.prior_vp_idx, 
        axis=1)
).drop(columns = ["trip_instance_key", "min_vp_idx", "max_vp_idx"])
    

In [8]:
df3 = compute(df3)[0]

In [10]:
def attach_vp_timestamp_location(
    df: pd.DataFrame,
    usable_vp: dd.DataFrame,
    timestamp_col: str
) -> gpd.GeoDataFrame:
    """
    """
    
    # Turn the vp_idx we need into gdf
    vp_to_keep = np.union1d(df.vp_idx, df.prior_vp_idx).tolist()
    usable_vp2 = usable_vp[usable_vp.vp_idx.isin(vp_to_keep)]
    
    # Merge in the timestamp and x, y coords 
    usable_gdf = gpd.GeoDataFrame(
        usable_vp2,
        geometry = gpd.points_from_xy(usable_vp2.x, usable_vp2.y),
        crs = WGS84
    ).to_crs(PROJECT_CRS).drop(columns = ["x", "y"])
    
    
    df_with_xy = pd.merge(
        usable_gdf,
        df,
        on = "vp_idx",
        how = "inner"
    ).rename(columns = {"geometry": "vp_geometry"})
    
    # Merge again to get timestamp and x, y coords of previous point
    usable_gdf2 = usable_gdf.rename(
        columns = {
            "vp_idx": "prior_vp_idx",
            timestamp_col: f"prior_{timestamp_col}",
            "geometry": "prior_vp_geometry"
        }
    ).drop(columns = "trip_instance_key")
    
    df_with_prior_xy = dd.merge(
        df_with_xy,
        usable_gdf2,
        on = "prior_vp_idx",
        how = "inner"
    )
    
    return df_with_prior_xy

In [11]:
from shared_utils.geography_utils import WGS84

gdf = delayed(attach_vp_timestamp_location)(
    df3,
    usable_vp,
    TIMESTAMP_COL
).persist()

In [12]:
part2 = gdf[gdf.n_vp_seg==2]

part2_keep = (part2.groupby(["trip_instance_key"] + SEGMENT_IDENTIFIER_COLS,
                            observed=True, group_keys=False)
              .vp_idx
              .max()
              .reset_index()
             )

In [13]:
part2_pared = delayed(dd.merge)(
    part2,
    part2_keep,
    on = ["trip_instance_key", "vp_idx"] + SEGMENT_IDENTIFIER_COLS, 
    how = "inner"
)


In [14]:
part2_gdf = delayed(test_split.merge_in_segments)(
    part2_pared,
    SEGMENT_IDENTIFIER_COLS,
    GROUPING_COL,
    n_vp_seg_value=2
)

In [15]:
p2_gdf = compute(part2_gdf)[0]

In [16]:
gddf = dg.from_geopandas(p2_gdf, npartitions=100)

In [17]:
shape_meters_series = gddf.map_partitions(
    wrangle_shapes.project_point_geom_onto_linestring,
    "geometry",
    "vp_geometry",
    meta = ("shape_meters", "float")
)

prior_shape_meters_series = gddf.map_partitions(
    wrangle_shapes.project_point_geom_onto_linestring,
    "geometry",
    "prior_vp_geometry",
    meta = ("prior_shape_meters", "float")
)

In [18]:
gddf["difference_shape_meters"] = abs(
        shape_meters_series - prior_shape_meters_series)
gddf["straight_distance"] = gddf.vp_geometry.distance(gddf.prior_vp_geometry)

In [19]:
scaling_factor = 1.75
min_meters_elapsed = 100

# It's not only that difference_shape_meters==0 is wrong,
# Separate out these and try again with full shape 
gddf = gddf.assign(
    meters_elapsed = gddf.apply(
        lambda x: x.straight_distance if (
            x.difference_shape_meters <= min_meters_elapsed or 
            x.difference_shape_meters >= x.straight_distance*scaling_factor
        ) else x.difference_shape_meters, 
        axis=1, 
        meta = ("meters_elapsed", "float")
    ),
)

gddf = segment_calcs.convert_timestamp_to_seconds(
    gddf, [TIMESTAMP_COL, f"prior_{TIMESTAMP_COL}"])

gddf = gddf.assign(
    sec_elapsed = (gddf[f"{TIMESTAMP_COL}_sec"] - 
                      gddf[f"prior_{TIMESTAMP_COL}_sec"]).abs()
)

In [20]:
ddf = gddf.drop(columns = ["prior_vp_geometry", 
                           "vp_geometry", "geometry"])

In [21]:
ddf = ddf.repartition(npartitions=2)

In [22]:
results = ddf.compute()

  return lib.line_locate_point(line, other)
  return lib.line_locate_point(line, other)
  return lib.line_locate_point(line, other)
  return lib.line_locate_point(line, other)
  return lib.line_locate_point(line, other)
  return lib.line_locate_point(line, other)
  return lib.line_locate_point(line, other)
  return lib.line_locate_point(line, other)
  return lib.line_locate_point(line, other)
  return lib.line_locate_point(line, other)
  return lib.line_locate_point(line, other)
  return lib.line_locate_point(line, other)
  return lib.line_locate_point(line, other)
  return lib.line_locate_point(line, other)
  return lib.line_locate_point(line, other)
  return lib.line_locate_point(line, other)
  return lib.line_locate_point(line, other)
  return lib.line_locate_point(line, other)
  return lib.line_locate_point(line, other)
  return lib.line_locate_point(line, other)
  return lib.line_locate_point(line, other)
  return lib.line_locate_point(line, other)
  return lib.line_locate_point(l

In [23]:
from shared_utils.rt_utils import MPH_PER_MPS

results = results.assign(
    speed_mph = (results.meters_elapsed.divide(results.sec_elapsed) * 
                 MPH_PER_MPS)
)

In [24]:
results[results.speed_mph < 5].shape

(17751, 15)

In [25]:
ok_results = results[results.speed_mph >= 5]

In [26]:
bad_results = results[(results.speed_mph >= 0) & (results.speed_mph <5)]

In [None]:
len(ok_results), len(bad_results)

(39221, 17751, 0)

In [29]:
bad_results

Unnamed: 0,trip_instance_key,vp_idx,location_timestamp_local,shape_array_key,stop_sequence,n_vp_seg,prior_vp_idx,prior_location_timestamp_local,difference_shape_meters,straight_distance,meters_elapsed,location_timestamp_local_sec,prior_location_timestamp_local_sec,sec_elapsed,speed_mph
0,d15b0bea563cd87d86c86aebe5092eec,103533,2023-07-12 07:02:19,bbcffbd3d0f15bb6aa401323d9b4cc16,21,2,103414,2023-07-12 06:09:18,130.776168,149.466617,130.776168,25339,22158,3181,0.091967
1,3be657dfc73c03e07bf64c8d0d1ba5b9,103662,2023-07-12 07:47:59,bbcffbd3d0f15bb6aa401323d9b4cc16,21,2,103587,2023-07-12 07:22:58,30.767878,35.712476,35.712476,28079,26578,1501,0.053224
2,96b1cdcb34a5e140783ecdd704b941a8,103820,2023-07-12 08:37:49,bbcffbd3d0f15bb6aa401323d9b4cc16,21,2,103735,2023-07-12 08:09:32,47.987494,59.847917,59.847917,31069,29372,1697,0.078892
3,2de5b5e96e174df9876577aa32574a37,103951,2023-07-12 09:21:27,bbcffbd3d0f15bb6aa401323d9b4cc16,21,2,103867,2023-07-12 08:53:28,66.836692,82.544468,82.544468,33687,32008,1679,0.109977
4,7272eba6ea9e1a31c821e8baab6ed1f4,104085,2023-07-12 10:06:12,bbcffbd3d0f15bb6aa401323d9b4cc16,21,2,104002,2023-07-12 09:38:34,42.590357,67.569399,67.569399,36372,34714,1658,0.091166
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56967,4230577777b1a6eefac68b504efd4107,14525378,2023-07-12 07:44:24,5c07969e700a525860a02fcee6389fbf,2,2,14525377,2023-07-12 07:43:24,0.000000,8.874775,8.874775,27864,27804,60,0.330881
56968,81c7590efa1520143b181577898b3572,14525418,2023-07-12 07:50:51,5c07969e700a525860a02fcee6389fbf,2,2,14525416,2023-07-12 07:48:51,0.769686,9.364080,9.364080,28251,28131,120,0.174562
56969,1189d5633f78463a0ddac2448f7a30b0,14525455,2023-07-12 08:06:39,5c07969e700a525860a02fcee6389fbf,2,2,14525454,2023-07-12 08:06:17,4.003774,14.340286,14.340286,29199,29177,22,1.458146
56970,68785ab8159f0daa7545085b46d5b6d2,14525498,2023-07-12 08:34:15,5c07969e700a525860a02fcee6389fbf,2,2,14525495,2023-07-12 08:31:58,0.000000,10.401350,10.401350,30855,30718,137,0.169838


In [42]:
part1 = gdf[gdf.n_vp_seg==1]

p1 = compute(part1)[0]

p1.to_parquet("one_vp_in_seg.parquet")

In [34]:
bad_shapes = bad_results.shape_array_key.unique().tolist()

In [40]:
ok_results.to_parquet("ok_results.parquet")
bad_results.to_parquet("bad_results.parquet")

For speeds that are unusually low, it is now confirmed it's not just `loop_or_inlining`. It happens on `loop_or_inlining==0` too, so it's good to have moved the sjoin postprocessing to all shapes.

In [36]:
segments = gpd.read_parquet(
    f"{SEGMENT_GCS}stop_segments_{analysis_date}.parquet",
    filters = [[(GROUPING_COL, "in", bad_shapes)]],
    columns = SEGMENT_IDENTIFIER_COLS + ["loop_or_inlining", "geometry"]
)

In [37]:
segments.loop_or_inlining.value_counts()

0    5040
1    1074
Name: loop_or_inlining, dtype: int64