In [1]:
import dask.dataframe as dd
import dask_geopandas as dg
import datetime
import geopandas as gpd
import pandas as pd

from shared_utils import dask_utils, utils
from segment_speed_utils import helpers, sched_rt_utils
from segment_speed_utils.project_vars import (SEGMENT_GCS, 
                                              CONFIG_PATH, analysis_date)


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas


In [2]:
STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments")

In [3]:
dict_inputs = STOP_SEG_DICT

INPUT_FILE_PREFIX = dict_inputs["stage2"]
SEGMENTS_FILE = dict_inputs["segments_file"]
SEGMENT_IDENTIFIER_COLS = dict_inputs["segment_identifier_cols"]
GROUPING_COL = dict_inputs["grouping_col"]
TIMESTAMP_COL = dict_inputs["timestamp_col"]
EXPORT_FILE = dict_inputs["stage3"]


In [4]:
shape_cases = pd.read_parquet(
    f"{SEGMENT_GCS}stops_projected_{analysis_date}/",
    filters = [[("loop_or_inlining", "==", 1)]],
    columns = ["shape_array_key"]
).shape_array_key.unique().tolist()
    

In [5]:
test_shapes = shape_cases[:2]

In [6]:
# https://docs.dask.org/en/stable/delayed-collections.html
vp_joined_to_segments = helpers.import_vehicle_positions(
    f"{SEGMENT_GCS}vp_sjoin/",
    f"{INPUT_FILE_PREFIX}_{analysis_date}",
    file_type = "df",
    filters = [[("shape_array_key", "in", test_shapes)]],
    partitioned=True
)

stop_segments = helpers.import_segments(
    file_name = f"{SEGMENTS_FILE}_{analysis_date}",
    filters = [[("shape_array_key", "in", test_shapes)]],
    columns = ["shape_array_key", "stop_sequence", "geometry"],
    partitioned = False
)

In [7]:
from segment_speed_utils import segment_calcs

ddf = segment_calcs.convert_timestamp_to_seconds(
    vp_joined_to_segments, "location_timestamp_local")

In [8]:
def calculate_mean_time(
    df: dd.DataFrame, 
    group_cols: list,
) -> dd.DataFrame:
    
    mean_time = (df.groupby(group_cols)
                 .agg({"location_timestamp_local_sec": "mean"})
                 .reset_index()
                 .rename(columns = {
                     "location_timestamp_local_sec": "mean_time"})
                )
    
    df2 = dd.merge(
        df,
        mean_time,
        on = group_cols,
    )
    
    df2 = df2.assign(
        group = df2.apply(
            lambda x: 0 if x.location_timestamp_local_sec <= x.mean_time 
            else 1, axis=1, meta=("group", "int8"))
    ).drop(columns = "mean_time")
    
    return df2
    
group_cols = ["shape_array_key", "stop_sequence", "trip_id"]

In [9]:
df2 = calculate_mean_time(ddf, group_cols)

In [10]:
def get_first_last_position_in_group(
    df: dd.DataFrame, 
    group_cols: list
) -> dd.DataFrame:
    
    time_col = "location_timestamp_local_sec"
    trip_group_cols = group_cols + ["group"]
    
    first = (df.groupby(trip_group_cols)
             .agg({time_col: "min"})
             .reset_index()
            )
    
    last = (df.groupby(trip_group_cols)
             .agg({time_col: "max"})
             .reset_index()
            )
    
    keep_cols = trip_group_cols + [time_col, "lat", "lon"]
    
    pared_down = (dd.multi.concat([first, last], axis=0)
                  [trip_group_cols + [time_col]]
                  .drop_duplicates()
                  .reset_index(drop=True)
                 )
    
    # get rid of the groups with only 1 obs 
    # if it has only 1 point (cannot calculate direction vector), 
    # which means it'll get excluded down the line
    more_than_2 = (pared_down
                   .groupby(trip_group_cols)
                   [time_col].size()
                   .loc[lambda x: x > 1]
                   .reset_index()
                   .drop(columns = time_col)
                  )
    
    pared_down2 = dd.merge(
        pared_down,
        more_than_2,
        on = trip_group_cols
    )
    
    df2 = dd.merge(
        df[keep_cols],
        pared_down2,
        on = trip_group_cols + [time_col]
    )
    
    df2 = df2.assign(
        obs = (df2.sort_values(trip_group_cols + [time_col])
               .groupby(trip_group_cols)[time_col]
               .cumcount() + 1
              ).astype("int8")
    )
    
    return df2

In [11]:
df3 = get_first_last_position_in_group(df2, group_cols)

In [12]:
def get_stop_segments_direction_vector(
    stop_segments: gpd.GeoDataFrame
) -> gpd.GeoDataFrame:
    """
    Grab the first and last coordinate points in the stop segment
    and turn that into a normalized vector.
    """
    # Take the stop segment geometry and turn it into an array of coords
    shape_array = [np.array(shapely.LineString(i).coords) 
               for i in stop_segments.geometry]
    
    # Grab the first and last items in the array, 
    # and turn it back to shapely
    subset_shape_array = [
        np.array(
            [shapely.geometry.Point(i[0]), 
             shapely.geometry.Point(i[-1])]
        ).flatten() for i in shape_array
    ]
    
    shape_vec = [distill_array_into_direction_vector(i) 
             for i in subset_shape_array]
    
    stop_segments = stop_segments.assign(
        stop_segments_vector = shape_vec
    )
    
    return stop_segments

In [16]:
stop_segments2 = get_stop_segments_direction_vector(
    stop_segments).drop(columns = "geometry")

In [17]:
df4 = dd.merge(
    stop_segments2,
    df3,
    on = ["shape_array_key", "stop_sequence"]
)

In [24]:
df4.columns

Index(['shape_array_key', 'stop_sequence', 'stop_segments_vector', 'trip_id',
       'group', 'location_timestamp_local_sec', 'lat', 'lon', 'obs'],
      dtype='object')

In [32]:
def get_vp_in_segment_array(df: dd.DataFrame, group_cols: list):

    trip_group_cols = group_cols + ["group", "stop_segments_vector"]
    keep_cols = trip_group_cols + ["lat", "lon"]
    first_position = df[df.obs == 1][keep_cols]
    last_position = df[df.obs==2][keep_cols]
    
    # Set this up to be wide so we can compare positions and 
    # get a vector
    df_wide = dd.merge(
        first_position,
        last_position,
        on = trip_group_cols,
        suffixes = ('_start', '_end')
    ).sort_values(trip_group_cols).reset_index(drop=True)
    
    first_vp = dg.points_from_xy(
        df_wide, "lon_start", "lat_start", 
        crs="EPSG:3310"
    ) 
           
    last_vp = dg.points_from_xy(
        df_wide, "lon_end", "lat_end", 
        crs="EPSG:3310"
    )
    
    first_series = first_vp.compute()
    last_series = last_vp.compute()
    
    direction_vector = [
        get_direction_vector(start, end) 
        for start, end in zip(first_series, last_series)
    ]
    
    vector_normalized = [get_normalized_vector(i) 
                     for i in direction_vector]
    
    results = df_wide[trip_group_cols].compute()
    results = results.assign(
        vp_vector = vector_normalized
    )
    
    return results

In [33]:
df5 = get_vp_in_segment_array(df4, group_cols)



In [34]:
dot_result = [dot_product(vec1, vec2) for vec1, vec2 in 
 zip(df5.stop_segments_vector, df5.vp_vector)]

In [35]:
df5 = df5.assign(
    dot_result = dot_result
)

In [39]:
df5[df5.dot_result < 0]

Unnamed: 0,shape_array_key,stop_sequence,trip_id,group,stop_segments_vector,vp_vector,dot_result
6,0255d1f91b47fe9bcedf25d77ddc0fc6,0,t2DA-bC82-sl4,0,"(0.06785342337389011, -0.08433537412201986)","(-0.6267741886036637, 0.7792009474456632)",-0.108243
8,0255d1f91b47fe9bcedf25d77ddc0fc6,0,t320-b837-sl4,0,"(0.06785342337389011, -0.08433537412201986)","(-0.7759583523916229, 0.6307841432325941)",-0.105849
19,0255d1f91b47fe9bcedf25d77ddc0fc6,0,t4B0-bC81-sl4,0,"(0.06785342337389011, -0.08433537412201986)","(-0.7014855506026993, 0.7126836761815356)",-0.107703
23,0255d1f91b47fe9bcedf25d77ddc0fc6,0,t514-b837-sl4,0,"(0.06785342337389011, -0.08433537412201986)","(-0.6218517786671907, 0.7831349598686368)",-0.108241
27,0255d1f91b47fe9bcedf25d77ddc0fc6,0,t578-b835-sl4,0,"(0.06785342337389011, -0.08433537412201986)","(-0.5842625041381195, 0.8115647394128541)",-0.108088
...,...,...,...,...,...,...,...
1590,0255d1f91b47fe9bcedf25d77ddc0fc6,1800,t596-bC81-sl4,0,"(177.5485639910912, -192.36837340403872)","(-0.45787526083212343, 0.8890164483955935)",-252.313743
1595,0255d1f91b47fe9bcedf25d77ddc0fc6,1800,t609-bC82-sl4,1,"(177.5485639910912, -192.36837340403872)","(-0.49521779329177784, 0.8687688629360644)",-255.048861
1596,0255d1f91b47fe9bcedf25d77ddc0fc6,1800,t640-b836-sl4,1,"(177.5485639910912, -192.36837340403872)","(-0.6703916266787903, 0.742007457428135)",-261.765838
1597,0255d1f91b47fe9bcedf25d77ddc0fc6,1800,t64F-b837-sl4,0,"(177.5485639910912, -192.36837340403872)","(-0.6290788807751702, 0.7773414705022879)",-261.227966


In [40]:
df5[(df5.trip_id=="t2DA-bC82-sl4") & (df5.stop_sequence==0)]

Unnamed: 0,shape_array_key,stop_sequence,trip_id,group,stop_segments_vector,vp_vector,dot_result
6,0255d1f91b47fe9bcedf25d77ddc0fc6,0,t2DA-bC82-sl4,0,"(0.06785342337389011, -0.08433537412201986)","(-0.6267741886036637, 0.7792009474456632)",-0.108243
7,0255d1f91b47fe9bcedf25d77ddc0fc6,0,t2DA-bC82-sl4,1,"(0.06785342337389011, -0.08433537412201986)","(0.6117460788633204, -0.7910541921988354)",0.108223


In [14]:
import numpy as np
import pandas as pd
import shapely

def get_direction_vector(
    start: shapely.geometry.Point, 
    end: shapely.geometry.Point
) -> tuple:
    """
    Given 2 points (in a projected CRS...not WGS84), return a 
    tuple that shows (delta_x, delta_y).

    https://www.varsitytutors.com/precalculus-help/find-a-direction-vector-when-given-two-points
    https://stackoverflow.com/questions/17332759/finding-vectors-with-2-points

    """
    return ((end.x - start.x), (end.y - start.y))

def distill_array_into_direction_vector(array: np.ndarray) -> tuple:
    """
    Given an array of n items, let's take the start/end of that.
    From start/end, we can turn 2 coordinate points into 1 distance vector.
    Distance vector is a tuple that equals (delta_x, delta_y).
    """
    origin = array[0]
    destination = array[-1]
    return get_direction_vector(origin, destination)


def get_vector_norm(vector: tuple) -> float:
    """
    Get the length (off of Pythagorean Theorem) by summing up
    the squares of the components and then taking square root.
    
    Use Pythagorean Theorem to get unit vector. Divide the vector 
    by the length of the vector to get unit/normalized vector.
    This equation tells us what we need to divide by.
    """
    return np.sqrt(vector[0]**2 + vector[1]**2)


def get_normalized_vector(vector: tuple) -> tuple:
    """
    Apply Pythagorean Theorem and normalize the vector of distances.
    """
    x_norm = vector[0] / get_vector_norm(vector)
    y_norm = vector[1] / get_vector_norm(vector)

    return (x_norm, y_norm)


def dot_product(vec1: tuple, vec2: tuple) -> float:
    """
    Take the dot product. Multiply the x components, the y components, and 
    sum it up.
    """
    return vec1[0]*vec2[0] + vec1[1]*vec2[1]




In [None]:
def find_if_two_arrays_same_direction(
    subset_stop_geom_array: np.ndarray, 
    subset_shape_geom_array: np.ndarray
) -> float:
    """
    https://stackoverflow.com/questions/21030391/how-to-normalize-a-numpy-array-to-a-unit-vector
    """
    # Get vectors for stop and shape
    stop_vec = distill_array_into_direction_vector(
        subset_stop_geom_array)
    shape_vec = distill_array_into_direction_vector(
        subset_shape_geom_array)
    
    print(f"stop vector: {stop_vec}")
    print(f"shape vector: {shape_vec}")
    
    # Normalize the vectors (divide by length) to get unit vector
    stop_norm = get_normalized_vector(stop_vec)
    shape_norm = get_normalized_vector(shape_vec)
    
    print(f"stop_norm: {stop_norm}, shape_norm: {shape_norm}")
    
    dot_result = dot_product(stop_norm, shape_norm)
    
    print(f"dot product: {dot_result}")
    
    return dot_result

In [None]:
one_shape = "085811097c87489feebe8206770b3cd7"
stop_seq = 600
one_trip = "t14A-b1B59-sl4"

In [None]:
stop_segments[stop_segments.stop_sequence < 3600].explore("stop_sequence", 
                    tiles="CartoDB Positron", categorical=True)

In [None]:
df = vp_joined_to_segments_special[
    (vp_joined_to_segments_special.shape_array_key==one_shape) & 
    (vp_joined_to_segments_special.stop_sequence==stop_seq) & 
    (vp_joined_to_segments_special.trip_id==one_trip)
]

In [None]:
df[["trip_id", "location_timestamp_local"]].sort_values("location_timestamp_local")