# Grab all the loopy/inlining/odd shapes
Select a couple of examples to see what the pattern is for `stop_sequence` and `shape_meters`.

In [None]:
import os
os.environ['USE_PYGEOS'] = '0'

import dask.dataframe as dd
import geopandas as gpd
import pandas as pd
import numpy as np
import shapely
import sys

from segment_speed_utils import (helpers, gtfs_schedule_wrangling, 
                                 wrangle_shapes)
from segment_speed_utils.project_vars import SEGMENT_GCS, analysis_date

# Temp append so we can import functions from scripts/
sys.path.append("./scripts/")
import prep_stop_segments
import cut_stop_segments
import loop_utils

shape_with_utc = "f765b9d12fcca0173b4e3ddbc0374d18"
another_shape = "007cf76062f5957d4e38ea54e624c0ad"

import folium

In [None]:
loop_shapes = loop_utils.grab_loop_shapes(analysis_date)

stop_times_with_geom = prep_stop_segments.stop_times_aggregated_to_shape_array_key(
        analysis_date, loop_shapes)

st_loops = stop_times_with_geom.compute()

In [None]:
gdf = (loop_utils.assign_visits_to_stop(st_loops)
       .sort_values(["shape_array_key", "stop_sequence"])
       .reset_index(drop=True)
      )

In [None]:
gdf2 = gdf[gdf.shape_array_key.isin([another_shape])]

In [None]:
# Grab relevant subset based on stop sequence values to get stop geometry subset
#https://stackoverflow.com/questions/5508352/indexing-numpy-array-with-another-numpy-array
#subset_stop_geom = stop_geom_array[subset_seq]

In [None]:
import loopy

shape_geometry = gdf2.geometry.iloc[0]
stop_geometry_array = np.array(gdf2.stop_geometry)
stop_sequence_array = np.array(gdf2.stop_sequence)

current_stop_seq = 17

In [None]:
# Set up: project each of the coordinates in the shape_geometry
# to be shape_meters
shape_dist_array = loopy.project_list_of_coords(
    shape_geometry, [], use_shapely_coords = True)

shape_dist_ordered = np.unique(shape_dist_array)

In [None]:
# (1) Given a stop sequence value, find the stop_sequence values 
# just flanking it (prior and subsequent).
# this is important especially because stop_sequence does not have 
# to be increasing in increments of 1, but it has to be monotonically increasing
subset_seq = loopy.include_prior_and_subsequent(
    stop_sequence_array, current_stop_seq)
print(f"subset of stop sequences: {subset_seq}")

# (2) Grab relevant subset based on stop sequence values to get stop geometry subset
# https://stackoverflow.com/questions/5508352/indexing-numpy-array-with-another-numpy-array
subset_stop_geom = stop_geometry_array[subset_seq]
print(f"subset of stop geometry: {subset_stop_geom}")

In [None]:
# (3) Project this vector of 3 stops
# because we need to know which part to subset
# off of the shape's shape_meters array
subset_stop_proj = loopy.project_list_of_coords(
    shape_geometry, subset_stop_geom)
    
print(f"subset stops projected: {subset_stop_proj}")
    
# (4) Get the subset of shape_path points that
# spans a start distance position and end distance position 
# We have 2 stops, and let's grab the chunk of the shape that spans that
# https://stackoverflow.com/questions/16343752/numpy-where-function-multiple-conditions

# the start_dist and end_dist take the prior/subsequent stop
# to use to check against direction
start_stop = subset_stop_proj[0]
end_stop = subset_stop_proj[-1]

print(f"origin: {start_stop}, destination: {end_stop}")

In [None]:
points_list = [shapely.Point(i) for i in shape_geometry.simplify(0).coords]

In [None]:
gpd.GeoSeries(points_list, crs="EPSG:3310").head()

In [None]:
points_series_wo_idx0 = gpd.GeoSeries(points_list[1:], crs="EPSG:3310")
points_series = gpd.GeoSeries(points_list, crs="EPSG:3310")
distance_from_prior = np.array(
    points_series_wo_idx0.distance(points_series)
)

In [None]:
cumulative_distances = np.array(
    [0] + list(np.cumsum(distance_from_prior))
)

In [None]:
subset_stop_geom
# Maybe we calculate distance between point 1 and 2
# then point 2 and 3

In [None]:
origin, destination = (subset_stop_geom[0], subset_stop_geom[1])
origin, destination

In [None]:
distance_between_stops = origin.distance(destination)
distance_between_stops

In [None]:
origin_stop_dist = subset_stop_proj[0]
destin_stop_dist = subset_stop_proj[0] + distance_between_stops

In [None]:
origin_stop_dist, destin_stop_dist

In [None]:
# change this to point
origin_destin_interp = [shape_geometry.interpolate(i) 
                        for i in [origin_stop_dist, destin_stop_dist]]


In [None]:
cumulative_distances[
    np.where(
        (cumulative_distances >= origin_stop_dist) & 
        (cumulative_distances <= destin_stop_dist)
    )
]

In [None]:
indices_grab_from_shape_coords = np.argwhere(
        (cumulative_distances >= origin_stop_dist) & 
        (cumulative_distances <= destin_stop_dist)
    ).flatten()
indices_grab_from_shape_coords

In [None]:
indices_grab_from_shape_coords

In [None]:
points_list[indices_grab_from_shape_coords[0]: 
            indices_grab_from_shape_coords[-1]+1]

In [None]:
grabbed_from_shape_coords = points_list[indices_grab_from_shape_coords[0]: 
            indices_grab_from_shape_coords[-1]+1]
grabbed_from_shape_coords

test_segment = shapely.geometry.LineString(
    [origin_destin_interp[0]] + grabbed_from_shape_coords + 
    [origin_destin_interp[-1]])

In [None]:
import folium

In [None]:
subset_stops = subset_stop_geom[0:2]

In [None]:
subset_stops_series = gpd.GeoSeries(subset_stops, crs="EPSG:3310")
test_segment_series = gpd.GeoSeries(test_segment, crs="EPSG:3310")

In [None]:
m = test_segment_series.explore(tiles="CartoDB Positron", name="segment")
m = subset_stops_series.explore(m=m, name="stops")

folium.LayerControl().add_to(m)
m

In [None]:
m

In [None]:
# find the index in shape_dist_array of closest to stop 1
# condition should be less than or equal to
def find_nearest(array, value):
    # https://stackoverflow.com/questions/2566412/find-nearest-value-in-numpy-array
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    #return array[idx]
    return idx

In [None]:
nearest_val_0 = find_nearest(shape_dist_array, subset_stop_proj[1]) 
nearest_val_1 = find_nearest(shape_dist_array, subset_stop_proj[2])

In [None]:
subset_stop_proj[1], subset_stop_proj[2]

In [None]:
nearest_val_0, nearest_val_1

In [None]:
shape_dist_array

In [None]:
#shape_dist_cumsum = np.cumsum(shape_dist_array)

In [None]:
shape_dist_cumsum[:10]

In [None]:
# typical, the end_stop should be progressing further
if end_stop > start_stop: 
    shape_dist_subset = loopy.cut_shape_projected_by_origin_destination(
        shape_dist_array, 
        (start_stop, end_stop)
    )
    print(f"cut shape by od: {shape_dist_subset}")

elif end_stop < start_stop:
    shape_dist_subset = loopy.cut_shape_projected_by_origin_destination(
        shape_dist_ordered, 
        (end_stop, start_stop)
    )
    print(f"cut shape by od: {shape_dist_subset}")        

    shape_dist_subset = np.flip(shape_dist_subset)
    print(f"flipped: cut shape by od: {shape_dist_subset}")        


In [None]:
# Interpolate again so we change the shape_meters back into coordinate points
subset_shape_geom = loopy.interpolate_projected_points(
    shape_geometry, shape_dist_subset)

    print(f"interpolated subset of shape: {subset_shape_geom}")
    
    # If the stop vector and shape vector run in the same direction,
    # then we want to cut a segment from prior to current stop.
    dot_prod = find_if_two_arrays_same_direction(
        subset_stop_geom,
        subset_shape_geom
    )

In [None]:
# Take current stop
# find stop_sequence before and after

## NEED THIS
#stop_geom_array = np.array(gdf2.stop_geometry)
#stop_seq_array = np.array(gdf2.stop_sequence)

def super_project(
    current_stop_seq: int,
    shape_geometry: shapely.geometry.LineString,
    stop_geometry_array: np.ndarray,
    stop_sequence_array: np.ndarray,
):
    
    # Set up: project each of the coordinates in the shape_geometry
    # to be shape_meters
    shape_dist_array = project_list_of_coords(
        shape_geometry, [], use_shapely_coords = True)
    
    shape_dist_ordered = np.unique(shape_dist_array)
    
    # (1) Given a stop sequence value, find the stop_sequence values 
    # just flanking it (prior and subsequent).
    # this is important especially because stop_sequence does not have 
    # to be increasing in increments of 1, but it has to be monotonically increasing
    subset_seq = include_prior_and_subsequent(
        stop_sequence_array, current_stop_seq)
    print(f"subset of stop sequences: {subset_seq}")
    
    # (2) Grab relevant subset based on stop sequence values to get stop geometry subset
    # https://stackoverflow.com/questions/5508352/indexing-numpy-array-with-another-numpy-array
    subset_stop_geom = stop_geometry_array[subset_seq]
    print(f"subset of stop geometry: {subset_stop_geom}")
    
    # (3) Project this vector of 3 stops
    # because we need to know which part to subset
    # off of the shape's shape_meters array
    subset_stop_proj = project_list_of_coords(
        shape_geometry, subset_stop_geom)
    
    print(f"subset stops projected: {subset_stop_proj}")
    
    # (4) Get the subset of shape_path points that
    # spans a start distance position and end distance position 
    # We have 2 stops, and let's grab the chunk of the shape that spans that
    # https://stackoverflow.com/questions/16343752/numpy-where-function-multiple-conditions

    # the start_dist and end_dist take the prior/subsequent stop
    # to use to check against direction
    start_stop = subset_stop_proj[0]
    end_stop = subset_stop_proj[-1]
    
    print(f"origin: {start_stop}, destination: {end_stop}")
    
    # typical, the end_stop should be progressing further
    if end_stop > start_stop: 
        shape_dist_subset = cut_shape_projected_by_origin_destination(
            shape_dist_array, 
            (start_stop, end_stop)
        )
        print(f"cut shape by od: {shape_dist_subset}")
    
    elif end_stop < start_stop:
        shape_dist_subset = cut_shape_projected_by_origin_destination(
            shape_dist_ordered, 
            (end_stop, start_stop)
        )
        print(f"cut shape by od: {shape_dist_subset}")        
    
        shape_dist_subset = np.flip(shape_dist_subset)
        print(f"flipped: cut shape by od: {shape_dist_subset}")        

    
    # Interpolate again so we change the shape_meters back into coordinate points
    subset_shape_geom = interpolate_projected_points(
        shape_geometry, shape_dist_subset)
    
    print(f"interpolated subset of shape: {subset_shape_geom}")
    
    # If the stop vector and shape vector run in the same direction,
    # then we want to cut a segment from prior to current stop.
    dot_prod = find_if_two_arrays_same_direction(
        subset_stop_geom,
        subset_shape_geom
    )
    
    return dot_prod, subset_stop_geom, subset_shape_geom


In [None]:
def find_if_two_arrays_same_direction(
    subset_stop_geom_array: np.ndarray, 
    subset_shape_geom_array: np.ndarray
) -> float:
    """
    """
    # Get vectors for stop and shape
    stop_vec = distill_array_into_direction_vector(
        subset_stop_geom_array)
    shape_vec = distill_array_into_direction_vector(
        subset_shape_geom_array)
    
    print(f"stop vector: {stop_vec}")
    print(f"shape vector: {shape_vec}")
    
    # Normalize the vectors (divide by length) to get unit vector
    stop_norm = get_normalized_vector(stop_vec)
    shape_norm = get_normalized_vector(shape_vec)
    
    print(f"stop_norm: {stop_norm}, shape_norm: {shape_norm}")
    
    dot_result = dot_product(stop_norm, shape_norm)
    
    print(f"dot product: {dot_result}")
    
    return dot_result

In [None]:
# take the shape's vector and take the start/end of that
# and create vector
# get a vector between the subsequent and prior stops (we draw 
# right through our stop of interest)

In [None]:
#https://stackoverflow.com/questions/17332759/finding-vectors-with-2-points
# normalize to unit vector
# https://stackoverflow.com/questions/21030391/how-to-normalize-a-numpy-array-to-a-unit-vector


In [None]:
from typing import Literal

def array_to_geoseries(
    array: np.ndarray,
    geom_type: Literal["point", "line", "polygon"],
    crs: str = "EPSG:3310"
)-> gpd.GeoSeries: 
    
    if geom_type == "point":
        gdf = gpd.GeoSeries(array, crs=crs)
        
    elif geom_type == "line":
        gdf = gpd.GeoSeries(
            shapely.geometry.LineString(array), 
            crs=crs)
        
    elif geom_type == "polygon":
        gdf = gpd.GeoSeries(
            shapely.geometry.Polygon(array),
            crs = crs)
    
    return gdf

In [None]:
stop_seq = 2
shape_geom = gdf2.geometry.iloc[0]
stop_geom_array = gdf2.stop_geometry
stop_seq_array = gdf2.stop_sequence

dot_prod, subset_stop_geom, subset_shape_geom = super_project(
    stop_seq,
    shape_geom,
    stop_geom_array,
    stop_seq_array
)

In [None]:
stop_series = array_to_geoseries(subset_stop_geom, "point")
shape_series = array_to_geoseries(subset_shape_geom, "line")

m = stop_series.explore(tiles="CartoDBPositron", name="stops")
m = shape_series.explore(m=m, name="segment")
folium.LayerControl().add_to(m)
m

In [None]:
dot_prod

In [None]:
segment_stops = subset_stop_geom[:2]
segment_stops

In [None]:
# Now project this vector of 3 stops
# because we need to know which part to subset
# off of the shape's shape_meters array
segment_stops_proj = project_list_of_coords(shape_geom, segment_stops)
segment_stops_proj

In [None]:
# need this again
shape_dist_array = np.array(
        [shape_geom.project(shapely.geometry.Point(p)) 
        for p in shape_geom.coords]
    )

segment_shape_proj = cut_shape_projected_by_origin_destination(
    shape_dist_array, segment_stops_proj)

segment_shape_proj

In [None]:
segment_shape = interpolate_projected_points(shape_geom, segment_shape_proj)

In [None]:
segment_geom = gpd.GeoSeries(
    shapely.geometry.LineString(segment_shape), 
    crs="EPSG:3310"
)

use_stops_geom = gpd.GeoSeries(segment_stops, crs="EPSG:3310")

In [None]:
import folium

In [None]:
m = segment_geom.explore(tiles="CartoDB Positron", name="segment")
m = use_stops_geom.explore(m=m, name="stops")
folium.LayerControl().add_to(m)
m

In [None]:
 shape_path_dist = np.array(
            [shape_geom.project(shapely.geometry.Point(p)) 
            for p in shape_geom.coords]
        )

In [None]:
shape_path_dist

In [None]:
# Turn the stop_geometry and shape_geometry columns into geoseries
shape_geoseries = gpd.GeoSeries(gdf.geometry)
stop_geoseries = gpd.GeoSeries(gdf.stop_geometry)

# Get projected shape_meters as dask array
shape_meters_geoseries = wrangle_shapes.project_point_geom_onto_linestring(
    shape_geoseries,
    stop_geoseries,
    get_dask_array=False
)

# Attach dask array as a column
gdf["shape_meters"] = shape_meters_geoseries

In [None]:
def add_prior_stop_info(
    gdf: gpd.GeoDataFrame, 
    trip_grouping_cols: list = ["shape_array_key"],
    segment_identifier_cols: list = ["shape_array_key", "stop_sequence"]
):
    # prior stop location won't be available if the first/last stop 
    # is the one being visited twice (which is often the case)
    gdf = gdf.assign(
        prior_stop_location = (gdf.sort_values(segment_identifier_cols)
                               .groupby(trip_grouping_cols)
                               ["stop_geometry"]
                               .apply(lambda x: x.shift(1))
                              ),
        #prior_shape_meters = (gdf.sort_values(segment_identifier_cols)
        #                       .groupby(trip_grouping_cols)
        #                      ["shape_meters"]
        #                       .apply(lambda x: x.shift(1))
        #                     ),
    )
    
    return gdf

In [None]:
gdf = add_prior_stop_info(
    gdf,
    trip_grouping_cols = ["shape_array_key"],
    segment_identifier_cols = ["shape_array_key", "stop_sequence"]
)

This is a case where there's not a monotonically increasing `shape_meters` column.

But, where it's not monotonically increasing is not only where stops are visited twice. It's also happening at `stop_sequence==2`. 
* `stop_sequence==1` is where the shape begins, and it's also a portion of a loop.
* `stop_sequence==2` is not visited twice, but since it's occurring during the loop, the `shape_meters` calculated is against the end of the shape, not the beginning.
* For a stop's 2nd visit, we want to look at the prior stop and calculate distance, and overwrite `shape_meters` so it **is** monotonically increasing.
* For a stop's 1st visit, we may not be able to look prior (stop 0), and even if we look ahead, stop 2 might have an issue. **But**, if we get the distance between stop 1 and 2, we still might be able to back out the same coordinates from the line geometry, even if the array is not monotonically increasing. The numbers are just our ordered perspective, but maybe for coordinates, it's still able to grab the same ones. 

In [None]:
display_cols = [
    "shape_array_key", "stop_id", "stop_sequence",
    "num_visits", "visit_order", 
    "shape_meters", 
    "prior_stop_location", 
]
another_shape_gdf = gdf[gdf.shape_array_key==another_shape][
    display_cols + ["geometry", "stop_geometry"]]

another_shape_gdf[["stop_id", "stop_sequence", 
                   "shape_meters"]]

In [None]:
shape_geom = another_shape_gdf.geometry.iloc[0]
shape_dist_array = np.array([shape_geom.project(shapely.geometry.Point(p)) 
          for p in shape_geom.coords])

In [None]:
len(shape_dist_array), len(np.unique(shape_dist_array))

In [None]:
shape_dist_array

In [None]:
np.unique(shape_dist_array)

In [None]:
another_shape_gdf.set_geometry("stop_geometry").explore(
    "stop_sequence",
    tiles="CartoDB Positron", 
    categorical=True, legend=False
)

For these shapes, do we want to sort `stop_sequence` instead by `shape_meters`?

Then pick a non-loopy route and see. If we sort and enforce the monotonically increasing rule, what will happen? There was a Sacramento Schedule route that was cut haphazardly.

Look at the array of line geometry coords, and those are also not monotonically increasing. So, it looks like we need to factor in what subset has been grabbed vs not yet (eligible to).

In [None]:
test_gdf = another_shape_gdf[another_shape_gdf.stop_sequence.isin([1,2])
                             ].reset_index(drop=True)
display(test_gdf[["stop_id", "stop_sequence", "shape_meters"]])

result = cut_stop_segments.get_shape_coords_up_to_stop(
    shape_geom,
    shape_dist_array,
    np.array(test_gdf.shape_meters),
    (test_gdf.shape_meters[0], test_gdf.shape_meters[1])
)

gpd.GeoSeries(result, crs="EPSG:3310").explore(
    tiles="CartoDB Positron")

In [None]:
stop_geom = gpd.GeoSeries(another_shape_gdf.stop_geometry)
prior_stop_geom = gpd.GeoSeries(another_shape_gdf.prior_stop_location)
distance_from_prior = stop_geom.distance(prior_stop_geom, align=True)

another_shape_gdf["distance_from_prior_stop"] = distance_from_prior

In [None]:
# Get the cumulative distance 
another_shape_gdf = another_shape_gdf.assign(
    cumulative_dist = (another_shape_gdf.sort_values(
                        ["shape_array_key", "stop_sequence"])
                       .groupby("shape_array_key")
                       .distance_from_prior_stop
                       .cumsum()
                      ),
    shape_meters_adj = (another_shape_gdf[["shape_meters", 
                                           "distance_from_prior_stop"]].sum(axis=1)
                       )
)

In [None]:
test_gdf = another_shape_gdf[another_shape_gdf.stop_sequence.isin([4,5])
                             ].reset_index(drop=True)
display(test_gdf[["stop_id", "stop_sequence", "shape_meters", 
                  "distance_from_prior_stop",
                  "cumulative_dist", "shape_meters_adj"]])

m = test_gdf.set_geometry("stop_geometry").explore(
    "stop_sequence", tiles='CartoDB Positron'
)

meters_col = "cumulative_dist"

result = cut_stop_segments.get_shape_coords_up_to_stop(
    shape_geom,
    np.unique(shape_dist_array),
    np.array(test_gdf[meters_col]),
    (test_gdf[meters_col][0], test_gdf[meters_col][1])
)

gpd.GeoSeries(result, crs="EPSG:3310").explore(
    tiles="CartoDB Positron", m=m)

In [None]:
test_gdf = another_shape_gdf[another_shape_gdf.stop_sequence.isin([29,30])
                             ].reset_index(drop=True)
display(test_gdf[["stop_id", "stop_sequence", "shape_meters", 
                  "distance_from_prior_stop",
                  "cumulative_dist", "shape_meters_adj"]])

m = test_gdf.set_geometry("stop_geometry").explore(
    "stop_sequence", tiles='CartoDB Positron'
)

meters_col = "shape_meters_adj"
result = cut_stop_segments.get_shape_coords_up_to_stop(
    shape_geom,
    shape_dist_array,
    np.array(test_gdf[meters_col]),
    (test_gdf[meters_col][0], test_gdf[meters_col][1])
)

gpd.GeoSeries(result, crs="EPSG:3310").explore(
    tiles="CartoDB Positron", m=m)

In [None]:
test_gdf = another_shape_gdf[another_shape_gdf.stop_sequence.isin([28, 29])
                             ].reset_index(drop=True)
display(test_gdf[["stop_id", "stop_sequence", "shape_meters", 
                 "cumulative_dist", "shape_meters_adj"]])

m = test_gdf.set_geometry("stop_geometry").explore(
    "stop_sequence", tiles='CartoDB Positron'
)

meters_col = "cumulative_dist"

result = cut_stop_segments.get_shape_coords_up_to_stop(
    shape_geom,
    shape_dist_array,
    np.array(test_gdf[meters_col]),
    (test_gdf[meters_col][0], test_gdf[meters_col][1])
)

gpd.GeoSeries(result, crs="EPSG:3310").explore(
    tiles="CartoDB Positron", m=m)

In [None]:
stop_segments = helpers.import_segments(
    SEGMENT_GCS,
    f"stop_segments_{analysis_date}", 
    filters=[[("shape_array_key", "==", shape_with_utc)],
             [("shape_array_key", "==", another_shape)]
            ]
)

In [None]:
stop_segments[stop_segments.geometry.notna()
             ].explore("stop_sequence", tiles="CartoDB Positron")