# Grab all the loopy/inlining/odd shapes
Select a couple of examples to see what the pattern is for `stop_sequence` and `shape_meters`.

In [1]:
import os
os.environ['USE_PYGEOS'] = '0'

import dask.dataframe as dd
import geopandas as gpd
import pandas as pd
import numpy as np
import shapely
import sys

from segment_speed_utils import (helpers, gtfs_schedule_wrangling, 
                                 wrangle_shapes)
from segment_speed_utils.project_vars import SEGMENT_GCS, analysis_date

# Temp append so we can import functions from scripts/
sys.path.append("./scripts/")
import prep_stop_segments
import cut_stop_segments
import loop_utils

shape_with_utc = "f765b9d12fcca0173b4e3ddbc0374d18"
another_shape = "007cf76062f5957d4e38ea54e624c0ad"

In [2]:
loop_shapes = loop_utils.grab_loop_shapes(analysis_date)

stop_times_with_geom = prep_stop_segments.stop_times_aggregated_to_shape_array_key(
        analysis_date, loop_shapes)

st_loops = stop_times_with_geom.compute()

In [3]:
gdf = (loop_utils.assign_visits_to_stop(st_loops)
       .sort_values(["shape_array_key", "stop_sequence"])
       .reset_index(drop=True)
      )

In [4]:
gdf2 = gdf[gdf.shape_array_key.isin([another_shape])]

In [6]:
#https://www.geeksforgeeks.org/how-to-find-the-index-of-value-in-numpy-array/
def get_index(array, item):
    for idx, val in np.ndenumerate(array):
        if val == item:
            return idx[0]
        
        
def include_prior_and_subsequent(array: np.ndarray, value: int):
    idx = get_index(array, value)
    return array[idx-1: idx+2]

In [24]:
shape_geom = gdf2.geometry.iloc[0]
stop_geom_array = np.array(gdf2.stop_geometry)
stop_seq_array = np.array(gdf2.stop_sequence)

In [25]:
stop_seq = 15
subset_seq = include_prior_and_subsequent(stop_seq_array, stop_seq)
subset_seq

array([14, 15, 16])

In [29]:
#https://stackoverflow.com/questions/5508352/indexing-numpy-array-with-another-numpy-array
subset_stop_geom = stop_geom_array[subset_seq]
subset_stop_geom

array([<POINT (262999.929 -582806.103)>, <POINT (262637.538 -582897.777)>,
       <POINT (262509.32 -582899.685)>], dtype=object)

In [48]:
# get a vector between the subsequent and prior stops (we draw 
# right through our stop of interest)
stop_vec_end = (subset_stop_geom[-1].x, subset_stop_geom[-1].y)
stop_vec_start = (subset_stop_geom[0].x, subset_stop_geom[0].y)

stop_vec_end, stop_vec_start

((262509.32000442955, -582899.6846790994),
 (262999.92859736393, -582806.1029309039))

In [49]:
# Now project this vector of 3 stops
# because we need to know which part to subset
# off of the shape's shape_meters array
subset_stop_projected = np.array(
    [shape_geom.project(i) for i in subset_stop_geom]
)
subset_stop_projected



array([5503.02579871, 5877.33959251, 6007.87823823])

In [53]:
shape_path_dist = np.array(
            [shape_geom.project(shapely.geometry.Point(p)) 
            for p in shape_geom.coords]
)

shape_path_dist[:5]

array([  0.        ,  14.50702629, 100.6041139 , 126.5169287 ,
       126.5169287 ])

In [55]:
# Get the subset of shape_path points that
# covers start_dist to end_dist
# https://stackoverflow.com/questions/16343752/numpy-where-function-multiple-conditions

# the start_dist and end_dist take the prior/subsequent stop
# to use to check against direction
shape_path_subset = shape_path_dist[
    (np.where(
        (shape_path_dist >= subset_stop_projected[0]) & 
        (shape_path_dist <= subset_stop_projected[-1]))
    )]

shape_path_subset

array([5503.17692096, 5503.17692096, 5517.81052203, 5533.71996806,
       5549.33373331, 5582.26940738, 5603.59423777, 5664.84485808,
       5848.62988146, 5859.68089117, 5877.38668559, 5877.38668559,
       5907.37737327])

In [59]:
shape_path_interp = [shape_geom.interpolate(i) 
                     for i in shape_path_subset]

shape_path_interp

[<POINT (262997.813 -582813.923)>,
 <POINT (262997.813 -582813.923)>,
 <POINT (262983.621 -582810.354)>,
 <POINT (262967.951 -582807.605)>,
 <POINT (262952.346 -582807.069)>,
 <POINT (262919.819 -582812.24)>,
 <POINT (262899.646 -582819.152)>,
 <POINT (262842.14 -582840.24)>,
 <POINT (262666.016 -582892.75)>,
 <POINT (262655.505 -582896.162)>,
 <POINT (262638.566 -582901.316)>,
 <POINT (262638.566 -582901.316)>,
 <POINT (262609.902 -582910.139)>]

In [60]:
# take the shape's vector and take the start/end of that
# and create vector
# get a vector between the subsequent and prior stops (we draw 
# right through our stop of interest)
shape_vec_start = (shape_path_interp[0].x, shape_path_interp[0].y)
shape_vec_end = (shape_path_interp[-1].x, shape_path_interp[-1].y)

shape_vec_start, shape_vec_end

((262997.81276319915, -582813.9233183321),
 (262609.9021033956, -582910.1389678828))

In [67]:
shape_vec = (shape_vec_end[0] - shape_vec_start[0],
             shape_vec_end[1] - shape_vec_start[1])

shape_vec

(-387.91065980354324, -96.21564955078065)

In [68]:
stop_vec = (stop_vec_end[0] - stop_vec_start[0],
             stop_vec_end[1] - stop_vec_start[1])

stop_vec

(-490.6085929343826, -93.5817481954582)

In [69]:
stop_vec[0]*shape_vec[0] + stop_vec[1]*shape_vec[1]

199316.33167918792

In [64]:
stop_vec_start

(262999.92859736393, -582806.1029309039)

In [None]:
# Take current stop
# find stop_sequence before and after
def super_project(
    current_stop: shapely.geometry.Point, 
    shape_geometry: shapely.geometry.LineString,
    stop_geometry_array: np.ndarray
):
    
    shape_path_array = np.array(
            [shape_geometry.project(shapely.geometry.Point(p)) 
            for p in shape_geometry.coords]
        )
    
    stop_vec = include_prior_and_subsequent(
        stop_geometry_array, current_stop)
    shape_vec = 
    
  

In [None]:
 shape_path_dist = np.array(
            [shape_geom.project(shapely.geometry.Point(p)) 
            for p in shape_geom.coords]
        )

In [None]:
shape_path_dist

In [None]:
# Turn the stop_geometry and shape_geometry columns into geoseries
shape_geoseries = gpd.GeoSeries(gdf.geometry)
stop_geoseries = gpd.GeoSeries(gdf.stop_geometry)

# Get projected shape_meters as dask array
shape_meters_geoseries = wrangle_shapes.project_point_geom_onto_linestring(
    shape_geoseries,
    stop_geoseries,
    get_dask_array=False
)

# Attach dask array as a column
gdf["shape_meters"] = shape_meters_geoseries

In [None]:
def add_prior_stop_info(
    gdf: gpd.GeoDataFrame, 
    trip_grouping_cols: list = ["shape_array_key"],
    segment_identifier_cols: list = ["shape_array_key", "stop_sequence"]
):
    # prior stop location won't be available if the first/last stop 
    # is the one being visited twice (which is often the case)
    gdf = gdf.assign(
        prior_stop_location = (gdf.sort_values(segment_identifier_cols)
                               .groupby(trip_grouping_cols)
                               ["stop_geometry"]
                               .apply(lambda x: x.shift(1))
                              ),
        #prior_shape_meters = (gdf.sort_values(segment_identifier_cols)
        #                       .groupby(trip_grouping_cols)
        #                      ["shape_meters"]
        #                       .apply(lambda x: x.shift(1))
        #                     ),
    )
    
    return gdf

In [None]:
gdf = add_prior_stop_info(
    gdf,
    trip_grouping_cols = ["shape_array_key"],
    segment_identifier_cols = ["shape_array_key", "stop_sequence"]
)

This is a case where there's not a monotonically increasing `shape_meters` column.

But, where it's not monotonically increasing is not only where stops are visited twice. It's also happening at `stop_sequence==2`. 
* `stop_sequence==1` is where the shape begins, and it's also a portion of a loop.
* `stop_sequence==2` is not visited twice, but since it's occurring during the loop, the `shape_meters` calculated is against the end of the shape, not the beginning.
* For a stop's 2nd visit, we want to look at the prior stop and calculate distance, and overwrite `shape_meters` so it **is** monotonically increasing.
* For a stop's 1st visit, we may not be able to look prior (stop 0), and even if we look ahead, stop 2 might have an issue. **But**, if we get the distance between stop 1 and 2, we still might be able to back out the same coordinates from the line geometry, even if the array is not monotonically increasing. The numbers are just our ordered perspective, but maybe for coordinates, it's still able to grab the same ones. 

In [None]:
display_cols = [
    "shape_array_key", "stop_id", "stop_sequence",
    "num_visits", "visit_order", 
    "shape_meters", 
    "prior_stop_location", 
]
another_shape_gdf = gdf[gdf.shape_array_key==another_shape][
    display_cols + ["geometry", "stop_geometry"]]

another_shape_gdf[["stop_id", "stop_sequence", 
                   "shape_meters"]]

In [None]:
shape_geom = another_shape_gdf.geometry.iloc[0]
shape_dist_array = np.array([shape_geom.project(shapely.geometry.Point(p)) 
          for p in shape_geom.coords])

In [None]:
len(shape_dist_array), len(np.unique(shape_dist_array))

In [None]:
shape_dist_array

In [None]:
np.unique(shape_dist_array)

In [None]:
another_shape_gdf.set_geometry("stop_geometry").explore(
    "stop_sequence",
    tiles="CartoDB Positron", 
    categorical=True, legend=False
)

For these shapes, do we want to sort `stop_sequence` instead by `shape_meters`?

Then pick a non-loopy route and see. If we sort and enforce the monotonically increasing rule, what will happen? There was a Sacramento Schedule route that was cut haphazardly.

Look at the array of line geometry coords, and those are also not monotonically increasing. So, it looks like we need to factor in what subset has been grabbed vs not yet (eligible to).

In [None]:
test_gdf = another_shape_gdf[another_shape_gdf.stop_sequence.isin([1,2])
                             ].reset_index(drop=True)
display(test_gdf[["stop_id", "stop_sequence", "shape_meters"]])

result = cut_stop_segments.get_shape_coords_up_to_stop(
    shape_geom,
    shape_dist_array,
    np.array(test_gdf.shape_meters),
    (test_gdf.shape_meters[0], test_gdf.shape_meters[1])
)

gpd.GeoSeries(result, crs="EPSG:3310").explore(
    tiles="CartoDB Positron")

In [None]:
stop_geom = gpd.GeoSeries(another_shape_gdf.stop_geometry)
prior_stop_geom = gpd.GeoSeries(another_shape_gdf.prior_stop_location)
distance_from_prior = stop_geom.distance(prior_stop_geom, align=True)

another_shape_gdf["distance_from_prior_stop"] = distance_from_prior

In [None]:
# Get the cumulative distance 
another_shape_gdf = another_shape_gdf.assign(
    cumulative_dist = (another_shape_gdf.sort_values(
                        ["shape_array_key", "stop_sequence"])
                       .groupby("shape_array_key")
                       .distance_from_prior_stop
                       .cumsum()
                      ),
    shape_meters_adj = (another_shape_gdf[["shape_meters", 
                                           "distance_from_prior_stop"]].sum(axis=1)
                       )
)

In [None]:
test_gdf = another_shape_gdf[another_shape_gdf.stop_sequence.isin([4,5])
                             ].reset_index(drop=True)
display(test_gdf[["stop_id", "stop_sequence", "shape_meters", 
                  "distance_from_prior_stop",
                  "cumulative_dist", "shape_meters_adj"]])

m = test_gdf.set_geometry("stop_geometry").explore(
    "stop_sequence", tiles='CartoDB Positron'
)

meters_col = "cumulative_dist"

result = cut_stop_segments.get_shape_coords_up_to_stop(
    shape_geom,
    np.unique(shape_dist_array),
    np.array(test_gdf[meters_col]),
    (test_gdf[meters_col][0], test_gdf[meters_col][1])
)

gpd.GeoSeries(result, crs="EPSG:3310").explore(
    tiles="CartoDB Positron", m=m)

In [None]:
test_gdf = another_shape_gdf[another_shape_gdf.stop_sequence.isin([29,30])
                             ].reset_index(drop=True)
display(test_gdf[["stop_id", "stop_sequence", "shape_meters", 
                  "distance_from_prior_stop",
                  "cumulative_dist", "shape_meters_adj"]])

m = test_gdf.set_geometry("stop_geometry").explore(
    "stop_sequence", tiles='CartoDB Positron'
)

meters_col = "shape_meters_adj"
result = cut_stop_segments.get_shape_coords_up_to_stop(
    shape_geom,
    shape_dist_array,
    np.array(test_gdf[meters_col]),
    (test_gdf[meters_col][0], test_gdf[meters_col][1])
)

gpd.GeoSeries(result, crs="EPSG:3310").explore(
    tiles="CartoDB Positron", m=m)

In [None]:
test_gdf = another_shape_gdf[another_shape_gdf.stop_sequence.isin([28, 29])
                             ].reset_index(drop=True)
display(test_gdf[["stop_id", "stop_sequence", "shape_meters", 
                 "cumulative_dist", "shape_meters_adj"]])

m = test_gdf.set_geometry("stop_geometry").explore(
    "stop_sequence", tiles='CartoDB Positron'
)

meters_col = "cumulative_dist"

result = cut_stop_segments.get_shape_coords_up_to_stop(
    shape_geom,
    shape_dist_array,
    np.array(test_gdf[meters_col]),
    (test_gdf[meters_col][0], test_gdf[meters_col][1])
)

gpd.GeoSeries(result, crs="EPSG:3310").explore(
    tiles="CartoDB Positron", m=m)

In [None]:
stop_segments = helpers.import_segments(
    SEGMENT_GCS,
    f"stop_segments_{analysis_date}", 
    filters=[[("shape_array_key", "==", shape_with_utc)],
             [("shape_array_key", "==", another_shape)]
            ]
)

In [None]:
stop_segments[stop_segments.geometry.notna()
             ].explore("stop_sequence", tiles="CartoDB Positron")