# Grab all the loopy/inlining/odd shapes
Select a couple of examples to see what the pattern is for `stop_sequence` and `shape_meters`.

In [1]:
import dask.dataframe as dd
import geopandas as gpd
import pandas as pd
import numpy as np
import shapely
import sys

from segment_speed_utils import (helpers, gtfs_schedule_wrangling, 
                                 wrangle_shapes)
from segment_speed_utils.project_vars import SEGMENT_GCS, analysis_date

# Temp append so we can import functions from scripts/
sys.path.append("./scripts/")
import prep_stop_segments
import cut_stop_segments

shape_with_utc = "f765b9d12fcca0173b4e3ddbc0374d18"
another_shape = "007cf76062f5957d4e38ea54e624c0ad"



In [2]:
def grab_loop_shapes(analysis_date: str) -> pd.DataFrame:
    """
    Use stop_times table to grab the trips that 
    visit the same stop_id at least twice.
    """
    stop_times = helpers.import_scheduled_stop_times(
        analysis_date, 
        columns = [
            "feed_key", "trip_id", 
            "stop_id", "stop_sequence",
        ]
    ).drop_duplicates()
    
    stop_visits = (stop_times.groupby(
                    ["feed_key", "trip_id", "stop_id"])
                  .agg({"stop_sequence": "count"}) 
                   #nunique doesn't work in dask
                  .reset_index()
                 )
    
    loop_trips = (stop_visits[stop_visits.stop_sequence > 1]
                  [["feed_key", "trip_id"]]
                  .drop_duplicates()
                  .reset_index(drop=True)
                  .compute()
                 )
    
    trips_with_geom = gtfs_schedule_wrangling.get_trips_with_geom(
        analysis_date).compute()
    
    loop_trips_with_geom = pd.merge(
        trips_with_geom,
        loop_trips,
        on = ["feed_key", "trip_id"],
        how = "inner"
    )
        
    return loop_trips_with_geom

In [3]:
loop_shapes = grab_loop_shapes(analysis_date)

stop_times_with_geom = prep_stop_segments.stop_times_aggregated_to_shape_array_key(
        analysis_date, loop_shapes)

st_loops = stop_times_with_geom.compute()

In [4]:
def assign_visits_to_stop(df: pd.DataFrame):
    """
    Groupby shape and stop_id and count how many times it's being visited
    and which number visit it is.
    """
    df = df.assign(
        num_visits = df.groupby(["shape_array_key", "stop_id"])
                    .stop_sequence.transform("nunique")
    )

    df = df.assign(
        visit_order = (df.sort_values(["stop_id", "stop_sequence"])
                      .groupby("stop_id")
                      .cumcount() + 1)
    )
    
    return df

In [5]:
gdf = (assign_visits_to_stop(st_loops)
       .sort_values(["shape_array_key", "stop_sequence"])
       .reset_index(drop=True)
      )

In [6]:
# Turn the stop_geometry and shape_geometry columns into geoseries
shape_geoseries = gpd.GeoSeries(gdf.geometry)
stop_geoseries = gpd.GeoSeries(gdf.stop_geometry)

# Get projected shape_meters as dask array
shape_meters_geoseries = wrangle_shapes.project_point_geom_onto_linestring(
    shape_geoseries,
    stop_geoseries,
    get_dask_array=False
)

# Attach dask array as a column
gdf["shape_meters"] = shape_meters_geoseries



In [7]:
def add_prior_stop_info(
    gdf: gpd.GeoDataFrame, 
    trip_grouping_cols: list = ["shape_array_key"],
    segment_identifier_cols: list = ["shape_array_key", "stop_sequence"]
):
    # prior stop location won't be available if the first/last stop 
    # is the one being visited twice (which is often the case)
    gdf = gdf.assign(
        prior_stop_location = (gdf.sort_values(segment_identifier_cols)
                               .groupby(trip_grouping_cols)
                               ["stop_geometry"]
                               .apply(lambda x: x.shift(1))
                              ),
        prior_shape_meters = (gdf.sort_values(segment_identifier_cols)
                               .groupby(trip_grouping_cols)
                              ["shape_meters"]
                               .apply(lambda x: x.shift(1))
                             ),
    )
    
    return gdf

In [8]:
gdf = add_prior_stop_info(
    gdf,
    trip_grouping_cols = ["shape_array_key"],
    segment_identifier_cols = ["shape_array_key", "stop_sequence"]
)



This is a case where there's not a monotonically increasing `shape_meters` column.

But, where it's not monotonically increasing is not only where stops are visited twice. It's also happening at `stop_sequence==2`. 
* `stop_sequence==1` is where the shape begins, and it's also a portion of a loop.
* `stop_sequence==2` is not visited twice, but since it's occurring during the loop, the `shape_meters` calculated is against the end of the shape, not the beginning.
* For a stop's 2nd visit, we want to look at the prior stop and calculate distance, and overwrite `shape_meters` so it **is** monotonically increasing.
* For a stop's 1st visit, we may not be able to look prior (stop 0), and even if we look ahead, stop 2 might have an issue. **But**, if we get the distance between stop 1 and 2, we still might be able to back out the same coordinates from the line geometry, even if the array is not monotonically increasing. The numbers are just our ordered perspective, but maybe for coordinates, it's still able to grab the same ones. 

In [9]:
display_cols = [
    "shape_array_key", "stop_id", "stop_sequence",
    "num_visits", "visit_order", 
    "shape_meters", 
    "prior_stop_location", "prior_shape_meters"
]
another_shape_gdf = gdf[gdf.shape_array_key==another_shape][
    display_cols + ["geometry", "stop_geometry"]]

another_shape_gdf[["stop_id", "stop_sequence", 
                   "shape_meters"]]

Unnamed: 0,stop_id,stop_sequence,shape_meters
0,99437,1,12515.809937
1,99342,2,126.539395
2,13311,3,432.656989
3,13312,4,11625.468119
4,13313,5,11367.805069
5,10839,6,11074.622935
6,10836,7,10948.319411
7,10833,8,1523.071798
8,10828,9,1790.190779
9,99356,10,1986.255879


In [10]:
shape_geom = another_shape_gdf.geometry.iloc[0]
shape_dist_array = np.array([shape_geom.project(shapely.geometry.Point(p)) 
          for p in shape_geom.coords])

In [11]:
len(shape_dist_array), len(np.unique(shape_dist_array))

(234, 146)

In [12]:
shape_dist_array

array([    0.        ,    14.50702629,   100.6041139 ,   126.5169287 ,
         126.5169287 ,   186.32333225,   271.48160851,   357.57636353,
         432.68603544,   432.68603544,   442.7324664 ,   528.17003552,
         614.54645823,   688.71952226,   688.71952226,   698.85923085,
         785.51731919,   870.675623  ,   946.35080412,   946.35080412,
         956.49111623,  1043.15152284,  1126.8052718 ,  1210.73957797,
        1239.56284427,  1239.56284427,  1293.45350502,  1365.83982612,
        1365.83982612,  1376.16730479,  1461.32234981,  1523.00556484,
        1523.00556484,  1544.69355345,  1628.90952416,  1725.98808233,
        1790.11243705,  1790.11243705,  1914.324297  ,  1986.19506933,
        1986.19506933,  2162.05160159,  2375.11758177,  2506.0144574 ,
        2506.0144574 ,  2777.55470848,  2829.30874841,  2843.36418094,
        2896.50423046,  2950.0206515 ,  2951.08571982,  2951.08571982,
        2961.7363607 ,  3013.41481329,  3062.16948937,  3115.67448341,
      

In [13]:
np.unique(shape_dist_array)

array([    0.        ,    14.50702629,   100.6041139 ,   126.5169287 ,
         186.32333225,   271.48160851,   357.57636353,   432.68603544,
         442.7324664 ,   528.17003552,   614.54645823,   688.71952226,
         698.85923085,   785.51731919,   870.675623  ,   946.35080412,
         956.49111623,  1043.15152284,  1126.8052718 ,  1210.73957797,
        1239.56284427,  1293.45350502,  1365.83982612,  1376.16730479,
        1461.32234981,  1523.00556484,  1544.69355345,  1628.90952416,
        1725.98808233,  1790.11243705,  1914.324297  ,  1986.19506933,
        2162.05160159,  2375.11758177,  2506.0144574 ,  2777.55470848,
        2829.30874841,  2843.36418094,  2896.50423046,  2950.0206515 ,
        2951.08571982,  2961.7363607 ,  3013.41481329,  3062.16948937,
        3115.67448341,  3168.85026828,  3170.78413909,  3221.07606461,
        3271.61786253,  3321.68034611,  3372.11133806,  3425.47418274,
        3449.84238087,  3523.06479534,  3578.37530356,  3661.61397168,
      

In [14]:
another_shape_gdf.set_geometry("stop_geometry").explore(
    "stop_sequence",
    tiles="CartoDB Positron", 
    categorical=True, legend=False
)

For these shapes, do we want to sort `stop_sequence` instead by `shape_meters`?

Then pick a non-loopy route and see. If we sort and enforce the monotonically increasing rule, what will happen? There was a Sacramento Schedule route that was cut haphazardly.

Look at the array of line geometry coords, and those are also not monotonically increasing. So, it looks like we need to factor in what subset has been grabbed vs not yet (eligible to).

In [15]:
test_gdf = another_shape_gdf[another_shape_gdf.stop_sequence.isin([1,2])
                             ].reset_index(drop=True)
display(test_gdf[["stop_id", "stop_sequence", "shape_meters"]])

result = cut_stop_segments.get_shape_coords_up_to_stop(
    shape_geom,
    shape_dist_array,
    np.array(test_gdf.shape_meters),
    (test_gdf.shape_meters[0], test_gdf.shape_meters[1])
)

gpd.GeoSeries(result, crs="EPSG:3310").explore(
    tiles="CartoDB Positron")

Unnamed: 0,stop_id,stop_sequence,shape_meters
0,99437,1,12515.809937
1,99342,2,126.539395


In [16]:
test_gdf = another_shape_gdf[another_shape_gdf.stop_sequence.isin([11,12])
                             ].reset_index(drop=True)
display(test_gdf[["stop_id", "stop_sequence", "shape_meters"]])

result = cut_stop_segments.get_shape_coords_up_to_stop(
    shape_geom,
    shape_dist_array,
    np.array(test_gdf.shape_meters),
    (test_gdf.shape_meters[0], test_gdf.shape_meters[1])
)

gpd.GeoSeries(result, crs="EPSG:3310").explore(
    tiles="CartoDB Positron")

Unnamed: 0,stop_id,stop_sequence,shape_meters
0,99859,11,9808.189194
1,12395,12,2951.08572


In [17]:
test_gdf = another_shape_gdf[another_shape_gdf.stop_sequence.isin([28, 29])
                             ].reset_index(drop=True)
display(test_gdf[["stop_id", "stop_sequence", "shape_meters"]])

result = cut_stop_segments.get_shape_coords_up_to_stop(
    shape_geom,
    shape_dist_array,
    np.array(test_gdf.shape_meters),
    (test_gdf.shape_meters[0], test_gdf.shape_meters[1])
)

gpd.GeoSeries(result, crs="EPSG:3310").explore(
    tiles="CartoDB Positron")

Unnamed: 0,stop_id,stop_sequence,shape_meters
0,10090,28,1148.989841
1,10094,29,11432.430297


In [18]:
stop_segments = helpers.import_segments(
    SEGMENT_GCS,
    f"stop_segments_{analysis_date}", 
    filters=[[("shape_array_key", "==", shape_with_utc)],
             [("shape_array_key", "==", another_shape)]
            ]
)

In [19]:
stop_segments[stop_segments.geometry.notna()
             ].explore("stop_sequence", tiles="CartoDB Positron")