# Shapes have different stop_ids and stop_sequences, but are the same for the most part

For each shape, if we can select a trip with the most number of stops, that's the one we should base our segments off of. By taking into account all the trips, we're convoluting what `stop_sequence` means.

Use the heuristic of picking a trip (sorted alphabetically) for the shape with the most stops. 

Most of the trips that occur on the same shape have the same number of stops, and have the same exact stops.
Let's check that the trips with the most stops contain all the stops present in the other trips that share the same shape.

In [None]:
import dask.dataframe as dd
import pandas as pd

from shared_utils import rt_dates
from segment_speed_utils import helpers
from segment_speed_utils.project_vars import SEGMENT_GCS

In [2]:
months = ["mar", "apr", "may", "jun", "jul"]

dates = [
    rt_dates.DATES[f"{m}2023"] for m in months
]
dates

['2023-03-15', '2023-04-12', '2023-05-17', '2023-06-14', '2023-07-12']

In [5]:
stop_times = helpers.import_scheduled_stop_times(
    dates[-1],
    columns = ["feed_key", "trip_id", "stop_sequence", "stop_id"]
)

stops_per_trip = (stop_times.groupby(["feed_key", "trip_id"], 
                                     observed=True, group_keys=False)
                  .agg({"stop_id": "count"})
                  .reset_index()
                  .rename(columns = {"stop_id": "n_stops"})
).compute()

In [4]:
trips = helpers.import_scheduled_trips(
    dates[-1],
    columns = ["feed_key", "trip_id", "shape_array_key"],
    get_pandas = True
)

In [6]:
df = pd.merge(
    trips,
    stops_per_trip,
    on = ["feed_key", "trip_id"],
    how = "inner"
)

In [8]:
df2 = (df.groupby("shape_array_key")
       .agg({"n_stops": lambda x: list(set(x))})
       .reset_index()
      )

In [14]:
df2 = df2.assign(
    multiple = df2.apply(lambda x: len(x.n_stops), axis=1)
)

In [15]:
print(df2.multiple.value_counts(normalize=True))
print(df2.multiple.value_counts())

1    0.983568
2    0.009683
3    0.004255
5    0.001174
4    0.001027
6    0.000293
Name: multiple, dtype: float64
1    6704
2      66
3      29
5       8
4       7
6       2
Name: multiple, dtype: int64


In [16]:
df2[df2.multiple > 1]

Unnamed: 0,shape_array_key,n_stops,multiple
8,005514e5fae0dcedd6c2e67a7eae7f18,"[50, 26]",2
91,036b25723fe1851ad77ee8e20509eca9,"[16, 15]",2
93,038f82980fa0cd12294f51089d357342,"[8, 17, 10]",3
153,05880fbcae889a9d2f044edcb349a64c,"[27, 35]",2
172,05fa3f0b05e1130683459c17a44eb4c4,"[13, 14]",2
...,...,...,...
6713,fc95469c45abff1b52a29ae29bd9cfaa,"[8, 13, 14, 15, 17]",5
6732,fd549241a84b8e80d7848999ea915a5e,"[8, 9, 7]",3
6747,fda0474810ee0e700734d41b8219c1b0,"[10, 11]",2
6795,fee87dd62e3e9c7bf69a6e7ee642db1e,"[4, 6, 7]",3
