# Sanity check: `scheduled_stop_times` 
* Check that every row is a unique combo of `trip_instance_key-stop_id-stop_sequence`
* Spot check a couple of stops to make sure `stop_primary_direction` is behaving the way we expect.
   * Esp since `dd.map_partitions` uses `align_dataframes = False`

In [None]:
import dask.dataframe as dd
import geopandas as gpd
import pandas as pd

from segment_speed_utils import helpers
from segment_speed_utils.project_vars import SEGMENT_GCS, RT_SCHED_GCS
from shared_utils import rt_dates

analysis_date = rt_dates.DATES["sep2023"]

In [None]:
scheduled_stop_times = gpd.read_parquet(
    f"{RT_SCHED_GCS}stop_times_direction_{analysis_date}.parquet",
    columns = ["trip_instance_key", 
               "stop_id", "stop_sequence", 
               "geometry",
               "stop_primary_direction"]
)

In [None]:
trip_stop_cols = ["trip_instance_key", "stop_id", "stop_sequence"]

n_rows = len(scheduled_stop_times)
expected_unique_rows = len(scheduled_stop_times[trip_stop_cols].drop_duplicates())

assert n_rows == expected_unique_rows 

## Investigate why there's not uniqueness of rows

In [None]:
n_rows - expected_unique_rows

In [None]:
scheduled_stop_times[
    scheduled_stop_times.duplicated(subset=trip_stop_cols, 
                                    keep=False)]

In [None]:
one_trip = "44c988a45bd7a196e569ef66b4a6b552"

trips = helpers.import_scheduled_trips(
    analysis_date,
    filters = [[("trip_instance_key", "==", one_trip)]],
    columns = None, # return all columns, set filtering to None
    get_pandas = True
)
trips

In [None]:
# Trips looks ok, so let's grab the 
# corresponding stop time entry using feed_key-trip_id
one_trip_id = trips.trip_id.iloc[0]
one_feed_key = trips.feed_key.iloc[0]

stop_times = helpers.import_scheduled_stop_times(
    analysis_date,
    filters = [[("feed_key", "==", one_feed_key), 
                ("trip_id", "==", one_trip_id )]],
    columns = None
).compute().sort_values("stop_sequence").reset_index(drop=True)

In [None]:
# Plot these on a map
scheduled_stop_times[
    scheduled_stop_times.trip_instance_key==one_trip
].explore("stop_sequence", 
          tiles = "CartoDB Positron",
          categorical = True
         )

## Spot checking

In [None]:
ok_stop_times = scheduled_stop_times[~scheduled_stop_times.duplicated(
    trip_stop_cols)]

In [None]:
sample_10_trips = ok_stop_times[
    ["trip_instance_key"]].drop_duplicates().sample(10)

In [None]:
sample_10_trips.trip_instance_key.unique()

In [None]:
sample_10_trips_list = ['81e873ee8252a8a0877cc983e57a6b29',
       '6189b77fba24e1ecc69f7da11c643434',
       '65d8444657dd5902ca05d7bda31c6922',
       'c41dc1d746e48f2b47dbdce466c0d221',
       '0db09e8871638928aa84611685de44bd',
       'd276f8f018790f8bc378a785063a08ad',
       '10a7d41d663609a4488f946d638281ad',
       'ffee4aee8f3d7693429e7a342296b8fc',
       '18160e8844c2870cd823587a287a8b71',
       'd3f339b7bd23d62ff231a7c1107545f1']

In [None]:
def plot_stops_by_direction(gdf: gpd.GeoDataFrame, one_trip: str):
    gdf2 = gdf[gdf.trip_instance_key==one_trip].reset_index(drop=True)
    
    print(f"trip_instance_key: {one_trip}")
    
    m = gdf2.explore(
        "stop_primary_direction", 
        categorical = True,
        tiles = "CartoDB Positron"
    )
    
    display(m) 

In [None]:
for t in sample_10_trips_list:
    plot_stops_by_direction(ok_stop_times, t)