# Sanity check: `scheduled_stop_times` 
* Check that every row is a unique combo of `trip_instance_key-stop_id-stop_sequence`
* Spot check a couple of stops to make sure `stop_primary_direction` is behaving the way we expect.
   * Esp since `dd.map_partitions` uses `align_dataframes = False`

In [1]:
import dask.dataframe as dd
import geopandas as gpd
import pandas as pd

from segment_speed_utils import helpers
from segment_speed_utils.project_vars import SEGMENT_GCS, RT_SCHED_GCS
from shared_utils import rt_dates

analysis_date = rt_dates.DATES["sep2023"]

In [2]:
scheduled_stop_times = gpd.read_parquet(
    f"{RT_SCHED_GCS}stop_times_direction_{analysis_date}.parquet",
    columns = ["trip_instance_key", 
               "stop_id", "stop_sequence", 
               "geometry",
               "stop_primary_direction"]
)

In [3]:
trip_stop_cols = ["trip_instance_key", "stop_id", "stop_sequence"]

n_rows = len(scheduled_stop_times)
expected_unique_rows = len(scheduled_stop_times[trip_stop_cols].drop_duplicates())

assert n_rows == expected_unique_rows 

AssertionError: 

## Investigate why there's not uniqueness of rows

In [4]:
n_rows - expected_unique_rows

120

In [5]:
scheduled_stop_times[
    scheduled_stop_times.duplicated(subset=trip_stop_cols, 
                                    keep=False)]

Unnamed: 0,trip_instance_key,stop_id,stop_sequence,geometry,stop_primary_direction
864070,44c988a45bd7a196e569ef66b4a6b552,6573716,0,POINT (248215.002 -382920.471),Southbound
864071,44c988a45bd7a196e569ef66b4a6b552,6573716,0,POINT (248215.002 -382920.471),Unknown
864072,44c988a45bd7a196e569ef66b4a6b552,3118875,1,POINT (250743.139 -382426.021),Southbound
864073,44c988a45bd7a196e569ef66b4a6b552,3118875,1,POINT (250743.139 -382426.021),Eastbound
864074,44c988a45bd7a196e569ef66b4a6b552,4028613,1,POINT (246428.935 -422741.402),Southbound
...,...,...,...,...,...
2653401,d2d28bff624c982d19050356737400d0,6573716,2,POINT (248215.002 -382920.471),Northbound
2653402,d2d28bff624c982d19050356737400d0,3123490,2,POINT (247931.788 -431248.536),Southbound
2653403,d2d28bff624c982d19050356737400d0,3123490,2,POINT (247931.788 -431248.536),Unknown
2653404,d2d28bff624c982d19050356737400d0,3123082,3,POINT (249410.572 -431475.496),Southbound


In [6]:
one_trip = "44c988a45bd7a196e569ef66b4a6b552"

trips = helpers.import_scheduled_trips(
    analysis_date,
    filters = [[("trip_instance_key", "==", one_trip)]],
    columns = None, # return all columns, set filtering to None
    get_pandas = True
)
trips

Unnamed: 0,feed_key,schedule_gtfs_dataset_key,name,regional_feed_type,service_date,trip_start_date_pacific,trip_id,trip_instance_key,route_key,route_id,...,route_desc,direction_id,shape_array_key,shape_id,trip_first_departure_datetime_pacific,trip_last_arrival_datetime_pacific,service_hours,trip_start_date_local_tz,trip_first_departure_datetime_local_tz,trip_last_arrival_datetime_local_tz
0,8caabd5e9263c45f86e5bb3bbfd36c12,8eecb796518dafd3c1b971a99f8b8252,Victor Valley GMV Schedule,,2023-09-13,2023-09-13,15102,44c988a45bd7a196e569ef66b4a6b552,ca911b413b9186fe599863322ca89b27,3220,...,Barstow - Victorville - San Bernardino,0.0,764e97bc51e230a94dac11b8f05a8b8a,19312,2023-09-13 08:00:00,2023-09-13 09:54:00,1.9,2023-09-13,2023-09-13 08:00:00,2023-09-13 09:54:00


In [7]:
# Trips looks ok, so let's grab the 
# corresponding stop time entry using feed_key-trip_id
one_trip_id = trips.trip_id.iloc[0]
one_feed_key = trips.feed_key.iloc[0]

stop_times = helpers.import_scheduled_stop_times(
    analysis_date,
    filters = [[("feed_key", "==", one_feed_key), 
                ("trip_id", "==", one_trip_id )]],
    columns = None
).compute().sort_values("stop_sequence").reset_index(drop=True)

In [8]:
# Plot these on a map
scheduled_stop_times[
    scheduled_stop_times.trip_instance_key==one_trip
].explore("stop_sequence", 
          tiles = "CartoDB Positron",
          categorical = True
         )