# Investigate neeed for a `stop_id-route_id` grain table

Eric: 
* I can think of a few examples of when this would not be true -- for example a route where one trip in a particular direction deviates to serve a high school around dismissal time, then resumes the route and serves all remaining stops. It seems possible that those remaining stops would have a shifted (or even completely different) `stop_sequence`...
* My first thought is that making a table like this at the `shape_id` level could avoid some of those issues, hopefully capturing complexities like the one above. 
* GTFS spec: stop sequence relationships are only required to be consistent within the individual trip (hence the cumbersome joins).
* Perhaps one middle ground would be to make this table without `stop_sequence`, it could be used to show things like "here are all the stops generally served by this route in this direction", without claiming that there is a consistent sequence or that all trips on that route in that direction serve exactly those stops.

In [1]:
import geopandas as gpd
import pandas as pd

GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/"
COMPILED_CACHED_VIEWS = f"{GCS_FILE_PATH}rt_delay/compiled_cached_views/"
analysis_date = "2023-01-18"



Pick LA Metro and pick a `route_id` that has a lot of `shape_id` values.

See how `stop_sequence` plays a role here for stops.

In [2]:
TEST_OPERATOR = "LA Metro Bus Schedule"

trips = pd.read_parquet(
    f"{COMPILED_CACHED_VIEWS}trips_{analysis_date}.parquet", 
    columns = ["feed_key", "name", "trip_id", "shape_id", 
               "route_id", "direction_id"], 
    filters = [[("name", "==", TEST_OPERATOR)]]
)

TEST_FEED = trips.feed_key.iloc[0]

# Let's pick a route that has a lot of variations on shape_id
trips.groupby("route_id").agg({
    "trip_id": "nunique", 
    "direction_id": "nunique", 
    "shape_id": "nunique"
}
).reset_index().sort_values("shape_id", ascending=False).head()


Unnamed: 0,route_id,trip_id,direction_id,shape_id
79,51-13167,300,2,23
0,10-13167,132,2,19
4,108-13167,197,2,18
110,92-13167,111,2,17
6,111-13167,197,2,17


In [3]:
TEST_ROUTE = "51-13167"

def get_subset_trips_shapes(test_route: str) -> tuple[list]:
    """
    Returns a tuple of 2 lists: 
    list1: list of the trips for that route_id
    list2: list of the shapes for that route_id
    """
    subset_trips = trips[trips.route_id==test_route
                        ].trip_id.unique().tolist()
    subset_shapes = trips[trips.route_id==test_route
                         ].shape_id.unique().tolist()
    
    return subset_trips, subset_shapes

TEST_TRIPS, TEST_SHAPES = get_subset_trips_shapes(TEST_ROUTE)

In [4]:
def stop_times_shapes_df(
    test_route: str) -> tuple[pd.DataFrame, gpd.GeoDataFrame]:
    """
    Return the subset of stop_times and shapes tables 
    for the route_id we're interested in.
    """
    TEST_TRIPS, TEST_SHAPES = get_subset_trips_shapes(test_route)
    
    stop_times = pd.read_parquet(
        f"{COMPILED_CACHED_VIEWS}st_{analysis_date}.parquet", 
        columns = ["feed_key", "stop_id", "trip_id", "stop_sequence"],
        filters = [[("feed_key", "==", TEST_FEED)]]
    )
    
    stop_times = stop_times[stop_times.trip_id.isin(TEST_TRIPS)]

    shapes = gpd.read_parquet(
        f"{COMPILED_CACHED_VIEWS}routelines_{analysis_date}.parquet", 
        filters = [[("feed_key", "==", TEST_FEED)]],
        columns = ["feed_key", "shape_id", "geometry"]
    )
    
    shapes = shapes[shapes.shape_id.isin(TEST_SHAPES)]
    
    return stop_times, shapes

    
stop_times, shapes = stop_times_shapes_df(TEST_ROUTE)    

stops = gpd.read_parquet(
    f"{COMPILED_CACHED_VIEWS}stops_{analysis_date}.parquet", 
    columns = ['feed_key', 'stop_id', 'stop_name', 'geometry'],
    filters = [[("feed_key", "==", TEST_FEED)]]
)

In [5]:
stop_times_with_shape = pd.merge(
    stop_times,
    trips[["feed_key", "trip_id", "shape_id"]].drop_duplicates(),
    on = ["feed_key", "trip_id"],
    how = "inner"
)

In [6]:
stop_times_unique = (stop_times_with_shape
                     .drop(columns = "trip_id")
                     .drop_duplicates(
                         subset=["feed_key", "stop_id", 
                                 "stop_sequence", "shape_id"])
                     .sort_values(["stop_sequence", "stop_id"])
                     .reset_index(drop=True)
)

In [7]:
for i in TEST_SHAPES:
    print(i)
    subset_df = stop_times_unique[stop_times_unique.shape_id==i]
    
    n_rows = len(subset_df)
    n_stops = subset_df.stop_id.nunique()
    n_stop_seq = subset_df.stop_sequence.nunique()
    
    print(n_rows, n_stops, n_stop_seq)       

510323_DEC22
54 54 54
510327_DEC22
61 61 61
510330_DEC22
53 53 53
510328_DEC22
80 80 80
510331_DEC22
72 72 72
510326_DEC22
62 62 62
510329_DEC22
75 75 75
510333_DEC22
52 52 52
510325_DEC22
83 83 83
510339_DEC22
74 74 74
510335_DEC22
53 53 53
510336_DEC22
42 42 42
510334_DEC22
53 53 53
510321_DEC22
40 40 40
510324_DEC22
61 61 61
510340_DEC22
52 52 52
510301_DEC22
33 33 33
510332_DEC22
60 60 60
510320_DEC22
33 33 33
510315_DEC22
32 32 32
510337_DEC22
31 31 31
510338_DEC22
0 0 0
510341_DEC22
71 71 71


In [8]:
shapes0 = trips[(trips.route_id==TEST_ROUTE) & 
               (trips.direction_id==0)].shape_id.unique()
shapes1 = trips[(trips.route_id==TEST_ROUTE) & 
               (trips.direction_id==1)].shape_id.unique()

Now that it's established that `stop_sequence` is unique within a `shape_id`, 
let's take a batch of those `shape_ids` that travel in the same direction.

If `stop_sequence` is unique within `route-direction`, that would be really useful....but it's unlikely.

In [9]:
# Subset to just shapes in direction 0, drop_duplicates by stop_id/stop_sequence
st0 = (stop_times_with_shape[
    stop_times_with_shape.shape_id.isin(shapes0)]
                     .drop(columns = "trip_id")
                     .drop_duplicates(
                         subset=["feed_key", "stop_id", 
                                 "stop_sequence"])
                     .sort_values(["stop_sequence", "stop_id"])
                     .reset_index(drop=True)
)

# Merge in point geom
st0 = pd.merge(
    stops, 
    st0,
    on = ["feed_key", "stop_id"],
    how = "inner"
)


In [10]:
very_long_shape = "510325_DEC22" 
st0[st0.shape_id == very_long_shape].stop_id.nunique()

83

In [11]:
short_shape = "510324_DEC22"
st0[st0.shape_id==short_shape].stop_id.nunique()

61

In [12]:
longer = st0[st0.shape_id==very_long_shape][
    ["stop_id", "stop_name", "stop_sequence", "geometry"]
].sort_values("stop_sequence").reset_index(drop=True)

In [13]:
shorter = st0[st0.shape_id==short_shape][
    ["stop_id", "stop_name", "stop_sequence"]
].sort_values("stop_sequence").reset_index(drop=True)

In [14]:
m1 = pd.merge(
    longer, 
    shorter,
    on = ["stop_id", "stop_name"],
    how = "outer",
    validate = "1:1",
    indicator=True
)

In [15]:
m1.explore("_merge", 
           tiles="Carto DB Positron")