# Speedmap segments 
* The 20th, 50th, 80th percentiles look extremely tight, why?
* Is this happening in the trip files?

In [None]:
import geopandas as gpd
import pandas as pd

from segment_speed_utils import helpers
from segment_speed_utils.project_vars import SEGMENT_GCS, GTFS_DATA_DICT
from shared_utils import rt_dates

analysis_date = rt_dates.DATES["apr2024"]
nov_date = rt_dates.DATES["nov2023"]

In [None]:
TRIP_FILE = GTFS_DATA_DICT.speedmap_segments.stage4
SHAPE_FILE = GTFS_DATA_DICT.speedmap_segments.shape_stop_single_segment

In [None]:
operator_name = "Big Blue Bus Schedule"

operator_route_df = helpers.import_scheduled_trips(
    analysis_date,
    columns = ["gtfs_dataset_key", "name", 
               "route_short_name", "route_long_name", "route_id"],
    filters = [[("name", "==", operator_name)]],
    get_pandas = True,
)

In [None]:
nov_trips = helpers.import_scheduled_trips(
    nov_date,
    columns = ["gtfs_dataset_key", "name", "shape_id", "route_id", 
               "route_long_name", "route_short_name"],
    filters = [[("name", "==", operator_name)]],
    get_pandas = True
)

In [None]:
if nov_trips.schedule_gtfs_dataset_key.iloc[0] == operator_route_df.schedule_gtfs_dataset_key.iloc[0]:
    bbb_key = nov_trips.schedule_gtfs_dataset_key.iloc[0]

In [None]:
def nov_shape_to_apr_route(
    nov_trips: pd.DataFrame,
    apr_route_df: pd.DataFrame,
    operator_key: str = bbb_key,
    one_shape: str = ""
):

    nov_route_name = nov_trips[
        #(nov_trips.schedule.str.contains(operator_substring)) & 
        (nov_trips.shape_id==one_shape)
    ].route_short_name.iloc[0]
    
    return apr_route_df[
        #(apr_route_df.name.str.contains(operator_substring)) & 
        (apr_route_df.route_short_name==nov_route_name)
    ].route_id.iloc[0]

## Trip

In [None]:
trip_df = pd.read_parquet(
    f"{SEGMENT_GCS}{TRIP_FILE}_{analysis_date}.parquet",
    filters = [[("schedule_gtfs_dataset_key", "==", bbb_key)]]
)

trip_df = trip_df.assign(
    speed_mph = trip_df.speed_mph.round(2)
)

In [None]:
olympic_shape1 = "26450"
olympic_route1 = nov_shape_to_apr_route(
    nov_trips,
    operator_route_df,
    bbb_key,
    olympic_shape1
)

santa_monica_shape1 = "26437"
santa_monica_route1 = nov_shape_to_apr_route(
    nov_trips,
    operator_route_df,
    bbb_key,
    santa_monica_shape1
)

santa_monica_shape2 = "26509"
santa_monica_route2 = nov_shape_to_apr_route(
    nov_trips,
    operator_route_df,
    bbb_key,
    santa_monica_shape2
)

fourth_shape1 = "26464"
fourth_route1 = nov_shape_to_apr_route(
    nov_trips,
    operator_route_df,
    bbb_key,
    fourth_shape1
)

In [None]:
def filter_to_route(trip_df, operator_key, one_route, one_stop):
    return trip_df[
        (trip_df.schedule_gtfs_dataset_key==operator_key) & 
        (trip_df.route_id==one_route) & 
        (trip_df.stop_id==one_stop)
        ][["stop_pair_name", "time_of_day", "arrival_time", "speed_mph", 
           "meters_elapsed", "sec_elapsed"]].sort_values("arrival_time")

In [None]:
green_olympic_speeds = {
    "721": "Olympic & Prosser",
    "688": "Olympic & Veteran",
    "716": "Olympic & Colby",
    "800": "Olympic & Purdue",
    "801": "Olympic & Colby",
    "700": "Olympic & 3030"
}

green_santa_monica_blvd_speeds = {
    "370": "Santa Monica & 14th",
    "117": "Santa Monica & 14th, under",
    "1234": "Santa Monica & 17th"
}

green_fourth_speeds = {
    "668": "4th & San Vincente",
    "666": "4th & Marguerita",
    "665": "4th & Alta",
    "664": "4th & Montana",
    "505": "4th & Washington",
    "504": "4th & California",
    "502": "4th & Washington",
    "503":
}

In [None]:
# There are several exact speeds across trips, take a look at
# interpolated stop arrivals, what are the chances this happens?
# is it actually interpolating between different vp_idx values?

In [None]:
filter_to_route(trip_df, bbb_key, fourth_route1, "502").query('time_of_day=="AM Peak"')
#.groupby("time_of_day").agg(
#{"speed_mph": lambda x: sorted(list(x))})

In [None]:
# Why are there the same speeds there?
trip_df[
    (trip_df.schedule_gtfs_dataset_key==bbb_key) & 
    (trip_df.route_id==fourth_route1) & 
    (trip_df.stop_id=="502") & 
    (trip_df.time_of_day=="AM Peak") & 
    (trip_df.speed_mph > 15) & (trip_df.speed_mph < 16)
].trip_instance_key.unique()

In [None]:
INTERP_FILE = GTFS_DATA_DICT.speedmap_segments.stage3b
NEAREST_VP_FILE = GTFS_DATA_DICT.rt_stop_times.stage2
subset_trips = [
        '0d448c743a91bc96271d36ba4450ebc9',
       '1fbea8d720efd0dd513e98eef5383dbf',
       '3a2e5c9e7304d091406cb5bbdfcc27e4',
       'a0f65344cb59c750934aff210b325f7e',
       'b6fc33a3b002b0bc63b07b6f39d80cb0'
]

stop_arrivals = pd.read_parquet(
    f"{SEGMENT_GCS}{INTERP_FILE}_{analysis_date}.parquet",
    filters = [[("trip_instance_key", "in", subset_trips), 
               ("stop_id", "==", "502")]]
)

In [None]:
nearest = gpd.read_parquet(
    f"{SEGMENT_GCS}{NEAREST_VP_FILE}_{analysis_date}.parquet",
    filters = [[("trip_instance_key", "in", subset_trips), 
               ("stop_id", "==", "502")]]
)

In [None]:
for i in nearest.index:
    print(i)
    print(nearest.loc[i]["location_timestamp_local_trio"])
    #print(nearest.loc[i]["vp_coords_trio"])

In [None]:
nearest.columns