# Sanity check: does`test_speedmap_pipe` results

* Check to see we can merge speeds back to segments
* Need to dive into which grouping cols should be used when taking the average

In [None]:
import geopandas as gpd
import pandas as pd

from shared_utils import rt_dates
from segment_speed_utils.project_vars import SEGMENT_GCS, GTFS_DATA_DICT

analysis_date = rt_dates.DATES["mar2024"]

In [None]:
file = GTFS_DATA_DICT.speedmap_segments.stage4
segments_file = GTFS_DATA_DICT.speedmap_segments.segments_file

In [None]:
speeds = pd.read_parquet(
    f"{SEGMENT_GCS}{file}_{analysis_date}.parquet",
)

In [None]:
segments_gdf = gpd.read_parquet(
    f"{SEGMENT_GCS}{segments_file}_{analysis_date}.parquet",
    columns = ["trip_instance_key", "stop_id1", "stop_sequence", 
               "stop_sequence1", 
               "segment_id",
               "geometry"],
)

In [None]:
gdf = pd.merge(
    segments_gdf,
    speeds.rename(columns = {"stop_id": "stop_id1"}),
    on = ["trip_instance_key", "stop_id1", 
          "stop_sequence", "stop_sequence1"],
)

In [None]:
# Roughly, this passes a gut check that most 
# things join up (0.8% missing) 

# should this switch to stop_pair though?
# actually, when we use trip_instance_key, it doesn't matter
# but if we go any level above trip, we should use stop_pair
gdf.shape, speeds.shape

In [None]:
from segment_speed_utils import metrics, gtfs_schedule_wrangling

gdf2 = gdf.assign(
    service_date = pd.to_datetime(analysis_date)
).pipe(
    gtfs_schedule_wrangling.add_peak_offpeak_column
).pipe(
    gtfs_schedule_wrangling.add_weekday_weekend_column
)

In [None]:
gdf2.speed_mph.describe()

In [None]:
gdf2.speed_mph.hist(bins=range(0, 80, 5))

In [None]:
gdf2.loc[gdf2.speed_mph <= 80].describe() # we set max_speed at 80

Need to figure out the right grouping columns to use

In [None]:
avg_speeds = metrics.concatenate_peak_offpeak_allday_averages(
    gdf2[gdf2.speed_mph <= 80], 
    group_cols = ["schedule_gtfs_dataset_key", 
                  "route_id", "direction_id",
                  "segment_id", 
                 ],
    metric_type = "segment_speeds"
)

In [None]:
avg_speeds_with_geom = pd.merge(
    segments_gdf[["segment_id", "geometry"]].drop_duplicates(),
    avg_speeds,
    on = "segment_id",
    how = "inner"
)

In [None]:
#import branca
#COLORSCALE = branca.colormap.step.RdYlGn_10.scale(
#    vmin=0, vmax=50)


avg_speeds_with_geom[
    avg_speeds_with_geom.time_period=="offpeak"
].explore(
    "p20_mph", 
    cmap=rt_utils.ZERO_THIRTY_COLORSCALE,
    tiles = "CartoDB Positron"
)