# Debug issues in `07_plot_stop_metrics` 

In [1]:
import altair as alt
import dask.dataframe as dd
import geopandas as gpd
import pandas as pd

from segment_speed_utils import helpers, gtfs_schedule_wrangling
from segment_speed_utils.project_vars import SEGMENT_GCS, analysis_date



In [None]:
def stop_avg_by_peak_off_peak(gdf): 
    """
    Aggregate to peak/offpeak
    """
    # Calculate weighted average
    gdf = gdf.assign(
        speed_multiplied_trips = gdf.speed_mph * gdf.n_trips
    )
    
    agg_df = (
        gdf.groupby(["gtfs_dataset_key", "_gtfs_dataset_name", 
                    "route_id", "direction_id", "route_short_name",
                     "stop_id",
                    "stop_name", "peak"])
        .agg({"speed_multiplied_trips": "sum",
              "n_trips": "sum",
              "stop_sequence": "max" # get max in case there are differences
             })
        .reset_index()
    )
    
    agg_df = agg_df.assign(
        avg_speed_mph = agg_df.speed_multiplied_trips.divide(agg_df.n_trips)
    ).drop(columns = "speed_multiplied_trips")
    
    return agg_df

In [None]:
def subset_for_operator(operator_name: str):
    gdf = gpd.read_parquet(
        f"./scripts/data/stop_metrics_by_hour_{analysis_date}.parquet",
        filters = [[("_gtfs_dataset_name", "==", operator_name)]]
    )
    
    gdf = gdf.assign(
        speed_mph = gdf.speed_mph.round(1),
        peak = gdf.apply(
            lambda x: 
            "peak" if x.time_of_day in ["AM Peak", "PM Peak"]
            else "off peak", axis=1
        )  
    )
    
    peak_df = stop_avg_by_peak_off_peak(gdf)
    peak_df = peak_df.assign(
        avg_speed_mph = peak_df.avg_speed_mph.round(1)
    )
    
    return gdf, peak_df

## Low Average Speeds

* Problem from the more granular trip-stop level data?
* Or from weighted average?

In [None]:
name = "San Diego Vehicle Positions"

In [None]:
sd, sd_peak = subset_for_operator(name)
rt_key = sd.gtfs_dataset_key.iloc[0]

In [None]:
test_route = "201"
test_stop = "UTC Transit Center"

In [None]:
sd[(sd.route_short_name==test_route) & 
   (sd.stop_name==test_stop)
  ].head()

In [None]:
sd_peak[(sd_peak.route_short_name==test_route) & 
        (sd_peak.stop_name==test_stop)
       ].avg_speed_mph.describe()

In [None]:
trips = helpers.import_scheduled_trips(
    analysis_date,
    columns = ["name", "feed_key", 
               "trip_id", "shape_array_key",
               "route_id"],
    filters = [[("feed_key", "==", "a7ba6f075198e9bf9152fab6c7faf0f6")]]
).compute()

In [None]:
sd_201_trips = trips[trips.route_id=="201"].trip_id.unique()

In [None]:
speeds.columns

In [None]:
speeds = pd.read_parquet(
    f"{SEGMENT_GCS}speeds_stop_segments_{analysis_date}/",
)

trips = helpers.import_scheduled_trips(
        analysis_date, 
        columns = ["feed_key", "name", "trip_id", "shape_array_key"]
    ).compute()
             
trips = gtfs_schedule_wrangling.exclude_scheduled_operators(trips)

stop_times = helpers.import_scheduled_stop_times(
    analysis_date, 
    columns = [
        "feed_key", "trip_id", 
        "stop_id", "stop_sequence",
        "arrival_sec"
    ]
)

scheduled_stop_times = gtfs_schedule_wrangling.merge_shapes_to_stop_times(
    trips, stop_times) 

In [None]:
df = dd.merge(
    speeds,
    scheduled_stop_times,
    on = ["shape_array_key", "stop_sequence"] + ["trip_id"],
    how = "inner"
)

In [None]:
stops = helpers.import_scheduled_stops(
        analysis_date, 
        columns = ["feed_key", "stop_id", "stop_name", "geometry"],
)

stop_metrics_with_geom = gtfs_schedule_wrangling.attach_stop_geometry(
    df[df.gtfs_dataset_key==rt_key],
    stops,
).compute()

In [None]:
stop_metrics_utc = stop_metrics_with_geom[
    stop_metrics_with_geom.shape_array_key==shape_with_utc]

In [None]:
for i in sorted(stop_metrics_utc.stop_sequence.unique().tolist()):
    print(i)
    subset = stop_metrics_utc[stop_metrics_utc.stop_sequence==i]
    print(subset.stop_name.value_counts())

In [None]:
stop_metrics_utc[stop_metrics_utc.stop_sequence==14].speed_mph.value_counts()

In [None]:
vp_pared = pd.read_parquet(
    f"{SEGMENT_GCS}vp_pared_stops_{analysis_date}/",
    filters = [[("shape_array_key", "==", shape_with_utc)]] 
)
                
vp_pared2 = vp_pared[vp_pared.stop_sequence.isin([1, 14])][
    ["shape_array_key", "feed_key", "stop_sequence", 
     "location_timestamp_local", "lon", "lat"]]

In [None]:
geom = gpd.points_from_xy(x=vp_pared2.lon, y=vp_pared2.lat, crs = "EPSG:3310")

In [None]:
vp_pared2_gdf = gpd.GeoDataFrame(vp_pared2, 
                                 geometry=geom, crs="EPSG:3310")

In [None]:
stop_segments = helpers.import_segments(
    SEGMENT_GCS,
    file_name = f"stop_segments_{analysis_date}",
    filters = [[("shape_array_key", "==", shape_with_utc)]],
    partitioned=False
).drop(columns = "geometry_arrowized")

In [None]:
stop_segments[stop_segments.geometry.isna()]

In [None]:
stop_segments[(stop_segments.geometry.notna())].sort_values("stop_sequence")

In [None]:
stop_segments[(stop_segments.geometry.notna()) & 
              (stop_segments.stop_sequence==1)].explore(
    "stop_sequence",
    tiles="CartoDB Positron")

In [None]:
stop_segments[(stop_segments.geometry.notna()) & 
              (stop_segments.stop_sequence==14)].explore(
    "stop_sequence",
    tiles="CartoDB Positron")

In [None]:
vp_pared2_gdf[vp_pared2_gdf.location_timestamp_local=="2023-02-15 19:03:54"]

In [None]:
vp_pared2_gdf.location_timestamp_local.value_counts()

In [None]:
vp_pared2_gdf[vp_pared2_gdf.stop_sequence==1].drop(columns="location_timestamp_local"
                  ).explore("stop_sequence", tiles='CartoDB Positron')

In [None]:
vp_pared2_gdf[vp_pared2_gdf.stop_sequence==14].drop(columns="location_timestamp_local"
                  ).explore("stop_sequence", tiles='CartoDB Positron')

In [None]:
stop_metrics_utc[stop_metrics_utc.stop_sequence==14].stop_name.value_counts()

In [None]:
stop_metrics_utc[stop_metrics_utc.stop_name=="UTC Transit Center"
                ].trip_id.value_counts()

In [None]:
sd_speed = pd.read_parquet(
    #f"{SEGMENT_GCS}speeds_stop_segments_{analysis_date}/",
    f"./scripts/data/stop_metrics_disaggregated_{analysis_date}.parquet",
    filters = [[("gtfs_dataset_key", "==", rt_key)]]
)

sd_speed_201 = sd_speed[sd_speed.trip_id.isin(sd_201_trips)]
shape_with_utc = "f765b9d12fcca0173b4e3ddbc0374d18"
shape_wo_utc = "eb78a03b26e653cc678f3ef6c8d7ebf7"

In [None]:
sd_speed[
    sd_speed.shape_array_key==shape_with_utc
].stop_name.value_counts()

In [None]:
sd_speed[
    sd_speed.shape_array_key==shape_wo_utc
].stop_sequence.value_counts()

In [None]:
sd_speed_201.shape_array_key.value_counts()

In [None]:
sd_speed_dropped = sd_speed[
    sd_speed.shape_array_key=="eb78a03b26e653cc678f3ef6c8d7ebf7"]

In [None]:
sd_speed_kept = sd_speed[
    sd_speed.shape_array_key=="f765b9d12fcca0173b4e3ddbc0374d18"]

In [None]:
sd_speed_kept[sd_speed_kept.stop_name=="UTC Transit Center"]

In [None]:
sd_speed_201[
    sd_speed_201.stop_name == "UTC Transit Center"
].stop_sequence.value_counts()

In [None]:
sd_speed_201[
    sd_speed_201.stop_name == "UTC Transit Center"
].shape_array_key.value_counts()

In [None]:
sd_speed_201[
    (sd_speed_201.stop_name == "UTC Transit Center")
]