# Plot stop-to-stop segments with speed and delay

In [1]:
import altair as alt
import dask.dataframe as dd
import geopandas as gpd
import pandas as pd

from segment_speed_utils import helpers, gtfs_schedule_wrangling
from segment_speed_utils.project_vars import SEGMENT_GCS, analysis_date

from shared_utils import calitp_color_palette as cp
from shared_utils import rt_utils

CONFIG_PATH = "./scripts/config.yml"



In [2]:
STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments")

FILE_NAME = STOP_SEG_DICT['stop_delay_diagnostics']

In [24]:
def get_rt_trip_departure_hour(analysis_date: str) -> dd.DataFrame:
    """
    In the future, we would want to get trip departure hour based
    off of scheduled trips, not RT.
    """
    FILE_NAME = STOP_SEG_DICT['stop_delay_diagnostics']
    
    df = pd.read_parquet(
        f"{SEGMENT_GCS}{FILE_NAME}_{analysis_date}.parquet", 
        columns = ["gtfs_dataset_key", "_gtfs_dataset_name", "trip_id", 
                   "max_time"]
    ).drop_duplicates()
    
    trip_cols = ["gtfs_dataset_key", "trip_id"]
    timestamp_col = "max_time"
    
    trip_df = (df.groupby(trip_cols)
               [timestamp_col].min().dt.hour
               .reset_index()
              ).rename(columns = {timestamp_col: "departure_hour"})
    
    trip_df = trip_df.assign(
        time_of_day = trip_df.apply(
            lambda x: rt_utils.categorize_time_of_day(x.departure_hour), 
            axis=1)
    )
    
    return trip_df

In [25]:
trip_start = get_rt_trip_departure_hour(analysis_date)

In [39]:
df = pd.read_parquet(
    f"{SEGMENT_GCS}{FILE_NAME}_{analysis_date}.parquet", 
    columns = ["gtfs_dataset_key", "_gtfs_dataset_name", "trip_id", 
               "shape_array_key", "feed_key",
               "stop_id", "stop_sequence", 
               "actual_minus_scheduled_sec", "speed_mph",
              ]
)

In [40]:
def get_distribution(df):
    group_cols = ["gtfs_dataset_key", "trip_id"]
    col = "actual_minus_scheduled_sec"
    
    minimum = (df.groupby(group_cols)[col]
               .min()
               .reset_index()
               .rename(columns = {col: "min_delay"})
              )
    
    maximum = (df.groupby(group_cols)[col]
               .max()
               .reset_index()
               .rename(columns = {col: "max_delay"})
              )
    
    mean = (df.groupby(group_cols)[col]
               .mean()
               .reset_index()
               .rename(columns = {col: "mean_delay"})
              )
    
    stats = dd.merge(
        minimum,
        maximum,
        on = group_cols
    ).merge(
        mean, 
        on = group_cols
    )
    
    return stats

In [41]:
delay = get_distribution(df)

In [42]:
delay.shape

(70582, 5)

In [43]:
# over 1 hr delayed
for i in range(1, 12):
    # find how many trips have over 1, 2, 3 hr delay
    subset = delay[delay.max_delay >= 60*60*i]
    print(f"max_delay is over {i} hr: {len(subset)}")

max_delay is over 1 hr: 1434
max_delay is over 2 hr: 383
max_delay is over 3 hr: 242
max_delay is over 4 hr: 149
max_delay is over 5 hr: 72
max_delay is over 6 hr: 32
max_delay is over 7 hr: 20
max_delay is over 8 hr: 13
max_delay is over 9 hr: 6
max_delay is over 10 hr: 2
max_delay is over 11 hr: 2


In [44]:
# Merge in route_id and route_type
scheduled_trips = helpers.import_scheduled_trips(
    analysis_date,
    columns = ["shape_array_key", "route_id", "route_type"]
).drop_duplicates().compute()

df2 = dd.merge(
    df,
    trip_start,
    on = ["gtfs_dataset_key", "trip_id"],
    how = "inner"
).merge(
    scheduled_trips,
    on = "shape_array_key",
    how = "inner"
)

df2 = df2[(df2.speed_mph >= 0) & 
          (df2.speed_mph <= 70)].reset_index(drop=True)

In [46]:
def aggregate_to_stop_hour_attach_geom(
    df: pd.DataFrame
) -> gpd.GeoDataFrame:
    """
    Aggregate to stop_id-departure_hour. 
    Attach stop's point geometry.
    """
    stop_cols = [
        "gtfs_dataset_key", "_gtfs_dataset_name", 
        "stop_id", "feed_key",
        "route_id", "route_type",
        "departure_hour", 'time_of_day'
    ]

    stop_metrics = (df.groupby(stop_cols)
                    .agg({
                        "actual_minus_scheduled_sec": "mean", 
                        "speed_mph": "mean",
                    }).reset_index()
                   )

    stop_metrics = stop_metrics.assign(
        actual_minus_scheduled_min = stop_metrics.actual_minus_scheduled_sec.divide(60).round(1),
    )
        
    stops = helpers.import_scheduled_stops(
        analysis_date, 
        columns = ["feed_key", "stop_id", "geometry"],
    )

    stop_metrics_with_geom = gtfs_schedule_wrangling.attach_stop_geometry(
        stop_metrics, 
        stops,
    ).compute()


    stop_metrics_with_geom = gpd.GeoDataFrame(stop_metrics_with_geom)
    
    return stop_metrics_with_geom

In [47]:
gdf = aggregate_to_stop_hour_attach_geom(df2)

one_operator = "Big Blue Bus VehiclePositions"

#gdf = gdf[gdf._gtfs_dataset_name==one_operator]

In [48]:
gdf.time_of_day.value_counts()

Midday      307002
PM Peak     305087
AM Peak     190415
Early AM    117693
Evening     104347
Owl          21604
Name: time_of_day, dtype: int64

In [None]:
'''
gdf.explore(
    "actual_minus_scheduled_min",
    tiles = "CartoDB Positron"
)
'''

In [51]:
def make_chart(df: pd.DataFrame, 
               stop: str) -> alt.Chart:
    
    df = df.drop(columns = ["geometry", 
                            "actual_minus_scheduled_sec"])
    
    
    base = (alt.Chart(df[df.stop_id==stop])
            .mark_bar(width=10)
            .encode(
                x=alt.X(
                    "departure_hour:Q", title="Hour",
                    scale = alt.Scale(domain=[1,23])
                ),
                color = alt.Color(
                    "time_of_day:N", title="Time of Day",
                    scale = alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS)
                )
            ).interactive()
           )
    
    tooltip_cols = ["_gtfs_dataset_name", "stop_id", "departure_hour"]
    
    delay_chart = base.encode(
        y=alt.Y("actual_minus_scheduled_min:Q", title="Minutes Delayed"),
        tooltip=tooltip_cols + ["actual_minus_scheduled_min"]
    ).properties(title=f"Stop {stop}: Delay by Hour", 
                 width=400, height=200
                )
            
    speed_chart = base.encode(
       y=alt.Y("speed_mph:Q", title="Speed (mph)"),
        tooltip = tooltip_cols + ["speed_mph"]
    ).properties(title=f"Stop {stop}: Speed by Hour", 
                 width=400, height=200
                )
    
    chart = alt.vconcat(delay_chart, speed_chart)
    
    return chart

In [52]:
test_route = gdf.route_id.iloc[0]


for i in gdf[gdf.route_id==test_route].stop_id.unique():
    chart = make_chart(gdf, i)
    display(chart)