# {name} stop-to-stop segments with speed and delay

* Explore this: https://github.com/justinbois/altair-catplot?

In [None]:
%%capture
import warnings
warnings.filterwarnings('ignore')

import altair as alt
import calitp_data_analysis.magics
import dask.dataframe as dd
import geopandas as gpd
import pandas as pd

from IPython.display import HTML, Markdown

from segment_speed_utils.project_vars import SEGMENT_GCS, analysis_date
from shared_utils import calitp_color_palette as cp

alt.data_transformers.enable('default', max_rows=None)

In [None]:
#parameters cell
name = "Big Blue Bus VehiclePositions"

In [None]:
%%capture_parameters
name

In [None]:
'''
def get_distribution(df):
    group_cols = ["gtfs_dataset_key", "trip_id"]
    col = "actual_minus_scheduled_sec"
    
    minimum = (df.groupby(group_cols)[col]
               .min()
               .reset_index()
               .rename(columns = {col: "min_delay"})
              )
    
    maximum = (df.groupby(group_cols)[col]
               .max()
               .reset_index()
               .rename(columns = {col: "max_delay"})
              )
    
    mean = (df.groupby(group_cols)[col]
               .mean()
               .reset_index()
               .rename(columns = {col: "mean_delay"})
              )
    
    stats = dd.merge(
        minimum,
        maximum,
        on = group_cols
    ).merge(
        mean, 
        on = group_cols
    )
    
    return stats

delay = get_distribution(df)

# over 1 hr delayed
for i in range(1, 12):
    # find how many trips have over 1, 2, 3 hr delay
    subset = delay[delay.max_delay >= 60*60*i]
    print(f"max_delay is over {i} hr: {len(subset)}")
'''

In [None]:

#operators = pd.read_parquet(
#    f"./scripts/data/stop_metrics_by_hour_{analysis_date}.parquet",
#    columns = ["_gtfs_dataset_name"]
#).sort_values("_gtfs_dataset_name"
#             ).drop_duplicates()._gtfs_dataset_name.tolist()


#one_operator = "Big Blue Bus VehiclePositions"

In [None]:
def stop_avg_by_peak_off_peak(gdf): 
    """
    Aggregate to peak/offpeak
    """
    gdf = gdf.assign(
        peak = gdf.apply(
            lambda x: 
            "peak" if x.time_of_day in ["AM Peak", "PM Peak"]
            else "off peak", axis=1)
    )

    
    # Calculate weighted average
    gdf = gdf.assign(
        speed_multiplied_trips = gdf.speed_mph * gdf.n_trips
    )
    
    agg_df = (
        gdf.groupby(["gtfs_dataset_key", "_gtfs_dataset_name", 
                    "route_id", "direction_id", "stop_sequence", "peak"])
        .agg({"speed_multiplied_trips": "sum",
              "n_trips": "sum",
             })
        .reset_index()
    )
    
    agg_df = agg_df.assign(
        avg_speed_mph = agg_df.speed_multiplied_trips.divide(agg_df.n_trips)
    ).drop(columns = "speed_multiplied_trips")
    
    return agg_df

In [None]:
def subset_for_operator(operator_name: str):
    gdf = gpd.read_parquet(
        f"./scripts/data/stop_metrics_by_hour_{analysis_date}.parquet",
        filters = [[("_gtfs_dataset_name", "==", operator_name)]]
    )
    
    gdf = gdf.assign(
        speed_mph = gdf.speed_mph.round(1)
    )
    
    peak_df = stop_avg_by_peak_off_peak(gdf)
    peak_df = peak_df.assign(
        avg_speed_mph = peak_df.avg_speed_mph.round(1)
    )
    
    return gdf, peak_df

In [None]:
#operator_name = "Big Blue Bus VehiclePositions"
#operator_df = df[df._gtfs_dataset_name==operator_name
#                ].reset_index(drop=True)

def get_operator_route_dropdown(df):
    input_dropdown = alt.binding_select(
        options=df.route_id.unique().tolist(), name='Route ')

    select_route = alt.selection_single(
        name="Route", fields=['route_id'],
        bind=input_dropdown,
    )
    
    return select_route

In [None]:
def stripplot_base(df: pd.DataFrame) -> alt.Chart:
    chart = (
        alt.Chart(df)
        .encode(
            # horiz jitter 
            y=alt.Y('jitter:Q', title=None,
                axis=alt.Axis(values=[0], ticks=True, 
                              grid=False, labels=False),
                    scale=alt.Scale(), #stack='zero',
            ),
        ).transform_calculate(
            # Generate Gaussian jitter with a Box-Muller transform
            jitter='sqrt(-0.5*log(random()))*cos(2*PI*random())'
        ).interactive()
    )

    return chart


def stripplot_by_time_of_day(
    df: pd.DataFrame, 
    x_col: str,
    grouping_col: str
) ->alt.Chart: 
        
    # altair chart can't take geometry
    df = df.drop(columns = ["actual_minus_scheduled_sec", "geometry"])
    
    base = stripplot_base(df)

    chart = (base
        .mark_point(size=10, opacity=0.9, strokeWidth=1.1)
        .encode( 
            x=alt.X(f"{x_col}:Q"),
        color = alt.Color(
            "time_of_day:N", title="Time of Day",
            scale = alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS)
        ),
        row = alt.Row(f"{grouping_col}:O", 
                      sort=alt.EncodingSortField(field="stop_sequence", 
                                                order="ascending"),
                      header=alt.Header(labelAngle=0)),
        )
    )
        
    return chart

In [None]:
def speed_stripplot_for_operator(operator_df):
    
    select_operator_route = get_operator_route_dropdown(operator_df)

    speed_chart0 = (stripplot_by_time_of_day(
        operator_df[operator_df.direction_id==0],
        x_col = "speed_mph",
        grouping_col = "stop_sequence" 
    ).encode(tooltip=["_gtfs_dataset_name",
        "stop_name", "speed_mph", 
        "route_id", "direction_id"])
    .add_selection(select_operator_route)
    .properties(
        title=f"Speed Variation Direction 0 ", width=150, height=30)
    )
    
    speed_chart1 = (stripplot_by_time_of_day(
        operator_df[operator_df.direction_id==1],
        x_col = "speed_mph",
        grouping_col = "stop_sequence" 
    ).encode(tooltip=["_gtfs_dataset_name",
        "stop_name", "speed_mph", 
        "route_id", "direction_id"])
    .add_selection(select_operator_route)
    .properties(
        title=f"Speed Variation Direction 1 ", width=150, height=30)
    )
    
    chart = (alt.hconcat(
        speed_chart0.transform_filter(select_operator_route), 
        speed_chart1.transform_filter(select_operator_route))
             .configure_facet(spacing=0)
             .configure_view(stroke=None)
             .configure_axis(labelFontSize=12, titleFontSize=12)
             .configure(padding={'top': 30})
             #https://github.com/altair-viz/altair/issues/1993
            )
    
    display(
        HTML("""
            <style>
            form.vega-bindings {
              position: absolute;
              left: 10px;
              top: 4px;
            }
            </style>
            """
            )
    )
    display(chart)
    #display(chart.transform_filter(select_operator_route))


In [None]:
def avg_peak_off_peak_by_operator(operator_peak_df):
    
    select_operator_route = get_operator_route_dropdown(operator_df)

    avg_chart = (
        alt.Chart(operator_peak_df)
        .mark_point(size=10, opacity=0.9, strokeWidth=1.1)
        .encode(
            x=alt.X('avg_speed_mph:Q', 
                    scale=alt.Scale(domain=[0,40], zero=True)
                   ),
            y=alt.Y('stop_sequence:O', title="Stop Sequence"),
            color=alt.Color('peak:N', 
                            scale=alt.Scale(
                                range=cp.CALITP_CATEGORY_BOLD_COLORS[2:])
                           ),
            facet=alt.Facet('direction_id:O', columns=2),
            tooltip=["avg_speed_mph", "route_id", 
                     "peak", "direction_id"],
        ).add_selection(select_operator_route)
        .interactive()
        .configure_facet(spacing=0)
        .configure(padding={'top': 10}) #https://github.com/altair-viz/altair/issues/1993
        .properties(title="Peak vs Offpeak Avg Speed for  ",
                    width=100,height=450)
    )


    display(HTML("""
    <style>
    form.vega-bindings {
      position: absolute;
      left: 195px;
      top: 4px;
    }
    </style>
    """))

    display(avg_chart.transform_filter(select_operator_route))

In [None]:
'''
import branca
ZERO_THIRTY_COLORSCALE = branca.colormap.step.RdYlGn_10.scale(vmin=0, vmax=30)

def make_map(gdf, y_col):
    if y_col == "speed_mph":
        cmap= ZERO_THIRTY_COLORSCALE
    elif y_col == "actual_minus_scheduled_min":
        cmap = "viridis"
    
    m = gdf.explore(
        f"{y_col}",
        tiles = "CartoDB Positron",
        cmap = cmap
    )
    
    return m
'''

In [None]:
#for operator in operators:
    #display(Markdown(f"## {operator}"))
    #test_operator = operators[28] # BBB
    #operator_df, operator_peak_df = subset_for_operator(operator)

    #speed_stripplot_for_operator(operator_df)
    #avg_peak_off_peak_by_operator(operator_peak_df)

In [None]:
operator_df, operator_peak_df = subset_for_operator(name)

speed_stripplot_for_operator(operator_df)
avg_peak_off_peak_by_operator(operator_peak_df)

In [None]:
#make_map(operator_df, "actual_minus_scheduled_min")

In [None]:
#make_map(operator_df, "speed_mph")