In [None]:
import _threshold_utils as threshold_utils
import altair as alt

import pandas as pd
import geopandas as gpd
from segment_speed_utils.project_vars import analysis_date
from shared_utils import calitp_color_palette as cp

In [None]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:
GCS_PATH = "gs://calitp-analytics-data/data-analyses/rt_segment_speeds/"

analysis_date

### Open the files

In [None]:
speed_stops_subset = ['gtfs_dataset_key', '_gtfs_dataset_name', 'shape_array_key',
       'stop_sequence', 'trip_id','speed_mph']

In [None]:
speed_stops = pd.read_parquet(f"{GCS_PATH}speeds_stop_segments_{analysis_date}")

In [None]:
speed_stops.sample()

In [None]:
speed_stops2 = speed_stops[speed_stops_subset]

In [None]:
# Do I need geometry? Doesn't seem like it..
avg_speeds = gpd.read_parquet(f"{GCS_PATH}avg_speeds_stop_segments_{analysis_date}.parquet")

In [None]:
avg_speeds_subset = ['geometry','geometry_arrowized','district','district_name']
avg_speeds2 = avg_speeds.drop(columns = avg_speeds_subset)

In [None]:
m1 = pd.merge(avg_speeds2, speed_stops2, on = ['gtfs_dataset_key','shape_array_key', 'stop_sequence'], how = 'inner')

In [None]:
# Lots of dups??
len(m1) - len(m1.drop_duplicates())  

In [None]:
m2 = m1.drop_duplicates().reset_index(drop = True)

### See which routes have a lot of stops
* every shape has maybe 20, 30, 50 stops

In [None]:
# Stops are not labelled sequentially necessarily
m2.stop_sequence.describe()

In [None]:
# Renumber stop sequences since some of them are out of order
m2 = m2.sort_values(['shape_array_key', 'gtfs_dataset_key', 'trip_id', 'stop_sequence']).reset_index(drop = True)

In [None]:
m2['Test Stop Sequence'] = m2.groupby(['gtfs_dataset_key','shape_array_key', 'trip_id']).cumcount().astype(int)+1

In [None]:
# Check out routes with a ton of unique stops in the sequence.
m2.groupby(['gtfs_dataset_key','shape_array_key','trip_id']).agg({'Test Stop Sequence':'nunique'}).sort_values(['Test Stop Sequence']).sample(10)

In [None]:
# Look at one trip
foothill_og = speed_stops2[speed_stops2.trip_id == "t604-b2791-sl5"]

In [None]:
len(foothill_og)

In [None]:
foothill_og.stop_sequence.nunique()

In [None]:
foothill_og.sort_values('stop_sequence').head()

In [None]:
foothill_renumbered_stop_seq = m2[m2.trip_id == "t604-b2791-sl5"] 

In [None]:
foothill_renumbered_stop_seq['Test Stop Sequence'].nunique(), foothill_og.stop_sequence.nunique()

In [None]:
# Flag
troubleshoot = (m2.groupby(['gtfs_dataset_key','_gtfs_dataset_name','shape_array_key','trip_id'])
                .agg({'stop_sequence':'nunique'})
                .sort_values(['stop_sequence'], ascending = False)
                .reset_index()
               )

In [None]:
# len(troubleshoot)

In [None]:
# Number of test stops should match stop sequence...
# troubleshoot['sequences_are_equal'] = troubleshoot['Test Stop Sequence'] - troubleshoot['stop_sequence']

In [None]:
# troubleshoot['sequences_are_equal'].value_counts()

In [None]:
# Look at this trip id in the original df
og_trip = speed_stops2[speed_stops2.trip_id == "t640-b15FF1-sl5"]

In [None]:
# Look at this trip id in the manipulated df
new_trip = m2[m2.trip_id == "t640-b15FF1-sl5"]

In [None]:
og_trip.shape, og_trip.stop_sequence.nunique()

In [None]:
new_trip.shape, new_trip.stop_sequence.nunique()

#### Function

In [None]:
troubleshoot = troubleshoot.rename(columns = {'stop_sequence':'total_stops'})

In [None]:
troubleshoot.total_stops.describe()

In [None]:
p25 = troubleshoot.total_stops.quantile(0.25).astype(float)

In [None]:
p50 =  troubleshoot.total_stops.quantile(0.50).astype(float)
p75 =  troubleshoot.total_stops.quantile(0.75).astype(float)
p95 =  troubleshoot.total_stops.quantile(0.95).astype(float)
p99 =  troubleshoot.total_stops.quantile(0.99).astype(float)

In [None]:
def stop_categories(row):
        if ((row.total_stops > 0) and (row.total_stops <= p25)):
            return "25th  <= 17 stops"
        elif ((row.total_stops > p25) and (row.total_stops <= p75)):
            return "50th <= 30 stops"
        elif ((row.total_stops > p75) and (row.total_stops <= p95)):
               return "75th <= 50 stops"
        elif ((row.total_stops > p95) and (row.total_stops <= p99)):
               return "95th <= 85 stops"
        elif row.total_stops >= p95:
               return "99th >= 203 stops"
        else:
            return "other"

In [None]:
troubleshoot["stop_percentiles"] = troubleshoot.apply(lambda x: stop_categories(x), axis=1)

In [None]:
troubleshoot["stop_percentiles"].unique()

In [None]:
routes_with_many_stops = troubleshoot[troubleshoot.stop_percentiles.isin(['99th >= 203 stops', '95th <= 85 stops'])]

In [None]:
routes_with_many_stops['shape_array_key'].nunique()

In [None]:
routes_with_many_stops.head(2)

In [None]:
routes_with_many_stops.groupby(['_gtfs_dataset_name', 'stop_percentiles']).agg({'shape_array_key':'nunique', 'trip_id':'nunique', 
                                                                               'total_stops':'max'})

In [None]:
routes_summary = (routes_with_many_stops
                  .groupby(['_gtfs_dataset_name','shape_array_key', 'stop_percentiles'])
                  .agg({'trip_id':'nunique','total_stops':'median'})
                  .reset_index()
                  .rename(columns = {'total_stops':'total_stops_median'})
                 )

In [None]:
routes_summary.shape_array_key.nunique(), routes_summary.shape

In [None]:
routes_summary.sort_values(['total_stops_median'], ascending = False).head()

In [None]:
# Just checking Foothill
speed_stops2[speed_stops2.trip_id == "t523-b27FD-sl5"].shape

In [None]:
# Bay Area 511 Santa Clara Transit VehiclePositions
speed_stops2[speed_stops2.trip_id == "3278210"].shape

In [None]:
speed_stops2[speed_stops2.shape_array_key == "1c3af44d68821dbd42638b3e76566466"].head()

### Charts  
Test with a few routes first
* Create new col that rounds up speed for plotting purposes only.

#### Manipulate DF for charts

In [None]:
test1 = m2.melt(id_vars=[ '_gtfs_dataset_name','shape_array_key','trip_id', 'Test Stop Sequence','gtfs_dataset_key','loop_or_inlining',
                       'n_trips'], value_vars=[ 'avg_speed_mph','speed_mph','p20_speed_mph', 'p80_speed_mph'])

In [None]:
test1.shape

In [None]:
# test1[test1.shape_array_key == "29d2bbdbeaec1d6888800f85bebf6e33"]

In [None]:
# Only need average speed/p20 speed/p80 to show up once for each stop sequence-operator-shape array
test2 = test1.drop_duplicates(subset = [ '_gtfs_dataset_name','shape_array_key','Test Stop Sequence','gtfs_dataset_key','variable','value']).reset_index(drop = True)

In [None]:
test2.shape

In [None]:
subset = test2[test2.shape_array_key.isin(["29d2bbdbeaec1d6888800f85bebf6e33",'754c5b012195800c38dc58e72e4f482e',
       'e3c5ed2c6fa6cd5c5cd57d46aeb3cd8e', '27ea352ade9af00e21aa2f2968810107'])]

In [None]:
subset = threshold_utils.pre_clean(subset)

In [None]:
subset['Route'] = subset['Gtfs Dataset Name'] +' '  + subset['Shape Array Key']

In [None]:
subset = subset.rename(columns = {'Value':'Speed'})

In [None]:
subset['Speed_Int'] = subset.Speed.fillna(0).astype(int)

In [None]:
# subset['Rounded Speed'].unique()

In [None]:
def speed(row):
    # If partner is none, return Unknown.
    if row.Speed_Int == 0:
        return 0
    elif 0 < row.Speed_Int < 6:
        return 5
    elif 5 < row.Speed_Int < 11:
        return 10
    elif 10 < row.Speed_Int < 16:
        return 15
    elif 15 < row.Speed_Int < 21:
        return 20
    elif 20 < row.Speed_Int < 26:
        return 25
    elif 25 < row.Speed_Int < 31:
        return 30
    else:
        return 35

In [None]:
# Apply the function
subset["Rounded Speed"] = subset.apply(speed, axis=1)

In [None]:
# subset[['Rounded Speed', 'Speed', 'Speed_Int']]

In [None]:
subset.Variable = subset.Variable.str.title().str.replace("_"," ")

In [None]:
# One df for the actual speeds
subset_speedmph = subset[subset.Variable == 'Speed Mph'].reset_index(drop = True)

In [None]:
# One df for the percentiles
subset_other= subset[subset.Variable != 'Speed Mph'].reset_index(drop = True)

In [None]:
def alt_dropdown(df, col_for_dropdown:str, dropdown_menu_title:str):
    # Create dropdown menu
    # Exclude "none" operators which are only scheduled data
    df = df.loc[df[col_for_dropdown] != "None"][[col_for_dropdown]]
    dropdown_list = df[col_for_dropdown].unique().tolist()
    
    # Show only first operator by default
    initialize_first_op = sorted(dropdown_list)[0]
    input_dropdown = alt.binding_select(options=sorted(dropdown_list), name=dropdown_menu_title)
    
    selection = alt.selection_single(name= dropdown_menu_title,fields=[col_for_dropdown],
    bind=input_dropdown, init={col_for_dropdown: initialize_first_op})
                 
    return selection

In [None]:
selection_test = alt_dropdown(subset, "Route", "Operator/Shape Array")

In [None]:
def create_jitter_plot(df):
    
    #title_op = df['Gtfs Dataset Name'].iloc[0].replace('VehiclePositions','').strip()
    #inline = df['Loop Or Inlining'].iloc[0]
    chart1 = (
        alt.Chart(df, width=0.5)
        .mark_circle(size=100)
        .encode(
            x=alt.X(
                "jitter:Q",
                title=None,
                axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),
                scale=alt.Scale(),
            ),
            y=alt.Y("Rounded Speed:Q",
                    scale=alt.Scale(domain=[0, 50]),
                    title = "Speed (MPH)",
                    axis=alt.Axis(labelAngle=360, grid=False,)),
            color=alt.Color(
                "Variable:N",
                scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
            ),
            tooltip=df.columns.tolist(),
            column=alt.Column(
                "Test Stop Sequence:N",
                header=alt.Header(
                    labelAngle=360,
                    titleOrient="top",
                    labelOrient="top",
                    labelAlign="right",
                    labelPadding=2,
                ),
            ),
        )
        .transform_calculate(
            # Generate Gaussian jitter with a Box-Muller transform
            jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
        )
        .properties(title="Speeds by Operator-Shape Array")
    )
    
    chart1 = threshold_utils.chart_size(chart1, 40, 200)
    
    return chart1

In [None]:
chart1 = create_jitter_plot(subset_speedmph).add_selection(selection_test).transform_filter(selection_test)

In [None]:
chart2 = (
        alt.Chart(subset_other, width=0.5)
        .mark_circle(size=200)
        .encode(
            x=alt.X(
                "jitter:Q",
                title=None,
                axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),
                scale=alt.Scale(),
            ),
            y=alt.Y("Rounded Speed:Q", 
                    title = "Speed (MPH)",
                    
                    scale=alt.Scale(domain=[0, 50]),
                    axis=alt.Axis(labels=False, ticks = False, grid= False)),
            color=alt.Color(
                "Variable:N",
                scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
            ),
            tooltip=subset_other.columns.tolist(),
            column=alt.Column(
                "Test Stop Sequence:N",
                header=alt.Header(
                    labelAngle=360,
                    title = None,
                    titleOrient="top",
                    labelOrient="top",
                    labelAlign="right",
                    labelPadding=2,
                ),
            ),
        )
        .transform_calculate(
            # Generate Gaussian jitter with a Box-Muller transform
            jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
        )
    )
    

In [None]:
chart2 = threshold_utils.chart_size(chart2, 40, 200)

In [None]:
chart2 = chart2.add_selection(selection_test).transform_filter(selection_test)

In [None]:
subset['Route Type'] = "Loop or Inlining: " + subset["Loop Or Inlining"].astype(str)

In [None]:
# https://github.com/altair-viz/altair/issues/1168
title = alt.Chart(subset).mark_text(dy=-40, size=15, fontWeight='normal').encode(
    text='Route Type:N',
)

In [None]:
title = threshold_utils.chart_size(title, 20, 20)

In [None]:
title = title.add_selection(selection_test).transform_filter(selection_test)

In [None]:
title & (chart1.interactive() & chart2.interactive())

### Draft

In [None]:
def create_jitter_plot(df):
    
    title_op = df['Gtfs Dataset Name'].iloc[0].replace('VehiclePositions','').strip()
    inline = df['Loop Or Inlining'].iloc[0]
    
    chart1 = (
        alt.Chart(df, width=0.5)
        .mark_circle(size=100)
        .encode(
            x=alt.X(
                "jitter:Q",
                title=None,
                axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),
                scale=alt.Scale(),
            ),
            y=alt.Y("Rounded Speed:Q", axis=alt.Axis(labelAngle=360)),
            color=alt.Color(
                "Variable:N",
                scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
            ),
            tooltip=df.columns.tolist(),
            column=alt.Column(
                "Stop Sequence:N",
                header=alt.Header(
                    labelAngle=360,
                    titleOrient="top",
                    labelOrient="bottom",
                    labelAlign="right",
                    labelPadding=2,
                ),
            ),
        )
        .transform_calculate(
            # Generate Gaussian jitter with a Box-Muller transform
            jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
        )
        .properties(title=f"{title_op} - Route Type {inline}")
    )
    
    chart1 = threshold_utils.chart_size(chart1, 40, 250)
    
    return chart1

In [None]:
chart2 = (
        alt.Chart(anaheim_test, width=0.5)
        .mark_circle(size=100)
        .encode(
            x=alt.X(
                "jitter:Q",
                title=None,
                axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),
                scale=alt.Scale(),
            ),
            y=alt.Y("rounded_speed:Q", axis=alt.Axis(labelAngle=-90)),
            color=alt.Color(
                "variable:N",
                scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
            ),
            tooltip=anaheim_test.columns.tolist(),
            column=alt.Column(
                "stop_sequence:N",
                header=alt.Header(
                    labelAngle=360,
                    titleOrient="top",
                    labelOrient="bottom",
                    labelAlign="right",
                    labelPadding=2,
                ),
            ),
        )
        .transform_calculate(
            # Generate Gaussian jitter with a Box-Muller transform
            jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
        )
        .configure_facet(spacing=0)
        .configure_view(stroke=None)
        .properties(title="Trip Duration by RT Category")
    )

In [None]:
chart2 = threshold_utils.chart_size(chart2,80,300)

In [None]:
chart2

In [None]:
chart1 = (
        alt.Chart(anaheim_test_speedmph, width=0.5)
        .mark_circle(size=100)
        .encode(
            x=alt.X(
                "jitter:Q",
                title=None,
                axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),
                scale=alt.Scale(),
            ),
            y=alt.Y("rounded_speed:Q", axis=alt.Axis(labelAngle=-90)),
            color=alt.Color(
                "stop_sequence:N",
                scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
            ),
            tooltip=anaheim_test.columns.tolist(),
            column=alt.Column(
                "stop_sequence:N",
                header=alt.Header(
                    labelAngle=360,
                    titleOrient="top",
                    labelOrient="bottom",
                    labelAlign="right",
                    labelPadding=2,
                ),
            ),
        )
        .transform_calculate(
            # Generate Gaussian jitter with a Box-Muller transform
            jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
        )
        .configure_facet(spacing=0)
        .configure_view(stroke=None)
        .properties(title=f"Loop/Inling:{anaheim_test_speedmph.loop_or_inlining.iloc[0]}")
    )

In [None]:
chart1 = threshold_utils.chart_size(chart1,80,300)

In [None]:
chart1

In [None]:
def create_dot_plot2(df, col_for_dots: str, 
                    x_axis_col:str, y_axis_col:str,
                   tooltip_cols:list, chart_title:str):
  
    chart = (alt.Chart(df).mark_circle(opacity=1, size = 100).transform_window(
    id='rank()',
    groupby=[col_for_dots]).encode(
    alt.X(f'{x_axis_col}:O', sort='descending', 
          axis=alt.Axis(ticks=False, grid=True)),
    alt.Y(f'{y_axis_col}:N'), 
    color=alt.Color(f"{col_for_dots}:N", scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS), legend=None),
    tooltip = tooltip_cols)
             .properties(title = chart_title))
    
    return chart

In [None]:
chart3 = create_dot_plot1(anaheim_test_other, 'variable', 'stop_sequence', 'rounded_speed', anaheim_test_other.columns.tolist(),  'Percentile/Average')

In [None]:
chart3 = threshold_utils.chart_size(chart3,650,300)

In [None]:
chart4 = create_dot_plot2(anaheim_test_speedmph, 'variable', 'stop_sequence', 'rounded_speed', anaheim_test_speedmph.columns.tolist(), 'Speed per Trip')

In [None]:
chart4 = threshold_utils.chart_size(chart4,650,300)

In [None]:
chart4

In [None]:
chart3 + chart4

In [None]:
chart7 = (
        alt.Chart(anaheim_test_other, width=0.5)
        .mark_circle(size=100)
        .encode(
            x=alt.X(
                "jitter:Q",
                title=None,
                axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),
                scale=alt.Scale(),
            ),
            y=alt.Y("rounded_speed:Q", axis=alt.Axis(labelAngle=-90)),
            color=alt.Color(
                "variable:N",
                scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
            ),
            tooltip=anaheim_test.columns.tolist(),
            column=alt.Column(
                "stop_sequence:N",
                header=alt.Header(
                    labelAngle=-90,
                    titleOrient="top",
                    labelOrient="bottom",
                    labelAlign="right",
                    labelPadding=2,
                ),
            ),
        )
        .transform_calculate(
            # Generate Gaussian jitter with a Box-Muller transform
            jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
        )
        .configure_facet(spacing=0)
        .configure_view(stroke=None)
        .properties(title="Trip Duration by RT Category")
    )

In [None]:
chart7 = threshold_utils.chart_size(chart7,80,300)

In [None]:
chart8 = (
        alt.Chart(anaheim_test_other, width=0.5)
        .mark_circle(size=100)
        .encode(
            x=alt.X(
                "jitter:Q",
                title=None,
                axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),
                scale=alt.Scale(),
            ),
            y=alt.Y("rounded_speed:Q", axis=alt.Axis(labelAngle=-90)),
            color=alt.Color(
                "variable:N",
                scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
            ),
            tooltip=anaheim_test.columns.tolist(),
            column=alt.Column(
                "stop_sequence:N",
                header=alt.Header(
                    labelAngle=-90,
                    titleOrient="top",
                    labelOrient="bottom",
                    labelAlign="right",
                    labelPadding=2,
                ),
            ),
        )
        .transform_calculate(
            # Generate Gaussian jitter with a Box-Muller transform
            jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
        )
        .properties(title="Trip Duration by RT Category")
    )

In [None]:
chart9 = (
        alt.Chart(anaheim_test_speedmph, width=0.5)
        .mark_circle(size=100)
        .encode(
            x=alt.X(
                "jitter:Q",
                title=None,
                axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),
                scale=alt.Scale(),
            ),
            y=alt.Y("rounded_speed:Q", axis=alt.Axis(labelAngle=-90)),
            color=alt.Color(
                "variable:N",
                scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
            ),
            tooltip=anaheim_test.columns.tolist(),
            column=alt.Column(
                "stop_sequence:N",
                header=alt.Header(
                    labelAngle=360,
                    titleOrient="top",
                    labelOrient="bottom",
                    labelAlign="right",
                    labelPadding=2,
                ),
            ),
        )
        .transform_calculate(
            # Generate Gaussian jitter with a Box-Muller transform
            jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
        )
        .properties(title=f"Loop/Inling:{anaheim_test_speedmph.loop_or_inlining.iloc[0]}")
    )

In [None]:
chart8

In [None]:
chart9 | chart8

In [None]:
# pip install altair==5.0.0rc3
chart5 = alt.Chart(anaheim_test_speedmph, title='Normally distributed jitter').mark_circle(size=50).encode(
    y="rounded_speed:Q",
    x="stop_sequence:N",
    yOffset="jitter:Q",
    color=alt.Color('stop_sequence:Q').legend(None)
).transform_calculate(
    # Generate Gaussian jitter with a Box-Muller transform
    jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
)


In [None]:
chart5 = threshold_utils.chart_size(chart5,650,300)

In [None]:
chart5