In [None]:
import _threshold_utils as threshold_utils
import _speed_utils as speed_utils
import altair as alt

import pandas as pd
import geopandas as gpd
from segment_speed_utils.project_vars import analysis_date
from segment_speed_utils import gtfs_schedule_wrangling, helpers, segment_calcs
from shared_utils import calitp_color_palette as cp

import dask.dataframe as dd

In [None]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:
# Flag
# routes_many_stops_df, routes_many_stops_list = speed_utils.find_shapes_with_many_stops(analysis_date)

### Merging

In [None]:
avg_speeds = (pd.read_parquet(f"{speed_utils.GCS_PATH}avg_speeds_stop_segments_{analysis_date}.parquet")
              .drop(columns = ['geometry','geometry_arrowized','district','district_name'])
             )

In [None]:
speeds = pd.read_parquet(f"{speed_utils.GCS_PATH}speeds_stop_segments_{analysis_date}")

In [None]:
avg_speeds.sample()

In [None]:
speeds.sample()

In [None]:
merge_cols = ['gtfs_dataset_key','shape_array_key', 'stop_sequence']
merge1 = pd.merge(avg_speeds, speeds, on = merge_cols, how = 'inner')

In [None]:
merge1.sample()

In [None]:
segments_file =  "stop_segments"

In [None]:
stop_segments = (pd.read_parquet(
        f"{speed_utils.GCS_PATH}{segments_file}_{analysis_date}.parquet")
        .drop(columns = ['geometry','geometry_arrowized'])
                )

In [None]:
stop_segments.sample()

In [None]:
# pd.merge(stop_segments, merge1, on = ['gtfs_dataset_key','shape_array_key','stop_sequence','loop_or_inlining'], how = "inner", indicator = True)[['_merge']].value_counts()

In [None]:
merge1.shape

In [None]:
m1 =  speed_utils.merge_all_speeds(analysis_date)

In [None]:
m1.sample()

In [None]:
# m1.groupby(['shape_array_key']).agg({'trip_id':'nunique'}).sort_values(['trip_id'], ascending = False).head(30)

#### A few routes

In [None]:
test_shapes = ["6388c0be232f0c745df85d66689a6db0", "d8b0826e923620f7b7cd74be090de936", "e7012e8847c179f713daee0f158233e4"]

In [None]:
few_routes = merge1.loc[merge1.shape_array_key.isin(test_shapes)].reset_index(drop = True)

In [None]:
few_routes.shape

In [None]:
few_routes.trip_id.nunique()

In [None]:
def categorize_by_percentile(df, column_percentile:str, column_str:str):
    
    agg1 = (df
               .groupby(['shape_array_key','stop_sequence'])[column_percentile]
               .describe()
               .reset_index()
               .add_prefix(column_str)
              )
    
    merge1 = (pd.merge(df, agg1, how = "inner", 
                      left_on = ['shape_array_key','stop_sequence'],
                      right_on =[f'{column_str}shape_array_key', f'{column_str}stop_sequence',])
             )
    
    def percentile(row):
           
            if (row[column_percentile] == row[f"{column_str}mean"]):
                return f"{column_str} elapsed is average"
            elif (row[column_percentile] == row[f"{column_str}50%"]):
                return f"{column_str} elapsed is average"
            elif (row[f"{column_str}25%"] <= row[column_percentile] <= row[f"{column_str}75%"]):
                return  f"{column_str} elapsed is average"
            elif(row[column_percentile] == 0):
                return f"{column_str} elapsed is 0"
            elif (row[column_percentile] <= row[f"{column_str}25%"]):
                return f"{column_str} elapsed on lower end"
            elif (row[column_percentile] >= row[f"{column_str}75%"]):
                return f"{column_str} elapsed on higher end"
            
            else:
                return "other"
    merge1[f"{column_str}cat"] = merge1.apply(lambda x: percentile(x), axis=1)
    
    return merge1

In [None]:
def mph_categories(row):
        if (row["speed_mph"] < row["p20_speed_mph"]):
            return "mph low"
       
        elif (row["p20_speed_mph"] <= row["speed_mph"] <= row["p80_speed_mph"]):
            return "mph average"
        elif (row["speed_mph"] > row["p80_speed_mph"]):
            return "mph high"
        else:
            return "speed is 0"

In [None]:
def categorize_meters_speeds(df):

    df = categorize_by_percentile(df, "meters_elapsed", "meters_")
    df = categorize_by_percentile(df, "sec_elapsed", "seconds_")
    df['mph_cat'] = df.apply(lambda x: mph_categories(x), axis=1)
    return df

In [None]:
few_routes_cat = categorize_meters_speeds(few_routes)

In [None]:
few_routes_cat.shape

In [None]:
few_routes_cat.mph_cat.value_counts()/len(few_routes_cat)*100

In [None]:
other_mph = few_routes_cat[few_routes_cat.mph_cat == 'other']

In [None]:
other_mph.speed_mph.value_counts()

In [None]:
few_routes_cat.seconds_cat.value_counts()/len(few_routes_cat)*100

In [None]:
few_routes_cat.meters_cat.value_counts()/len(few_routes_cat)*100

In [None]:
few_routes_cat.columns

In [None]:
subset = ['stop_sequence','mph_cat','speed_mph','avg_speed_mph', 'p20_speed_mph',
       'p80_speed_mph','meters_cat',
          'meters_elapsed', 'meters_mean','meters_25%', 'meters_50%', 'meters_75%','seconds_cat',
          'sec_elapsed', 'seconds_mean','seconds_25%',
       'seconds_50%', 'seconds_75%','gtfs_dataset_key',]
          

In [None]:
few_routes_cat.groupby(['mph_cat','meters_cat', 'seconds_cat',]).size()

In [None]:
# few_routes_cat[(few_routes_cat.mph_cat == "speed is 0") & (few_routes_cat.meters_cat == "meters_ elapsed is 0") & (few_routes_cat.seconds_cat  == "seconds_ elapsed is 0")][subset].head(200)

In [None]:
def flag(row):
        if ((row["meters_cat"] == "meters_ elapsed is average") & (row["seconds_cat"] ==  "seconds_ elapsed is average")):
            return "ok"
        elif (row["mph_cat"] ==  "mph average"):
            return "ok"
        elif  ((row["meters_cat"] ==  "meters_ elapsed on higher end") & (row["seconds_cat"] ==  "seconds_ elapsed on higher end")):
            return "ok"
        elif  ((row["meters_cat"] ==  "meters_ elapsed on lower end") & (row["seconds_cat"] ==  "seconds_ elapsed on lower end")):
            return "ok"
        elif  ((row["meters_cat"] != "meters_ elapsed is average") & (row["seconds_cat"] !=  "seconds_ elapsed is average") & (row["mph_cat"] ==  "mph high")):
            return "high"
        elif  ((row["meters_cat"] != "meters_ elapsed is average") & (row["seconds_cat"] !=  "seconds_ elapsed is average") & (row["mph_cat"] ==  "mph low")):
            return "low"
        elif  ((row["meters_cat"] ==  "meters_ elapsed is 0") & (row["mph_cat"] ==  "speed is 0") & (row["seconds_cat"] ==  "seconds_ elapsed is 0")):
            return "zeroes"
        elif (row["mph_cat"] == "speed is 0"):
            return "zeroes"
        elif (row["mph_cat"] == "mph high"):
            return "high"
        elif (row["mph_cat"] == "mph low"):
            return "low"
        else:
            return "other"

In [None]:
# few_routes_cat[(few_routes_cat.shape_array_key == "d8b0826e923620f7b7cd74be090de936") & (few_routes_cat.stop_sequence == 1)][subset]

In [None]:
few_routes_cat["unusual_flag"] = few_routes_cat.apply(lambda x: flag(x), axis=1)

In [None]:
few_routes_cat.unusual_flag.value_counts()/len(few_routes_cat)*100

In [None]:
 few_routes_cat.unusual_flag.value_counts()

In [None]:
few_routes_cat[few_routes_cat.unusual_flag == "ok"][subset].sample(5)

In [None]:
few_routes_cat.columns

In [None]:
251.99-117.96

In [None]:
subset2 = ['unusual_flag','speed_mph','mph_cat', 'p20_speed_mph','avg_speed_mph' ,
       'p80_speed_mph',]

In [None]:
stop_16 = few_routes_cat[(few_routes_cat.stop_sequence == 16) & (few_routes_cat._gtfs_dataset_name == "Bay Area 511 Muni VehiclePositions")]

In [None]:
stop_16.speed_mph.describe()

In [None]:
4.1+3.71

In [None]:
4.1-3.71

In [None]:
# stop_16[subset2].sort_values(['speed_mph'], ascending = False)

In [None]:
# few_routes_cat[few_routes_cat.unusual_flag == "high"][subset].sample(40)

In [None]:
len(few_routes_cat[few_routes_cat.unusual_flag.isin(['high','low', 'speed is 0'])])/len(few_routes_cat) *100

#### Should filter even further.

In [None]:
high_low_zero = few_routes_cat[few_routes_cat.unusual_flag.isin(['high','low', 'speed is 0'])].reset_index()

In [None]:
high_low_zero.sample()

In [None]:
# To plot
#all_trips = one_route3.melt(id_vars=[ '_gtfs_dataset_name','shape_array_key','trip_id', 'stop_sequence','gtfs_dataset_key','loop_or_inlining',
#'n_trips'], value_vars=['avg_speed_mph','speed_mph','p20_speed_mph', 'p80_speed_mph'])

In [None]:

#all_trips = all_trips.drop_duplicates(subset = [ '_gtfs_dataset_name','shape_array_key','stop_sequence','gtfs_dataset_key','variable','value']).reset_index(drop = True)

#### Other ideas
* Show which stops are excluded from flags
* Show how many stops are dropped
* Show % of stops that were flagged compared to total stops.

In [None]:
high_low_zero.shape

In [None]:
high_low_zero.shape_array_key.value_counts()

In [None]:
high_low_zero2 = high_low_zero.melt(id_vars=[ '_gtfs_dataset_name','shape_array_key','trip_id', 'stop_sequence','gtfs_dataset_key','loop_or_inlining',
'n_trips', 'meters_cat', 'seconds_cat','unusual_flag','mph_cat'], value_vars=['avg_speed_mph','speed_mph','p20_speed_mph', 'p80_speed_mph'])

In [None]:

high_low_zero2 = high_low_zero2.drop_duplicates(subset = [ '_gtfs_dataset_name','shape_array_key','stop_sequence','gtfs_dataset_key','variable','value']).reset_index(drop = True)

In [None]:
def alt_dropdown(df, col_for_dropdown:str, dropdown_menu_title:str):
    # Create dropdown menu
    # Exclude "none" operators which are only scheduled data
    df = df.loc[df[col_for_dropdown] != "None"][[col_for_dropdown]]
    dropdown_list = df[col_for_dropdown].unique().tolist()
    
    # Show only first operator by default
    initialize_first_op = sorted(dropdown_list)[0]
    input_dropdown = alt.binding_select(options=sorted(dropdown_list), name=dropdown_menu_title)
    
    selection = alt.selection_single(name= dropdown_menu_title,fields=[col_for_dropdown],
    bind=input_dropdown, init={col_for_dropdown: initialize_first_op})
                 
    return selection

In [None]:
selection_test = alt_dropdown(high_low_zero2, "shape_array_key", "Route")

In [None]:
threshold_utils.chart_size(alt.Chart(high_low_zero2).mark_tick(size=30,thickness=5,).encode(
    x='stop_sequence:N',
    y='value:Q',
    color='variable',
    tooltip=high_low_zero2.columns.tolist()
).interactive(), 1700, 400).add_selection(selection_test).transform_filter(selection_test)


### Charts  
Test with a few routes first
* Create new col that rounds up speed for plotting purposes only.

#### Manipulate DF for charts

In [None]:
m1 = 

In [None]:
test1 = m1.melt(id_vars=[ '_gtfs_dataset_name','shape_array_key','trip_id', 'sorted_stop_seq','gtfs_dataset_key','loop_or_inlining',
                       'n_trips'], value_vars=[ 'avg_speed_mph','speed_mph','p20_speed_mph', 'p80_speed_mph'])

In [None]:
test1.shape

In [None]:
# test1[test1.shape_array_key == "29d2bbdbeaec1d6888800f85bebf6e33"]

In [None]:
# Only need average speed/p20 speed/p80 to show up once for each stop sequence-operator-shape array
test2 = test1.drop_duplicates(subset = [ '_gtfs_dataset_name','shape_array_key','sorted_stop_seq','gtfs_dataset_key','variable','value']).reset_index(drop = True)

In [None]:
# test2.to_csv("./speeds.csv")

In [None]:
test2.shape

In [None]:
other = ['cf688717cf0cd8dac0e6d1f12f9c7333',
       '6f39f818c9a0c5496cd1c8bd1aa11e67',
       '3de4482ec32ba0f2edb451d3528b5a5e']

In [None]:
# Take out routes that have over 85 stops
# subset = test2[~test2.shape_array_key.isin(routes_many_stops_list)].reset_index(drop = True)

In [None]:
subset = test2[test2.shape_array_key.isin(["29d2bbdbeaec1d6888800f85bebf6e33",'754c5b012195800c38dc58e72e4f482e',
      'e3c5ed2c6fa6cd5c5cd57d46aeb3cd8e', ])]

In [None]:
subset = threshold_utils.pre_clean(subset)

In [None]:
subset.columns

In [None]:
subset.sample()

In [None]:
subset['Route'] = subset['Gtfs Dataset Name'] +' '  + subset['Shape Array Key']

In [None]:
subset = subset.rename(columns = {'Value':'Speed'})

In [None]:
subset['Speed_Int'] = subset.Speed.fillna(0).astype(int)

In [None]:
subset['Route Type'] = "Loop or Inlining: " + subset["Loop Or Inlining"].astype(str)

In [None]:
# subset['Rounded Speed'].unique()

In [None]:
def speed(row):
    # If partner is none, return Unknown.
    if row.Speed_Int == 0:
        return 0
    elif 0 < row.Speed_Int < 6:
        return 5
    elif 5 < row.Speed_Int < 11:
        return 10
    elif 10 < row.Speed_Int < 16:
        return 15
    elif 15 < row.Speed_Int < 21:
        return 20
    elif 20 < row.Speed_Int < 26:
        return 25
    elif 25 < row.Speed_Int < 31:
        return 30
    else:
        return 35

In [None]:
# Apply the function
subset["Rounded Speed"] = subset.apply(speed, axis=1)

In [None]:
# subset[['Rounded Speed', 'Speed', 'Speed_Int']]

In [None]:
subset.Variable = subset.Variable.str.title().str.replace("_"," ")

In [None]:
# One df for the actual speeds
subset_speedmph = subset[subset.Variable == 'Speed Mph'].reset_index(drop = True)

In [None]:
# One df for the percentiles
subset_other= subset[subset.Variable != 'Speed Mph'].reset_index(drop = True)

In [None]:
selection_test = alt_dropdown(subset, "Route", "Operator/Shape Array")

In [None]:
# https://github.com/altair-viz/altair/issues/1168
title = alt.Chart(subset).mark_text(dy=-40, size=15, fontWeight='normal').encode(
    text='Route Type:N',
)

In [None]:
title = title.add_selection(selection_test).transform_filter(selection_test)

#### Scatterplot

#### Jitter

In [None]:
def create_jitter_plot(df):
    
    #title_op = df['Gtfs Dataset Name'].iloc[0].replace('VehiclePositions','').strip()
    #inline = df['Loop Or Inlining'].iloc[0]
    chart1 = (
        alt.Chart(df, width=0.5)
        .mark_circle(size=100)
        .encode(
            x=alt.X(
                "jitter:Q",
                title=None,
                axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),
                scale=alt.Scale(),
            ),
            y=alt.Y("Rounded Speed:Q",
                    scale=alt.Scale(domain=[0,50]),
                    title = "Speed (MPH)",
                    axis=alt.Axis(labelAngle=360, grid=False,)),
            color=alt.Color(
                "Variable:N",
                scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
            ),
            tooltip=df.columns.tolist(),
            column=alt.Column(
                "Sorted Stop Seq:N",
                header=alt.Header(
                    labelAngle=360,
                    titleOrient="top",
                    labelOrient="top",
                    labelAlign="right",
                    labelPadding=2,
                ),
            ),
        )
        .transform_calculate(
            # Generate Gaussian jitter with a Box-Muller transform
            jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
        )
        .properties(title="Speeds by Operator-Shape Array")
    )
    
    chart1 = threshold_utils.chart_size(chart1, 75, 200)
    
    return chart1

In [None]:
chart1 = create_jitter_plot(subset_speedmph).add_selection(selection_test).transform_filter(selection_test)

In [None]:
chart2 = (
        alt.Chart(subset_other, width=0.5)
        .mark_circle(size=200)
        .encode(
            x=alt.X(
                "jitter:Q",
                title=None,
                axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),
                scale=alt.Scale(),
            ),
            y=alt.Y("Rounded Speed:Q", 
                    title = "Speed (MPH)",
                    scale=alt.Scale(domain=[0, 50]),
                   axis = alt.Axis(grid = False)),
            color=alt.Color(
                "Variable:N",
                scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
            ),
            tooltip=subset_other.columns.tolist(),
            column=alt.Column(
                "Sorted Stop Seq:N",
                header=alt.Header(
                    labelAngle=360,
                    title = None,
                    titleOrient="top",
                    labelOrient="top",
                    labelAlign="right",
                    labelPadding=2,
                ),
            ),
        )
        .transform_calculate(
            # Generate Gaussian jitter with a Box-Muller transform
            jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
        )
    )
    

In [None]:
chart2 = threshold_utils.chart_size(chart2, 75, 200)

In [None]:
chart2 = chart2.add_selection(selection_test).transform_filter(selection_test)

In [None]:
title = threshold_utils.chart_size(title, 20, 20)

In [None]:
alt.data_transformers.enable('default', max_rows=None)

In [None]:
title & (chart1.interactive() & chart2.interactive())

### Draft

In [None]:
def meter_elapsed_categories(row):
        lower_end = (row["meters_mean"] - row["meters_std"])
        higher_end = (row["meters_mean"] + row["meters_std"])
        if (row["meters_elapsed"] == row["meters_mean"]):
            return "distance elapsed is average"
        elif (row["meters_elapsed"] <= lower_end):
            return "distance lapsed on lower end"
        elif (row["meters_elapsed"] >= higher_end):
            return "distance lapsed on higher end"
        elif (lower_end < row["meters_elapsed"] < higher_end):
            return "distance elapsed is average"
        else:
            return "other"
def seconds_elapsed_categories(row):
        lower_end = (row["secs_mean"] - row["secs_std"])
        higher_end = (row["secs_mean"] + row["secs_std"])
        if (row["sec_elapsed"] == row["secs_mean"]):
            return "secs elapsed is average"
        elif (row["sec_elapsed"] <= lower_end):
            return "secs lapsed on lower end"
        elif (row["sec_elapsed"] >= higher_end):
            return "secs lapsed on higher end"
        elif (lower_end < row["sec_elapsed"] < higher_end):
            return "secs elapsed is average"
        else:
            return "other"

In [None]:
#Determine if an agency has a small, medium, or large fleet size.
def categorize_by_percentile(df, column_percentile:str): 
  
    #Get percentiles in objects for total vehicle.
    p75 = df[column_percentile].quantile(0.75).astype(float)
    p25 = df[column_percentile].quantile(0.25).astype(float)
    p50 = df[column_percentile].quantile(0.50).astype(float)

    def percentile(row):
        if (row[column_percentile] <= p25):
            return f"{column_percentile}: low"
        elif ((p25 < row[column_percentile]) and (row[column_percentile] <= p75)):
            return f"{column_percentile}: average"
        elif (row[column_percentile] > p75):
               return f"{column_percentile}: high"
        else:
            return "other"
    df[f"{column_percentile}_cat"] = df.apply(lambda x: percentile(x), axis=1)
  
    return df    

def categorize_all(df):

    # Hold results
    final = pd.DataFrame()

    for column in ['meters_elapsed', 'sec_elapsed']:
            for shape_array_key in df.shape_array_key.tolist():
                for stop in df.stop_sequence.tolist():
                    filtered = df[(df.shape_array_key == shape_array_key) & (df.stop_sequence == stop)].reset_index()
                    categorized = categorize_by_percentile(filtered, column)
                    final = pd.concat([final, categorized], axis=0)
                    print(f'done for {column}/{stop}')

    return final

In [None]:
"""
p25 = troubleshoot.total_stops.quantile(0.25).astype(float)
p50 =  troubleshoot.total_stops.quantile(0.50).astype(float)
p75 =  troubleshoot.total_stops.quantile(0.75).astype(float)
p95 =  troubleshoot.total_stops.quantile(0.95).astype(float)
p99 =  troubleshoot.total_stops.quantile(0.99).astype(float)
"""

In [None]:
def stop_categories1(row):
        if ((row.total_stops > 0) and (row.total_stops <= p25)):
            return "25th  <= 17 stops"
        elif ((row.total_stops > p25) and (row.total_stops <= p75)):
            return "50th <= 30 stops"
        elif ((row.total_stops > p75) and (row.total_stops <= p95)):
               return "75th <= 50 stops"
        elif ((row.total_stops > p95) and (row.total_stops <= p99)):
               return "95th <= 85 stops"
        elif row.total_stops >= p95:
               return "99th >= 203 stops"
        else:
            return "other"

In [None]:
def create_jitter_plot(df):
    
    title_op = df['Gtfs Dataset Name'].iloc[0].replace('VehiclePositions','').strip()
    inline = df['Loop Or Inlining'].iloc[0]
    
    chart1 = (
        alt.Chart(df, width=0.5)
        .mark_circle(size=100)
        .encode(
            x=alt.X(
                "jitter:Q",
                title=None,
                axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),
                scale=alt.Scale(),
            ),
            y=alt.Y("Rounded Speed:Q", axis=alt.Axis(labelAngle=360)),
            color=alt.Color(
                "Variable:N",
                scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
            ),
            tooltip=df.columns.tolist(),
            column=alt.Column(
                "Stop Sequence:N",
                header=alt.Header(
                    labelAngle=360,
                    titleOrient="top",
                    labelOrient="bottom",
                    labelAlign="right",
                    labelPadding=2,
                ),
            ),
        )
        .transform_calculate(
            # Generate Gaussian jitter with a Box-Muller transform
            jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
        )
        .properties(title=f"{title_op} - Route Type {inline}")
    )
    
    chart1 = threshold_utils.chart_size(chart1, 40, 250)
    
    return chart1

In [None]:
chart2 = (
        alt.Chart(anaheim_test, width=0.5)
        .mark_circle(size=100)
        .encode(
            x=alt.X(
                "jitter:Q",
                title=None,
                axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),
                scale=alt.Scale(),
            ),
            y=alt.Y("rounded_speed:Q", axis=alt.Axis(labelAngle=-90)),
            color=alt.Color(
                "variable:N",
                scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
            ),
            tooltip=anaheim_test.columns.tolist(),
            column=alt.Column(
                "stop_sequence:N",
                header=alt.Header(
                    labelAngle=360,
                    titleOrient="top",
                    labelOrient="bottom",
                    labelAlign="right",
                    labelPadding=2,
                ),
            ),
        )
        .transform_calculate(
            # Generate Gaussian jitter with a Box-Muller transform
            jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
        )
        .configure_facet(spacing=0)
        .configure_view(stroke=None)
        .properties(title="Trip Duration by RT Category")
    )

In [None]:
chart2 = threshold_utils.chart_size(chart2,80,300)

In [None]:
chart2

In [None]:
chart1 = (
        alt.Chart(anaheim_test_speedmph, width=0.5)
        .mark_circle(size=100)
        .encode(
            x=alt.X(
                "jitter:Q",
                title=None,
                axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),
                scale=alt.Scale(),
            ),
            y=alt.Y("rounded_speed:Q", axis=alt.Axis(labelAngle=-90)),
            color=alt.Color(
                "stop_sequence:N",
                scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
            ),
            tooltip=anaheim_test.columns.tolist(),
            column=alt.Column(
                "stop_sequence:N",
                header=alt.Header(
                    labelAngle=360,
                    titleOrient="top",
                    labelOrient="bottom",
                    labelAlign="right",
                    labelPadding=2,
                ),
            ),
        )
        .transform_calculate(
            # Generate Gaussian jitter with a Box-Muller transform
            jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
        )
        .configure_facet(spacing=0)
        .configure_view(stroke=None)
        .properties(title=f"Loop/Inling:{anaheim_test_speedmph.loop_or_inlining.iloc[0]}")
    )

In [None]:
chart1 = threshold_utils.chart_size(chart1,80,300)

In [None]:
chart1

In [None]:
def create_dot_plot2(df, col_for_dots: str, 
                    x_axis_col:str, y_axis_col:str,
                   tooltip_cols:list, chart_title:str):
  
    chart = (alt.Chart(df).mark_circle(opacity=1, size = 100).transform_window(
    id='rank()',
    groupby=[col_for_dots]).encode(
    alt.X(f'{x_axis_col}:O', sort='descending', 
          axis=alt.Axis(ticks=False, grid=True)),
    alt.Y(f'{y_axis_col}:N'), 
    color=alt.Color(f"{col_for_dots}:N", scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS), legend=None),
    tooltip = tooltip_cols)
             .properties(title = chart_title))
    
    return chart

In [None]:
chart3 = create_dot_plot1(anaheim_test_other, 'variable', 'stop_sequence', 'rounded_speed', anaheim_test_other.columns.tolist(),  'Percentile/Average')

In [None]:
chart3 = threshold_utils.chart_size(chart3,650,300)

In [None]:
chart4 = create_dot_plot2(anaheim_test_speedmph, 'variable', 'stop_sequence', 'rounded_speed', anaheim_test_speedmph.columns.tolist(), 'Speed per Trip')

In [None]:
chart4 = threshold_utils.chart_size(chart4,650,300)

In [None]:
chart4

In [None]:
chart3 + chart4

In [None]:
chart7 = (
        alt.Chart(anaheim_test_other, width=0.5)
        .mark_circle(size=100)
        .encode(
            x=alt.X(
                "jitter:Q",
                title=None,
                axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),
                scale=alt.Scale(),
            ),
            y=alt.Y("rounded_speed:Q", axis=alt.Axis(labelAngle=-90)),
            color=alt.Color(
                "variable:N",
                scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
            ),
            tooltip=anaheim_test.columns.tolist(),
            column=alt.Column(
                "stop_sequence:N",
                header=alt.Header(
                    labelAngle=-90,
                    titleOrient="top",
                    labelOrient="bottom",
                    labelAlign="right",
                    labelPadding=2,
                ),
            ),
        )
        .transform_calculate(
            # Generate Gaussian jitter with a Box-Muller transform
            jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
        )
        .configure_facet(spacing=0)
        .configure_view(stroke=None)
        .properties(title="Trip Duration by RT Category")
    )

In [None]:
chart7 = threshold_utils.chart_size(chart7,80,300)

In [None]:
chart8 = (
        alt.Chart(anaheim_test_other, width=0.5)
        .mark_circle(size=100)
        .encode(
            x=alt.X(
                "jitter:Q",
                title=None,
                axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),
                scale=alt.Scale(),
            ),
            y=alt.Y("rounded_speed:Q", axis=alt.Axis(labelAngle=-90)),
            color=alt.Color(
                "variable:N",
                scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
            ),
            tooltip=anaheim_test.columns.tolist(),
            column=alt.Column(
                "stop_sequence:N",
                header=alt.Header(
                    labelAngle=-90,
                    titleOrient="top",
                    labelOrient="bottom",
                    labelAlign="right",
                    labelPadding=2,
                ),
            ),
        )
        .transform_calculate(
            # Generate Gaussian jitter with a Box-Muller transform
            jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
        )
        .properties(title="Trip Duration by RT Category")
    )

In [None]:
chart9 = (
        alt.Chart(anaheim_test_speedmph, width=0.5)
        .mark_circle(size=100)
        .encode(
            x=alt.X(
                "jitter:Q",
                title=None,
                axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),
                scale=alt.Scale(),
            ),
            y=alt.Y("rounded_speed:Q", axis=alt.Axis(labelAngle=-90)),
            color=alt.Color(
                "variable:N",
                scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
            ),
            tooltip=anaheim_test.columns.tolist(),
            column=alt.Column(
                "stop_sequence:N",
                header=alt.Header(
                    labelAngle=360,
                    titleOrient="top",
                    labelOrient="bottom",
                    labelAlign="right",
                    labelPadding=2,
                ),
            ),
        )
        .transform_calculate(
            # Generate Gaussian jitter with a Box-Muller transform
            jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
        )
        .properties(title=f"Loop/Inling:{anaheim_test_speedmph.loop_or_inlining.iloc[0]}")
    )

In [None]:
chart8

In [None]:
chart9 | chart8

In [None]:
# pip install altair==5.0.0rc3
chart5 = alt.Chart(anaheim_test_speedmph, title='Normally distributed jitter').mark_circle(size=50).encode(
    y="rounded_speed:Q",
    x="stop_sequence:N",
    yOffset="jitter:Q",
    color=alt.Color('stop_sequence:Q').legend(None)
).transform_calculate(
    # Generate Gaussian jitter with a Box-Muller transform
    jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
)


In [None]:
chart5 = threshold_utils.chart_size(chart5,650,300)

In [None]:
chart5

#### Look at one trip

In [None]:
#foothill_og = speed_stops2[speed_stops2.trip_id == "t604-b2791-sl5"]

In [None]:
#len(foothill_og)

In [None]:
#foothill_og.stop_sequence.nunique()

In [None]:
#foothill_og.stop_sequence.describe()

In [None]:
#foothill_og.sort_values('stop_sequence').head()

In [None]:
#foothill_renumbered_stop_seq = m2[m2.trip_id == "t604-b2791-sl5"] 

In [None]:
#foothill_renumbered_stop_seq['Test Stop Sequence'].describe()

In [None]:
#foothill_renumbered_stop_seq.sort_values('stop_sequence').head()

In [None]:
# len(troubleshoot)

In [None]:
# Number of test stops should match stop sequence...
# troubleshoot['sequences_are_equal'] = troubleshoot['Test Stop Sequence'] - troubleshoot['stop_sequence']

In [None]:
# troubleshoot['sequences_are_equal'].value_counts()

In [None]:
# Look at this trip id in the original df
#og_trip = speed_stops2[speed_stops2.trip_id == "t640-b15FF1-sl5"]

In [None]:
# Look at this trip id in the manipulated df
#new_trip = m2[m2.trip_id == "t640-b15FF1-sl5"]

In [None]:
#og_trip.shape, og_trip.stop_sequence.nunique()

In [None]:
#new_trip.shape, new_trip.stop_sequence.nunique()