In [1]:
import datetime
import dask.dataframe as dd
import numpy as np
import geopandas as gpd
import pandas as pd
import altair as alt
from segment_speed_utils import gtfs_schedule_wrangling, helpers, segment_calcs,sched_rt_utils
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    PROJECT_CRS,
    SEGMENT_GCS,  
    analysis_date,
)
from scripts import (A1_sjoin_vp_segments, A2_valid_vehicle_positions,B2_avg_speeds_by_segment)
from shared_utils import calitp_color_palette as cp, rt_utils, geography_utils
import _threshold_utils as threshold_utils
import _rt_scheduled_utils as rt_scheduled_utils
CONFIG_PATH = './scripts/config.yml'
STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments")


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

## % of Meters
* start with the speeds_stop_segments_{analysis_date} parquet (which is produced in B1_speeds_by_segment_trip ).
* grab in stop_segments_{analysis_date} (in CRS 3310 already), and you can get the segment's length.
* merge with the speeds by segment-trip, which contains the meters_elapsed column
* calculate pct where meters_elapsed/segment_length
* show me some charts around this, a couple of descriptives to make sure these are all ok
* give me rough descriptives of how many rows we're dropping if we go with keeping at least 30%, 40%, 50%

### Open Files
* Add `import_speeds_segs`,`calculate_segment_length`,`merge_segments_speeds` into `B2`

In [3]:
# dict_inputs = helpers.get_parameters(CONFIG_PATH, "stop_segments")

In [4]:
# dict_inputs

In [5]:
dictionary = helpers.get_parameters(CONFIG_PATH, "stop_segments")

In [6]:
def import_speeds_segs(analysis_date:str, max_speed_cutoff: int, dict_inputs:dict):
    FILE = dict_inputs['stage4']
    df = pd.read_parquet(f"{SEGMENT_GCS}{FILE}_{analysis_date}", 
        filters = [[("speed_mph", "<=", max_speed_cutoff)]])
    
    return df

In [7]:
def calculate_segment_length(analysis_date:str, dict_inputs:dict) -> gpd.GeoDataFrame:
    # Load in ALL segments, find the length
    FILE = dict_inputs['segments_file']
    gdf = gpd.read_parquet(f"{SEGMENT_GCS}{FILE}_{analysis_date}.parquet")
    
    gdf = gdf.assign(
        meters_length=(gdf.geometry.length)
    )
    df = gdf.drop(columns = ['geometry','geometry_arrowized','district','district_name'])
    
    return df

In [8]:
def merge_segments_speeds(analysis_date:str, max_speed_cutoff:int, dict_inputs:dict):
    speeds = import_speeds_segs(analysis_date,max_speed_cutoff, dict_inputs)
    segments = calculate_segment_length(analysis_date, dict_inputs)
    merge1 = pd.merge(segments, speeds, on = ['shape_array_key','gtfs_dataset_key','stop_sequence','schedule_gtfs_dataset_key'], how = "inner")
    
    merge1['percent'] = merge1.meters_elapsed/merge1.meters_length * 100
    
    merge1.percent = merge1.percent.fillna(0)
    merge1['rounded_percent'] = ((merge1.percent/100)*10).astype(int)*10
    return merge1

In [10]:
merge1= merge_segments_speeds(analysis_date,70, dictionary)

In [13]:
def valid_trips_by_cutoff(df, percentages:list):
    final = pd.DataFrame()
    og_len = len(df)
    og_trips = df.trip_id.nunique()
    og_shape_array_key = df.shape_array_key.nunique()
    
    for i in percentages:
        percent = df.percent.quantile(i).astype(float)
        temp = (df[(df.percent >= percent)])
        
        temp = temp.assign(
            percentile = f"Min. of {percent.astype(int)}% of seg length covered")
        
        temp = (temp
                .groupby(['percentile'])
                .agg({'gtfs_dataset_name':'count',
                      'speed_mph':'mean',
                      'shape_array_key':'nunique',
                       'trip_id':'nunique',
                     'gtfs_dataset_key':'nunique'})
                .reset_index()
                .rename(columns = {'gtfs_dataset_name':'n_rows',
                                  'speed_mph':'mean_speed_mph',
                                  'shape_array_key':'n_kept_routes',
                                  'trip_id':'n_kept_trips',
                                  'gtfs_dataset_key':'n_kept_operators'})
               )
        
        final = pd.concat([final, temp], axis=0)

    
    final = final.assign(
        percentage_kept_rows=final.n_rows.divide(og_len) * 100,
        percentage_kept_trips = final.n_kept_trips.divide(og_trips) * 100,
        percentage_kept_routes = final.n_kept_routes.divide(og_shape_array_key)*100)
    
    return final

In [14]:
test = valid_trips_by_cutoff(merge1, [.1,.2,.3,.4,.5,.6,.7,.8])

In [101]:
test

Unnamed: 0,Percentile,N Rows,Mean Speed Mph,N Kept Routes,N Kept Trips,N Kept Operators,Percentage Kept Rows,Percentage Kept Trips,Percentage Kept Routes
0,Min. of 13% of seg length covered,2002522,11.9,4079,65931,74,90.0,99.6,99.8
0,Min. of 43% of seg length covered,1780020,12.4,4065,65696,74,80.0,99.2,99.5
0,Min. of 60% of seg length covered,1557517,12.2,4055,65420,74,70.0,98.8,99.2
0,Min. of 73% of seg length covered,1335015,11.9,4041,65010,74,60.0,98.2,98.9
0,Min. of 83% of seg length covered,1112513,11.6,4026,64250,74,50.0,97.0,98.5
0,Min. of 90% of seg length covered,890010,11.2,3977,62802,74,40.0,94.8,97.3
0,Min. of 95% of seg length covered,667508,10.8,3902,60798,74,30.0,91.8,95.5
0,Min. of 98% of seg length covered,445005,10.1,3735,57281,74,20.0,86.5,91.4


In [105]:
valid_trips_by_cutoff(merge1, [.15,0.30,.45,0.6,0.75])

Unnamed: 0,percentile,n_rows,mean_speed_mph,n_kept_routes,n_kept_trips,n_kept_operators,percentage_kept_rows,percentage_kept_trips,percentage_kept_routes
0,Min. of 30% of seg length covered,1891271,12.26,4070,65795,74,85.0,99.36,99.58
0,Min. of 60% of seg length covered,1557517,12.22,4055,65420,74,70.0,98.79,99.22
0,Min. of 78% of seg length covered,1223764,11.79,4033,64695,74,55.0,97.69,98.68
0,Min. of 90% of seg length covered,890010,11.25,3977,62802,74,40.0,94.84,97.31
0,Min. of 97% of seg length covered,556257,10.45,3840,59431,74,25.0,89.75,93.96


## Sample segments and routes for Big Blue Bus

In [15]:
test_operator = "Big Blue Bus VehiclePositions"
test_org = "City of Santa Monica"
test_key = "6c2d7daaf979779fa2089c6395baf98b"

In [16]:
pub_df = pd.read_parquet(
    f"{SEGMENT_GCS}export/avg_speeds_stop_segments_{analysis_date}_tabular.parquet", 
    filters = [[("agency", "==", test_org)]]
)

In [17]:
# Dark orange
shape_id1  = "26375"
stop_seq1 = 7
shape_array1 = pub_df[pub_df.shape_id==shape_id1].shape_array_key.iloc[0]

In [18]:
# Light yellow 
shape_id2 = "26342"
stop_seq2 = 23
shape_array2 = pub_df[pub_df.shape_id==shape_id2].shape_array_key.iloc[0]

In [19]:
# Dark Red
shape_id3 = "26393"
stop_seq3 = 32
shape_array3 = pub_df[pub_df.shape_id==shape_id3].shape_array_key.iloc[0]

In [20]:
# Light orange
shape_id4 = "26372"
stop_seq4 = 14
shape_array4 = pub_df[pub_df.shape_id==shape_id4].shape_array_key.iloc[0]

In [21]:
# Green
shape_id5 = "26400"
stop_seq5= 8
shape_array5 = pub_df[pub_df.shape_id==shape_id5].shape_array_key.iloc[0]

In [22]:
def one_route_map(avg_speeds:gpd.GeoDataFrame, shape_array_key:str):
    display(avg_speeds[avg_speeds.shape_array_key==shape_array_key].explore(
    "p50_mph", 
    tiles = "CartoDB Positron",
    cmap = rt_utils.ZERO_THIRTY_COLORSCALE,
    style_kwds = {'weight':5}))
    

## Visualizing Speed
* https://nbviewer.org/github/cal-itp/data-analyses/blob/filter-speeds-avgs/rt_segment_speeds/18_speed_distribution.ipynb
* https://analysis.calitp.org/rt/district_07-los-angeles/9__speedmaps__district_07-los-angeles__itp_id_300.html

### % of rows kept

In [23]:
test = threshold_utils.pre_clean(test)

In [24]:
# Main chart
def bar_chart(df, x_column: str, y_column:str, title:str):
    chart = (alt.Chart(df)
         .mark_bar()
         .encode(x=x_column, y= y_column, 
          color=alt.Color(y_column, 
          scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
          legend=None),
          tooltip = df.columns.tolist())
         .properties(title = title)
            )
    chart = threshold_utils.chart_size((chart), 400,300)
    return chart
    

In [25]:
bar_chart(test, 'Percentage Kept Rows','Percentile', 'Rows Kept After % Segment Cutoff')

In [99]:
bar_chart(test, 'Percentage Kept Trips','Percentile', '% of Trips Kept After Segment Cutoff')

In [100]:
bar_chart(test, 'Percentage Kept Routes','Percentile', '% of Routes Kept After Segment Cutoff')

In [28]:
bar_chart(test, 'Mean Speed Mph', 'Percentile', 'Mean MPH by % Segment Length')

### Only keep speeds that meet a certain threshold
* Put it in `speeds_with_segment_geom` in `B2`

In [69]:
def speeds_length_filter(analysis_date:str, threshold: float, max_speed_cut_off: int, dict_inputs:dict):
    
    df = merge_segments_speeds(analysis_date, max_speed_cut_off,dict_inputs)
    
    percent = df.percent.quantile(threshold).astype(float)
    
    df = (df[(df.percent >= percent)])
    
    # ADD PERCENT TO THIS
    columns_to_keep = ['shape_array_key', 'stop_sequence', 'gtfs_dataset_key',
       'gtfs_dataset_name', 'trip_id', 'min_time', 'min_dist', 'max_time',
       'max_dist', 'meters_elapsed', 'sec_elapsed', 'speed_mph',
       'trip_instance_key', 'schedule_gtfs_dataset_key','percent']
    
    df = df[columns_to_keep]
    
    return df 

In [70]:
percentile_df  = speeds_length_filter(analysis_date, 0.2, 70, dictionary)

In [71]:
def speeds_with_segment_geom(
    analysis_date: str, 
    threshold:float, 
    dict_inputs:dict,
    max_speed_cutoff: int = 70
    ) -> gpd.GeoDataFrame: 
    """
    Import the segment-trip table. 
    Average the speed_mph across all trips present in the segment.
    """
    SEGMENT_IDENTIFIER_COLS = dict_inputs['segment_identifier_cols']
    SEGMENT_FILE = dict_inputs["segments_file"]
    
    df = speeds_length_filter(analysis_date, threshold, max_speed_cutoff, dict_inputs)
    
    time_of_day_df = sched_rt_utils.get_trip_time_buckets(analysis_date)
    
    df2 = pd.merge(
        df, 
        time_of_day_df, 
        on = ["gtfs_dataset_key", "trip_id"], 
        how = "inner"
    )
    
    all_day = B2_avg_speeds_by_segment.calculate_avg_speeds(
        df2, 
        SEGMENT_IDENTIFIER_COLS
    )
    
    peak = B2_avg_speeds_by_segment.calculate_avg_speeds(
        df2[df2.time_of_day.isin(["AM Peak", "PM Peak"])], 
        SEGMENT_IDENTIFIER_COLS
    )
    
    stats = pd.concat([
        all_day.assign(time_of_day = "all_day"),
        peak.assign(time_of_day = "peak")
    ], axis=0)
    
    
    # Merge in segment geometry
    segments = helpers.import_segments(
        SEGMENT_GCS,
        f"{SEGMENT_FILE}_{analysis_date}",
        columns = SEGMENT_IDENTIFIER_COLS + [
            "gtfs_dataset_key", 
            "stop_id",
            "loop_or_inlining",
            "geometry", 
            "district", "district_name"
        ]
    ).to_crs(geography_utils.WGS84)
    
    gdf = pd.merge(
        segments,
        stats,
        on = SEGMENT_IDENTIFIER_COLS,
        how = "inner"
    )
    
    return gdf

### Check out speeds

In [72]:
avg_test = speeds_with_segment_geom(analysis_date, 0.2, dictionary)

In [73]:
STG5_FILE = dictionary['stage5']
og_avg = gpd.read_parquet(f"{SEGMENT_GCS}{STG5_FILE}_{analysis_date}.parquet")
og_avg = og_avg.drop(columns=["district", "district_name"])

In [74]:
def compare_average(avg_test:pd.DataFrame,
                    og_avg:pd.DataFrame, 
                    shape_array_key:str,
                    stop_sequence:int):
    print('Original')
    display(og_avg[(og_avg.shape_array_key == shape_array_key) &(og_avg.stop_sequence == stop_sequence)].drop(columns = ['geometry']))
    
    print('New')
    display(avg_test[(avg_test.shape_array_key == shape_array_key) & (avg_test.stop_sequence == stop_sequence)].drop(columns = ['geometry']))     

In [75]:
compare_average(avg_test, og_avg, shape_array2, stop_seq2)

Original


Unnamed: 0,shape_array_key,stop_sequence,gtfs_dataset_key,stop_id,loop_or_inlining,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
85604,5d34851ee46adb62216152f8a16fe7d0,23,6c2d7daaf979779fa2089c6395baf98b,149,0,13.72,19,8.43,15.66,all_day
85605,5d34851ee46adb62216152f8a16fe7d0,23,6c2d7daaf979779fa2089c6395baf98b,149,0,13.17,8,5.74,14.9,peak


New


Unnamed: 0,shape_array_key,stop_sequence,gtfs_dataset_key,stop_id,loop_or_inlining,district,district_name,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
151306,5d34851ee46adb62216152f8a16fe7d0,23,6c2d7daaf979779fa2089c6395baf98b,149,0,7,District 7 - Los Angeles,13.72,18,8.7,15.74,all_day
151307,5d34851ee46adb62216152f8a16fe7d0,23,6c2d7daaf979779fa2089c6395baf98b,149,0,7,District 7 - Los Angeles,13.72,7,9.27,15.2,peak


In [76]:
compare_average(avg_test, og_avg, shape_array3, stop_seq3)

Original


Unnamed: 0,shape_array_key,stop_sequence,gtfs_dataset_key,stop_id,loop_or_inlining,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
133752,94e02a46331c8b449aedb4469f49764a,32,6c2d7daaf979779fa2089c6395baf98b,1648,0,1.9,23,0.91,9.85,all_day
133753,94e02a46331c8b449aedb4469f49764a,32,6c2d7daaf979779fa2089c6395baf98b,1648,0,1.58,12,0.62,6.55,peak


New


Unnamed: 0,shape_array_key,stop_sequence,gtfs_dataset_key,stop_id,loop_or_inlining,district,district_name,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
152168,94e02a46331c8b449aedb4469f49764a,32,6c2d7daaf979779fa2089c6395baf98b,1648,0,7,District 7 - Los Angeles,1.27,16,0.88,7.27,all_day
152169,94e02a46331c8b449aedb4469f49764a,32,6c2d7daaf979779fa2089c6395baf98b,1648,0,7,District 7 - Los Angeles,1.27,9,0.86,7.09,peak


* This is missing, not sure why?

In [77]:

compare_average(avg_test, og_avg, shape_array4, stop_seq4)

Original


Unnamed: 0,shape_array_key,stop_sequence,gtfs_dataset_key,stop_id,loop_or_inlining,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
75183,50d9dd7ba11f08a8c86130778d6cabc4,14,6c2d7daaf979779fa2089c6395baf98b,916,0,11.34,1,11.34,11.34,all_day


New


Unnamed: 0,shape_array_key,stop_sequence,gtfs_dataset_key,stop_id,loop_or_inlining,district,district_name,p50_mph,n_trips,p20_mph,p80_mph,time_of_day


In [78]:
compare_average(avg_test, og_avg, shape_array1, stop_seq1)

Original


Unnamed: 0,shape_array_key,stop_sequence,gtfs_dataset_key,stop_id,loop_or_inlining,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
83381,5a788bd9c9aa5c5465875689a626baa9,7,6c2d7daaf979779fa2089c6395baf98b,894,0,8.94,1,8.94,8.94,all_day
83382,5a788bd9c9aa5c5465875689a626baa9,7,6c2d7daaf979779fa2089c6395baf98b,894,0,8.94,1,8.94,8.94,peak


New


Unnamed: 0,shape_array_key,stop_sequence,gtfs_dataset_key,stop_id,loop_or_inlining,district,district_name,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
151223,5a788bd9c9aa5c5465875689a626baa9,7,6c2d7daaf979779fa2089c6395baf98b,894,0,7,District 7 - Los Angeles,8.94,1,8.94,8.94,all_day
151224,5a788bd9c9aa5c5465875689a626baa9,7,6c2d7daaf979779fa2089c6395baf98b,894,0,7,District 7 - Los Angeles,8.94,1,8.94,8.94,peak


In [79]:
compare_average(avg_test, og_avg, shape_array5, stop_seq5)

Original


Unnamed: 0,shape_array_key,stop_sequence,gtfs_dataset_key,stop_id,loop_or_inlining,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
29718,21a802700dadbc2e249b595773a50363,8,6c2d7daaf979779fa2089c6395baf98b,579,0,22.24,6,20.76,25.12,all_day
29719,21a802700dadbc2e249b595773a50363,8,6c2d7daaf979779fa2089c6395baf98b,579,0,22.94,4,18.22,27.1,peak


New


Unnamed: 0,shape_array_key,stop_sequence,gtfs_dataset_key,stop_id,loop_or_inlining,district,district_name,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
150742,21a802700dadbc2e249b595773a50363,8,6c2d7daaf979779fa2089c6395baf98b,579,0,7,District 7 - Los Angeles,22.75,5,20.26,26.11,all_day
150743,21a802700dadbc2e249b595773a50363,8,6c2d7daaf979779fa2089c6395baf98b,579,0,7,District 7 - Los Angeles,25.12,3,18.69,28.09,peak


### Speeds for 1 Sequence

In [80]:
def speeds_one_trip_seq(shape_array_key:str, stop_sequence:int):
    STG4_FILE = DICT['stage4']
    df = pd.read_parquet(f"{SEGMENT_GCS}{STG4_FILE}_{analysis_date}", 
         filters = [[("shape_array_key", "==", shape_array_key),
                    ("stop_sequence", "==", stop_sequence)]]) 
    return df

In [81]:
def avg_speeds_one_trip_seq(shape_array_key:str, stop_sequence:int):
    STG5_FILE = DICT['stage5']
    df = pd.read_parquet(f"{SEGMENT_GCS}{STG5_FILE}_{analysis_date}.parquet", 
         filters = [[("shape_array_key", "==", shape_array_key),
                    ("stop_sequence", "==", stop_sequence),
                    ("time_of_day", "==", "all_day")]])
    df = df.drop(columns=["district", "district_name", "geometry"])
    return df

In [82]:
def merge_avg_all_speeds(shape_array_key:str, stop_sequence:int):
    speeds = speeds_one_trip_seq(shape_array_key, stop_sequence)
    avg = avg_speeds_one_trip_seq(shape_array_key, stop_sequence)
    m1 = pd.merge(speeds,avg, on =['shape_array_key','stop_sequence','gtfs_dataset_key'], how = "inner")
    return m1

In [83]:
def display_speeds(shape_array_key:str, stop_sequence:int):
    m1 = merge_avg_all_speeds(shape_array_key,stop_sequence)
    
    # Fill any nan values
    m1= m1.fillna(0)
    
    # Create chart title 
    chart_title = f"Speed Distribution for Seq {m1.stop_sequence.iloc[0]}/Shape {m1.shape_array_key.iloc[0]}"
    
    # Main chart
    chart = (alt.Chart(m1)
         .mark_bar()
         .encode(x='trip_id', y='speed_mph')
         .properties(title = chart_title))
    
    # Rule
    rule1 = alt.Chart(m1).mark_rule(color='red', strokeDash=[10, 7]).encode(y='p20_mph')
    rule2 = alt.Chart(m1).mark_rule(color='blue', strokeDash=[10, 7]).encode(y='p80_mph')
    rule3 = alt.Chart(m1).mark_rule(color='green', strokeDash=[10, 7]).encode(y='p50_mph')
    chart = threshold_utils.chart_size((chart+rule1+rule2+rule3), 600,300)
    chart = chart.interactive()
    display(chart)

In [84]:
# display_speeds(shape_array2, stop_seq2)

In [85]:
# display_speeds(shape_array3, stop_seq3)

### Speeds for 1 Shape

In [86]:
def box_whisker(df:pd.DataFrame, route:str):
    
    one_shape = df[df.shape_array_key == route]
    
    chart_title = f"Speed Distribution for Seq {one_shape.gtfs_dataset_name.iloc[0]}/Shape {one_shape.shape_array_key.iloc[0]}"
    
    one_shape = threshold_utils.pre_clean(one_shape)
    
    chart = (alt.Chart(one_shape)
    .mark_boxplot(extent='min-max').encode(
    x='Stop Sequence:O',
    y='Speed Mph:Q', color=alt.Color('Shape Array Key',
    scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
    legend=None))
    .properties(title = chart_title)
            )
    
    chart = threshold_utils.chart_size((chart), 800,300)
    
    return chart

In [87]:
box_whisker(merge1, shape_array5)

In [88]:
box_whisker(percentile_df, shape_array5)

In [103]:
box_whisker(merge1, shape_array4)

In [102]:
box_whisker(percentile_df, shape_array4)

In [89]:
def dotplot_trip_time_rt_coverage(df:pd.DataFrame, route:str):
    """
    Create a dotplot showing trips
    by its duration and % of RT coverage.
    """
    one_shape = df[df.shape_array_key == route]
    chart_title = f"Speed Distribution for Seq {one_shape.gtfs_dataset_name.iloc[0]}/Shape {one_shape.shape_array_key.iloc[0]}"
    one_shape = threshold_utils.pre_clean(one_shape)
    
    dot_chart = (
        alt.Chart(one_shape, width=0.5)
        .mark_circle(size=50)
        .encode(
            x=alt.X(
                "jitter:Q",
                title=None,
                axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),
                scale=alt.Scale(),
            ),
            y=alt.Y("Speed Mph:Q", axis=alt.Axis(labelAngle=90)),
            color=alt.Color(
                "Stop Sequence:N",
                scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
                legend=None,
            ),
            tooltip=['Stop Sequence','Speed Mph','Percent'],
            column=alt.Column(
                "Stop Sequence:N",
                header=alt.Header(
                    labelAngle=45,
                    titleOrient="top",
                    labelOrient="bottom",
                    labelAlign="right",
                    labelPadding=2,
                ),
            ),
        )
        .transform_calculate(
            # Generate Gaussian jitter with a Box-Muller transform
            jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
        )
        .configure_facet(spacing=0)
        .configure_view(stroke=None)
        .properties(title=chart_title)
    )
    
    dot_chart = threshold_utils.chart_size(dot_chart, 25, 150).interactive()
    
    return dot_chart

In [90]:
dotplot_trip_time_rt_coverage(merge1, shape_array1)

In [91]:
dotplot_trip_time_rt_coverage(percentile_df, shape_array1)

In [104]:
stop_seq3

32

In [92]:
dotplot_trip_time_rt_coverage(merge1, shape_array3)

In [93]:
dotplot_trip_time_rt_coverage(percentile_df, shape_array3)

In [94]:
def preview(df:pd.DataFrame, shape_array_key:str, stop_sequence: str):
    df = df[(df.shape_array_key ==shape_array_key) & (df.stop_sequence == stop_sequence)]
    
    display(df[['stop_sequence','sec_elapsed','meters_elapsed','percent', 'speed_mph']])

In [95]:
preview(percentile_df, shape_array5, 14)

Unnamed: 0,stop_sequence,sec_elapsed,meters_elapsed,percent,speed_mph
1796745,14,44.0,372.68,67.31,18.95


In [96]:
preview(merge1, shape_array5, 14)

Unnamed: 0,stop_sequence,sec_elapsed,meters_elapsed,percent,speed_mph
1796744,14,26.0,0.0,0.0,0.0
1796745,14,44.0,372.68,67.31,18.95
1796746,14,89.0,200.33,36.18,5.04
1796747,14,24.0,207.6,37.49,19.35


In [97]:
preview(merge1, shape_array5, 24)

Unnamed: 0,stop_sequence,sec_elapsed,meters_elapsed,percent,speed_mph
1796777,24,124.0,42.99,22.45,0.78
1796778,24,11.0,176.26,92.05,35.85
1796779,24,123.0,3.86,2.02,0.07
1796780,24,138.0,130.55,68.18,2.12
1796781,24,26.0,0.5,0.26,0.04


In [98]:
preview(percentile_df, shape_array5, 24)

Unnamed: 0,stop_sequence,sec_elapsed,meters_elapsed,percent,speed_mph
1796778,24,11.0,176.26,92.05,35.85
1796780,24,138.0,130.55,68.18,2.12
