In [1]:
import datetime
import dask.dataframe as dd
import numpy as np
import geopandas as gpd
import pandas as pd
import altair as alt
from segment_speed_utils import gtfs_schedule_wrangling, helpers, segment_calcs,sched_rt_utils
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    PROJECT_CRS,
    SEGMENT_GCS,  
    analysis_date,
)
from scripts import (A1_sjoin_vp_segments, A2_valid_vehicle_positions,B2_avg_speeds_by_segment)
from shared_utils import calitp_color_palette as cp, rt_utils, geography_utils
import _threshold_utils as threshold_utils
import _rt_scheduled_utils as rt_scheduled_utils
CONFIG_PATH = './scripts/config.yml'
STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments")


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In the next release, GeoPandas will switch to using Shapely by default, even if PyGEOS is installed. If you only have PyGEOS installed to get speed-ups, this switch should be smooth. However, if you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

## % of Meters
* start with the speeds_stop_segments_{analysis_date} parquet (which is produced in B1_speeds_by_segment_trip ).
* grab in stop_segments_{analysis_date} (in CRS 3310 already), and you can get the segment's length.
* merge with the speeds by segment-trip, which contains the meters_elapsed column
* calculate pct where meters_elapsed/segment_length
* show me some charts around this, a couple of descriptives to make sure these are all ok
* give me rough descriptives of how many rows we're dropping if we go with keeping at least 30%, 40%, 50%

### Open Files
* Add `import_speeds_segs`,`calculate_segment_length`,`merge_segments_speeds` into `B2`

In [3]:
# dict_inputs = helpers.get_parameters(CONFIG_PATH, "stop_segments")

In [4]:
# dict_inputs

In [5]:
dictionary = helpers.get_parameters(CONFIG_PATH, "stop_segments")

In [6]:
def import_speeds_segs(analysis_date:str, 
                       max_speed_cutoff: int, 
                       dict_inputs:dict)-> pd.DataFrame:
    FILE = dict_inputs['stage4']
    df = pd.read_parquet(f"{SEGMENT_GCS}{FILE}_{analysis_date}", 
        filters = [[("speed_mph", "<=", max_speed_cutoff)]])
    
    return df

In [7]:
def load_segments(analysis_date:str, dict_inputs:dict) -> gpd.GeoDataFrame:
    # Load in ALL segments, find the length
    FILE = dict_inputs['segments_file']
    segments = gpd.read_parquet(f"{SEGMENT_GCS}{FILE}_{analysis_date}.parquet")
    
    return segments

In [8]:
segments_gdf = load_segments(analysis_date, dictionary)

In [9]:
def calculate_segment_length(segments:gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    # Load in ALL segments, find the length
    segments = segments.assign(
        meters_length=(segments.geometry.length)
    )
    df = segments.drop(columns = ['geometry','district','district_name'])
    
    return df

In [10]:
def merge_segments_speeds(segments:gpd.GeoDataFrame, 
                          analysis_date:str, 
                          max_speed_cutoff:int, 
                          dict_inputs:dict) -> pd.DataFrame:
    speeds = import_speeds_segs(analysis_date,max_speed_cutoff, dict_inputs)
    segments = calculate_segment_length(segments)
    
    merge_cols = ['shape_array_key','gtfs_dataset_key','stop_sequence','schedule_gtfs_dataset_key']
    merge1 = pd.merge(segments, speeds, on = merge_cols, how = "inner")
    
    merge1['percent'] = merge1.meters_elapsed/merge1.meters_length * 100
    
    merge1.percent = merge1.percent.fillna(0)
    return merge1

In [11]:
merge1= merge_segments_speeds(segments_gdf, analysis_date, 70, dictionary)

In [12]:
def myround(x, base=5):
    return base * round(x/base)

In [13]:
def valid_trips_by_cutoff(df, percentages:list):
    final = pd.DataFrame()
    og_len = len(df)
    og_trips = df.trip_id.nunique()
    og_shape_array_key = df.shape_array_key.nunique()
    
    for i in percentages:
        # Round up percent to nearest 5. Ex: 43 becomes 45.
        percent = myround(df.percent.quantile(i).astype(float))
        temp = (df[(df.percent >= percent)])
        
        temp = temp.assign(
            percentile = f"Min. of {percent}% of seg length covered")
        
        temp = (temp
                .groupby(['percentile'])
                .agg({'gtfs_dataset_name':'count',
                      'speed_mph':'mean',
                      'shape_array_key':'nunique',
                     'trip_id':'nunique',
                     'gtfs_dataset_key':'nunique'})
                .reset_index()
                .rename(columns = {'gtfs_dataset_name':'n_rows',
                                  'speed_mph':'mean_speed_mph',
                                  'shape_array_key':'n_kept_routes',
                                  'trip_id':'n_kept_trips',
                                  'gtfs_dataset_key':'n_kept_operators'})
               )
        
        final = pd.concat([final, temp], axis=0)

    
    final = final.assign(
        percentage_kept_rows=final.n_rows.divide(og_len) * 100,
        percentage_kept_trips = final.n_kept_trips.divide(og_trips) * 100,
        percentage_kept_routes = final.n_kept_routes.divide(og_shape_array_key)*100)
    
    round_cols = ['percentage_kept_rows', 'percentage_kept_trips', 'percentage_kept_routes']
    final[round_cols] = final[round_cols].round(0)
    return final

In [14]:
test = valid_trips_by_cutoff(merge1, [.1,.2,.3,.4,.5,.6,.7])

In [15]:
#test

## Sample segments and routes for Big Blue Bus

In [16]:
test_operator = "Big Blue Bus VehiclePositions"
test_org = "City of Santa Monica"
test_key = "6c2d7daaf979779fa2089c6395baf98b"

In [17]:
pub_df = pd.read_parquet(
    f"{SEGMENT_GCS}export/avg_speeds_stop_segments_{analysis_date}_tabular.parquet", 
    filters = [[("agency", "==", test_org)]]
)

In [18]:
# Dark orange
shape_id1  = "26375"
stop_seq1 = 7
shape_array1 = pub_df[pub_df.shape_id==shape_id1].shape_array_key.iloc[0]

In [19]:
# Light yellow 
shape_id2 = "26342"
stop_seq2 = 23
shape_array2 = pub_df[pub_df.shape_id==shape_id2].shape_array_key.iloc[0]

In [20]:
# Dark Red
shape_id3 = "26393"
stop_seq3 = 32
shape_array3 = pub_df[pub_df.shape_id==shape_id3].shape_array_key.iloc[0]

In [21]:
# Light orange
shape_id4 = "26372"
stop_seq4 = 14
shape_array4 = pub_df[pub_df.shape_id==shape_id4].shape_array_key.iloc[0]

In [22]:
# Green
shape_id5 = "26400"
stop_seq5= 8
shape_array5 = pub_df[pub_df.shape_id==shape_id5].shape_array_key.iloc[0]

In [23]:
def one_route_map(avg_speeds:gpd.GeoDataFrame, shape_array_key:str):
    display(avg_speeds[avg_speeds.shape_array_key==shape_array_key].explore(
    "p50_mph", 
    tiles = "CartoDB Positron",
    cmap = rt_utils.ZERO_THIRTY_COLORSCALE,
    style_kwds = {'weight':5}))
    

## Visualizing Speed
* https://nbviewer.org/github/cal-itp/data-analyses/blob/filter-speeds-avgs/rt_segment_speeds/18_speed_distribution.ipynb
* https://analysis.calitp.org/rt/district_07-los-angeles/9__speedmaps__district_07-los-angeles__itp_id_300.html

### % of rows kept

In [24]:
test = threshold_utils.pre_clean(test)

In [25]:
# Main chart
def bar_chart(df, x_column: str, y_column:str, title:str):
    chart = (alt.Chart(df)
         .mark_bar()
         .encode(x=x_column, y= y_column, 
          color=alt.Color(y_column, 
          scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
          legend=None),
          tooltip = df.columns.tolist())
         .properties(title = title)
            )
    chart = threshold_utils.chart_size((chart), 400,300)
    return chart
    

In [26]:
bar_chart(test, 'Percentage Kept Rows','Percentile', 'Rows Kept After % Segment Cutoff')

In [27]:
#bar_chart(test, 'Percentage Kept Trips','Percentile', '% of Trips Kept After Segment Cutoff')

In [28]:
#bar_chart(test, 'Percentage Kept Routes','Percentile', '% of Routes Kept After Segment Cutoff')

In [29]:
#bar_chart(test, 'Mean Speed Mph', 'Percentile', 'Mean MPH by % Segment Length')

## Only keep speeds that meet a certain threshold
* Put it in `speeds_with_segment_geom` in `B2`

In [30]:
def speeds_length_filter(segments: gpd.GeoDataFrame, 
                         analysis_date:str, 
                         threshold: float, 
                         max_speed_cut_off: int,
                         dict_inputs:dict) -> pd.DataFrame:
    
    df = merge_segments_speeds(segments, analysis_date, max_speed_cut_off,dict_inputs)
    
    percent = df.percent.quantile(threshold).astype(float)
    
    df = (df[(df.percent >= percent)])

    columns_to_keep = ['shape_array_key', 'stop_sequence', 'gtfs_dataset_key',
       'gtfs_dataset_name', 'trip_id', 'min_time', 'min_dist', 'max_time',
       'max_dist', 'meters_elapsed', 'sec_elapsed', 'speed_mph',
       'trip_instance_key', 'schedule_gtfs_dataset_key',]
    
    df = df[columns_to_keep]
    
    return df 

In [31]:
# percentile_df  = speeds_length_filter(segments_gdf, analysis_date, 0.2, 70, dictionary)

In [32]:
# percentile_df.columns

In [33]:
# time_of_day_df = sched_rt_utils.get_trip_time_buckets(analysis_date)

In [34]:
# time_of_day_df.columns

### Edited B2 function

In [35]:
def speeds_with_segment_geom(
    analysis_date: str, 
    dict_inputs:dict,
    max_speed_cutoff: int = 70,
    threshold:float = .20) -> gpd.GeoDataFrame: 
    """
    Import the segment-trip table. 
    Average the speed_mph across all trips present in the segment.
    """
    SEGMENT_IDENTIFIER_COLS = dict_inputs['segment_identifier_cols']
    SEGMENT_FILE = dict_inputs["segments_file"]
    
     # Merge in segment geometry
    segments = helpers.import_segments(
        SEGMENT_GCS,
        f"{SEGMENT_FILE}_{analysis_date}",
        columns = SEGMENT_IDENTIFIER_COLS + [
            "gtfs_dataset_key", 
            "stop_id",
            "loop_or_inlining",
            "geometry", 
            "district", "district_name",
            "schedule_gtfs_dataset_key",
        ])
    
    # Filter out abnormally high and low speeds
    # Threshold defaults to throwing away the bottom 20% of rows with low speeds.
    df = speeds_length_filter(segments, analysis_date, threshold, max_speed_cutoff, dict_inputs)
    
    time_of_day_df = sched_rt_utils.get_trip_time_buckets(analysis_date)
    
    df2 = pd.merge(
        df, 
        time_of_day_df, 
        on = ["trip_instance_key"], 
        how = "inner"
    )
    
    all_day = B2_avg_speeds_by_segment.calculate_avg_speeds(
        df2, 
        SEGMENT_IDENTIFIER_COLS
    )
    
    peak = B2_avg_speeds_by_segment.calculate_avg_speeds(
        df2[df2.time_of_day.isin(["AM Peak", "PM Peak"])], 
        SEGMENT_IDENTIFIER_COLS
    )
    
    stats = pd.concat([
        all_day.assign(time_of_day = "all_day"),
        peak.assign(time_of_day = "peak")
    ], axis=0)
    
    # Merge in segment geometry with a changed CRS
    segments = segments.to_crs(geography_utils.WGS84)

    gdf = pd.merge(
        segments,
        stats,
        on = SEGMENT_IDENTIFIER_COLS,
        how = "inner"
    )
    
    # Columns to keep 
    to_keep = ['shape_array_key', 'stop_sequence', 'gtfs_dataset_key', 'stop_id',
       'loop_or_inlining', 'geometry', 'p50_mph', 'n_trips', 'p20_mph',
       'p80_mph', 'time_of_day']
    
    gdf = gdf[to_keep]
    
    return gdf

### Check out speeds

In [36]:
# Teakes around 7 minutes
avg_test = speeds_with_segment_geom(analysis_date, dictionary, 70, 0.2)

In [37]:
STG5_FILE = dictionary['stage5']
og_avg = gpd.read_parquet(f"{SEGMENT_GCS}{STG5_FILE}_{analysis_date}.parquet")
og_avg = og_avg.drop(columns=["district", "district_name"])

In [38]:
# to_keep = og_avg.columns.tolist()

In [39]:
# type(to_keep)

In [40]:
# avg_test = avg_test[to_keep]

In [41]:
merge1_preview_cols = ['meters_length', 
       'trip_id', 'min_time', 'min_dist', 'max_time', 'max_dist',
       'meters_elapsed', 'sec_elapsed', 'speed_mph', 
       'percent']

## Checks

In [42]:
def compare_average(avg_test:pd.DataFrame,
                    og_avg:pd.DataFrame, 
                    shape_array_key:str,
                    stop_sequence:int):
    
    print(f"route {shape_array_key}, sequence {stop_sequence}")
    drop_cols = ['shape_array_key','geometry','stop_sequence']
    print('Original')
    display(og_avg[(og_avg.shape_array_key == shape_array_key) &(og_avg.stop_sequence == stop_sequence)].drop(columns = drop_cols))
    
    print('New')
    display(avg_test[(avg_test.shape_array_key == shape_array_key) & (avg_test.stop_sequence == stop_sequence)].drop(columns = drop_cols))     

In [43]:
compare_average(avg_test, og_avg, shape_array1, stop_seq1)

route 5a788bd9c9aa5c5465875689a626baa9, sequence 7
Original


Unnamed: 0,gtfs_dataset_key,stop_id,loop_or_inlining,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
83381,6c2d7daaf979779fa2089c6395baf98b,894,0,8.94,1,8.94,8.94,all_day
83382,6c2d7daaf979779fa2089c6395baf98b,894,0,8.94,1,8.94,8.94,peak


New


Unnamed: 0,gtfs_dataset_key,stop_id,loop_or_inlining,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
151223,6c2d7daaf979779fa2089c6395baf98b,894,0,8.94,1,8.94,8.94,all_day
151224,6c2d7daaf979779fa2089c6395baf98b,894,0,8.94,1,8.94,8.94,peak


In [44]:
compare_average(avg_test, og_avg, shape_array2, stop_seq2)

route 5d34851ee46adb62216152f8a16fe7d0, sequence 23
Original


Unnamed: 0,gtfs_dataset_key,stop_id,loop_or_inlining,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
85604,6c2d7daaf979779fa2089c6395baf98b,149,0,13.72,19,8.43,15.66,all_day
85605,6c2d7daaf979779fa2089c6395baf98b,149,0,13.17,8,5.74,14.9,peak


New


Unnamed: 0,gtfs_dataset_key,stop_id,loop_or_inlining,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
151306,6c2d7daaf979779fa2089c6395baf98b,149,0,13.72,18,8.7,15.74,all_day
151307,6c2d7daaf979779fa2089c6395baf98b,149,0,13.72,7,9.27,15.2,peak


* Strangely enough, the 50th percentile speed became lower
* Sometimes the lower the length, the higher the speed?

In [45]:
compare_average(avg_test, og_avg, shape_array3, stop_seq3)

route 94e02a46331c8b449aedb4469f49764a, sequence 32
Original


Unnamed: 0,gtfs_dataset_key,stop_id,loop_or_inlining,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
133752,6c2d7daaf979779fa2089c6395baf98b,1648,0,1.9,23,0.91,9.85,all_day
133753,6c2d7daaf979779fa2089c6395baf98b,1648,0,1.58,12,0.62,6.55,peak


New


Unnamed: 0,gtfs_dataset_key,stop_id,loop_or_inlining,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
152168,6c2d7daaf979779fa2089c6395baf98b,1648,0,1.27,16,0.88,7.27,all_day
152169,6c2d7daaf979779fa2089c6395baf98b,1648,0,1.27,9,0.86,7.09,peak


In [46]:
merge1.loc[(merge1.shape_array_key == shape_array3) & (merge1.stop_sequence == stop_seq3)][merge1_preview_cols].sort_values(['percent'])

Unnamed: 0,meters_length,trip_id,min_time,min_dist,max_time,max_dist,meters_elapsed,sec_elapsed,speed_mph,percent
1806859,583.74,905067,2023-07-12 18:59:27,430.11,2023-07-12 19:00:12,468.28,38.17,45.0,1.9,6.54
1806861,583.74,905083,2023-07-12 11:30:31,249.29,2023-07-12 11:31:13,434.91,185.61,42.0,9.89,31.8
1806863,583.74,905092,2023-07-12 07:43:06,267.36,2023-07-12 07:44:35,459.47,192.11,89.0,4.83,32.91
1806868,583.74,905076,2023-07-12 14:25:17,0.0,2023-07-12 14:26:46,207.87,207.87,89.0,5.22,35.61
1806875,583.74,905084,2023-07-12 11:15:31,103.57,2023-07-12 11:16:16,323.28,219.72,45.0,10.92,37.64
1806857,583.74,905079,2023-07-12 13:11:31,88.57,2023-07-12 13:12:16,314.28,225.72,45.0,11.22,38.67
1806871,583.74,905070,2023-07-12 17:28:31,352.35,2023-07-12 17:51:28,582.05,229.7,1377.0,0.37,39.35
1806860,583.74,905069,2023-07-12 18:03:58,328.54,2023-07-12 18:23:12,583.74,255.2,1154.0,0.49,43.72
1806873,583.74,905073,2023-07-12 15:52:45,306.14,2023-07-12 16:15:08,580.88,274.74,1343.0,0.46,47.06
1806869,583.74,905087,2023-07-12 09:47:58,83.85,2023-07-12 09:49:27,361.36,277.52,89.0,6.98,47.54


* This is missing because the only row that was non-zero only covered 27% of the length
* Maybe up the threshold?

In [47]:

compare_average(avg_test, og_avg, shape_array4, stop_seq4)

route 50d9dd7ba11f08a8c86130778d6cabc4, sequence 14
Original


Unnamed: 0,gtfs_dataset_key,stop_id,loop_or_inlining,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
75183,6c2d7daaf979779fa2089c6395baf98b,916,0,11.34,1,11.34,11.34,all_day


New


Unnamed: 0,gtfs_dataset_key,stop_id,loop_or_inlining,p50_mph,n_trips,p20_mph,p80_mph,time_of_day


In [48]:
merge1.loc[(merge1.shape_array_key == shape_array4) & (merge1.stop_sequence == stop_seq4)][merge1_preview_cols]

Unnamed: 0,meters_length,trip_id,min_time,min_dist,max_time,max_dist,meters_elapsed,sec_elapsed,speed_mph,percent
1801245,386.01,904430,2023-07-12 07:38:20,386.01,2023-07-12 07:38:30,386.01,0.0,10.0,0.0,0.0
1801246,386.01,904452,2023-07-12 20:54:10,0.0,2023-07-12 20:55:39,0.0,0.0,89.0,0.0,0.0
1801247,386.01,904427,2023-07-12 14:57:04,284.59,2023-07-12 14:57:24,386.01,101.42,20.0,11.34,26.27


In [49]:
compare_average(avg_test, og_avg, shape_array5, stop_seq5)

route 21a802700dadbc2e249b595773a50363, sequence 8
Original


Unnamed: 0,gtfs_dataset_key,stop_id,loop_or_inlining,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
29718,6c2d7daaf979779fa2089c6395baf98b,579,0,22.24,6,20.76,25.12,all_day
29719,6c2d7daaf979779fa2089c6395baf98b,579,0,22.94,4,18.22,27.1,peak


New


Unnamed: 0,gtfs_dataset_key,stop_id,loop_or_inlining,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
150742,6c2d7daaf979779fa2089c6395baf98b,579,0,22.75,5,20.26,26.11,all_day
150743,6c2d7daaf979779fa2089c6395baf98b,579,0,25.12,3,18.69,28.09,peak


### Speeds for 1 Sequence

In [None]:
def speeds_one_trip_seq(shape_array_key:str, stop_sequence:int):
    STG4_FILE = DICT['stage4']
    df = pd.read_parquet(f"{SEGMENT_GCS}{STG4_FILE}_{analysis_date}", 
         filters = [[("shape_array_key", "==", shape_array_key),
                    ("stop_sequence", "==", stop_sequence)]]) 
    return df

In [None]:
def avg_speeds_one_trip_seq(shape_array_key:str, stop_sequence:int):
    STG5_FILE = DICT['stage5']
    df = pd.read_parquet(f"{SEGMENT_GCS}{STG5_FILE}_{analysis_date}.parquet", 
         filters = [[("shape_array_key", "==", shape_array_key),
                    ("stop_sequence", "==", stop_sequence),
                    ("time_of_day", "==", "all_day")]])
    df = df.drop(columns=["district", "district_name", "geometry"])
    return df

In [None]:
def merge_avg_all_speeds(shape_array_key:str, stop_sequence:int):
    speeds = speeds_one_trip_seq(shape_array_key, stop_sequence)
    avg = avg_speeds_one_trip_seq(shape_array_key, stop_sequence)
    m1 = pd.merge(speeds,avg, on =['shape_array_key','stop_sequence','gtfs_dataset_key'], how = "inner")
    return m1

In [None]:
def display_speeds(shape_array_key:str, stop_sequence:int):
    m1 = merge_avg_all_speeds(shape_array_key,stop_sequence)
    
    # Fill any nan values
    m1= m1.fillna(0)
    
    # Create chart title 
    chart_title = f"Speed Distribution for Seq {m1.stop_sequence.iloc[0]}/Shape {m1.shape_array_key.iloc[0]}"
    
    # Main chart
    chart = (alt.Chart(m1)
         .mark_bar()
         .encode(x='trip_id', y='speed_mph')
         .properties(title = chart_title))
    
    # Rule
    rule1 = alt.Chart(m1).mark_rule(color='red', strokeDash=[10, 7]).encode(y='p20_mph')
    rule2 = alt.Chart(m1).mark_rule(color='blue', strokeDash=[10, 7]).encode(y='p80_mph')
    rule3 = alt.Chart(m1).mark_rule(color='green', strokeDash=[10, 7]).encode(y='p50_mph')
    chart = threshold_utils.chart_size((chart+rule1+rule2+rule3), 600,300)
    chart = chart.interactive()
    display(chart)

In [None]:
# display_speeds(shape_array2, stop_seq2)

In [None]:
# display_speeds(shape_array3, stop_seq3)

### Speeds for 1 Shape

In [None]:
def box_whisker(df:pd.DataFrame, route:str):
    
    one_shape = df[df.shape_array_key == route]
    
    chart_title = f"Speed Distribution for Seq {one_shape.gtfs_dataset_name.iloc[0]}/Shape {one_shape.shape_array_key.iloc[0]}"
    
    one_shape = threshold_utils.pre_clean(one_shape)
    
    chart = (alt.Chart(one_shape)
    .mark_boxplot(extent='min-max').encode(
    x='Stop Sequence:O',
    y='Speed Mph:Q', color=alt.Color('Shape Array Key',
    scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
    legend=None))
    .properties(title = chart_title)
            )
    
    chart = threshold_utils.chart_size((chart), 800,300)
    
    return chart

In [None]:
# box_whisker(merge1, shape_array5)

In [None]:
# box_whisker(percentile_df, shape_array5)

In [None]:
box_whisker(merge1, shape_array4)

In [None]:
box_whisker(percentile_df, shape_array4)

In [None]:
def dotplot_trip_time_rt_coverage(df:pd.DataFrame, route:str):
    """
    Create a dotplot showing trips
    by its duration and % of RT coverage.
    """
    one_shape = df[df.shape_array_key == route]
    chart_title = f"Speed Distribution for Seq {one_shape.gtfs_dataset_name.iloc[0]}/Shape {one_shape.shape_array_key.iloc[0]}"
    one_shape = threshold_utils.pre_clean(one_shape)
    
    dot_chart = (
        alt.Chart(one_shape, width=0.5)
        .mark_circle(size=50)
        .encode(
            x=alt.X(
                "jitter:Q",
                title=None,
                axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),
                scale=alt.Scale(),
            ),
            y=alt.Y("Speed Mph:Q", axis=alt.Axis(labelAngle=90)),
            color=alt.Color(
                "Stop Sequence:N",
                scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
                legend=None,
            ),
            tooltip=['Stop Sequence','Speed Mph','Percent'],
            column=alt.Column(
                "Stop Sequence:N",
                header=alt.Header(
                    labelAngle=45,
                    titleOrient="top",
                    labelOrient="bottom",
                    labelAlign="right",
                    labelPadding=2,
                ),
            ),
        )
        .transform_calculate(
            # Generate Gaussian jitter with a Box-Muller transform
            jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
        )
        .configure_facet(spacing=0)
        .configure_view(stroke=None)
        .properties(title=chart_title)
    )
    
    dot_chart = threshold_utils.chart_size(dot_chart, 25, 150).interactive()
    
    return dot_chart

In [None]:
dotplot_trip_time_rt_coverage(merge1, shape_array1)

In [None]:
dotplot_trip_time_rt_coverage(percentile_df, shape_array1)

In [None]:
stop_seq3

In [None]:
dotplot_trip_time_rt_coverage(merge1, shape_array3)

In [None]:
dotplot_trip_time_rt_coverage(percentile_df, shape_array3)

In [None]:
def preview(df:pd.DataFrame, shape_array_key:str, stop_sequence: str):
    df = df[(df.shape_array_key ==shape_array_key) & (df.stop_sequence == stop_sequence)]
    
    display(df[['stop_sequence','sec_elapsed','meters_elapsed','percent', 'speed_mph']])

In [None]:
preview(percentile_df, shape_array5, 14)

In [None]:
preview(merge1, shape_array5, 14)

In [None]:
preview(merge1, shape_array5, 24)

In [None]:
preview(percentile_df, shape_array5, 24)