# Distribution of speeds

Spot-checking Big Blue Bus speeds of existing speed maps and these `p20_mph` speeds.

Average speeds only throws away too-high speeds (above 70 mph), but not anything too low.

It looks like we're keeping way too many observations going into the averages, and we need to be more aggressive in excluding unstable speed calculations (speeds that are derived over a too-short-distance or too-short-time). Too-short-time means our denominator is approaching zero, and the calculations could be wildly unstable as we approach the asymptote. The same is probably happening for too-short-distances. 

In [1]:
import os
os.environ['USE_PYGEOS']='0'

import altair as alt
import dask.dataframe as dd
import geopandas as gpd
import numpy as np
import pandas as pd

from shared_utils import rt_dates, rt_utils
from segment_speed_utils.project_vars import SEGMENT_GCS

In [2]:
months = ["mar", "apr", "may", "jun", "jul"]

dates = [
    rt_dates.DATES[f"{m}2023"] for m in months
]
analysis_date = dates[-1]
analysis_date

'2023-07-12'

In [3]:
test_operator = "Big Blue Bus VehiclePositions"
test_org = "City of Santa Monica"

pub_df = pd.read_parquet(
    f"{SEGMENT_GCS}export/avg_speeds_stop_segments_{analysis_date}_tabular.parquet", 
    filters = [[("agency", "==", test_org)]]
)

test_operator_key = pub_df.gtfs_dataset_key.iloc[0]

In [4]:
def import_avg_speeds(date: str, **kwargs) -> gpd.GeoDataFrame: 
    avg_speeds = gpd.read_parquet(
        f"{SEGMENT_GCS}avg_speeds_stop_segments_{date}.parquet", 
        **kwargs
    )
    
    return avg_speeds


def import_trip_speeds(date: str, **kwargs) -> pd.DataFrame:
    trips = dd.read_parquet(
        f"{SEGMENT_GCS}speeds_stop_segments_{analysis_date}",
        **kwargs
    ).compute()
    
    return trips

In [5]:
pub_df.columns

Index(['shape_array_key', 'stop_sequence', 'gtfs_dataset_key', 'stop_id',
       'loop_or_inlining', 'district', 'district_name', 'p50_mph', 'n_trips',
       'p20_mph', 'p80_mph', 'time_of_day', 'shape_id', 'base64_url', 'uri',
       'org_id', 'agency'],
      dtype='object')

In [6]:
pub_df[pub_df.shape_id=="26347"].shape_array_key.unique()

array(['de70089f186a809de6685c056377f892'], dtype=object)

In [7]:
pub_df[pub_df.shape_id=="26348"].shape_array_key.unique()

array(['080f585295228f8c8f52cb373b1685cc'], dtype=object)

In [8]:
pub_df[pub_df.shape_id=="26342"].shape_array_key.unique()

array(['5d34851ee46adb62216152f8a16fe7d0'], dtype=object)

In [9]:
test_shape1 = "de70089f186a809de6685c056377f892"
test_shape2 = "080f585295228f8c8f52cb373b1685cc"
test_shape3 = "5d34851ee46adb62216152f8a16fe7d0"

In [10]:
avg_speeds = import_avg_speeds(
    analysis_date, 
    filters = [[("gtfs_dataset_key", "==", test_operator_key)]],
    columns = ["shape_array_key", "stop_sequence",
               "p20_mph", "p50_mph", "p80_mph", 
               "n_trips", "time_of_day",
               "geometry"]
)

In [11]:
trip_speeds = import_trip_speeds(
    analysis_date,
    filters = [[("gtfs_dataset_key", "==", test_operator_key)]],
)

In [12]:
avg_speeds[(avg_speeds.shape_array_key==test_shape1) & 
           (avg_speeds.stop_sequence==29)]

Unnamed: 0,shape_array_key,stop_sequence,p20_mph,p50_mph,p80_mph,n_trips,time_of_day,geometry
201857,de70089f186a809de6685c056377f892,29,1.68,3.77,9.89,3,all_day,"LINESTRING (-118.48871 34.02165, -118.48951 34..."


In [13]:
trip_speeds[(trip_speeds.shape_array_key==test_shape1) & 
            (trip_speeds.stop_sequence==29)].speed_mph.unique()

array([        nan,  3.76987431, 13.96246222,  0.29230316])

In [14]:
avg_speeds[avg_speeds.shape_array_key==test_shape1].explore(
    "p50_mph", 
    tiles = "CartoDB Positron",
    cmap = rt_utils.ZERO_THIRTY_COLORSCALE
)

In [15]:
avg_speeds[avg_speeds.shape_array_key==test_shape3].explore(
    "p50_mph", 
    tiles = "CartoDB Positron",
    cmap = rt_utils.ZERO_THIRTY_COLORSCALE
)

In [16]:
trip_speeds[(trip_speeds.shape_array_key==test_shape3) & 
           (trip_speeds.stop_sequence==6)].speed_mph.unique()

array([        nan, 11.85504701,  8.30953561,  8.11102264,  3.30187334,
       13.55831359,  3.12661394,  0.        , 12.39887132,  9.04728661,
       12.95855883, 11.64274665, 13.80783397,  8.7487871 ,  8.11962361,
        1.13056783,  8.76647662,  1.91638667, 12.15852066,  9.74048341,
        6.32444451, 10.52354078, 12.24562967,  6.69858223,  4.280623  ,
       13.21839885])

In [17]:
one_segment = trip_speeds[
    (trip_speeds.shape_array_key==test_shape3) & 
    (trip_speeds.stop_sequence==6)]

np.sort(one_segment.speed_mph.unique())

array([ 0.        ,  1.13056783,  1.91638667,  3.12661394,  3.30187334,
        4.280623  ,  6.32444451,  6.69858223,  8.11102264,  8.11962361,
        8.30953561,  8.7487871 ,  8.76647662,  9.04728661,  9.74048341,
       10.52354078, 11.64274665, 11.85504701, 12.15852066, 12.24562967,
       12.39887132, 12.95855883, 13.21839885, 13.55831359, 13.80783397,
               nan])

In [18]:
one_segment_filtered = one_segment[(one_segment.sec_elapsed > 0) & 
            (one_segment.meters_elapsed > 0) & 
            (one_segment.speed_mph.notna())
           ]

np.sort(one_segment_filtered.speed_mph.unique())

array([ 1.13056783,  1.91638667,  3.12661394,  3.30187334,  4.280623  ,
        6.32444451,  6.69858223,  8.11102264,  8.11962361,  8.30953561,
        8.7487871 ,  8.76647662,  9.04728661,  9.74048341, 10.52354078,
       11.64274665, 11.85504701, 12.15852066, 12.24562967, 12.39887132,
       12.95855883, 13.21839885, 13.55831359, 13.80783397])

In [19]:
one_segment.meters_elapsed.describe()

count     53.000000
mean     102.879312
std      121.899773
min        0.000000
25%        0.000000
50%        0.000000
75%      254.725516
max      271.589045
Name: meters_elapsed, dtype: float64

In [22]:
bins = [0, 5, 10, 15, 20, 25, 30, 35, 40]
meter_bins = [0, 50, 100, 150, 200, 250, 300]

one_segment['speed_binned'] = pd.cut(
    one_segment.speed_mph, bins).apply(lambda x: x.left)

one_segment["meters_binned"] = pd.cut(
    one_segment.meters_elapsed, meter_bins).apply(lambda x: x.left)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [23]:
(alt.Chart(one_segment)
 .mark_tick()
 .encode(
     x="meters_binned:O",
     y="speed_mph:Q"
 ).interactive()
)

In [24]:
one_segment_filtered.sort_values("speed_mph")[
    ["trip_id", "meters_elapsed", "sec_elapsed", "speed_mph"]]

Unnamed: 0,trip_id,meters_elapsed,sec_elapsed,speed_mph
5953,903007,22.237365,44.0,1.130568
5955,902944,268.139932,313.0,1.916387
5936,902985,271.150248,194.0,3.126614
5932,902987,271.589045,184.0,3.301873
5971,903002,86.109984,45.0,4.280623
5963,902978,271.411119,96.0,6.324445
5968,902943,266.505954,89.0,6.698582
5930,902971,177.666566,49.0,8.111023
5952,902997,192.373738,53.0,8.119624
5928,902960,167.156505,45.0,8.309536


In [25]:
speed_distribution = np.sort(one_segment_filtered.speed_mph.unique())

In [26]:
np.quantile(speed_distribution, 0.5)

8.906881612224176

In [27]:
np.quantile(speed_distribution, 0.2)

5.506915906639923

In [28]:
operator_segments = gpd.read_parquet(
    f"{SEGMENT_GCS}stop_segments_{analysis_date}.parquet",
    filters = [[("gtfs_dataset_key", "==", test_operator_key)]]
)

In [30]:
operator_segments = operator_segments.assign(
    segment_meters = operator_segments.geometry.length
)

In [31]:
operator_segments.segment_meters.describe()

count     2211.000000
mean       431.703997
std        635.401191
min          0.000000
25%        265.627183
50%        343.305085
75%        442.045878
max      17762.088740
Name: segment_meters, dtype: float64