In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(12_000_000_000_000)
os.environ['USE_PYGEOS'] = '0'

import altair as alt
import pandas as pd

from siuba import *

import shared_utils
from utils import analysis_date

In [2]:
from segment_speed_utils.project_vars import PREDICTIONS_GCS

# About

Trip updates is really big! Start by sampling narrow time windows across all operators. Use 0800-0830, 1230-1300, and 2300-2330 to capture various times of day. Use those time windows to filter `mart_ad_hoc.fct_stop_time_updates_20230315_to_20230321` on `arrival_time_pacific`. Even with narrow window, can't query all operators at once. Chunk out based on scheduled service hours. `sample_query_materialized.py` --> GCS.

Since we're interested in trip starts, query GTFS schedule warehouse for trips starting _within_ each time window. Use those trips to subset trip updates data, combination of filtering trip updates on arrival time and schedule on trip start time catches relevant updates to trip starts. Filter to first stop sequence (using stop sequence specified in trip updates for all but Caltrain, which doesn't provide so use from `dim_stop_times` instead)

Summarize by trip/organization/route_type/sample_period:

* `max_advance_min`: maximum minutes before trip start for which an update for this trip was provided
* `updates_per_min`: density of updates in that period, can use as a rough filter for update consistency during that period

`summarize_sampled_updates.py` --> GCS (all chunks from last step combined into single summary df)

# Summary Data Interpretation

In [3]:
summarized_df = pd.read_parquet(f"{PREDICTIONS_GCS}st_advance_samples_summarized_2023-03-15.parquet")

In [4]:
summarized_df >> head(3)

Unnamed: 0,trip_id,organization_name,route_type,max_advance_min,updates_per_min,sample_period
0,10002011240802-DEC22,Los Angeles County Metropolitan Transportation...,3,44.0,3.0,am
1,10002011240812-DEC22,Los Angeles County Metropolitan Transportation...,3,45.0,2.9,am
2,10002011240822-DEC22,Los Angeles County Metropolitan Transportation...,3,44.8,2.9,am


In [5]:
summarized_df.shape

(3629, 6)

## Filtering

* drop top 5% of far-advance updating trips (outliers, make charting easier). We're unlikely to set our standard this high.
* also drop trips with less than 2 updates per min (consistent with other metrics, also avoid making judgements based on potentially poor quality data)
* _2961_ trips across all sample periods after filter

In [6]:
summarized_df = (summarized_df
                 >> filter(_.updates_per_min >= 2)
                 >> filter(_.max_advance_min < _.max_advance_min.quantile(.95))
                )

In [7]:
summarized_df.shape

(2961, 6)

## Summary stats by trip

* `updates_per_min` can go to inf if first update provided within 6 seconds of trip start. (since `max_advance_min`) is rounded to one decimal place!
* OK for now since we're filtering > 2. Also, if the first update is provided within 6 seconds of trip start, update density is not the problem!
* Median trip starts providing updates _59 minutes_ in advance

In [8]:
summarized_df.describe()

Unnamed: 0,max_advance_min,updates_per_min
count,2961.0,2961.0
mean,62.877271,inf
std,31.852706,
min,0.0,2.0
25%,44.7,2.9
50%,59.0,3.0
75%,69.8,3.1
max,159.2,inf


## Charts by trip

In [9]:
alt.Chart(summarized_df).mark_bar().encode(
    alt.X('max_advance_min', bin=True),
    alt.Y('count()'),
    alt.Color('organization_name'),
    tooltip = summarized_df.columns.to_list()
).properties(width=800, height=500).interactive()

Organizations tend to have fairly consistent practices here, when looking at 20 minute bins

In [10]:
alt.Chart(summarized_df).mark_bar().encode(
    alt.X('max_advance_min', bin=True),
    alt.Y('count()'),
    alt.Color('sample_period'),
    tooltip = summarized_df.columns.to_list()
).properties(width=800, height=500).interactive()

Similar distributions across our AM Peak, Midday, and Late Evening sample periods

In [11]:
alt.Chart(summarized_df).mark_point().encode(
    x='organization_name',
    y='max_advance_min',
    color='route_type',
    tooltip=summarized_df.columns.to_list()
).properties(width=800, height=500).interactive()

No obvious pattern by route type, for example Muni has rail/bus/cable car fairly mixed

## Summary stats by organization

* take the median time in advance provided by each organization and look at that
* the median _organization_ provides updates _61 minutes_ in advance for their _median trip_

In [12]:
org_summarized = (summarized_df >> group_by(_.organization_name, _.route_type, _.sample_period)
               >> summarize(median_max_advance = _.max_advance_min.median(),
                           median_updates_per_min = _.updates_per_min.median())
)

In [13]:
org_summarized.describe()

Unnamed: 0,median_max_advance,median_updates_per_min
count,137.0,137.0
mean,69.860949,4.070803
std,34.226148,5.065413
min,0.6,2.0
25%,44.8,2.9
50%,61.25,3.0
75%,89.8,3.1
max,155.8,37.6


## Charts by organization

In [14]:
alt.Chart(org_summarized).mark_bar().encode(
    alt.X('median_max_advance', bin=True),
    alt.Y('count()'),
    alt.Color('organization_name'),
    tooltip = org_summarized.columns.to_list()
).properties(width=800, height=500).interactive()

In [15]:
alt.Chart(org_summarized).mark_bar().encode(
    alt.X('median_max_advance', bin=True),
    alt.Y('count()'),
    alt.Color('route_type'),
    tooltip = org_summarized.columns.to_list()
).properties(width=800, height=500).interactive()

In [16]:
alt.Chart(org_summarized).mark_bar().encode(
    alt.X('median_max_advance', bin=True),
    alt.Y('count()'),
    alt.Color('sample_period'),
    tooltip = org_summarized.columns.to_list()
).properties(width=800, height=500).interactive()

Similar distributions across our AM Peak, Midday, and Late Evening sample periods

# Potential Standards

As an exercise, evaluate 60 minute and 40 minute cutoffs for evaluating pre-trip updates, corresponding to the 50%ile and 25%ile respectively. (similiar by all-trip and organization-median-trip percentiles)

## 60 minute standard

* Approximate median. Good news: half of trips already exceed. Bad news: half of trips don't yet reach.
* Large number of organizations falling below, 29 including some big ones

In [17]:
orgs_without_sample_period = (summarized_df >> group_by(_.organization_name, _.route_type)
               >> summarize(median_max_advance = _.max_advance_min.median(),
                           median_updates_per_min = _.updates_per_min.median())
)

In [18]:
orgs_median_below_60 = orgs_without_sample_period >> filter(_.median_max_advance < 60)

In [19]:
orgs_median_below_60 >> arrange(-_.median_max_advance)

Unnamed: 0,organization_name,route_type,median_max_advance,median_updates_per_min
22,City of Turlock,3,59.8,3.1
49,San Joaquin Regional Transit District,3,59.8,3.0
19,City of Santa Monica,3,59.55,3.0
51,Santa Barbara Metropolitan Transit District,3,59.4,3.0
0,Alameda-Contra Costa Transit District,3,59.3,3.0
46,Riverside Transit Agency,3,59.1,3.05
21,City of Torrance,3,58.8,3.2
3,Central Contra Costa Transit Authority,3,58.7,3.0
9,City of Culver City,3,48.25,3.0
23,City of Visalia,3,47.9,2.7


In [20]:
orgs_median_below_60.shape

(29, 4)

## 40 minute standard

* At under 25%ile, many more trips and organizations already reaching
* Only 10 organizations not yet attaining for median trip

In [21]:
orgs_median_below_40 = orgs_without_sample_period >> filter(_.median_max_advance < 40)

In [22]:
orgs_median_below_40 >> arrange(-_.median_max_advance)

Unnamed: 0,organization_name,route_type,median_max_advance,median_updates_per_min
48,San Francisco Bay Area Water Emergency Transit...,4,39.2,2.0
25,Cloverdale Transit,3,32.4,3.7
28,Emeryville Transportation Management Agency,3,32.3,2.3
14,City of Los Angeles,3,30.85,2.8
54,Santa Cruz Metropolitan Transit District,3,27.4,3.0
38,Monterey-Salinas Transit,3,25.1,3.3
62,"University of California, Los Angeles",3,11.05,17.75
50,San Mateo County Transit District,3,5.6,3.2
18,City of Santa Maria,3,3.95,35.3
32,Long Beach Transit,3,0.85,24.1


## "Think Fast" Organizations

* San Mateo County Transit District (median trip 5.6 minutes)
* City of Santa Maria (median trip 3.95 minutes)
* Long Beach Transit (median trip _0.85 minutes_)

These organizations provide updates essentially only as the trip is starting! Their trip updates aren't really useful to riders intending to board at or near the start of a trip. In other words, there's no way to know if the trip will be late to get going... until it gets going.

Reach out to understand ops better/assist as we roll out metrics?