## Transit Bunching
* I tried turning `stop_times` to actual dates but it seems like seconds is easier to manipulate.
* 10_transit_bunching.ipynb contains timestamps attempts
* cd data-analyses/rt_segment_speeds && pip install -r requirements.txt && cd ../_shared_utils && make setup_env && cd ../gtfs_digest
* [Issue](https://github.com/cal-itp/data-analyses/issues/1099)

In [1]:
import geopandas as gpd
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers
from shared_utils import catalog_utils, rt_dates, rt_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS

# https://github.com/cal-itp/data-analyses/blob/main/_shared_utils/shared_utils/gtfs_analytics_data.yml
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)

In [2]:
import merge_data

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [4]:
may_date = "2024-05-22"

In [5]:
drop_for_preview = [
    "schedule_gtfs_dataset_key",
    "trip_instance_key",
    "shape_array_key",
    "feed_key",
    "trip_id",
]

### Get high frequency routes
* Group by mean frequency minutes for the operator-route-direction grain.
* Use mean?

In [6]:
subset = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "route_primary_direction",
    "service_date",
    "frequency",
]

In [7]:
GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics

'schedule_route_dir/schedule_route_direction_metrics'

In [8]:
route_dir = merge_data.concatenate_schedule_by_route_direction([may_date])[subset]

In [9]:
route_dir["frequency_in_minutes"] = 60 / route_dir.frequency

In [10]:
route_dir.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,route_primary_direction,service_date,frequency,frequency_in_minutes
0,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,Northbound,2024-05-22,0.92,65.22


In [11]:
route_freq_groupby = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "route_primary_direction",
]

In [12]:
high_frequency_routes = (
    route_dir.groupby(route_freq_groupby)
    .agg({"frequency_in_minutes": "mean"})
    .reset_index()
)

#### Grab routes in the 5th percentile of frequency for now.

In [13]:
high_frequency_routes["frequency_in_minutes"].describe(
    percentiles=[0.05, 0.1, 0.9, 0.95]
)

count   3417.00
mean     234.64
std      312.42
min        4.00
5%        17.65
10%       23.40
50%       97.71
90%      750.00
95%     1000.00
max     1250.00
Name: frequency_in_minutes, dtype: float64

In [14]:
high_frequency_routes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3417 entries, 0 to 3416
Data columns (total 5 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   schedule_gtfs_dataset_key  3417 non-null   object 
 1   route_id                   3417 non-null   object 
 2   direction_id               3417 non-null   float64
 3   route_primary_direction    3417 non-null   object 
 4   frequency_in_minutes       3417 non-null   float64
dtypes: float64(2), object(3)
memory usage: 133.6+ KB


In [15]:
high_frequency_routes2 = high_frequency_routes.loc[
    high_frequency_routes.frequency_in_minutes <= 17.65
]

In [16]:
high_frequency_routes2.route_id.nunique()

93

### Get trips of high frequency routes

In [17]:
TABLE = GTFS_DATA_DICT.schedule_downloads.trips

In [18]:
FILE = f"{COMPILED_CACHED_VIEWS}{TABLE}_{may_date}.parquet"

In [19]:
trips_subset = [
    "gtfs_dataset_key",
    "route_id",
    "trip_instance_key",
    "shape_array_key",
    "feed_key",
    "route_long_name",
    "direction_id",
]

In [20]:
trips = pd.read_parquet(FILE)[trips_subset].rename(
    columns={"gtfs_dataset_key": "schedule_gtfs_dataset_key"}
)

In [21]:
trips.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,trip_instance_key,shape_array_key,feed_key,route_long_name,direction_id
0,1770249a5a2e770ca90628434d4934b1,3408,c256553e28c4bba693e3136240b35419,8f644f847e987de68e0cb6fcd339cf41,926867fdee73d5fbfe4f011871bcd830,Route 21,0.0
1,1770249a5a2e770ca90628434d4934b1,3408,488e9e227288606249d0508961c0fa15,8f644f847e987de68e0cb6fcd339cf41,926867fdee73d5fbfe4f011871bcd830,Route 21,0.0


In [22]:
high_frequency_routes2.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,route_primary_direction,frequency_in_minutes
34,0666caf3ec1ecc96b74f4477ee4bc939,105-13172,0.0,Westbound,14.59
35,0666caf3ec1ecc96b74f4477ee4bc939,105-13172,1.0,Eastbound,14.43


In [23]:
trips_freq_routes = pd.merge(
    trips,
    high_frequency_routes2,
    on=["schedule_gtfs_dataset_key", "route_id", "direction_id"],
    how="inner",
)

In [25]:
trips_freq_routes.shape

(20090, 9)

In [26]:
trips_freq_routes.trip_instance_key.nunique()

20090

In [27]:
trips.shape

(96398, 7)

### `rt_stop_times2`: Get Stop Times of High Frequency Routes/Trips
* What's the difference btwn `trip_id` and `trip_instance_key`?

In [28]:
rt_stop_times = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_rt_stop_times_2024-05-22.parquet"
)

In [29]:
rt_stop_times.head(2)

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec
0,1d105244-776c-4b3f-af78-9c7ad78c2103,0b2443b6-b50f-452b-a749-464588ca93b8,8,60991.0,1fd2f07342d966919b15d5d37fda8cc8,45ae17540ca9fb5030c84dbb12e48e9a,61434
1,1d105244-776c-4b3f-af78-9c7ad78c2103,cd5650b0-9a18-4e78-aedc-385f3094fa0f,9,61179.0,1fd2f07342d966919b15d5d37fda8cc8,45ae17540ca9fb5030c84dbb12e48e9a,61616


In [30]:
trips_freq_routes.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,trip_instance_key,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes
0,cc53a0dbf5df90e3009b9cb5d89d80ba,4869,cd1d4fc457d3a3fff6e77e47336bbc98,7fca7ce64e1b773776b91ec1cf82c9ea,2cfdf0e33e9229d6b0ad124d956f5856,DASH Pico Union/Echo Park,0.0,Northbound,16.67
1,cc53a0dbf5df90e3009b9cb5d89d80ba,4869,180a069ab3aefcf8f3317a788b32c288,7fca7ce64e1b773776b91ec1cf82c9ea,2cfdf0e33e9229d6b0ad124d956f5856,DASH Pico Union/Echo Park,0.0,Northbound,16.67


In [31]:
# Find only stop times of trips that belong to high frequency trips
rt_stop_times2 = pd.merge(
    rt_stop_times,
    trips_freq_routes,
    on=[
        "schedule_gtfs_dataset_key",
        "trip_instance_key",
    ],
    how="inner",
)

In [32]:
len(rt_stop_times) - len(rt_stop_times2)

1903905

In [33]:
rt_stop_times2.shape

(697357, 14)

In [34]:
rt_stop_times2.trip_id.nunique(), rt_stop_times2.trip_instance_key.nunique()

(17213, 17213)

###  `rt_stop_times3`: Some scheduled arrival seconds span longer than a day: filter them out
* There are 86,400 seconds in a day

In [35]:
rt_stop_times2.scheduled_arrival_sec.describe()

count   697357.00
mean     50526.22
std      19329.72
min       9420.00
25%      34320.00
50%      49740.00
75%      64380.00
max     108431.00
Name: scheduled_arrival_sec, dtype: float64

In [37]:
rt_stop_times3 = rt_stop_times2.loc[
    rt_stop_times2.scheduled_arrival_sec < 86400
].reset_index(drop=True)

In [39]:
len(rt_stop_times2) - len(rt_stop_times3)

27611

In [43]:
len(rt_stop_times) - len(rt_stop_times3)

1931516

In [40]:
rt_stop_times3.scheduled_arrival_sec.describe()

count   669746.00
mean     48760.53
std      17580.67
min       9420.00
25%      33720.00
50%      48540.00
75%      62640.00
max      86399.00
Name: scheduled_arrival_sec, dtype: float64

In [41]:
rt_stop_times3.rt_arrival_sec.describe()

count   669746.00
mean     48828.92
std      17647.56
min          0.00
25%      33814.00
50%      48581.00
75%      62840.00
max      86399.00
Name: rt_arrival_sec, dtype: float64

### `rt_stop_times4`: Sort so `stop sequence` for the `operator-stop_id-route-id_direction_id` will be in order.
* Comparing bunching by STOP, so we have to look at the `stop sequence-stop_id.`

In [44]:
rt_stop_times3.head(1)

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes
0,922552,258,2,61249.0,efbbd5293be71f7a5de0cf82b59febe1,50617e0d3c1bbedd9803836728767a69,61995,3730,e10d20177f6b29f7d2de52645301f18f,0e75eaae4dc791180f05782fa8825254,Main St & Santa Monica Blvd/UCLA,1.0,Southbound,16.08


In [45]:
# Rearrange: I want the stop sequence to be 1,2,3,4.
# stop ids can differ between trips of the same route and the same stop sequence is the same
rt_stop_times4 = rt_stop_times3.sort_values(
    by=[
        "schedule_gtfs_dataset_key",
        "route_id",
        "shape_array_key",
        "direction_id",
        "stop_sequence",
        "scheduled_arrival_sec",
    ]
).reset_index(drop=True)

In [46]:
fillmore_stop_seq_13 = rt_stop_times4.loc[
    (rt_stop_times4.shape_array_key == "1b678a66d0009c55bc573cfc37aa1029")
    & (rt_stop_times4.stop_id == "13086")
    & (rt_stop_times4.direction_id == 0)
]

### Calculate the difference btwn actual vs scheduled arrival.

In [49]:
def check_delay(df):
    df = df.assign(delay=df.rt_arrival_sec - df.scheduled_arrival_sec)

    print(df.delay.describe(percentiles=[0.05, 0.1, 0.9, 0.95]))

    max_delay_min = df.delay.max() / 60
    p95_delay_min = df.delay.quantile(q=0.95) / 60

    min_delay_min = df.delay.min() / 60
    p5_delay_min = df.delay.quantile(q=0.05) / 60

    print(f"min / max delay (minutes): {min_delay_min}, {max_delay_min}")
    print(f"5th / 95th delay (minutes): {p5_delay_min}, {p95_delay_min}")

    return df

In [50]:
rt_stop_times4 = check_delay(rt_stop_times4)

count   669746.00
mean        68.40
std       2841.12
min     -86381.00
5%        -167.00
10%       -107.00
50%         89.00
90%        515.00
95%        719.00
max      57878.00
Name: delay, dtype: float64
min / max delay (minutes): -1439.6833333333334, 964.6333333333333
5th / 95th delay (minutes): -2.783333333333333, 11.983333333333333


#### `rt_stop_times5`: Filter out values in `delay` that are in the 1 hour zone
* Actual times should not exceed more than an hour or less than hour.

In [51]:
# Filter to only delays that are an hour or less
rt_stop_times5 = rt_stop_times4[rt_stop_times4["delay"] <= 60 * 60].reset_index(
    drop=True
)

In [52]:
# Filter to only delays that are no less than
rt_stop_times5 = rt_stop_times5[rt_stop_times5["delay"] >= -3600].reset_index(drop=True)

In [53]:
len(rt_stop_times4) - len(rt_stop_times5)

1100

In [54]:
len(rt_stop_times) - len(rt_stop_times5)

1932616

### Calculate the actual headway the `operator-route-direction_id-stop_sequence-stop_id-` grain
* Do I need to include feed key and shape array key?

In [60]:
groupby_cols = [
    "schedule_gtfs_dataset_key",
    
    "route_id",
    "shape_array_key",
    "direction_id",
    "route_primary_direction",
    "stop_sequence",
    "stop_id",
]

In [57]:
# Subtract rt_arrival_sec from the previous row to the target row
# using groupby columns
rt_stop_times5["actual_headway"] = rt_stop_times5.groupby(groupby_cols)[
    "rt_arrival_sec"
].diff()

### Calculate scheduled headway
* Using the same grain.

In [58]:
rt_stop_times5["schd_headway"] = rt_stop_times5.groupby(groupby_cols)[
    "scheduled_arrival_sec"
].diff()

In [61]:
rt_stop_times5.loc[rt_stop_times5.actual_headway.isna()].head(10).drop(
    columns=drop_for_preview
)

Unnamed: 0,stop_id,stop_sequence,scheduled_arrival_sec,rt_arrival_sec,route_id,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway
0,14261,2,14400.0,14421,105-13172,Metro Local Line,1.0,Eastbound,14.43,21.0,,
10,14289,3,14460.0,14455,105-13172,Metro Local Line,1.0,Eastbound,14.43,-5.0,,
20,10997,4,14520.0,14492,105-13172,Metro Local Line,1.0,Eastbound,14.43,-28.0,,
30,10998,5,14580.0,14531,105-13172,Metro Local Line,1.0,Eastbound,14.43,-49.0,,
40,12150,6,14640.0,14581,105-13172,Metro Local Line,1.0,Eastbound,14.43,-59.0,,
50,12151,7,14700.0,14606,105-13172,Metro Local Line,1.0,Eastbound,14.43,-94.0,,
60,12148,8,14760.0,14650,105-13172,Metro Local Line,1.0,Eastbound,14.43,-110.0,,
70,12149,9,14760.0,14666,105-13172,Metro Local Line,1.0,Eastbound,14.43,-94.0,,
80,12146,10,14820.0,14699,105-13172,Metro Local Line,1.0,Eastbound,14.43,-121.0,,
90,70500002,11,14880.0,14742,105-13172,Metro Local Line,1.0,Eastbound,14.43,-138.0,,


### Delete out rows that are `nan`??
* I am not sure if `nans` impact calculations of the mean scheduled headway and whatnot?
* These `nans` are becuase the first `operator-route-stop_id-stop_sequence` combo won't have anything to compare it to.

In [64]:
rt_stop_times5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 668646 entries, 0 to 668645
Data columns (total 17 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   trip_id                    668646 non-null  object 
 1   stop_id                    668646 non-null  object 
 2   stop_sequence              668646 non-null  int64  
 3   scheduled_arrival_sec      668646 non-null  float64
 4   schedule_gtfs_dataset_key  668646 non-null  object 
 5   trip_instance_key          668646 non-null  object 
 6   rt_arrival_sec             668646 non-null  int64  
 7   route_id                   668646 non-null  object 
 8   shape_array_key            668646 non-null  object 
 9   feed_key                   668646 non-null  object 
 10  route_long_name            668646 non-null  object 
 11  direction_id               668646 non-null  float64
 12  route_primary_direction    668646 non-null  object 
 13  frequency_in_minutes       66

### `rt_stop_times6`: Delete out the rows in which `actual_headway` and `schd_headway` are `nan`: this is basically the first row of each grain

In [65]:
rt_stop_times6 = rt_stop_times5.loc[~rt_stop_times5.actual_headway.isna()]

In [66]:
rt_stop_times6 = rt_stop_times6.loc[~rt_stop_times5.schd_headway.isna()].reset_index(
    drop=True
)

In [67]:
len(rt_stop_times5) - len(rt_stop_times6)

21439

In [70]:
len(rt_stop_times) - len(rt_stop_times6)

1954055

In [68]:
rt_stop_times6.shape

(647207, 17)

In [69]:
rt_stop_times.shape

(2601262, 7)

### Use Coefficient
* Find the mean scheduled headway for the `operator-route-direction_id-stop_sequence-stop_id-` grain

In [71]:
agg1 = (
    rt_stop_times6.groupby(groupby_cols)
    .agg({"schd_headway": "mean"})
    .reset_index()
    .rename(columns={"schd_headway": "avg_schd_headway_sec"})
)

In [72]:
agg1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19260 entries, 0 to 19259
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   schedule_gtfs_dataset_key  19260 non-null  object 
 1   route_id                   19260 non-null  object 
 2   shape_array_key            19260 non-null  object 
 3   direction_id               19260 non-null  float64
 4   route_primary_direction    19260 non-null  object 
 5   stop_sequence              19260 non-null  int64  
 6   stop_id                    19260 non-null  object 
 7   avg_schd_headway_sec       19260 non-null  float64
dtypes: float64(2), int64(1), object(5)
memory usage: 1.2+ MB


#### Merge 

In [73]:
m1 = pd.merge(
    rt_stop_times6,
    agg1,
    on=groupby_cols,
)

In [74]:
# Make sure the lengths make sense
len(rt_stop_times6) - len(m1)

0

#### Find standard deviation: how far the actual headway is from the mean scheduled headway for the same grain above.


In [75]:
# Takes 1 minute
std_dev = (
    m1.groupby(groupby_cols)
    .apply(lambda x: (x["actual_headway"] - x["avg_schd_headway_sec"]).std())
    .reset_index(name="std_dev_headway")
)

##### Investigate missing rows
* This one seems to have some very unrealistic time stamps, like the time between scheduled versus actual arrival times are more than an hour.

In [76]:
std_dev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19260 entries, 0 to 19259
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   schedule_gtfs_dataset_key  19260 non-null  object 
 1   route_id                   19260 non-null  object 
 2   shape_array_key            19260 non-null  object 
 3   direction_id               19260 non-null  float64
 4   route_primary_direction    19260 non-null  object 
 5   stop_sequence              19260 non-null  int64  
 6   stop_id                    19260 non-null  object 
 7   std_dev_headway            17781 non-null  float64
dtypes: float64(2), int64(1), object(5)
memory usage: 1.2+ MB


In [81]:
m1.loc[
    (m1.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939")
    & (m1.shape_array_key == "0688a14c97a2ebfe90f5674c1262d741")
    & (m1.route_id == "217-13172")
    & (m1.direction_id == 1)
    & (m1.stop_id == "15434")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway,avg_schd_headway_sec
157700,10217003302323-DEC23,15434,3,84360.0,0666caf3ec1ecc96b74f4477ee4bc939,bc7a2481002d5e5b5938a991db4e69d1,84408,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92,48.0,1818.0,1800.0,1800.0


#### `scheduled_arrival_sec` is 86,220 but `rt_arrival_sec` is 303.

In [84]:
rt_stop_times2.loc[
    (rt_stop_times2.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939")
    & (rt_stop_times2.shape_array_key == "0688a14c97a2ebfe90f5674c1262d741")
    & (rt_stop_times2.route_id == "217-13172")
    & (rt_stop_times2.direction_id == 1)
    & (rt_stop_times2.stop_id == "15434")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes
413908,10217003302253-DEC23,15434,3,82560.0,0666caf3ec1ecc96b74f4477ee4bc939,27d29b3a92104fdcb72b4095ef46fed6,82590,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92
419236,10217003302323-DEC23,15434,3,84360.0,0666caf3ec1ecc96b74f4477ee4bc939,bc7a2481002d5e5b5938a991db4e69d1,84408,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92
425286,10217003302432-DEC23,15434,3,88500.0,0666caf3ec1ecc96b74f4477ee4bc939,6699f5297ef2d670988b29937f33b56e,2031,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92
479447,10217003302354-DEC23,15434,3,86220.0,0666caf3ec1ecc96b74f4477ee4bc939,2f10227a381957bbf2b4f388e7f2a3e9,303,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92


In [85]:
rt_stop_times3.loc[
    (rt_stop_times3.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939")
    & (rt_stop_times3.shape_array_key == "0688a14c97a2ebfe90f5674c1262d741")
    & (rt_stop_times3.route_id == "217-13172")
    & (rt_stop_times3.direction_id == 1)
    & (rt_stop_times3.stop_id == "15434")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes
413484,10217003302253-DEC23,15434,3,82560.0,0666caf3ec1ecc96b74f4477ee4bc939,27d29b3a92104fdcb72b4095ef46fed6,82590,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92
417843,10217003302323-DEC23,15434,3,84360.0,0666caf3ec1ecc96b74f4477ee4bc939,bc7a2481002d5e5b5938a991db4e69d1,84408,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92
462159,10217003302354-DEC23,15434,3,86220.0,0666caf3ec1ecc96b74f4477ee4bc939,2f10227a381957bbf2b4f388e7f2a3e9,303,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92


In [86]:
rt_stop_times4.loc[
    (rt_stop_times4.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939")
    & (rt_stop_times4.shape_array_key == "0688a14c97a2ebfe90f5674c1262d741")
    & (rt_stop_times4.route_id == "217-13172")
    & (rt_stop_times4.direction_id == 1)
    & (rt_stop_times4.stop_id == "15434")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay
164067,10217003302253-DEC23,15434,3,82560.0,0666caf3ec1ecc96b74f4477ee4bc939,27d29b3a92104fdcb72b4095ef46fed6,82590,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92,30.0
164068,10217003302323-DEC23,15434,3,84360.0,0666caf3ec1ecc96b74f4477ee4bc939,bc7a2481002d5e5b5938a991db4e69d1,84408,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92,48.0
164069,10217003302354-DEC23,15434,3,86220.0,0666caf3ec1ecc96b74f4477ee4bc939,2f10227a381957bbf2b4f388e7f2a3e9,303,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92,-85917.0


In [87]:
rt_stop_times5.loc[
    (rt_stop_times5.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939")
    & (rt_stop_times5.shape_array_key == "0688a14c97a2ebfe90f5674c1262d741")
    & (rt_stop_times5.route_id == "217-13172")
    & (rt_stop_times5.direction_id == 1)
    & (rt_stop_times5.stop_id == "15434")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway
163937,10217003302253-DEC23,15434,3,82560.0,0666caf3ec1ecc96b74f4477ee4bc939,27d29b3a92104fdcb72b4095ef46fed6,82590,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92,30.0,,
163938,10217003302323-DEC23,15434,3,84360.0,0666caf3ec1ecc96b74f4477ee4bc939,bc7a2481002d5e5b5938a991db4e69d1,84408,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92,48.0,1818.0,1800.0


In [88]:
rt_stop_times6.loc[
    (rt_stop_times6.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939")
    & (rt_stop_times6.shape_array_key == "0688a14c97a2ebfe90f5674c1262d741")
    & (rt_stop_times6.route_id == "217-13172")
    & (rt_stop_times6.direction_id == 1)
    & (rt_stop_times6.stop_id == "15434")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway
157700,10217003302323-DEC23,15434,3,84360.0,0666caf3ec1ecc96b74f4477ee4bc939,bc7a2481002d5e5b5938a991db4e69d1,84408,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92,48.0,1818.0,1800.0


#### Missing rows Case 2
* There are only 2 rows, I guess there needs to be at least 3 rows to calculate the standard deviation since the first row of a combo won't have anything.

In [89]:
m1.loc[
    (m1.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939")
    & (m1.shape_array_key == "6f33c9cd019664d5085f94294aeacfd3")
    & (m1.route_id == "234-13172")
    & (m1.direction_id == 1)
    & (m1.stop_id == "15383")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway,avg_schd_headway_sec
179345,10234000780433-DEC23,15383,58,19380.0,0666caf3ec1ecc96b74f4477ee4bc939,3741cc1a8fd3d2ea5ddc59ba1766c0f5,19418,234-13172,6f33c9cd019664d5085f94294aeacfd3,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,14.42,38.0,1841.0,1800.0,1800.0


In [90]:
rt_stop_times2.loc[
    (rt_stop_times2.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939")
    & (rt_stop_times2.shape_array_key == "6f33c9cd019664d5085f94294aeacfd3")
    & (rt_stop_times2.route_id == "234-13172")
    & (rt_stop_times2.direction_id == 1)
    & (rt_stop_times2.stop_id == "15383")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes
256950,10234000780433-DEC23,15383,58,19380.0,0666caf3ec1ecc96b74f4477ee4bc939,3741cc1a8fd3d2ea5ddc59ba1766c0f5,19418,234-13172,6f33c9cd019664d5085f94294aeacfd3,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,14.42
482097,10234000780403-DEC23,15383,58,17580.0,0666caf3ec1ecc96b74f4477ee4bc939,bd2c7c473164147ae73920b7ea99c3b6,17577,234-13172,6f33c9cd019664d5085f94294aeacfd3,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,14.42


In [91]:
rt_stop_times3.loc[
    (rt_stop_times3.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939")
    & (rt_stop_times3.shape_array_key == "6f33c9cd019664d5085f94294aeacfd3")
    & (rt_stop_times3.route_id == "234-13172")
    & (rt_stop_times3.direction_id == 1)
    & (rt_stop_times3.stop_id == "15383")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes
256533,10234000780433-DEC23,15383,58,19380.0,0666caf3ec1ecc96b74f4477ee4bc939,3741cc1a8fd3d2ea5ddc59ba1766c0f5,19418,234-13172,6f33c9cd019664d5085f94294aeacfd3,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,14.42
462564,10234000780403-DEC23,15383,58,17580.0,0666caf3ec1ecc96b74f4477ee4bc939,bd2c7c473164147ae73920b7ea99c3b6,17577,234-13172,6f33c9cd019664d5085f94294aeacfd3,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,14.42


In [92]:
rt_stop_times4.loc[
    (rt_stop_times4.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939")
    & (rt_stop_times4.shape_array_key == "6f33c9cd019664d5085f94294aeacfd3")
    & (rt_stop_times4.route_id == "234-13172")
    & (rt_stop_times4.direction_id == 1)
    & (rt_stop_times4.stop_id == "15383")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay
186728,10234000780403-DEC23,15383,58,17580.0,0666caf3ec1ecc96b74f4477ee4bc939,bd2c7c473164147ae73920b7ea99c3b6,17577,234-13172,6f33c9cd019664d5085f94294aeacfd3,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,14.42,-3.0
186729,10234000780433-DEC23,15383,58,19380.0,0666caf3ec1ecc96b74f4477ee4bc939,3741cc1a8fd3d2ea5ddc59ba1766c0f5,19418,234-13172,6f33c9cd019664d5085f94294aeacfd3,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,14.42,38.0


In [93]:
rt_stop_times5.loc[
    (rt_stop_times5.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939")
    & (rt_stop_times5.shape_array_key == "6f33c9cd019664d5085f94294aeacfd3")
    & (rt_stop_times5.route_id == "234-13172")
    & (rt_stop_times5.direction_id == 1)
    & (rt_stop_times5.stop_id == "15383")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway
186588,10234000780403-DEC23,15383,58,17580.0,0666caf3ec1ecc96b74f4477ee4bc939,bd2c7c473164147ae73920b7ea99c3b6,17577,234-13172,6f33c9cd019664d5085f94294aeacfd3,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,14.42,-3.0,,
186589,10234000780433-DEC23,15383,58,19380.0,0666caf3ec1ecc96b74f4477ee4bc939,3741cc1a8fd3d2ea5ddc59ba1766c0f5,19418,234-13172,6f33c9cd019664d5085f94294aeacfd3,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,14.42,38.0,1841.0,1800.0


#### Merge

In [95]:
std_dev.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,shape_array_key,direction_id,route_primary_direction,stop_sequence,stop_id,std_dev_headway
0,0666caf3ec1ecc96b74f4477ee4bc939,105-13172,46445899dfaef107a695f7a6af906c18,1.0,Eastbound,2,14261,9784.55


In [96]:
std_dev2 = std_dev.loc[~std_dev.std_dev_headway.isna()].reset_index(drop = True)

In [97]:
m2 = pd.merge(
    m1,
    std_dev2,
    on=groupby_cols,
)

### Bunching coefficient is for the entire grain, rather than each row

In [98]:
m2["bunching_coefficient"] = m2.std_dev_headway / m2.avg_schd_headway_sec

In [99]:
m2.bunching_coefficient.describe()

count   645728.00
mean         0.72
std          0.43
min          0.00
25%          0.47
50%          0.63
75%          0.79
max          5.28
Name: bunching_coefficient, dtype: float64

In [100]:
m2["avg_schd_headway_min"] = m2.avg_schd_headway_sec / 60

In [101]:
m2["actual_headway_min"] = m2.actual_headway / 60

In [102]:
m2["sched_headway_min"] = m2.schd_headway / 60

### Retain only one row for the grain

In [104]:
bunching_by_stops = (
    m2.drop_duplicates(subset=groupby_cols)
    .reset_index(drop=True)
    .drop(columns=["actual_headway", "actual_headway_min", "sched_headway_min"])
)

In [106]:
len(bunching_by_stops)

17781

In [107]:
bunching_by_stops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17781 entries, 0 to 17780
Data columns (total 20 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   trip_id                    17781 non-null  object 
 1   stop_id                    17781 non-null  object 
 2   stop_sequence              17781 non-null  int64  
 3   scheduled_arrival_sec      17781 non-null  float64
 4   schedule_gtfs_dataset_key  17781 non-null  object 
 5   trip_instance_key          17781 non-null  object 
 6   rt_arrival_sec             17781 non-null  int64  
 7   route_id                   17781 non-null  object 
 8   shape_array_key            17781 non-null  object 
 9   feed_key                   17781 non-null  object 
 10  route_long_name            17781 non-null  object 
 11  direction_id               17781 non-null  float64
 12  route_primary_direction    17781 non-null  object 
 13  frequency_in_minutes       17781 non-null  flo

### Bunching Coefficient

In [108]:
def coefficient_frequency(row):
    if row.bunching_coefficient <= 0.21:
        return "Service provided like clockwork"
    elif 0.21 < row.bunching_coefficient <= 0.3:
        return "Vehicles slightly off headway"
    elif 0.3 < row.bunching_coefficient <= 0.39:
        return "Vehicles often off headway"
    elif 0.39 < row.bunching_coefficient <= 0.52:
        return "Irregular headways, with some bunching"
    elif 0.52 < row.bunching_coefficient <= 0.74:
        return "Frequent bunching"
    else:
        return "Most vehicles bunched"


bunching_by_stops["passenger_op_perspective"] = bunching_by_stops.apply(
    lambda x: coefficient_frequency(x), axis=1
)

In [109]:
bunching_by_stops.passenger_op_perspective.value_counts() / len(bunching_by_stops)

Most vehicles bunched                    0.35
Frequent bunching                        0.25
Irregular headways, with some bunching   0.18
Vehicles often off headway               0.09
Service provided like clockwork          0.09
Vehicles slightly off headway            0.04
Name: passenger_op_perspective, dtype: float64

### Missing Routes
* These routes and trips weren't found in the `stop_times` dataset.

In [110]:
high_freq_routes = set(high_frequency_routes2.route_id.unique().tolist())
remaining_routes = set(bunching_by_stops.route_id.unique().tolist())
high_freq_routes - remaining_routes

{'Beige-N',
 'Beige-S',
 'Blue Line',
 'Lot D',
 'Red Line',
 'West Field Garage',
 'eastvalley',
 'mposa'}

In [111]:
high_frequency_routes2.loc[
    high_frequency_routes2.route_id == "West Field Garage"
].head()

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,route_primary_direction,frequency_in_minutes
597,2ff70dd1151d7532db40436f8228cd33,West Field Garage,0.0,Eastbound,10.0


#### Check out one route

In [112]:
rt_stop_times2.loc[rt_stop_times2.route_id == "West Field Garage"].head(1)

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes


In [113]:
rt_stop_times2.loc[rt_stop_times2.route_id == "eastvalley"].head(1)

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes


In [114]:
trips_freq_routes.loc[trips_freq_routes.route_id == "eastvalley"].head(1)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,trip_instance_key,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes
19915,31152914d10e2d0977b8b2fabb167922,eastvalley,77a45a5bbd6e7b87ef670d90e16fdbc3,aebd18679bd2170ee61d7522bcfc11ab,7900b8b29688af30a699822f84ae2753,East Valley Shuttle,1.0,Eastbound,16.05


In [116]:
rt_stop_times.loc[
    (rt_stop_times.trip_instance_key == "77a45a5bbd6e7b87ef670d90e16fdbc3")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec


In [115]:
rt_stop_times2.loc[
    (rt_stop_times2.trip_instance_key == "77a45a5bbd6e7b87ef670d90e16fdbc3")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes


#### Second Route

In [117]:
trips_freq_routes.loc[trips_freq_routes.route_id == "Beige-S"].sample(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,trip_instance_key,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes
2415,8a1405af8da1379acc062e346187ac98,Beige-S,bd9b3aed0f38a75885179e6eaeba1f4a,93312729ad2928439c75cbbd97191b66,3e22f1090d0d12096ee943c621298225,Coliseum to Oakland Airport,1.0,Southbound,7.9
2297,8a1405af8da1379acc062e346187ac98,Beige-S,e958e7d45ee5388470df5b80d0291cba,93312729ad2928439c75cbbd97191b66,3e22f1090d0d12096ee943c621298225,Coliseum to Oakland Airport,1.0,Southbound,7.9


In [118]:
rt_stop_times2.loc[
    (rt_stop_times2.trip_instance_key == "96e2844c6ce2bcc5ede547c32c220efb")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes


In [119]:
rt_stop_times2.loc[
    (rt_stop_times2.trip_instance_key == "4f9e575f015b6bf05e69af9da50d9579")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes


### SF Muni Fillmore Test

In [120]:
more_values = [
    "scheduled_arrival_sec",
    "std_dev_headway",
    "avg_schd_headway_sec",
    "schd_headway",
    "rt_arrival_sec",
]

In [121]:
drop_for_preview = drop_for_preview + more_values

In [122]:
fillmore = bunching_by_stops.loc[bunching_by_stops.route_long_name == "FILLMORE"]

In [124]:
fillmore.passenger_op_perspective.value_counts()/len(fillmore)

Most vehicles bunched                    0.82
Frequent bunching                        0.16
Irregular headways, with some bunching   0.01
Vehicles slightly off headway            0.01
Service provided like clockwork          0.01
Name: passenger_op_perspective, dtype: float64

In [133]:
fillmore.tail(2)

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,schd_headway,avg_schd_headway_sec,std_dev_headway,bunching_coefficient,avg_schd_headway_min,passenger_op_perspective
13583,11490047_M31,17768,6,67722.0,7cc0cb1871dfd558f11a2885c145d144,51672d1ed74ce3e66e4bfc273e6e9931,67988,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,266.0,25969.0,10029.5,11270.5,1.12,167.16,Most vehicles bunched
13584,11490047_M31,17769,7,67772.0,7cc0cb1871dfd558f11a2885c145d144,51672d1ed74ce3e66e4bfc273e6e9931,68087,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,315.0,25957.0,10023.5,11333.78,1.13,167.06,Most vehicles bunched


In [134]:
m2.loc[
    (m2.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144")
    & (m2.shape_array_key == "fefbc78a6cf676d7fbd1d25b61ef7bfb")
    & (m2.route_id == "22")
    & (m2.direction_id == 1)
    & (m2.stop_id == "17769")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway,avg_schd_headway_sec,std_dev_headway,bunching_coefficient,avg_schd_headway_min,actual_headway_min,sched_headway_min
443917,11490047_M31,17769,7,67772.0,7cc0cb1871dfd558f11a2885c145d144,51672d1ed74ce3e66e4bfc273e6e9931,68087,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,315.0,26178.0,25957.0,10023.5,11333.78,1.13,167.06,436.3,432.62
443918,11490090_M31,17769,7,72092.0,7cc0cb1871dfd558f11a2885c145d144,6aaf49dfccb82109a1f77471fee6e43a,71689,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,-403.0,3602.0,4320.0,10023.5,11333.78,1.13,167.06,60.03,72.0
443919,11490091_M31,17769,7,73892.0,7cc0cb1871dfd558f11a2885c145d144,e8ff0c3ae23fd4ba4ab3d251d0c11069,72690,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,-1202.0,1001.0,1800.0,10023.5,11333.78,1.13,167.06,16.68,30.0
443920,11490099_M31,17769,7,81909.0,7cc0cb1871dfd558f11a2885c145d144,2c824de8cf7a1b5338f76d2501e6ebd5,81349,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,-560.0,8659.0,8017.0,10023.5,11333.78,1.13,167.06,144.32,133.62


In [131]:
fillmore.loc[
    (fillmore.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144")
    & (fillmore.shape_array_key == "1b678a66d0009c55bc573cfc37aa1029")
    & (fillmore.direction_id == 0)
    & (fillmore.stop_id == "14609")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,schd_headway,avg_schd_headway_sec,std_dev_headway,bunching_coefficient,avg_schd_headway_min,passenger_op_perspective
13439,11489815_M31,14609,7,69120.0,7cc0cb1871dfd558f11a2885c145d144,186fd89b59a49ddc1e84cb4b89c066d8,69067,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,-53.0,360.0,1356.0,987.75,0.73,22.6,Frequent bunching


In [132]:
m2.loc[
    (m2.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144")
    & (m2.shape_array_key == "1b678a66d0009c55bc573cfc37aa1029")
    & (m2.route_id == "22")
    & (m2.direction_id == 0)
    & (m2.stop_id == "14609")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway,avg_schd_headway_sec,std_dev_headway,bunching_coefficient,avg_schd_headway_min,actual_headway_min,sched_headway_min
434653,11489815_M31,14609,7,69120.0,7cc0cb1871dfd558f11a2885c145d144,186fd89b59a49ddc1e84cb4b89c066d8,69067,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,-53.0,384.0,360.0,1356.0,987.75,0.73,22.6,6.4,6.0
434654,11489816_M31,14609,7,69660.0,7cc0cb1871dfd558f11a2885c145d144,5cd2523ccd8c33e277aaae0ac9af35c8,69723,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,63.0,656.0,540.0,1356.0,987.75,0.73,22.6,10.93,9.0
434655,11489817_M31,14609,7,72660.0,7cc0cb1871dfd558f11a2885c145d144,45830206e5f4a07c06e520968f4b789f,72638,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,-22.0,2915.0,3000.0,1356.0,987.75,0.73,22.6,48.58,50.0
434656,11489861_M31,14609,7,74100.0,7cc0cb1871dfd558f11a2885c145d144,a28a9fc884812bbb9e404de1dd970ccd,74147,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,47.0,1509.0,1440.0,1356.0,987.75,0.73,22.6,25.15,24.0
434657,11489818_M31,14609,7,75540.0,7cc0cb1871dfd558f11a2885c145d144,c20c26a42e6277dd327fe1280cead6a8,75647,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,107.0,1500.0,1440.0,1356.0,987.75,0.73,22.6,25.0,24.0


### [Transit Matters](https://transitmatters.org/blog/reveal-mbtas-slowest-most-bunched-bus)
* The following charts show bunching events as a pecentage of total trips. Here,
bunching is defined as headways < 25% of the scheduled_headway.
* Grain: operator-route_id-stop_id. 

In [None]:
# Using rt_stop_times5 since this already has the actual and scheduled headways

In [135]:
len(rt_stop_times5)

668646

In [136]:
transit_matters_df1 = rt_stop_times5.copy()

In [137]:
transit_matters_df1["pct_actual_schd_headway"] = (
    transit_matters_df1.actual_headway / transit_matters_df1.schd_headway
)

In [138]:
import numpy as np

transit_matters_df1["bunched_y_n"] = np.where(
    transit_matters_df1["pct_actual_schd_headway"] < 0.25, "bunched", "not bunched"
)

#### There are some very extreme values: how to deal with this?


In [139]:
transit_matters_df1.pct_actual_schd_headway.describe()

count   647207.00
mean         1.00
std          0.52
min         -9.95
25%          0.81
50%          1.00
75%          1.19
max         12.38
Name: pct_actual_schd_headway, dtype: float64

In [140]:
transit_matters_df1.bunched_y_n.value_counts(dropna=True)

not bunched    636013
bunched         32633
Name: bunched_y_n, dtype: int64

#### Groupby grain and see how many trips for that grain are considered "bunched" or not.

In [141]:
transit_matters_df2 = (
    transit_matters_df1.groupby(
        [
            "schedule_gtfs_dataset_key",
            "route_long_name",
            "shape_array_key",
            "route_id",
            "stop_id",
            "direction_id",
            "route_primary_direction",
            "bunched_y_n",
        ]
    )
    .agg({"trip_instance_key": "nunique"})
    .reset_index()
)

In [142]:
#Filter out only rows that are bunched.
bunched_only = transit_matters_df2.loc[
    transit_matters_df2.bunched_y_n == "bunched"
].reset_index(drop=True)

In [143]:
transit_matters_agg = [
    "schedule_gtfs_dataset_key",
    "route_long_name",
    "shape_array_key",
    "route_id",
    "stop_id",
    "direction_id",
    "route_primary_direction",
]

In [144]:
# Aggregate all trips on the grain
transit_matters_all_trips = (
    transit_matters_df1.groupby(transit_matters_agg)
    .agg({"trip_instance_key": "nunique"})
    .reset_index()
    .rename(columns={"trip_instance_key": "all_trips"})
)

In [145]:
# Merge back, using left merge to keep bunching
bunched_only = pd.merge(
    bunched_only, transit_matters_all_trips, on=transit_matters_agg, how="left"
)

In [146]:
bunched_only["pct_trips_bunched"] = (
    bunched_only.trip_instance_key / bunched_only.all_trips * 100
)

In [147]:
bunched_only = bunched_only.drop(columns=["all_trips"])

In [148]:
# Merge back all rows that don't have bunching trips.
transit_matters_m1 = pd.merge(
    transit_matters_all_trips,
    bunched_only,
    on=transit_matters_agg,
    how="left",
)

In [149]:
transit_matters_m1.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_long_name,shape_array_key,route_id,stop_id,direction_id,route_primary_direction,all_trips,bunched_y_n,trip_instance_key,pct_trips_bunched
0,0666caf3ec1ecc96b74f4477ee4bc939,Metro G Line 901,12530c16e07a519c8a8543d487f26ade,901-13172,15313,0.0,Eastbound,87,bunched,3.0,3.45
1,0666caf3ec1ecc96b74f4477ee4bc939,Metro G Line 901,12530c16e07a519c8a8543d487f26ade,901-13172,15416,0.0,Eastbound,81,bunched,5.0,6.17


In [154]:
transit_matters_m1 = transit_matters_m1.drop(
    columns=["bunched_y_n", "trip_instance_key"]
)

In [150]:
transit_matters_m1.pct_trips_bunched = transit_matters_m1.pct_trips_bunched.fillna(0)

In [155]:
transit_matters_m1.pct_trips_bunched.describe()

count   21130.00
mean        2.51
std         4.98
min         0.00
25%         0.00
50%         0.00
75%         3.45
max        50.00
Name: pct_trips_bunched, dtype: float64

In [156]:
transit_matters_m1.loc[transit_matters_m1.pct_trips_bunched >= 10].shape

(1673, 9)

### Fillmore Test
* Compare coefficient results w/ this transit matters one for one stop-direction.

In [157]:
fillmore.loc[
    (fillmore.shape_array_key == "1b678a66d0009c55bc573cfc37aa1029")
    & (fillmore.stop_id == "13086")
    & (fillmore.direction_id == 0)
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,schd_headway,avg_schd_headway_sec,std_dev_headway,bunching_coefficient,avg_schd_headway_min,passenger_op_perspective
13445,11489975_M31,13086,13,69106.0,7cc0cb1871dfd558f11a2885c145d144,d30242b374225ed75a4aadd78fa8d7be,69048,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,-58.0,1907.0,1445.5,960.66,0.66,24.09,Frequent bunching


In [158]:
transit_matters_fillmore_test = transit_matters_df1.loc[
    (transit_matters_df1.shape_array_key == "1b678a66d0009c55bc573cfc37aa1029")
    & (transit_matters_df1.stop_id == "13086")
    & (transit_matters_df1.direction_id == 0)
]

In [159]:
# Manually calculate
transit_matters_fillmore_test["actual_headway_min"] = (
    transit_matters_fillmore_test.actual_headway / 60
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transit_matters_fillmore_test["actual_headway_min"] = (


In [160]:
transit_matters_fillmore_test["schd_headway_min"] = (
    transit_matters_fillmore_test.schd_headway / 60
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transit_matters_fillmore_test["schd_headway_min"] = (


### Use 2 minute benchmark
* [Source](https://static1.squarespace.com/static/533b9a24e4b01d79d0ae4376/t/645e82de1f570b31497c44dc/1683915486889/TransitMatters-Headwaymanagement.pdf)
* Justifying the use of
headway maintenance. For example, in April
2022 the 66 bus significantly bunched around
several stops. When bunching is defined as
buses that run within two minutes or less of
each other, inbound buses towards Nubian
Square bunched 10% of the time at Brigham
Circle, 9% at Brookline Village and Roxbury
Crossing, and 8% of the time at Coolidge
Corner. Bunching is even more dramatic
outbound towards Harvard Square where
buses bunched over 35% of the time at Winship
St, 13% at Coolidge Corner and Harvard Ave at
Commonwealth Ave, and 12% at North Harvard
St at Western Ave. View more data about bus
bunching through the TransitMatters Data
Dashboard here.

In [161]:
two_minutess_df = rt_stop_times6.copy()

In [162]:
two_minutess_df["actual_headway_min"] = two_minutess_df.actual_headway / 60

In [163]:
two_minutess_df["bunched_y_n"] = np.where(
    two_minutess_df["actual_headway_min"] <= 2, "bunched", "not bunched"
)

In [164]:
two_minutess_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 647207 entries, 0 to 647206
Data columns (total 19 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   trip_id                    647207 non-null  object 
 1   stop_id                    647207 non-null  object 
 2   stop_sequence              647207 non-null  int64  
 3   scheduled_arrival_sec      647207 non-null  float64
 4   schedule_gtfs_dataset_key  647207 non-null  object 
 5   trip_instance_key          647207 non-null  object 
 6   rt_arrival_sec             647207 non-null  int64  
 7   route_id                   647207 non-null  object 
 8   shape_array_key            647207 non-null  object 
 9   feed_key                   647207 non-null  object 
 10  route_long_name            647207 non-null  object 
 11  direction_id               647207 non-null  float64
 12  route_primary_direction    647207 non-null  object 
 13  frequency_in_minutes       64

In [165]:
two_minutess_df.bunched_y_n.value_counts()

not bunched    618075
bunched         29132
Name: bunched_y_n, dtype: int64

#### Same code as Transit Matters Approach

In [166]:
two_minutes_agg1 = (
    two_minutess_df.groupby(
        [
            "schedule_gtfs_dataset_key",
            "route_long_name",
            "shape_array_key",
            "route_id",
            "stop_id",
            "direction_id",
            "route_primary_direction",
            "bunched_y_n",
        ]
    )
    .agg({"trip_instance_key": "nunique"})
    .reset_index()
)

In [167]:
bunched_only_two_min = (
    two_minutes_agg1.loc[two_minutes_agg1.bunched_y_n == "bunched"]
    .reset_index(drop=True)
    .rename(columns={"trip_instance_key": "bunched_trips"})
)

In [168]:
# I want to do a left merge because I'm only interested in trips that bunched.
bunched_only_two_min = pd.merge(
    bunched_only_two_min,
    transit_matters_all_trips,
    on=[
        "schedule_gtfs_dataset_key",
        "route_long_name",
        "shape_array_key",
        "route_id",
        "stop_id",
        "direction_id",
        "route_primary_direction",
    ],
    how="left",
)

In [169]:
bunched_only_two_min["pct_trips_bunched"] = (
    bunched_only_two_min.bunched_trips / bunched_only_two_min.all_trips * 100
)

In [170]:
bunched_only_two_min = bunched_only_two_min.drop(columns=["all_trips"])

In [171]:
# Need to do a left merge on all trips for the stops that don't have bunching.
final_two_minute = pd.merge(
    transit_matters_all_trips,
    bunched_only_two_min,
    on=[
        "schedule_gtfs_dataset_key",
        "route_long_name",
        "shape_array_key",
        "route_id",
        "stop_id",
        "direction_id",
        "route_primary_direction",
    ],
    how="left",
)

In [181]:
final_two_minute.shape

(21130, 11)

In [182]:
final_two_minute = final_two_minute.drop(columns=["bunched_y_n", "bunched_trips"])

### Checkout all 3 using a stop_sequence/direction_id for Fillmore again
* Very different results between the 3 approaches. 
* The coefficient one says frequent bunching lol, but the other methods say there isn't any bunching...

In [172]:
transit_matters_m2.shape

(21130, 11)

In [173]:
bunching_by_stops.shape

(17781, 21)

In [180]:
fillmore.loc[
    (fillmore.shape_array_key == "1b678a66d0009c55bc573cfc37aa1029")
    & (fillmore.stop_id == "13086")
    & (fillmore.direction_id == 0)
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,schd_headway,avg_schd_headway_sec,std_dev_headway,bunching_coefficient,avg_schd_headway_min,passenger_op_perspective
13445,11489975_M31,13086,13,69106.0,7cc0cb1871dfd558f11a2885c145d144,d30242b374225ed75a4aadd78fa8d7be,69048,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,-58.0,1907.0,1445.5,960.66,0.66,24.09,Frequent bunching


In [176]:
transit_matters_m2.loc[
    (transit_matters_m2.shape_array_key == "1b678a66d0009c55bc573cfc37aa1029")
    & (transit_matters_m2.stop_id == "13086")
    & (transit_matters_m2.direction_id == 0)
]

Unnamed: 0,schedule_gtfs_dataset_key,route_long_name,shape_array_key,route_id,stop_id,direction_id,route_primary_direction,all_trips,bunched_y_n,trip_instance_key,pct_trips_bunched
16137,7cc0cb1871dfd558f11a2885c145d144,FILLMORE,1b678a66d0009c55bc573cfc37aa1029,22,13086,0.0,Southbound,7,,,0.0


In [177]:
final_two_minute.loc[
    (final_two_minute.shape_array_key == "1b678a66d0009c55bc573cfc37aa1029")
    & (final_two_minute.stop_id == "13086")
    & (final_two_minute.direction_id == 0)
]

Unnamed: 0,schedule_gtfs_dataset_key,route_long_name,shape_array_key,route_id,stop_id,direction_id,route_primary_direction,all_trips,bunched_y_n,bunched_trips,pct_trips_bunched
16137,7cc0cb1871dfd558f11a2885c145d144,FILLMORE,1b678a66d0009c55bc573cfc37aa1029,22,13086,0.0,Southbound,7,,,


In [178]:
# convert seconds to timestamp
transit_matters_fillmore_test["rt_arrival_time"] = pd.to_timedelta(
    transit_matters_fillmore_test["rt_arrival_sec"], unit="s"
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transit_matters_fillmore_test["rt_arrival_time"] = pd.to_timedelta(


In [179]:
transit_matters_fillmore_test

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway,pct_actual_schd_headway,bunched_y_n,actual_headway_min,schd_headway_min,rt_arrival_time
452304,11489969_M31,13086,13,67199.0,7cc0cb1871dfd558f11a2885c145d144,b73ff68241fdcb9ff5a3f3be424b2268,67051,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,-148.0,,,,not bunched,,,0 days 18:37:31
452305,11489975_M31,13086,13,69106.0,7cc0cb1871dfd558f11a2885c145d144,d30242b374225ed75a4aadd78fa8d7be,69048,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,-58.0,1997.0,1907.0,1.05,not bunched,33.28,31.78,0 days 19:10:48
452306,11489815_M31,13086,13,69466.0,7cc0cb1871dfd558f11a2885c145d144,186fd89b59a49ddc1e84cb4b89c066d8,69723,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,257.0,675.0,360.0,1.88,not bunched,11.25,6.0,0 days 19:22:03
452307,11489816_M31,13086,13,70006.0,7cc0cb1871dfd558f11a2885c145d144,5cd2523ccd8c33e277aaae0ac9af35c8,70421,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,415.0,698.0,540.0,1.29,not bunched,11.63,9.0,0 days 19:33:41
452308,11489817_M31,13086,13,72992.0,7cc0cb1871dfd558f11a2885c145d144,45830206e5f4a07c06e520968f4b789f,73064,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,72.0,2643.0,2986.0,0.89,not bunched,44.05,49.77,0 days 20:17:44
452309,11489861_M31,13086,13,74432.0,7cc0cb1871dfd558f11a2885c145d144,a28a9fc884812bbb9e404de1dd970ccd,75434,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,1002.0,2370.0,1440.0,1.65,not bunched,39.5,24.0,0 days 20:57:14
452310,11489818_M31,13086,13,75872.0,7cc0cb1871dfd558f11a2885c145d144,c20c26a42e6277dd327fe1280cead6a8,75943,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,71.0,509.0,1440.0,0.35,not bunched,8.48,24.0,0 days 21:05:43
