## I tried turning `stop_times` to actual dates but it seems like seconds is easier to manipulate.
* 10_transit_bunching.ipynb contains timestamps attempts
* cd data-analyses/rt_segment_speeds && pip install -r requirements.txt && cd ../_shared_utils && make setup_env && cd ../gtfs_digest
* [Issue](https://github.com/cal-itp/data-analyses/issues/1099)

In [1]:
import geopandas as gpd
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers
from shared_utils import catalog_utils, rt_dates, rt_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS

# https://github.com/cal-itp/data-analyses/blob/main/_shared_utils/shared_utils/gtfs_analytics_data.yml
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)

In [2]:
import merge_data

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [4]:
may_date = "2024-05-22"

In [5]:
drop_for_preview = [
    "schedule_gtfs_dataset_key",
    "trip_instance_key",
    "shape_array_key",
    "feed_key",
    "trip_id",
]

### Get high frequency routes
* Group by mean frequency minutes for the operator-route-direction grain.

In [6]:
subset = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "route_primary_direction",
    "service_date",
    "frequency",
]

In [7]:
route_dir = merge_data.concatenate_schedule_by_route_direction([may_date])[subset]

In [8]:
route_dir["frequency_in_minutes"] = 60 / route_dir.frequency

In [9]:
route_freq_groupby = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "route_primary_direction",
]

In [10]:
high_frequency_routes = (
    route_dir.groupby(route_freq_groupby)
    .agg({"frequency_in_minutes": "mean"})
    .reset_index()
)

#### Grab routes in the 5th percentile of frequency for now.

In [11]:
high_frequency_routes["frequency_in_minutes"].describe(
    percentiles=[0.05, 0.1, 0.9, 0.95]
)

count   3417.00
mean     234.64
std      312.42
min        4.00
5%        17.65
10%       23.40
50%       97.71
90%      750.00
95%     1000.00
max     1250.00
Name: frequency_in_minutes, dtype: float64

In [12]:
high_frequency_routes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3417 entries, 0 to 3416
Data columns (total 5 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   schedule_gtfs_dataset_key  3417 non-null   object 
 1   route_id                   3417 non-null   object 
 2   direction_id               3417 non-null   float64
 3   route_primary_direction    3417 non-null   object 
 4   frequency_in_minutes       3417 non-null   float64
dtypes: float64(2), object(3)
memory usage: 133.6+ KB


In [13]:
high_frequency_routes2 = high_frequency_routes.loc[
    high_frequency_routes.frequency_in_minutes <= 17.65
]

In [14]:
high_frequency_routes2.route_id.nunique()

93

### Get trips of high frequency routes

In [15]:
TABLE = GTFS_DATA_DICT.schedule_downloads.trips

In [16]:
FILE = f"{COMPILED_CACHED_VIEWS}{TABLE}_{may_date}.parquet"

In [17]:
trips_subset = [
    "gtfs_dataset_key",
    "route_id",
    "trip_instance_key",
    "shape_array_key",
    "feed_key",
    "route_long_name",
    "direction_id",
]

In [18]:
trips = pd.read_parquet(FILE)[trips_subset].rename(
    columns={"gtfs_dataset_key": "schedule_gtfs_dataset_key"}
)

In [19]:
trips_freq_routes = pd.merge(
    trips,
    high_frequency_routes2,
    on=["schedule_gtfs_dataset_key", "route_id", "direction_id"],
    how="inner",
)

In [20]:
trips_freq_routes.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,trip_instance_key,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes
0,cc53a0dbf5df90e3009b9cb5d89d80ba,4869,cd1d4fc457d3a3fff6e77e47336bbc98,7fca7ce64e1b773776b91ec1cf82c9ea,2cfdf0e33e9229d6b0ad124d956f5856,DASH Pico Union/Echo Park,0.0,Northbound,16.67
1,cc53a0dbf5df90e3009b9cb5d89d80ba,4869,180a069ab3aefcf8f3317a788b32c288,7fca7ce64e1b773776b91ec1cf82c9ea,2cfdf0e33e9229d6b0ad124d956f5856,DASH Pico Union/Echo Park,0.0,Northbound,16.67


In [21]:
trips_freq_routes.trip_instance_key.nunique()

20090

### `rt_stop_times2`: Get Stop Times of High Frequency Routes/Trips
* What's the difference btwn `trip_id` and `trip_instance_key`?

In [22]:
rt_stop_times = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_rt_stop_times_2024-05-22.parquet"
)

In [23]:
rt_stop_times.shape

(2601262, 7)

In [24]:
# Find only stop times of trips that belong to high frequency trips
rt_stop_times2 = pd.merge(
    rt_stop_times,
    trips_freq_routes,
    on=[
        "schedule_gtfs_dataset_key",
        "trip_instance_key",
    ],
    how="inner",
)

In [25]:
len(rt_stop_times) - len(rt_stop_times2)

1903905

In [26]:
rt_stop_times2.shape

(697357, 14)

In [27]:
rt_stop_times2.head(2)

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes
0,922552,258,2,61249.0,efbbd5293be71f7a5de0cf82b59febe1,50617e0d3c1bbedd9803836728767a69,61995,3730,e10d20177f6b29f7d2de52645301f18f,0e75eaae4dc791180f05782fa8825254,Main St & Santa Monica Blvd/UCLA,1.0,Southbound,16.08
1,922552,310,9,62012.0,efbbd5293be71f7a5de0cf82b59febe1,50617e0d3c1bbedd9803836728767a69,62826,3730,e10d20177f6b29f7d2de52645301f18f,0e75eaae4dc791180f05782fa8825254,Main St & Santa Monica Blvd/UCLA,1.0,Southbound,16.08


In [28]:
rt_stop_times2.trip_id.nunique(), rt_stop_times2.trip_instance_key.nunique()

(17213, 17213)

###  `rt_stop_times3`: Some scheduled arrival seconds span longer than a day: filter them out
* There are 86,400 seconds in a day

In [29]:
rt_stop_times2.scheduled_arrival_sec.describe()

count   697357.00
mean     50526.22
std      19329.72
min       9420.00
25%      34320.00
50%      49740.00
75%      64380.00
max     108431.00
Name: scheduled_arrival_sec, dtype: float64

In [30]:
len(rt_stop_times2.loc[rt_stop_times2.scheduled_arrival_sec > 86400])

27472

In [31]:
rt_stop_times3 = rt_stop_times2.loc[
    rt_stop_times2.scheduled_arrival_sec < 86400
].reset_index(drop=True)

In [32]:
len(rt_stop_times3)

669746

In [33]:
rt_stop_times3.scheduled_arrival_sec.describe()

count   669746.00
mean     48760.53
std      17580.67
min       9420.00
25%      33720.00
50%      48540.00
75%      62640.00
max      86399.00
Name: scheduled_arrival_sec, dtype: float64

In [34]:
rt_stop_times3.rt_arrival_sec.describe()

count   669746.00
mean     48828.92
std      17647.56
min          0.00
25%      33814.00
50%      48581.00
75%      62840.00
max      86399.00
Name: rt_arrival_sec, dtype: float64

### `rt_stop_times4`: Sort so stop sequence for the `operator-stop_id-route-id_direction_id` will be in order.
Help, which columns should I use to sort? Should I keep `feed_key` and `shape_array_key`?

In [35]:
# Rearrange: I want the stop sequence to be 1,2,3,4.
# stop ids can differ between trips of the same route and the same stop sequence is the same
rt_stop_times4 = rt_stop_times3.sort_values(
    by=[
        "schedule_gtfs_dataset_key",
        "route_id",
        "direction_id",
        "feed_key",
        "shape_array_key",
        "stop_sequence",
        "scheduled_arrival_sec",
    ]
).reset_index(drop=True)

### Calculate the difference btwn actual vs scheduled arrival.

In [36]:
def check_delay(df):
    df = df.assign(delay=df.rt_arrival_sec - df.scheduled_arrival_sec)

    print(df.delay.describe(percentiles=[0.05, 0.1, 0.9, 0.95]))

    max_delay_min = df.delay.max() / 60
    p95_delay_min = df.delay.quantile(q=0.95) / 60

    min_delay_min = df.delay.min() / 60
    p5_delay_min = df.delay.quantile(q=0.05) / 60

    print(f"min / max delay (minutes): {min_delay_min}, {max_delay_min}")
    print(f"5th / 95th delay (minutes): {p5_delay_min}, {p95_delay_min}")

    return df

In [37]:
rt_stop_times4 = check_delay(rt_stop_times4)

count   669746.00
mean        68.40
std       2841.12
min     -86381.00
5%        -167.00
10%       -107.00
50%         89.00
90%        515.00
95%        719.00
max      57878.00
Name: delay, dtype: float64
min / max delay (minutes): -1439.6833333333334, 964.6333333333333
5th / 95th delay (minutes): -2.783333333333333, 11.983333333333333


In [38]:
1439 / 60

23.983333333333334

#### `rt_stop_times5`: Filter out values in `delay` that ~1 hour zone

In [39]:
# Filter to only delays that are an hour or less
rt_stop_times5 = rt_stop_times4[rt_stop_times4["delay"] <= 60 * 60].reset_index(
    drop=True
)

In [40]:
# Filter to only delays that are no less than
rt_stop_times5 = rt_stop_times5[rt_stop_times5["delay"] >= -3600].reset_index(drop=True)

In [41]:
len(rt_stop_times4) - len(rt_stop_times5)

1100

In [42]:
len(rt_stop_times) - len(rt_stop_times5)

1932616

In [43]:
len(rt_stop_times2) - len(rt_stop_times5)

28711

In [44]:
rt_stop_times5.delay.describe()

count   668646.00
mean       158.11
std        306.54
min      -3559.00
25%        -24.00
50%         89.00
75%        269.00
max       3592.00
Name: delay, dtype: float64

### Calculate the actual headway the `operator-route-direction_id-stop_sequence-stop_id-` grain
* Do I need to include feed key and shape array key?

In [45]:
groupby_cols = [
    "schedule_gtfs_dataset_key",
    "feed_key",
    "shape_array_key",
    "route_id",
    "direction_id",
    "route_primary_direction",
    "stop_sequence",
    "stop_id",
]

In [46]:
rt_stop_times5["actual_headway"] = rt_stop_times5.groupby(groupby_cols)[
    "rt_arrival_sec"
].diff()

### Calculate scheduled headway

In [47]:
rt_stop_times5["schd_headway"] = rt_stop_times5.groupby(groupby_cols)[
    "scheduled_arrival_sec"
].diff()

In [48]:
rt_stop_times5.loc[rt_stop_times5.actual_headway.isna()].head(10).drop(
    columns=drop_for_preview
)

Unnamed: 0,stop_id,stop_sequence,scheduled_arrival_sec,rt_arrival_sec,route_id,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway
0,4201,2,56100.0,56133,105-13172,Metro Local Line,0.0,Westbound,14.59,33.0,,
9,4198,3,56160.0,56211,105-13172,Metro Local Line,0.0,Westbound,14.59,51.0,,
19,10256,4,56280.0,56262,105-13172,Metro Local Line,0.0,Westbound,14.59,-18.0,,
29,10249,5,56340.0,56337,105-13172,Metro Local Line,0.0,Westbound,14.59,-3.0,,
39,36572,6,56460.0,56478,105-13172,Metro Local Line,0.0,Westbound,14.59,18.0,,
49,10243,7,56580.0,56589,105-13172,Metro Local Line,0.0,Westbound,14.59,9.0,,
59,10244,8,56700.0,56695,105-13172,Metro Local Line,0.0,Westbound,14.59,-5.0,,
69,10251,9,56820.0,56743,105-13172,Metro Local Line,0.0,Westbound,14.59,-77.0,,
79,10247,10,56940.0,56832,105-13172,Metro Local Line,0.0,Westbound,14.59,-108.0,,
89,10250,11,57060.0,56955,105-13172,Metro Local Line,0.0,Westbound,14.59,-105.0,,


### Delete out rows that are `nan`??
* I am not sure if `nans` impact calculations of the mean scheduled headway and whatnot?
* These `nans` are becuase the first `operator-route-stop_id-stop_sequence` combo won't have anything to compare it to.

In [49]:
rt_stop_times5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 668646 entries, 0 to 668645
Data columns (total 17 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   trip_id                    668646 non-null  object 
 1   stop_id                    668646 non-null  object 
 2   stop_sequence              668646 non-null  int64  
 3   scheduled_arrival_sec      668646 non-null  float64
 4   schedule_gtfs_dataset_key  668646 non-null  object 
 5   trip_instance_key          668646 non-null  object 
 6   rt_arrival_sec             668646 non-null  int64  
 7   route_id                   668646 non-null  object 
 8   shape_array_key            668646 non-null  object 
 9   feed_key                   668646 non-null  object 
 10  route_long_name            668646 non-null  object 
 11  direction_id               668646 non-null  float64
 12  route_primary_direction    668646 non-null  object 
 13  frequency_in_minutes       66

### `rt_stop_times6`: Delete out the rows in which `actual_headway` and `schd_headway` are `nan`: this is basically teh first row of each grain

In [50]:
rt_stop_times6 = rt_stop_times5.loc[~rt_stop_times5.actual_headway.isna()]

In [51]:
rt_stop_times6 = rt_stop_times6.loc[~rt_stop_times5.schd_headway.isna()].reset_index(
    drop=True
)

In [52]:
len(rt_stop_times5) - len(rt_stop_times6)

21439

###Find the mean scheduled headway for the `operator-route-direction_id-stop_sequence-stop_id-` grain

In [53]:
agg1 = (
    rt_stop_times6.groupby(groupby_cols)
    .agg({"schd_headway": "mean"})
    .reset_index()
    .rename(columns={"schd_headway": "avg_schd_headway_sec"})
)

In [54]:
agg1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19260 entries, 0 to 19259
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   schedule_gtfs_dataset_key  19260 non-null  object 
 1   feed_key                   19260 non-null  object 
 2   shape_array_key            19260 non-null  object 
 3   route_id                   19260 non-null  object 
 4   direction_id               19260 non-null  float64
 5   route_primary_direction    19260 non-null  object 
 6   stop_sequence              19260 non-null  int64  
 7   stop_id                    19260 non-null  object 
 8   avg_schd_headway_sec       19260 non-null  float64
dtypes: float64(2), int64(1), object(6)
memory usage: 1.3+ MB


#### Merge

In [55]:
m1 = pd.merge(
    rt_stop_times6,
    agg1,
    on=groupby_cols,
)

In [56]:
len(rt_stop_times6) - len(m1)

0

#### Find standard deviation: how far the actual headway is from the mean scheduled headway for the same grain above.


In [57]:
# Takes 1 minute
std_dev = (
    m1.groupby(groupby_cols)
    .apply(lambda x: (x["actual_headway"] - x["avg_schd_headway_sec"]).std())
    .reset_index(name="std_dev_headway")
)

##### Why are so many missing? Case 1
Why are some Groupby and transform: When using groupby and transform, if a group has only one element, the standard deviation is NaN (Not a Number). This is because standard deviation requires at least two data points to calculate.

In [58]:
std_dev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19260 entries, 0 to 19259
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   schedule_gtfs_dataset_key  19260 non-null  object 
 1   feed_key                   19260 non-null  object 
 2   shape_array_key            19260 non-null  object 
 3   route_id                   19260 non-null  object 
 4   direction_id               19260 non-null  float64
 5   route_primary_direction    19260 non-null  object 
 6   stop_sequence              19260 non-null  int64  
 7   stop_id                    19260 non-null  object 
 8   std_dev_headway            17781 non-null  float64
dtypes: float64(2), int64(1), object(6)
memory usage: 1.3+ MB


In [59]:
std_dev.loc[std_dev.std_dev_headway.isna()].sample(1)

Unnamed: 0,schedule_gtfs_dataset_key,feed_key,shape_array_key,route_id,direction_id,route_primary_direction,stop_sequence,stop_id,std_dev_headway
1349,0666caf3ec1ecc96b74f4477ee4bc939,608992664173210532aa3e6cc573be2f,13d1f6a199bb8055089a2f96cea449f0,108-13172,1.0,Westbound,43,16283,


In [60]:
std_dev2 = std_dev.loc[~std_dev.std_dev_headway.isna()].reset_index(drop=True)

##### Investigate missing rows
* This one seems to have some very unrealistic time stamps, like the time between scheduled versus actual arrival times are more than an hour.

In [61]:
m1.loc[
    (m1.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939")
    & (m1.shape_array_key == "0688a14c97a2ebfe90f5674c1262d741")
    & (m1.route_id == "217-13172")
    & (m1.direction_id == 1)
    & (m1.stop_id == "15434")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway,avg_schd_headway_sec
161685,10217003302323-DEC23,15434,3,84360.0,0666caf3ec1ecc96b74f4477ee4bc939,bc7a2481002d5e5b5938a991db4e69d1,84408,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92,48.0,1818.0,1800.0,1800.0


#### `scheduled_arrival_sec` is 86,220 but `rt_arrival_sec` is 303.

In [62]:
rt_stop_times2.loc[
    (rt_stop_times2.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939")
    & (rt_stop_times2.shape_array_key == "0688a14c97a2ebfe90f5674c1262d741")
    & (rt_stop_times2.route_id == "217-13172")
    & (rt_stop_times2.direction_id == 1)
    & (rt_stop_times2.stop_id == "15434")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes
413908,10217003302253-DEC23,15434,3,82560.0,0666caf3ec1ecc96b74f4477ee4bc939,27d29b3a92104fdcb72b4095ef46fed6,82590,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92
419236,10217003302323-DEC23,15434,3,84360.0,0666caf3ec1ecc96b74f4477ee4bc939,bc7a2481002d5e5b5938a991db4e69d1,84408,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92
425286,10217003302432-DEC23,15434,3,88500.0,0666caf3ec1ecc96b74f4477ee4bc939,6699f5297ef2d670988b29937f33b56e,2031,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92
479447,10217003302354-DEC23,15434,3,86220.0,0666caf3ec1ecc96b74f4477ee4bc939,2f10227a381957bbf2b4f388e7f2a3e9,303,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92


In [63]:
rt_stop_times3.loc[
    (rt_stop_times3.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939")
    & (rt_stop_times3.shape_array_key == "0688a14c97a2ebfe90f5674c1262d741")
    & (rt_stop_times3.route_id == "217-13172")
    & (rt_stop_times3.direction_id == 1)
    & (rt_stop_times3.stop_id == "15434")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes
413484,10217003302253-DEC23,15434,3,82560.0,0666caf3ec1ecc96b74f4477ee4bc939,27d29b3a92104fdcb72b4095ef46fed6,82590,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92
417843,10217003302323-DEC23,15434,3,84360.0,0666caf3ec1ecc96b74f4477ee4bc939,bc7a2481002d5e5b5938a991db4e69d1,84408,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92
462159,10217003302354-DEC23,15434,3,86220.0,0666caf3ec1ecc96b74f4477ee4bc939,2f10227a381957bbf2b4f388e7f2a3e9,303,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92


In [64]:
rt_stop_times4.loc[
    (rt_stop_times4.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939")
    & (rt_stop_times4.shape_array_key == "0688a14c97a2ebfe90f5674c1262d741")
    & (rt_stop_times4.route_id == "217-13172")
    & (rt_stop_times4.direction_id == 1)
    & (rt_stop_times4.stop_id == "15434")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay
168196,10217003302253-DEC23,15434,3,82560.0,0666caf3ec1ecc96b74f4477ee4bc939,27d29b3a92104fdcb72b4095ef46fed6,82590,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92,30.0
168197,10217003302323-DEC23,15434,3,84360.0,0666caf3ec1ecc96b74f4477ee4bc939,bc7a2481002d5e5b5938a991db4e69d1,84408,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92,48.0
168198,10217003302354-DEC23,15434,3,86220.0,0666caf3ec1ecc96b74f4477ee4bc939,2f10227a381957bbf2b4f388e7f2a3e9,303,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92,-85917.0


In [65]:
rt_stop_times5.loc[
    (rt_stop_times5.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939")
    & (rt_stop_times5.shape_array_key == "0688a14c97a2ebfe90f5674c1262d741")
    & (rt_stop_times5.route_id == "217-13172")
    & (rt_stop_times5.direction_id == 1)
    & (rt_stop_times5.stop_id == "15434")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway
168062,10217003302253-DEC23,15434,3,82560.0,0666caf3ec1ecc96b74f4477ee4bc939,27d29b3a92104fdcb72b4095ef46fed6,82590,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92,30.0,,
168063,10217003302323-DEC23,15434,3,84360.0,0666caf3ec1ecc96b74f4477ee4bc939,bc7a2481002d5e5b5938a991db4e69d1,84408,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92,48.0,1818.0,1800.0


In [66]:
rt_stop_times6.loc[
    (rt_stop_times6.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939")
    & (rt_stop_times6.shape_array_key == "0688a14c97a2ebfe90f5674c1262d741")
    & (rt_stop_times6.route_id == "217-13172")
    & (rt_stop_times6.direction_id == 1)
    & (rt_stop_times6.stop_id == "15434")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway
161685,10217003302323-DEC23,15434,3,84360.0,0666caf3ec1ecc96b74f4477ee4bc939,bc7a2481002d5e5b5938a991db4e69d1,84408,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92,48.0,1818.0,1800.0


#### Missing rows Case 2
* There are only 2 rows, I guess there needs to be at least 3 rows to calculate the standard deviation since the first row of a combo won't have anything.

In [67]:
m1.loc[
    (m1.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939")
    & (m1.shape_array_key == "6f33c9cd019664d5085f94294aeacfd3")
    & (m1.route_id == "234-13172")
    & (m1.direction_id == 1)
    & (m1.stop_id == "15383")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway,avg_schd_headway_sec
179405,10234000780433-DEC23,15383,58,19380.0,0666caf3ec1ecc96b74f4477ee4bc939,3741cc1a8fd3d2ea5ddc59ba1766c0f5,19418,234-13172,6f33c9cd019664d5085f94294aeacfd3,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,14.42,38.0,1841.0,1800.0,1800.0


In [68]:
rt_stop_times2.loc[
    (rt_stop_times2.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939")
    & (rt_stop_times2.shape_array_key == "6f33c9cd019664d5085f94294aeacfd3")
    & (rt_stop_times2.route_id == "234-13172")
    & (rt_stop_times2.direction_id == 1)
    & (rt_stop_times2.stop_id == "15383")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes
256950,10234000780433-DEC23,15383,58,19380.0,0666caf3ec1ecc96b74f4477ee4bc939,3741cc1a8fd3d2ea5ddc59ba1766c0f5,19418,234-13172,6f33c9cd019664d5085f94294aeacfd3,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,14.42
482097,10234000780403-DEC23,15383,58,17580.0,0666caf3ec1ecc96b74f4477ee4bc939,bd2c7c473164147ae73920b7ea99c3b6,17577,234-13172,6f33c9cd019664d5085f94294aeacfd3,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,14.42


In [69]:
rt_stop_times3.loc[
    (rt_stop_times3.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939")
    & (rt_stop_times3.shape_array_key == "6f33c9cd019664d5085f94294aeacfd3")
    & (rt_stop_times3.route_id == "234-13172")
    & (rt_stop_times3.direction_id == 1)
    & (rt_stop_times3.stop_id == "15383")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes
256533,10234000780433-DEC23,15383,58,19380.0,0666caf3ec1ecc96b74f4477ee4bc939,3741cc1a8fd3d2ea5ddc59ba1766c0f5,19418,234-13172,6f33c9cd019664d5085f94294aeacfd3,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,14.42
462564,10234000780403-DEC23,15383,58,17580.0,0666caf3ec1ecc96b74f4477ee4bc939,bd2c7c473164147ae73920b7ea99c3b6,17577,234-13172,6f33c9cd019664d5085f94294aeacfd3,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,14.42


In [70]:
rt_stop_times4.loc[
    (rt_stop_times4.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939")
    & (rt_stop_times4.shape_array_key == "6f33c9cd019664d5085f94294aeacfd3")
    & (rt_stop_times4.route_id == "234-13172")
    & (rt_stop_times4.direction_id == 1)
    & (rt_stop_times4.stop_id == "15383")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay
186848,10234000780403-DEC23,15383,58,17580.0,0666caf3ec1ecc96b74f4477ee4bc939,bd2c7c473164147ae73920b7ea99c3b6,17577,234-13172,6f33c9cd019664d5085f94294aeacfd3,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,14.42,-3.0
186849,10234000780433-DEC23,15383,58,19380.0,0666caf3ec1ecc96b74f4477ee4bc939,3741cc1a8fd3d2ea5ddc59ba1766c0f5,19418,234-13172,6f33c9cd019664d5085f94294aeacfd3,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,14.42,38.0


In [71]:
rt_stop_times5.loc[
    (rt_stop_times5.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939")
    & (rt_stop_times5.shape_array_key == "6f33c9cd019664d5085f94294aeacfd3")
    & (rt_stop_times5.route_id == "234-13172")
    & (rt_stop_times5.direction_id == 1)
    & (rt_stop_times5.stop_id == "15383")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway
186708,10234000780403-DEC23,15383,58,17580.0,0666caf3ec1ecc96b74f4477ee4bc939,bd2c7c473164147ae73920b7ea99c3b6,17577,234-13172,6f33c9cd019664d5085f94294aeacfd3,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,14.42,-3.0,,
186709,10234000780433-DEC23,15383,58,19380.0,0666caf3ec1ecc96b74f4477ee4bc939,3741cc1a8fd3d2ea5ddc59ba1766c0f5,19418,234-13172,6f33c9cd019664d5085f94294aeacfd3,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,14.42,38.0,1841.0,1800.0


#### Merge

In [72]:
m2 = pd.merge(
    m1,
    std_dev2,
    on=groupby_cols,
)

### Bunching coefficient is for the entire grain, rather than each row

In [73]:
m2["bunching_coefficient"] = m2.std_dev_headway / m2.avg_schd_headway_sec

In [74]:
m2.bunching_coefficient.describe()

count   645728.00
mean         0.72
std          0.43
min          0.00
25%          0.47
50%          0.63
75%          0.79
max          5.28
Name: bunching_coefficient, dtype: float64

In [75]:
m2["avg_schd_headway_min"] = m2.avg_schd_headway_sec / 60

In [76]:
m2["actual_headway_min"] = m2.actual_headway / 60

In [77]:
m2["sched_headway_min"] = m2.schd_headway / 60

### Retain only one row f or the grain

In [78]:
bunching_by_stops = (
    m2.drop_duplicates(subset=groupby_cols)
    .reset_index(drop=True)
    .drop(columns=["actual_headway", "actual_headway_min", "sched_headway_min"])
)

In [79]:
len(m2) - len(bunching_by_stops)

627947

In [80]:
len(bunching_by_stops)

17781

In [81]:
bunching_by_stops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17781 entries, 0 to 17780
Data columns (total 20 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   trip_id                    17781 non-null  object 
 1   stop_id                    17781 non-null  object 
 2   stop_sequence              17781 non-null  int64  
 3   scheduled_arrival_sec      17781 non-null  float64
 4   schedule_gtfs_dataset_key  17781 non-null  object 
 5   trip_instance_key          17781 non-null  object 
 6   rt_arrival_sec             17781 non-null  int64  
 7   route_id                   17781 non-null  object 
 8   shape_array_key            17781 non-null  object 
 9   feed_key                   17781 non-null  object 
 10  route_long_name            17781 non-null  object 
 11  direction_id               17781 non-null  float64
 12  route_primary_direction    17781 non-null  object 
 13  frequency_in_minutes       17781 non-null  flo

### Bunching Coefficient

In [82]:
def coefficient_frequency(row):
    if row.bunching_coefficient <= 0.21:
        return "Service provided like clockwork"
    elif 0.21 < row.bunching_coefficient <= 0.3:
        return "Vehicles slightly off headway"
    elif 0.3 < row.bunching_coefficient <= 0.39:
        return "Vehicles often off headway"
    elif 0.39 < row.bunching_coefficient <= 0.52:
        return "Irregular headways, with some bunching"
    elif 0.52 < row.bunching_coefficient <= 0.74:
        return "Frequent bunching"
    else:
        return "Most vehicles bunched"


bunching_by_stops["passenger_op_perspective"] = bunching_by_stops.apply(
    lambda x: coefficient_frequency(x), axis=1
)

In [83]:
bunching_by_stops.passenger_op_perspective.value_counts() / len(bunching_by_stops)

Most vehicles bunched                    0.35
Frequent bunching                        0.25
Irregular headways, with some bunching   0.18
Vehicles often off headway               0.09
Service provided like clockwork          0.09
Vehicles slightly off headway            0.04
Name: passenger_op_perspective, dtype: float64

### Missing Routes
* These routes and trips weren't found in the `stop_times` dataset.

In [84]:
high_freq_routes = set(high_frequency_routes2.route_id.unique().tolist())
remaining_routes = set(bunching_by_stops.route_id.unique().tolist())
high_freq_routes - remaining_routes

{'Beige-N',
 'Beige-S',
 'Blue Line',
 'Lot D',
 'Red Line',
 'West Field Garage',
 'eastvalley',
 'mposa'}

In [85]:
high_frequency_routes2.loc[
    high_frequency_routes2.route_id == "West Field Garage"
].head()

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,route_primary_direction,frequency_in_minutes
597,2ff70dd1151d7532db40436f8228cd33,West Field Garage,0.0,Eastbound,10.0


#### Check out one route

In [86]:
rt_stop_times2.loc[rt_stop_times2.route_id == "West Field Garage"].head(1)

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes


In [87]:
rt_stop_times2.loc[rt_stop_times2.route_id == "eastvalley"].head(1)

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes


In [88]:
trips_freq_routes.loc[trips_freq_routes.route_id == "eastvalley"].head(1)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,trip_instance_key,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes
19915,31152914d10e2d0977b8b2fabb167922,eastvalley,77a45a5bbd6e7b87ef670d90e16fdbc3,aebd18679bd2170ee61d7522bcfc11ab,7900b8b29688af30a699822f84ae2753,East Valley Shuttle,1.0,Eastbound,16.05


In [89]:
rt_stop_times2.loc[
    (rt_stop_times2.trip_instance_key == "77a45a5bbd6e7b87ef670d90e16fdbc3")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes


In [90]:
rt_stop_times.loc[
    (rt_stop_times.trip_instance_key == "77a45a5bbd6e7b87ef670d90e16fdbc3")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec


#### Second Route

In [91]:
trips_freq_routes.loc[trips_freq_routes.route_id == "Beige-S"].sample(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,trip_instance_key,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes
2390,8a1405af8da1379acc062e346187ac98,Beige-S,fcfb52a1d6d40797a52539554c71f63e,93312729ad2928439c75cbbd97191b66,3e22f1090d0d12096ee943c621298225,Coliseum to Oakland Airport,1.0,Southbound,7.9
2322,8a1405af8da1379acc062e346187ac98,Beige-S,2fb4b4d008ce4dff34b4d553119d1847,93312729ad2928439c75cbbd97191b66,3e22f1090d0d12096ee943c621298225,Coliseum to Oakland Airport,1.0,Southbound,7.9


In [92]:
rt_stop_times2.loc[
    (rt_stop_times2.trip_instance_key == "96e2844c6ce2bcc5ede547c32c220efb")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes


In [93]:
rt_stop_times2.loc[
    (rt_stop_times2.trip_instance_key == "4f9e575f015b6bf05e69af9da50d9579")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes


### Fillmore Test

In [94]:
more_values = [
    "scheduled_arrival_sec",
    "std_dev_headway",
    "avg_schd_headway_sec",
    "schd_headway",
    "rt_arrival_sec",
]

In [95]:
drop_for_preview = drop_for_preview + more_values

In [96]:
fillmore = bunching_by_stops.loc[bunching_by_stops.route_long_name == "FILLMORE"]

In [97]:
fillmore.passenger_op_perspective.value_counts()

Most vehicles bunched                     120
Frequent bunching                          24
Irregular headways, with some bunching      1
Vehicles slightly off headway               1
Service provided like clockwork             1
Name: passenger_op_perspective, dtype: int64

In [98]:
fillmore

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,schd_headway,avg_schd_headway_sec,std_dev_headway,bunching_coefficient,avg_schd_headway_min,passenger_op_perspective
13438,11489815_M31,14630,2,68903.0,7cc0cb1871dfd558f11a2885c145d144,186fd89b59a49ddc1e84cb4b89c066d8,69064,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,161.0,2280.0,1750.8,859.58,0.49,29.18,"Irregular headways, with some bunching"
13439,11489815_M31,14609,7,69120.0,7cc0cb1871dfd558f11a2885c145d144,186fd89b59a49ddc1e84cb4b89c066d8,69067,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,-53.0,360.0,1356.0,987.75,0.73,22.6,Frequent bunching
13440,11489815_M31,14627,8,69175.0,7cc0cb1871dfd558f11a2885c145d144,186fd89b59a49ddc1e84cb4b89c066d8,69077,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,-98.0,360.0,1356.0,1338.73,0.99,22.6,Most vehicles bunched
13441,11489975_M31,16754,9,68926.0,7cc0cb1871dfd558f11a2885c145d144,d30242b374225ed75a4aadd78fa8d7be,69041,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,115.0,1920.0,1450.0,1193.62,0.82,24.17,Most vehicles bunched
13442,11489975_M31,16491,10,69000.0,7cc0cb1871dfd558f11a2885c145d144,d30242b374225ed75a4aadd78fa8d7be,69043,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,43.0,1920.0,1450.0,1055.87,0.73,24.17,Frequent bunching
13443,11489975_M31,16488,11,69023.0,7cc0cb1871dfd558f11a2885c145d144,d30242b374225ed75a4aadd78fa8d7be,69044,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,21.0,1917.0,1449.0,984.31,0.68,24.15,Frequent bunching
13444,11489975_M31,16493,12,69066.0,7cc0cb1871dfd558f11a2885c145d144,d30242b374225ed75a4aadd78fa8d7be,69046,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,-20.0,1912.0,1447.33,969.72,0.67,24.12,Frequent bunching
13445,11489975_M31,13086,13,69106.0,7cc0cb1871dfd558f11a2885c145d144,d30242b374225ed75a4aadd78fa8d7be,69048,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,-58.0,1907.0,1445.5,960.66,0.66,24.09,Frequent bunching
13446,11489975_M31,14605,14,69164.0,7cc0cb1871dfd558f11a2885c145d144,d30242b374225ed75a4aadd78fa8d7be,69051,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,-113.0,1900.0,1443.17,1062.23,0.74,24.05,Frequent bunching
13447,11489975_M31,14624,15,69217.0,7cc0cb1871dfd558f11a2885c145d144,d30242b374225ed75a4aadd78fa8d7be,69133,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,-84.0,1893.0,1441.0,967.76,0.67,24.02,Frequent bunching


#### Test 1: What do you do with scheduled headways that are pretty extreme compared to the other rows?
* One headway is scheduled for 7 hours?

In [99]:
436 / 60

7.266666666666667

In [100]:
m2.loc[
    (m2.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144")
    & (m2.shape_array_key == "fefbc78a6cf676d7fbd1d25b61ef7bfb")
    & (m2.route_id == "22")
    & (m2.direction_id == 1)
    & (m2.stop_id == "17769")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway,avg_schd_headway_sec,std_dev_headway,bunching_coefficient,avg_schd_headway_min,actual_headway_min,sched_headway_min
443917,11490047_M31,17769,7,67772.0,7cc0cb1871dfd558f11a2885c145d144,51672d1ed74ce3e66e4bfc273e6e9931,68087,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,315.0,26178.0,25957.0,10023.5,11333.78,1.13,167.06,436.3,432.62
443918,11490090_M31,17769,7,72092.0,7cc0cb1871dfd558f11a2885c145d144,6aaf49dfccb82109a1f77471fee6e43a,71689,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,-403.0,3602.0,4320.0,10023.5,11333.78,1.13,167.06,60.03,72.0
443919,11490091_M31,17769,7,73892.0,7cc0cb1871dfd558f11a2885c145d144,e8ff0c3ae23fd4ba4ab3d251d0c11069,72690,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,-1202.0,1001.0,1800.0,10023.5,11333.78,1.13,167.06,16.68,30.0
443920,11490099_M31,17769,7,81909.0,7cc0cb1871dfd558f11a2885c145d144,2c824de8cf7a1b5338f76d2501e6ebd5,81349,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,-560.0,8659.0,8017.0,10023.5,11333.78,1.13,167.06,144.32,133.62


In [101]:
rt_stop_times2.loc[
    (rt_stop_times2.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144")
    & (rt_stop_times2.shape_array_key == "fefbc78a6cf676d7fbd1d25b61ef7bfb")
    & (rt_stop_times2.route_id == "22")
    & (rt_stop_times2.direction_id == 1)
    & (rt_stop_times2.stop_id == "17769")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes
528875,11490047_M31,17769,7,67772.0,7cc0cb1871dfd558f11a2885c145d144,51672d1ed74ce3e66e4bfc273e6e9931,68087,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6
529753,11490091_M31,17769,7,73892.0,7cc0cb1871dfd558f11a2885c145d144,e8ff0c3ae23fd4ba4ab3d251d0c11069,72690,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6
556189,11490099_M31,17769,7,81909.0,7cc0cb1871dfd558f11a2885c145d144,2c824de8cf7a1b5338f76d2501e6ebd5,81349,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6
688955,11490090_M31,17769,7,72092.0,7cc0cb1871dfd558f11a2885c145d144,6aaf49dfccb82109a1f77471fee6e43a,71689,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6
689073,11490074_M31,17769,7,41815.0,7cc0cb1871dfd558f11a2885c145d144,59dbf32fbb998fc9837ad6fe10146b64,41909,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6


In [102]:
rt_stop_times3.loc[
    (rt_stop_times3.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144")
    & (rt_stop_times3.shape_array_key == "fefbc78a6cf676d7fbd1d25b61ef7bfb")
    & (rt_stop_times3.route_id == "22")
    & (rt_stop_times3.direction_id == 1)
    & (rt_stop_times3.stop_id == "17769")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes
508536,11490047_M31,17769,7,67772.0,7cc0cb1871dfd558f11a2885c145d144,51672d1ed74ce3e66e4bfc273e6e9931,68087,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6
509366,11490091_M31,17769,7,73892.0,7cc0cb1871dfd558f11a2885c145d144,e8ff0c3ae23fd4ba4ab3d251d0c11069,72690,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6
534538,11490099_M31,17769,7,81909.0,7cc0cb1871dfd558f11a2885c145d144,2c824de8cf7a1b5338f76d2501e6ebd5,81349,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6
661545,11490090_M31,17769,7,72092.0,7cc0cb1871dfd558f11a2885c145d144,6aaf49dfccb82109a1f77471fee6e43a,71689,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6
661663,11490074_M31,17769,7,41815.0,7cc0cb1871dfd558f11a2885c145d144,59dbf32fbb998fc9837ad6fe10146b64,41909,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6


In [103]:
rt_stop_times4.loc[
    (rt_stop_times4.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144")
    & (rt_stop_times4.shape_array_key == "fefbc78a6cf676d7fbd1d25b61ef7bfb")
    & (rt_stop_times4.route_id == "22")
    & (rt_stop_times4.direction_id == 1)
    & (rt_stop_times4.stop_id == "17769")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay
462155,11490074_M31,17769,7,41815.0,7cc0cb1871dfd558f11a2885c145d144,59dbf32fbb998fc9837ad6fe10146b64,41909,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,94.0
462156,11490047_M31,17769,7,67772.0,7cc0cb1871dfd558f11a2885c145d144,51672d1ed74ce3e66e4bfc273e6e9931,68087,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,315.0
462157,11490090_M31,17769,7,72092.0,7cc0cb1871dfd558f11a2885c145d144,6aaf49dfccb82109a1f77471fee6e43a,71689,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,-403.0
462158,11490091_M31,17769,7,73892.0,7cc0cb1871dfd558f11a2885c145d144,e8ff0c3ae23fd4ba4ab3d251d0c11069,72690,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,-1202.0
462159,11490099_M31,17769,7,81909.0,7cc0cb1871dfd558f11a2885c145d144,2c824de8cf7a1b5338f76d2501e6ebd5,81349,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,-560.0


In [104]:
41815 / 60 / 60

11.615277777777777

In [105]:
rt_stop_times5.loc[
    (rt_stop_times5.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144")
    & (rt_stop_times5.shape_array_key == "fefbc78a6cf676d7fbd1d25b61ef7bfb")
    & (rt_stop_times5.route_id == "22")
    & (rt_stop_times5.direction_id == 1)
    & (rt_stop_times5.stop_id == "17769")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway
461677,11490074_M31,17769,7,41815.0,7cc0cb1871dfd558f11a2885c145d144,59dbf32fbb998fc9837ad6fe10146b64,41909,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,94.0,,
461678,11490047_M31,17769,7,67772.0,7cc0cb1871dfd558f11a2885c145d144,51672d1ed74ce3e66e4bfc273e6e9931,68087,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,315.0,26178.0,25957.0
461679,11490090_M31,17769,7,72092.0,7cc0cb1871dfd558f11a2885c145d144,6aaf49dfccb82109a1f77471fee6e43a,71689,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,-403.0,3602.0,4320.0
461680,11490091_M31,17769,7,73892.0,7cc0cb1871dfd558f11a2885c145d144,e8ff0c3ae23fd4ba4ab3d251d0c11069,72690,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,-1202.0,1001.0,1800.0
461681,11490099_M31,17769,7,81909.0,7cc0cb1871dfd558f11a2885c145d144,2c824de8cf7a1b5338f76d2501e6ebd5,81349,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,-560.0,8659.0,8017.0


In [106]:
rt_stop_times5.loc[
    (rt_stop_times5.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144")
    & (rt_stop_times5.shape_array_key == "fefbc78a6cf676d7fbd1d25b61ef7bfb")
    & (rt_stop_times5.route_id == "22")
    & (rt_stop_times5.direction_id == 1)
    & (rt_stop_times5.stop_id == "17769")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway
461677,11490074_M31,17769,7,41815.0,7cc0cb1871dfd558f11a2885c145d144,59dbf32fbb998fc9837ad6fe10146b64,41909,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,94.0,,
461678,11490047_M31,17769,7,67772.0,7cc0cb1871dfd558f11a2885c145d144,51672d1ed74ce3e66e4bfc273e6e9931,68087,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,315.0,26178.0,25957.0
461679,11490090_M31,17769,7,72092.0,7cc0cb1871dfd558f11a2885c145d144,6aaf49dfccb82109a1f77471fee6e43a,71689,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,-403.0,3602.0,4320.0
461680,11490091_M31,17769,7,73892.0,7cc0cb1871dfd558f11a2885c145d144,e8ff0c3ae23fd4ba4ab3d251d0c11069,72690,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,-1202.0,1001.0,1800.0
461681,11490099_M31,17769,7,81909.0,7cc0cb1871dfd558f11a2885c145d144,2c824de8cf7a1b5338f76d2501e6ebd5,81349,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,-560.0,8659.0,8017.0


#### Test 2

In [107]:
m2.loc[
    (m2.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144")
    & (m2.shape_array_key == "1b678a66d0009c55bc573cfc37aa1029")
    & (m2.route_id == "22")
    & (m2.direction_id == 0)
    & (m2.stop_id == "16491")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway,avg_schd_headway_sec,std_dev_headway,bunching_coefficient,avg_schd_headway_min,actual_headway_min,sched_headway_min
434669,11489975_M31,16491,10,69000.0,7cc0cb1871dfd558f11a2885c145d144,d30242b374225ed75a4aadd78fa8d7be,69043,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,43.0,2134.0,1920.0,1450.0,1055.87,0.73,24.17,35.57,32.0
434670,11489815_M31,16491,10,69360.0,7cc0cb1871dfd558f11a2885c145d144,186fd89b59a49ddc1e84cb4b89c066d8,69454,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,94.0,411.0,360.0,1450.0,1055.87,0.73,24.17,6.85,6.0
434671,11489816_M31,16491,10,69900.0,7cc0cb1871dfd558f11a2885c145d144,5cd2523ccd8c33e277aaae0ac9af35c8,70287,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,387.0,833.0,540.0,1450.0,1055.87,0.73,24.17,13.88,9.0
434672,11489817_M31,16491,10,72900.0,7cc0cb1871dfd558f11a2885c145d144,45830206e5f4a07c06e520968f4b789f,73061,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,161.0,2774.0,3000.0,1450.0,1055.87,0.73,24.17,46.23,50.0
434673,11489861_M31,16491,10,74340.0,7cc0cb1871dfd558f11a2885c145d144,a28a9fc884812bbb9e404de1dd970ccd,75431,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,1091.0,2370.0,1440.0,1450.0,1055.87,0.73,24.17,39.5,24.0
434674,11489818_M31,16491,10,75780.0,7cc0cb1871dfd558f11a2885c145d144,c20c26a42e6277dd327fe1280cead6a8,75853,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,73.0,422.0,1440.0,1450.0,1055.87,0.73,24.17,7.03,24.0


In [108]:
rt_stop_times2.loc[
    (rt_stop_times2.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144")
    & (rt_stop_times2.shape_array_key == "1b678a66d0009c55bc573cfc37aa1029")
    & (rt_stop_times2.route_id == "22")
    & (rt_stop_times2.direction_id == 0)
    & (rt_stop_times2.stop_id == "16491")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes
565068,11489815_M31,16491,10,69360.0,7cc0cb1871dfd558f11a2885c145d144,186fd89b59a49ddc1e84cb4b89c066d8,69454,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61
566542,11489969_M31,16491,10,67080.0,7cc0cb1871dfd558f11a2885c145d144,b73ff68241fdcb9ff5a3f3be424b2268,66909,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61
605147,11489817_M31,16491,10,72900.0,7cc0cb1871dfd558f11a2885c145d144,45830206e5f4a07c06e520968f4b789f,73061,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61
606713,11489861_M31,16491,10,74340.0,7cc0cb1871dfd558f11a2885c145d144,a28a9fc884812bbb9e404de1dd970ccd,75431,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61
616129,11489818_M31,16491,10,75780.0,7cc0cb1871dfd558f11a2885c145d144,c20c26a42e6277dd327fe1280cead6a8,75853,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61
627185,11489820_M31,16491,10,88620.0,7cc0cb1871dfd558f11a2885c145d144,df2fe2bd0d1a3490250578dd0cf8f069,2272,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61
643093,11489816_M31,16491,10,69900.0,7cc0cb1871dfd558f11a2885c145d144,5cd2523ccd8c33e277aaae0ac9af35c8,70287,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61
648095,11489821_M31,16491,10,89220.0,7cc0cb1871dfd558f11a2885c145d144,515ade67689a882a95148e68b58ab913,2581,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61
680459,11489975_M31,16491,10,69000.0,7cc0cb1871dfd558f11a2885c145d144,d30242b374225ed75a4aadd78fa8d7be,69043,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61


In [109]:
rt_stop_times3.loc[
    (rt_stop_times3.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144")
    & (rt_stop_times3.shape_array_key == "1b678a66d0009c55bc573cfc37aa1029")
    & (rt_stop_times3.route_id == "22")
    & (rt_stop_times3.direction_id == 0)
    & (rt_stop_times3.stop_id == "16491")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes
543061,11489815_M31,16491,10,69360.0,7cc0cb1871dfd558f11a2885c145d144,186fd89b59a49ddc1e84cb4b89c066d8,69454,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61
544473,11489969_M31,16491,10,67080.0,7cc0cb1871dfd558f11a2885c145d144,b73ff68241fdcb9ff5a3f3be424b2268,66909,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61
581283,11489817_M31,16491,10,72900.0,7cc0cb1871dfd558f11a2885c145d144,45830206e5f4a07c06e520968f4b789f,73061,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61
582710,11489861_M31,16491,10,74340.0,7cc0cb1871dfd558f11a2885c145d144,a28a9fc884812bbb9e404de1dd970ccd,75431,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61
591738,11489818_M31,16491,10,75780.0,7cc0cb1871dfd558f11a2885c145d144,c20c26a42e6277dd327fe1280cead6a8,75853,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61
617343,11489816_M31,16491,10,69900.0,7cc0cb1871dfd558f11a2885c145d144,5cd2523ccd8c33e277aaae0ac9af35c8,70287,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61
653265,11489975_M31,16491,10,69000.0,7cc0cb1871dfd558f11a2885c145d144,d30242b374225ed75a4aadd78fa8d7be,69043,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61


In [110]:
rt_stop_times4.loc[
    (rt_stop_times4.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144")
    & (rt_stop_times4.shape_array_key == "1b678a66d0009c55bc573cfc37aa1029")
    & (rt_stop_times4.route_id == "22")
    & (rt_stop_times4.direction_id == 0)
    & (rt_stop_times4.stop_id == "16491")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay
452724,11489969_M31,16491,10,67080.0,7cc0cb1871dfd558f11a2885c145d144,b73ff68241fdcb9ff5a3f3be424b2268,66909,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,-171.0
452725,11489975_M31,16491,10,69000.0,7cc0cb1871dfd558f11a2885c145d144,d30242b374225ed75a4aadd78fa8d7be,69043,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,43.0
452726,11489815_M31,16491,10,69360.0,7cc0cb1871dfd558f11a2885c145d144,186fd89b59a49ddc1e84cb4b89c066d8,69454,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,94.0
452727,11489816_M31,16491,10,69900.0,7cc0cb1871dfd558f11a2885c145d144,5cd2523ccd8c33e277aaae0ac9af35c8,70287,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,387.0
452728,11489817_M31,16491,10,72900.0,7cc0cb1871dfd558f11a2885c145d144,45830206e5f4a07c06e520968f4b789f,73061,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,161.0
452729,11489861_M31,16491,10,74340.0,7cc0cb1871dfd558f11a2885c145d144,a28a9fc884812bbb9e404de1dd970ccd,75431,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,1091.0
452730,11489818_M31,16491,10,75780.0,7cc0cb1871dfd558f11a2885c145d144,c20c26a42e6277dd327fe1280cead6a8,75853,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,73.0


In [111]:
rt_stop_times5.loc[
    (rt_stop_times5.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144")
    & (rt_stop_times5.shape_array_key == "1b678a66d0009c55bc573cfc37aa1029")
    & (rt_stop_times5.route_id == "22")
    & (rt_stop_times5.direction_id == 0)
    & (rt_stop_times5.stop_id == "16491")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway
452283,11489969_M31,16491,10,67080.0,7cc0cb1871dfd558f11a2885c145d144,b73ff68241fdcb9ff5a3f3be424b2268,66909,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,-171.0,,
452284,11489975_M31,16491,10,69000.0,7cc0cb1871dfd558f11a2885c145d144,d30242b374225ed75a4aadd78fa8d7be,69043,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,43.0,2134.0,1920.0
452285,11489815_M31,16491,10,69360.0,7cc0cb1871dfd558f11a2885c145d144,186fd89b59a49ddc1e84cb4b89c066d8,69454,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,94.0,411.0,360.0
452286,11489816_M31,16491,10,69900.0,7cc0cb1871dfd558f11a2885c145d144,5cd2523ccd8c33e277aaae0ac9af35c8,70287,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,387.0,833.0,540.0
452287,11489817_M31,16491,10,72900.0,7cc0cb1871dfd558f11a2885c145d144,45830206e5f4a07c06e520968f4b789f,73061,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,161.0,2774.0,3000.0
452288,11489861_M31,16491,10,74340.0,7cc0cb1871dfd558f11a2885c145d144,a28a9fc884812bbb9e404de1dd970ccd,75431,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,1091.0,2370.0,1440.0
452289,11489818_M31,16491,10,75780.0,7cc0cb1871dfd558f11a2885c145d144,c20c26a42e6277dd327fe1280cead6a8,75853,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,73.0,422.0,1440.0


In [112]:
rt_stop_times6.loc[
    (rt_stop_times6.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144")
    & (rt_stop_times6.shape_array_key == "1b678a66d0009c55bc573cfc37aa1029")
    & (rt_stop_times6.route_id == "22")
    & (rt_stop_times6.direction_id == 0)
    & (rt_stop_times6.stop_id == "16491")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway
435913,11489975_M31,16491,10,69000.0,7cc0cb1871dfd558f11a2885c145d144,d30242b374225ed75a4aadd78fa8d7be,69043,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,43.0,2134.0,1920.0
435914,11489815_M31,16491,10,69360.0,7cc0cb1871dfd558f11a2885c145d144,186fd89b59a49ddc1e84cb4b89c066d8,69454,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,94.0,411.0,360.0
435915,11489816_M31,16491,10,69900.0,7cc0cb1871dfd558f11a2885c145d144,5cd2523ccd8c33e277aaae0ac9af35c8,70287,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,387.0,833.0,540.0
435916,11489817_M31,16491,10,72900.0,7cc0cb1871dfd558f11a2885c145d144,45830206e5f4a07c06e520968f4b789f,73061,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,161.0,2774.0,3000.0
435917,11489861_M31,16491,10,74340.0,7cc0cb1871dfd558f11a2885c145d144,a28a9fc884812bbb9e404de1dd970ccd,75431,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,1091.0,2370.0,1440.0
435918,11489818_M31,16491,10,75780.0,7cc0cb1871dfd558f11a2885c145d144,c20c26a42e6277dd327fe1280cead6a8,75853,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,73.0,422.0,1440.0


### [Transit Matters](https://transitmatters.org/blog/reveal-mbtas-slowest-most-bunched-bus)
* The following charts show bunching events as a pecentage of total trips. Here,
bunching is defined as headways < 25% of the scheduled_headway.
* Grain: operator-route_id-stop_id. 

In [113]:
# Using rt_stop_times5 since this already has the actual and scheduled headways

In [114]:
len(rt_stop_times5)

668646

In [115]:
transit_matters_df1 = rt_stop_times5.copy()

In [116]:
transit_matters_df1.head(2)

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway
0,10105002681535-DEC23,4201,2,56100.0,0666caf3ec1ecc96b74f4477ee4bc939,657acf4c2ae320466a4fc09f1bbbba26,56133,105-13172,6b06373e4a70b2cb094870285bd92bec,608992664173210532aa3e6cc573be2f,Metro Local Line,0.0,Westbound,14.59,33.0,,
1,10105002681555-DEC23,4201,2,57300.0,0666caf3ec1ecc96b74f4477ee4bc939,1c45265e242009e9589b98d1ec995907,57279,105-13172,6b06373e4a70b2cb094870285bd92bec,608992664173210532aa3e6cc573be2f,Metro Local Line,0.0,Westbound,14.59,-21.0,1146.0,1200.0


In [117]:
transit_matters_df1["pct_actual_schd_headway"] = (
    transit_matters_df1.actual_headway / transit_matters_df1.schd_headway
)

In [118]:
import numpy as np

transit_matters_df1["bunched_y_n"] = np.where(
    transit_matters_df1["pct_actual_schd_headway"] < 0.25, "bunched", "not bunched"
)

In [119]:
transit_matters_df1.pct_actual_schd_headway.describe()

count   647207.00
mean         1.00
std          0.52
min         -9.95
25%          0.81
50%          1.00
75%          1.19
max         12.38
Name: pct_actual_schd_headway, dtype: float64

In [120]:
transit_matters_df1.bunched_y_n.value_counts(dropna=False)

not bunched    636013
bunched         32633
Name: bunched_y_n, dtype: int64

In [121]:
transit_matters_df1.bunched_y_n.value_counts(dropna=True)

not bunched    636013
bunched         32633
Name: bunched_y_n, dtype: int64

In [122]:
transit_matters_df2 = (
    transit_matters_df1.groupby(
        [
            "schedule_gtfs_dataset_key",
            "route_long_name",
            "shape_array_key",
            "route_id",
            "stop_id",
            "direction_id",
            "route_primary_direction",
            "bunched_y_n",
        ]
    )
    .agg({"trip_instance_key": "nunique"})
    .reset_index()
)

In [123]:
transit_matters_df2.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_long_name,shape_array_key,route_id,stop_id,direction_id,route_primary_direction,bunched_y_n,trip_instance_key
0,0666caf3ec1ecc96b74f4477ee4bc939,Metro G Line 901,12530c16e07a519c8a8543d487f26ade,901-13172,15313,0.0,Eastbound,bunched,3
1,0666caf3ec1ecc96b74f4477ee4bc939,Metro G Line 901,12530c16e07a519c8a8543d487f26ade,901-13172,15313,0.0,Eastbound,not bunched,84


In [124]:
bunched_only = transit_matters_df2.loc[transit_matters_df2.bunched_y_n == "bunched"].reset_index(drop = True)

In [127]:
transit_matters_agg = [
    "schedule_gtfs_dataset_key",
    "route_long_name",
    "shape_array_key",
    "route_id",
    "stop_id",
    "direction_id",
    "route_primary_direction",
]

In [128]:
transit_matters_all_trips = (
    transit_matters_df1.groupby(transit_matters_agg)
    .agg({"trip_instance_key": "nunique"})
    .reset_index()
    .rename(columns={"trip_instance_key": "all_trips"})
)

In [129]:
bunched_only = pd.merge(
    bunched_only,transit_matters_all_trips,  on=transit_matters_agg, how = "left"
)

In [133]:
bunched_only["pct_trips_bunched"] = (
    bunched_only.trip_instance_key / bunched_only.all_trips * 100
)

In [135]:
transit_matters_m1 = pd.merge(
    transit_matters_all_trips, bunched_only, on=transit_matters_agg.append('all_trips'), how = "left"
)

In [136]:
transit_matters_m1.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_long_name,shape_array_key,route_id,stop_id,direction_id,route_primary_direction,all_trips,bunched_y_n,trip_instance_key,pct_trips_bunched
0,0666caf3ec1ecc96b74f4477ee4bc939,Metro G Line 901,12530c16e07a519c8a8543d487f26ade,901-13172,15313,0.0,Eastbound,87,bunched,3.0,3.45
1,0666caf3ec1ecc96b74f4477ee4bc939,Metro G Line 901,12530c16e07a519c8a8543d487f26ade,901-13172,15416,0.0,Eastbound,81,bunched,5.0,6.17


In [138]:
transit_matters_m1 = transit_matters_m1.drop(columns = ['bunched_y_n','trip_instance_key'])

In [139]:
transit_matters_m1.pct_trips_bunched = transit_matters_m1.pct_trips_bunched.fillna(0)

In [140]:
transit_matters_m2 = transit_matters_m1.drop_duplicates(
    subset=transit_matters_agg
).reset_index(drop=True)

In [141]:
transit_matters_m2.shape

(21130, 9)

In [151]:
transit_matters_m2.head()

Unnamed: 0,schedule_gtfs_dataset_key,route_long_name,shape_array_key,route_id,stop_id,direction_id,route_primary_direction,all_trips,pct_trips_bunched
0,0666caf3ec1ecc96b74f4477ee4bc939,Metro G Line 901,12530c16e07a519c8a8543d487f26ade,901-13172,15313,0.0,Eastbound,87,3.45
1,0666caf3ec1ecc96b74f4477ee4bc939,Metro G Line 901,12530c16e07a519c8a8543d487f26ade,901-13172,15416,0.0,Eastbound,81,6.17
2,0666caf3ec1ecc96b74f4477ee4bc939,Metro G Line 901,12530c16e07a519c8a8543d487f26ade,901-13172,15432,0.0,Eastbound,86,1.16
3,0666caf3ec1ecc96b74f4477ee4bc939,Metro G Line 901,12530c16e07a519c8a8543d487f26ade,901-13172,15436,0.0,Eastbound,86,3.49
4,0666caf3ec1ecc96b74f4477ee4bc939,Metro G Line 901,12530c16e07a519c8a8543d487f26ade,901-13172,15453,0.0,Eastbound,85,2.35


In [152]:
transit_matters_m2.pct_trips_bunched.describe()

count   21130.00
mean        2.51
std         4.98
min         0.00
25%         0.00
50%         0.00
75%         3.45
max        50.00
Name: pct_trips_bunched, dtype: float64

In [157]:
transit_matters_m2.loc[transit_matters_m2.pct_trips_bunched >= 10].shape

(1673, 9)

In [158]:
transit_matters_m2.loc[transit_matters_m2.pct_trips_bunched >= 10].sample(10)

Unnamed: 0,schedule_gtfs_dataset_key,route_long_name,shape_array_key,route_id,stop_id,direction_id,route_primary_direction,all_trips,pct_trips_bunched
15667,7cc0cb1871dfd558f11a2885c145d144,BAYSHORE,c47d8fe41d96232a4d3f2b18d9fed8d1,8,16054,0.0,Southbound,90,11.11
17693,7cc0cb1871dfd558f11a2885c145d144,MISSION RAPID,219712f1e41d4fb7c4de5c591d2cea22,14R,15529,0.0,Westbound,9,11.11
1112,0666caf3ec1ecc96b74f4477ee4bc939,Metro Local Line,0dcba66281656183f90616c00505ec66,251-13172,1746,0.0,Northbound,27,11.11
20845,cc53a0dbf5df90e3009b9cb5d89d80ba,DASH Pico Union/Echo Park,ace98a4725851e87e92b71b3a393c351,4869,5802500,1.0,Southbound,65,12.31
19408,7cc0cb1871dfd558f11a2885c145d144,VAN NESS-MISSION,189a8be8a1d11d5ccac1ac1dc40bbb29,49,15801,1.0,Northbound,146,13.01
6421,0666caf3ec1ecc96b74f4477ee4bc939,Metro Local Line,650398fd4bbe9d528b98587a8a8eade6,240-13172,7002,0.0,Eastbound,91,18.68
16894,7cc0cb1871dfd558f11a2885c145d144,GEARY RAPID,aece61421bdb79216286993adcedf25a,38R,14285,1.0,Eastbound,107,12.15
16185,7cc0cb1871dfd558f11a2885c145d144,FILLMORE,2fe09ae2c449928945abe662a855ffb2,22,14618,0.0,Southbound,131,12.21
6663,0666caf3ec1ecc96b74f4477ee4bc939,Metro Local Line,68c79aa33ec7ab64426d2d5cd17c9fb1,4-13172,14422,1.0,Westbound,8,12.5
17398,7cc0cb1871dfd558f11a2885c145d144,MASONIC,afde3db7e3d6fa6626f3b080fd749995,43,15257,0.0,Southbound,83,10.84


In [142]:
transit_matters_m1.shape

(21130, 9)

#### Fillmore Test

In [144]:
fillmore.loc[
    (fillmore.shape_array_key == "1b678a66d0009c55bc573cfc37aa1029")
    & (fillmore.stop_id == "13086")
    & (fillmore.direction_id == 0)
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,schd_headway,avg_schd_headway_sec,std_dev_headway,bunching_coefficient,avg_schd_headway_min,passenger_op_perspective
13445,11489975_M31,13086,13,69106.0,7cc0cb1871dfd558f11a2885c145d144,d30242b374225ed75a4aadd78fa8d7be,69048,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,-58.0,1907.0,1445.5,960.66,0.66,24.09,Frequent bunching


In [148]:
2370/60

39.5

In [149]:
1440/60

24.0

In [145]:
transit_matters_df1.loc[
    (transit_matters_df1.shape_array_key == "1b678a66d0009c55bc573cfc37aa1029")
    & (transit_matters_df1.stop_id == "13086")
    & (transit_matters_df1.direction_id == 0)
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway,pct_actual_schd_headway,bunched_y_n
452304,11489969_M31,13086,13,67199.0,7cc0cb1871dfd558f11a2885c145d144,b73ff68241fdcb9ff5a3f3be424b2268,67051,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,-148.0,,,,not bunched
452305,11489975_M31,13086,13,69106.0,7cc0cb1871dfd558f11a2885c145d144,d30242b374225ed75a4aadd78fa8d7be,69048,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,-58.0,1997.0,1907.0,1.05,not bunched
452306,11489815_M31,13086,13,69466.0,7cc0cb1871dfd558f11a2885c145d144,186fd89b59a49ddc1e84cb4b89c066d8,69723,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,257.0,675.0,360.0,1.88,not bunched
452307,11489816_M31,13086,13,70006.0,7cc0cb1871dfd558f11a2885c145d144,5cd2523ccd8c33e277aaae0ac9af35c8,70421,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,415.0,698.0,540.0,1.29,not bunched
452308,11489817_M31,13086,13,72992.0,7cc0cb1871dfd558f11a2885c145d144,45830206e5f4a07c06e520968f4b789f,73064,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,72.0,2643.0,2986.0,0.89,not bunched
452309,11489861_M31,13086,13,74432.0,7cc0cb1871dfd558f11a2885c145d144,a28a9fc884812bbb9e404de1dd970ccd,75434,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,1002.0,2370.0,1440.0,1.65,not bunched
452310,11489818_M31,13086,13,75872.0,7cc0cb1871dfd558f11a2885c145d144,c20c26a42e6277dd327fe1280cead6a8,75943,22,1b678a66d0009c55bc573cfc37aa1029,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,0.0,Southbound,7.61,71.0,509.0,1440.0,0.35,not bunched


In [147]:
transit_matters_m2.loc[transit_matters_m2.route_id == "22"]

Unnamed: 0,schedule_gtfs_dataset_key,route_long_name,shape_array_key,route_id,stop_id,direction_id,route_primary_direction,all_trips,pct_trips_bunched
16137,7cc0cb1871dfd558f11a2885c145d144,FILLMORE,1b678a66d0009c55bc573cfc37aa1029,22,13086,0.0,Southbound,7,0.0
16138,7cc0cb1871dfd558f11a2885c145d144,FILLMORE,1b678a66d0009c55bc573cfc37aa1029,22,13285,0.0,Southbound,5,0.0
16139,7cc0cb1871dfd558f11a2885c145d144,FILLMORE,1b678a66d0009c55bc573cfc37aa1029,22,13293,0.0,Southbound,4,0.0
16140,7cc0cb1871dfd558f11a2885c145d144,FILLMORE,1b678a66d0009c55bc573cfc37aa1029,22,13299,0.0,Southbound,1,0.0
16141,7cc0cb1871dfd558f11a2885c145d144,FILLMORE,1b678a66d0009c55bc573cfc37aa1029,22,13301,0.0,Southbound,5,0.0
16142,7cc0cb1871dfd558f11a2885c145d144,FILLMORE,1b678a66d0009c55bc573cfc37aa1029,22,13984,0.0,Southbound,5,0.0
16143,7cc0cb1871dfd558f11a2885c145d144,FILLMORE,1b678a66d0009c55bc573cfc37aa1029,22,14005,0.0,Southbound,7,0.0
16144,7cc0cb1871dfd558f11a2885c145d144,FILLMORE,1b678a66d0009c55bc573cfc37aa1029,22,14605,0.0,Southbound,7,0.0
16145,7cc0cb1871dfd558f11a2885c145d144,FILLMORE,1b678a66d0009c55bc573cfc37aa1029,22,14606,0.0,Southbound,1,0.0
16146,7cc0cb1871dfd558f11a2885c145d144,FILLMORE,1b678a66d0009c55bc573cfc37aa1029,22,14609,0.0,Southbound,6,0.0


In [150]:
transit_matters_m2.loc[transit_matters_m2.route_id == "22"].pct_trips_bunched.describe()

count   153.00
mean      7.89
std       8.26
min       0.00
25%       0.00
50%       8.14
75%      15.62
max      28.15
Name: pct_trips_bunched, dtype: float64