## I tried turning `stop_times` to actual dates but it seems like seconds is easier to manipulate.
* 10_transit_bunching.ipynb contains timestamps attempts
* cd data-analyses/rt_segment_speeds && pip install -r requirements.txt && cd ../_shared_utils && make setup_env && cd ../gtfs_digest
* [Issue](https://github.com/cal-itp/data-analyses/issues/1099)

In [1]:
import geopandas as gpd
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers
from shared_utils import catalog_utils, rt_dates, rt_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS

# https://github.com/cal-itp/data-analyses/blob/main/_shared_utils/shared_utils/gtfs_analytics_data.yml
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
may_date = "2024-05-22"

In [4]:
import merge_data

In [5]:
drop_for_preview = [
    "schedule_gtfs_dataset_key",
    "trip_instance_key",
    "shape_array_key",
    "feed_key",
    "trip_id",
]

### Get high frequency routes

In [6]:
subset = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "route_primary_direction",
    "service_date",
    "frequency",
]

In [7]:
route_dir = merge_data.concatenate_schedule_by_route_direction([may_date])[subset]

In [8]:
route_dir["frequency_in_minutes"] = 60 / route_dir.frequency

In [9]:
route_dir2 = route_dir.loc[route_dir.frequency_in_minutes <= 10]

In [10]:
route_dir2.head(10)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,route_primary_direction,service_date,frequency,frequency_in_minutes
200,0666caf3ec1ecc96b74f4477ee4bc939,16-13172,0.0,Eastbound,2024-05-22,6.71,8.94
202,0666caf3ec1ecc96b74f4477ee4bc939,16-13172,0.0,Eastbound,2024-05-22,8.88,6.76
205,0666caf3ec1ecc96b74f4477ee4bc939,16-13172,1.0,Westbound,2024-05-22,8.75,6.86
262,0666caf3ec1ecc96b74f4477ee4bc939,18-13172,0.0,Eastbound,2024-05-22,8.12,7.39
265,0666caf3ec1ecc96b74f4477ee4bc939,18-13172,1.0,Westbound,2024-05-22,7.75,7.74
268,0666caf3ec1ecc96b74f4477ee4bc939,180-13172,0.0,Eastbound,2024-05-22,6.0,10.0
280,0666caf3ec1ecc96b74f4477ee4bc939,2-13172,0.0,Eastbound,2024-05-22,6.62,9.06
316,0666caf3ec1ecc96b74f4477ee4bc939,207-13172,0.0,Northbound,2024-05-22,7.5,8.0
317,0666caf3ec1ecc96b74f4477ee4bc939,207-13172,1.0,Southbound,2024-05-22,6.0,10.0
319,0666caf3ec1ecc96b74f4477ee4bc939,207-13172,1.0,Southbound,2024-05-22,8.5,7.06


In [11]:
high_frequency_routes_crosswalk = (
    route_dir[
        [
            "schedule_gtfs_dataset_key",
            "route_id",
            "direction_id",
            "route_primary_direction",
            "frequency_in_minutes",
        ]
    ]
    .drop_duplicates(
        [
            "schedule_gtfs_dataset_key",
            "route_id",
            "direction_id",
            "route_primary_direction",
        ]
    )
    .reset_index(drop=True)
)

In [109]:
high_frequency_routes_crosswalk.route_id.nunique()

1338

### Get trips of high frequency routes

In [12]:
TABLE = GTFS_DATA_DICT.schedule_downloads.trips

In [13]:
FILE = f"{COMPILED_CACHED_VIEWS}{TABLE}_{may_date}.parquet"

In [14]:
trips_subset = [
    "gtfs_dataset_key",
    "route_id",
    "trip_instance_key",
    "shape_array_key",
    "feed_key",
    "route_long_name",
    "direction_id",
]

In [15]:
high_frequency_routes_crosswalk.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,route_primary_direction,frequency_in_minutes
0,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,Northbound,65.22


In [16]:
trips = pd.read_parquet(FILE)[trips_subset].rename(
    columns={"gtfs_dataset_key": "schedule_gtfs_dataset_key"}
)

In [17]:
trips_freq_routes = pd.merge(
    trips,
    high_frequency_routes_crosswalk,
    on=["schedule_gtfs_dataset_key", "route_id", "direction_id"],
    how="inner",
)

In [18]:
trips_freq_routes.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,trip_instance_key,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes
0,1770249a5a2e770ca90628434d4934b1,3408,c256553e28c4bba693e3136240b35419,8f644f847e987de68e0cb6fcd339cf41,926867fdee73d5fbfe4f011871bcd830,Route 21,0.0,Westbound,51.28
1,1770249a5a2e770ca90628434d4934b1,3408,488e9e227288606249d0508961c0fa15,8f644f847e987de68e0cb6fcd339cf41,926867fdee73d5fbfe4f011871bcd830,Route 21,0.0,Westbound,51.28


#### Why are some trip instance keys duplicated?

In [19]:
len(trips_freq_routes) - trips_freq_routes.trip_instance_key.nunique()

7

In [20]:
trips_freq_routes.trip_instance_key.value_counts().head(10)

1ace22b258649b391e24772b4386d7f1    2
8d3d8c1b3050a1118c654c1435f67d5f    2
caca82650ae961e6ca37ca17592b61cb    2
c7ccc08ec1ecc2420a28066056de0515    2
7ffb024120f14f74921e1655c3b61e15    2
ff8551829b06f27cdaabb0aa2a97d4e2    2
6ba6d0e7c8294b58de3f47ec80f8208a    2
0bb1b79782349602b6f2c556d4750560    1
64d3db543039c647eeef4414003c0a73    1
66d63247a3488d7addb421ea5654ea78    1
Name: trip_instance_key, dtype: int64

In [21]:
trips_freq_routes.loc[
    trips_freq_routes.trip_instance_key == "1ace22b258649b391e24772b4386d7f1"
]

Unnamed: 0,schedule_gtfs_dataset_key,route_id,trip_instance_key,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes
5317,8eecb796518dafd3c1b971a99f8b8252,3241,1ace22b258649b391e24772b4386d7f1,2dd40a9897bd4f5f33ddd8780f5dddfd,5067febbcbf1cd61da2b11ed9a337eb8,03 Barstow City Hall - Lenwood,0.0,Westbound,103.45
5332,8eecb796518dafd3c1b971a99f8b8252,3241,1ace22b258649b391e24772b4386d7f1,2dd40a9897bd4f5f33ddd8780f5dddfd,5067febbcbf1cd61da2b11ed9a337eb8,03 Barstow City Hall - Lenwood,0.0,Westbound,103.45


In [22]:
trips_freq_routes.loc[
    trips_freq_routes.trip_instance_key == "ff8551829b06f27cdaabb0aa2a97d4e2"
]

Unnamed: 0,schedule_gtfs_dataset_key,route_id,trip_instance_key,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes
5106,8eecb796518dafd3c1b971a99f8b8252,3220,ff8551829b06f27cdaabb0aa2a97d4e2,427c5866ad6d5adfd23ec18ddbf114dd,5067febbcbf1cd61da2b11ed9a337eb8,15 Barstow - Victorville - San Bernardino,1.0,Northbound,181.82
5109,8eecb796518dafd3c1b971a99f8b8252,3220,ff8551829b06f27cdaabb0aa2a97d4e2,427c5866ad6d5adfd23ec18ddbf114dd,5067febbcbf1cd61da2b11ed9a337eb8,15 Barstow - Victorville - San Bernardino,1.0,Northbound,181.82


In [23]:
trips_freq_routes = trips_freq_routes.drop_duplicates(subset=["trip_instance_key"])

In [24]:
len(trips_freq_routes) - trips_freq_routes.trip_instance_key.nunique()

0

### Get Stop Times
* What's the difference btwn `trip_id` and `trip_instance_key`?

In [25]:
rt_stop_times = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_rt_stop_times_2024-05-22.parquet"
)

In [26]:
rt_stop_times.head(1)

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec
0,1d105244-776c-4b3f-af78-9c7ad78c2103,0b2443b6-b50f-452b-a749-464588ca93b8,8,60991.0,1fd2f07342d966919b15d5d37fda8cc8,45ae17540ca9fb5030c84dbb12e48e9a,61434


In [27]:
# Find only stop times of trips that belong to high frequency trips
rt_stop_times2 = pd.merge(
    rt_stop_times,
    trips_freq_routes,
    on=[
        "schedule_gtfs_dataset_key",
        "trip_instance_key",
    ],
    how="inner",
)

In [28]:
len(rt_stop_times) - len(rt_stop_times2)

127535

In [29]:
rt_stop_times2.head(2)

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes
0,14971717-d196-4362-8572-e5059751f732,0e466c39-2ade-49f8-a79d-b929dde2cfe5,3,57720.0,1fd2f07342d966919b15d5d37fda8cc8,1bbf8cf8a0db82e6e56b9a20fe9414a6,57978,e24126d6-fbad-46b1-a498-75026e763636,4dbba8014c5e63cfb381249002aa683f,e9a188003e67026bf648e639cf4b3f9d,U4,0.0,Northbound,285.71
1,14971717-d196-4362-8572-e5059751f732,0f993eda-623d-40b4-bf47-dc2cd04262f8,4,57780.0,1fd2f07342d966919b15d5d37fda8cc8,1bbf8cf8a0db82e6e56b9a20fe9414a6,58080,e24126d6-fbad-46b1-a498-75026e763636,4dbba8014c5e63cfb381249002aa683f,e9a188003e67026bf648e639cf4b3f9d,U4,0.0,Northbound,285.71


###  Some scheduled arrival seconds span longer than a day: filter them out?
* There are 86,400 seconds in a day

In [30]:
rt_stop_times2.scheduled_arrival_sec.describe()

count   2458723.00
mean      49457.62
std       18037.90
min         720.00
25%       34140.00
50%       49200.00
75%       63180.00
max      111374.00
Name: scheduled_arrival_sec, dtype: float64

In [31]:
len(rt_stop_times2.loc[rt_stop_times2.scheduled_arrival_sec > 86400])

44137

In [32]:
rt_stop_times3 = rt_stop_times2.loc[
    rt_stop_times2.scheduled_arrival_sec < 86400
].reset_index(drop=True)

In [33]:
# Rearrange: I want the stop sequence to be 1,2,3,4.
# stop ids can differ between trips of the same route and the same stop sequence is the same
rt_stop_times4 = rt_stop_times3.sort_values(
    by=[
        "schedule_gtfs_dataset_key",
        "feed_key",
        "shape_array_key",
        "route_id",
        "direction_id",
        "stop_sequence",
        "scheduled_arrival_sec",
    ]
).reset_index(drop=True)

### Calculate the difference btwn actual vs scheduled arrival.

In [34]:
def check_delay(df):
    df = df.assign(delay=df.rt_arrival_sec - df.scheduled_arrival_sec)

    print(df.delay.describe(percentiles=[0.05, 0.1, 0.9, 0.95]))

    max_delay_min = df.delay.max() / 60
    p95_delay_min = df.delay.quantile(q=0.95) / 60

    min_delay_min = df.delay.min() / 60
    p5_delay_min = df.delay.quantile(q=0.05) / 60

    print(f"min / max delay (minutes): {min_delay_min}, {max_delay_min}")
    print(f"5th / 95th delay (minutes): {p5_delay_min}, {p95_delay_min}")

    return df

In [35]:
rt_stop_times4 = check_delay(rt_stop_times4)

count   2414347.00
mean        118.31
std        2037.13
min      -86387.00
5%         -150.00
10%         -92.00
50%          94.00
90%         491.00
95%         689.00
max       57878.00
Name: delay, dtype: float64
min / max delay (minutes): -1439.7833333333333, 964.6333333333333
5th / 95th delay (minutes): -2.5, 11.483333333333333


#### Filter out differences of actual-scheduled arrival time that are more or less than one hour

In [36]:
# Filter to only delays that are an hour or less
rt_stop_times5 = rt_stop_times4[rt_stop_times4["delay"] <= 60 * 60].reset_index(
    drop=True
)

In [37]:
# Filter to only delays that are no less than
rt_stop_times5 = rt_stop_times5[rt_stop_times5["delay"] >= -3600].reset_index(drop=True)

In [38]:
len(rt_stop_times4) - len(rt_stop_times5)

3971

In [39]:
rt_stop_times5.delay.describe()

count   2410376.00
mean        157.34
std         301.29
min       -3600.00
25%         -11.00
50%          94.00
75%         257.00
max        3600.00
Name: delay, dtype: float64

### Calculate the actual headway the `operator-route-direction_id-stop_sequence-stop_id-` grain

In [40]:
rt_stop_times5.head(2)

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay
0,29001,40471,2,25620.0,015d67d5b75b5cf2b710bbadadfb75f5,fc4fd5fc790a1a84e1b3663fa9bca4fd,25862,29,04ce2af2df4eebb3126f1d90a10b5a61,9529027364faa9b7dfbb3d7d7501b133,Downtown San Rafael - E. Corte Madera,1.0,Eastbound,240.0,242.0
1,29005,40471,2,31620.0,015d67d5b75b5cf2b710bbadadfb75f5,46168565fecd863f123814d4d8e1b3a2,31737,29,04ce2af2df4eebb3126f1d90a10b5a61,9529027364faa9b7dfbb3d7d7501b133,Downtown San Rafael - E. Corte Madera,1.0,Eastbound,240.0,117.0


In [41]:
groupby_cols = [
    "schedule_gtfs_dataset_key",
    "feed_key",
    "shape_array_key",
    "route_id",
    "direction_id",
    "route_primary_direction",
    "stop_sequence",
    "stop_id",
]

In [42]:
rt_stop_times5["actual_headway"] = rt_stop_times5.groupby(groupby_cols)[
    "rt_arrival_sec"
].diff()

### Calculate scheduled headway

In [43]:
rt_stop_times5["schd_headway"] = rt_stop_times5.groupby(groupby_cols)[
    "scheduled_arrival_sec"
].diff()

In [44]:
rt_stop_times5.head(10).drop(columns = drop_for_preview)

Unnamed: 0,stop_id,stop_sequence,scheduled_arrival_sec,rt_arrival_sec,route_id,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway
0,40471,2,25620.0,25862,29,Downtown San Rafael - E. Corte Madera,1.0,Eastbound,240.0,242.0,,
1,40471,2,31620.0,31737,29,Downtown San Rafael - E. Corte Madera,1.0,Eastbound,240.0,117.0,5875.0,6000.0
2,40471,2,53820.0,54213,29,Downtown San Rafael - E. Corte Madera,1.0,Eastbound,240.0,393.0,22476.0,22200.0
3,40471,2,57240.0,57258,29,Downtown San Rafael - E. Corte Madera,1.0,Eastbound,240.0,18.0,3045.0,3420.0
4,41333,3,25680.0,25938,29,Downtown San Rafael - E. Corte Madera,1.0,Eastbound,240.0,258.0,,
5,41333,3,28680.0,29106,29,Downtown San Rafael - E. Corte Madera,1.0,Eastbound,240.0,426.0,3168.0,3000.0
6,41333,3,51180.0,51379,29,Downtown San Rafael - E. Corte Madera,1.0,Eastbound,240.0,199.0,22273.0,22500.0
7,41333,3,53880.0,54343,29,Downtown San Rafael - E. Corte Madera,1.0,Eastbound,240.0,463.0,2964.0,2700.0
8,41333,3,57300.0,57356,29,Downtown San Rafael - E. Corte Madera,1.0,Eastbound,240.0,56.0,3013.0,3420.0
9,41334,4,25740.0,26011,29,Downtown San Rafael - E. Corte Madera,1.0,Eastbound,240.0,271.0,,


### Delete out rows that are Nan??

In [45]:
#rt_stop_times6 = rt_stop_times5.loc[~rt_stop_times5.actual_headway.isna()]

In [46]:
#rt_stop_times6 = rt_stop_times6.loc[~rt_stop_times5.schd_headway.isna()].reset_index(drop=True)

In [47]:
#rt_stop_times6.head(10).drop(columns = drop_for_preview)

### Find the mean scheduled headway for the `operator-route-direction_id-stop_sequence-stop_id-` grain

In [48]:
agg1 = (
    rt_stop_times5.groupby(groupby_cols
    )
    .agg({"schd_headway": "mean"})
    .reset_index()
    .rename(columns={"schd_headway": "avg_schd_headway_sec"})
)

In [49]:
m1 = pd.merge(
    rt_stop_times5,
    agg1,
    on=groupby_cols,
)

In [50]:
len(m1) == len(rt_stop_times5)

True

In [51]:
m1.head(2)

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway,avg_schd_headway_sec
0,29001,40471,2,25620.0,015d67d5b75b5cf2b710bbadadfb75f5,fc4fd5fc790a1a84e1b3663fa9bca4fd,25862,29,04ce2af2df4eebb3126f1d90a10b5a61,9529027364faa9b7dfbb3d7d7501b133,Downtown San Rafael - E. Corte Madera,1.0,Eastbound,240.0,242.0,,,10540.0
1,29005,40471,2,31620.0,015d67d5b75b5cf2b710bbadadfb75f5,46168565fecd863f123814d4d8e1b3a2,31737,29,04ce2af2df4eebb3126f1d90a10b5a61,9529027364faa9b7dfbb3d7d7501b133,Downtown San Rafael - E. Corte Madera,1.0,Eastbound,240.0,117.0,5875.0,6000.0,10540.0


### Find standard deviation: how far the actual headway is from the mean scheduled headway for the same grain above.

In [52]:
std_dev = (
    m1.groupby(groupby_cols
    )
    .apply(lambda x: (x["actual_headway"] - x["avg_schd_headway_sec"]).std())
    .reset_index(name="std_dev_headway")
)

In [53]:
std_dev.head()

Unnamed: 0,schedule_gtfs_dataset_key,feed_key,shape_array_key,route_id,direction_id,route_primary_direction,stop_sequence,stop_id,std_dev_headway
0,015d67d5b75b5cf2b710bbadadfb75f5,9529027364faa9b7dfbb3d7d7501b133,04ce2af2df4eebb3126f1d90a10b5a61,29,1.0,Eastbound,2,40471,10497.35
1,015d67d5b75b5cf2b710bbadadfb75f5,9529027364faa9b7dfbb3d7d7501b133,04ce2af2df4eebb3126f1d90a10b5a61,29,1.0,Eastbound,3,41333,9612.73
2,015d67d5b75b5cf2b710bbadadfb75f5,9529027364faa9b7dfbb3d7d7501b133,04ce2af2df4eebb3126f1d90a10b5a61,29,1.0,Eastbound,4,41334,10623.49
3,015d67d5b75b5cf2b710bbadadfb75f5,9529027364faa9b7dfbb3d7d7501b133,04ce2af2df4eebb3126f1d90a10b5a61,29,1.0,Eastbound,5,40325,7900.49
4,015d67d5b75b5cf2b710bbadadfb75f5,9529027364faa9b7dfbb3d7d7501b133,04ce2af2df4eebb3126f1d90a10b5a61,29,1.0,Eastbound,6,40323,7899.84


In [82]:
m2 = pd.merge(
    m1,
    std_dev,
    on=groupby_cols,
)

In [83]:
m2["bunching_coefficient"] = m2.std_dev_headway / m2.avg_schd_headway_sec

In [84]:
m2.loc[m2.bunching_coefficient.isna()].drop(columns = drop_for_preview).head(3)

Unnamed: 0,stop_id,stop_sequence,scheduled_arrival_sec,rt_arrival_sec,route_id,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway,avg_schd_headway_sec,std_dev_headway,bunching_coefficient
91,41190,2,24960.0,25081,613,Paradise Cay - Redwood HS,1.0,Westbound,750.0,121.0,,,,,
92,40294,3,25080.0,25142,613,Paradise Cay - Redwood HS,1.0,Westbound,750.0,62.0,,,,,
93,40292,4,25080.0,25154,613,Paradise Cay - Redwood HS,1.0,Westbound,750.0,74.0,,,,,


### Drop duplicates

In [85]:
test = m2.loc[~m2.std_dev_headway.isna()]

In [86]:
test = test.loc[~m2.avg_schd_headway_sec.isna()]

In [87]:
len(test), len(m2)

(2347544, 2410376)

In [88]:
bunching_by_stops = test.drop_duplicates(
    subset=groupby_cols
).reset_index(drop = True)

In [89]:
len(m2)-len(bunching_by_stops)

2304196

In [90]:
len(bunching_by_stops)

106180

In [91]:
bunching_by_stops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106180 entries, 0 to 106179
Data columns (total 20 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   trip_id                    106180 non-null  object 
 1   stop_id                    106180 non-null  object 
 2   stop_sequence              106180 non-null  int64  
 3   scheduled_arrival_sec      106180 non-null  float64
 4   schedule_gtfs_dataset_key  106180 non-null  object 
 5   trip_instance_key          106180 non-null  object 
 6   rt_arrival_sec             106180 non-null  int64  
 7   route_id                   106180 non-null  object 
 8   shape_array_key            106180 non-null  object 
 9   feed_key                   106180 non-null  object 
 10  route_long_name            105184 non-null  object 
 11  direction_id               106180 non-null  float64
 12  route_primary_direction    106180 non-null  object 
 13  frequency_in_minutes       10

In [92]:
len(bunching_by_stops.loc[bunching_by_stops.bunching_coefficient.isna()])

0

#### Why are some of the rows for `std_dev_headway` and `bunching_coefficient` `nan`??

In [95]:
bunching_by_stops.bunching_coefficient.describe()

count   106180.00
mean          inf
std           NaN
min          0.00
25%          0.20
50%          0.38
75%          0.62
max           inf
Name: bunching_coefficient, dtype: float64

### Bunching Coefficient

In [101]:
def coefficient_frequency(row):
    if row.bunching_coefficient <= 0.21:
        return "Service provided like clockwork"
    elif 0.21 < row.bunching_coefficient <= 0.3:
        return "Vehicles slightly off headway"
    elif 0.3 < row.bunching_coefficient <= 0.39:
        return "Vehicles often off headway"
    elif 0.39 < row.bunching_coefficient <= 0.52:
        return "Irregular headways, with some bunching"
    elif 0.52 < row.bunching_coefficient <= 0.74:
        return "Frequent bunching"
    
    else:
        return "Most vehicles bunched"


bunching_by_stops["passenger_op_perspective"] = bunching_by_stops.apply(lambda x: coefficient_frequency(x), axis=1)

In [102]:
bunching_by_stops.passenger_op_perspective.value_counts()

Service provided like clockwork           27739
Most vehicles bunched                     20213
Irregular headways, with some bunching    15864
Frequent bunching                         15814
Vehicles often off headway                14838
Vehicles slightly off headway             11712
Name: passenger_op_perspective, dtype: int64

In [108]:
bunching_by_stops.route_id.nunique()

831

In [106]:
bunching_by_stops['avg_schd_headway_min'] = bunching_by_stops.avg_schd_headway_sec/60

In [107]:
bunching_by_stops.loc[bunching_by_stops.passenger_op_perspective == "Most vehicles bunched"].sample(10).drop(columns = drop_for_preview)

Unnamed: 0,stop_id,stop_sequence,scheduled_arrival_sec,rt_arrival_sec,route_id,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway,avg_schd_headway_sec,std_dev_headway,bunching_coefficient,passenger_op_perspective,avg_schd_headway_min
75101,1555,38,18780.0,18979,28,Florida & New Chicago to Perris STC,0.0,Westbound,48.0,199.0,,,158.5,9316.84,0.98,Most vehicles bunched,2.64
47284,14377,16,38338.0,38455,35,EUREKA,0.0,Southbound,49.59,117.0,,,90.0,4695.71,0.87,Most vehicles bunched,1.5
27367,10718,14,15420.0,15351,70-13172,Metro Local Line,1.0,Westbound,11.9,-69.0,,,11.28,682.74,1.01,Most vehicles bunched,0.19
83314,5943,18,16740.0,16731,64,Huntington Beach - Tustin,0.0,Eastbound,20.83,-9.0,,,186.17,22935.99,2.05,Most vehicles bunched,3.1
25281,7862,13,16740.0,16833,28-13172,Metro Local Line,0.0,Eastbound,13.33,93.0,,,23.73,1055.59,0.74,Most vehicles bunched,0.4
22883,4799,28,14760.0,14701,30-13172,Metro Local Line,0.0,Eastbound,12.99,-59.0,,,380.0,33483.75,1.47,Most vehicles bunched,6.33
45450,15655,17,50019.0,51016,F,MARKET & WHARVES,0.0,Westbound,20.27,997.0,,,48.24,2693.51,0.93,Most vehicles bunched,0.8
48196,3190,33,29677.0,29831,545,Country Club-Malls,0.0,Westbound,75.95,154.0,,,67.22,4864.16,1.21,Most vehicles bunched,1.12
72462,1149,1512,27612.0,27711,37,Route 37,0.0,Westbound,55.56,99.0,,,73.64,6328.81,1.43,Most vehicles bunched,1.23
92674,3025,3480,29880.0,29974,185,Azusa – West Covina – Puente Hills Mall,1.0,Southbound,49.59,94.0,,,360.5,18032.64,0.83,Most vehicles bunched,6.01
