## I tried turning `stop_times` to actual dates but it seems like seconds is easier to manipulate.
* 10_transit_bunching.ipynb contains timestamps attempts
* cd data-analyses/rt_segment_speeds && pip install -r requirements.txt && cd ../_shared_utils && make setup_env && cd ../gtfs_digest
* [Issue](https://github.com/cal-itp/data-analyses/issues/1099)

In [1]:
import geopandas as gpd
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers
from shared_utils import catalog_utils, rt_dates, rt_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS

# https://github.com/cal-itp/data-analyses/blob/main/_shared_utils/shared_utils/gtfs_analytics_data.yml
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)

In [2]:
import merge_data

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [4]:
may_date = "2024-05-22"

In [5]:
drop_for_preview = [
    "schedule_gtfs_dataset_key",
    "trip_instance_key",
    "shape_array_key",
    "feed_key",
    "trip_id",
]

### Get high frequency routes
* Group by mean frequency minutes for the operator-route-direction grain.

In [6]:
subset = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "route_primary_direction",
    "service_date",
    "frequency",
]

In [7]:
route_dir = merge_data.concatenate_schedule_by_route_direction([may_date])[subset]

In [8]:
route_dir["frequency_in_minutes"] = 60 / route_dir.frequency

In [9]:
route_freq_groupby = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "route_primary_direction",
]

In [10]:
high_frequency_routes = (
    route_dir.groupby(route_freq_groupby)
    .agg({"frequency_in_minutes": "mean"})
    .reset_index()
)

#### Grab routes in the 5th percentile of frequency for now.

In [11]:
high_frequency_routes["frequency_in_minutes"].describe(
    percentiles=[0.05, 0.1, 0.9, 0.95]
)

count   3417.00
mean     234.64
std      312.42
min        4.00
5%        17.65
10%       23.40
50%       97.71
90%      750.00
95%     1000.00
max     1250.00
Name: frequency_in_minutes, dtype: float64

In [12]:
high_frequency_routes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3417 entries, 0 to 3416
Data columns (total 5 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   schedule_gtfs_dataset_key  3417 non-null   object 
 1   route_id                   3417 non-null   object 
 2   direction_id               3417 non-null   float64
 3   route_primary_direction    3417 non-null   object 
 4   frequency_in_minutes       3417 non-null   float64
dtypes: float64(2), object(3)
memory usage: 133.6+ KB


In [13]:
high_frequency_routes2 = high_frequency_routes.loc[
    high_frequency_routes.frequency_in_minutes <= 17.65
]

### Get trips of high frequency routes

In [14]:
TABLE = GTFS_DATA_DICT.schedule_downloads.trips

In [15]:
FILE = f"{COMPILED_CACHED_VIEWS}{TABLE}_{may_date}.parquet"

In [16]:
trips_subset = [
    "gtfs_dataset_key",
    "route_id",
    "trip_instance_key",
    "shape_array_key",
    "feed_key",
    "route_long_name",
    "direction_id",
]

In [17]:
trips = pd.read_parquet(FILE)[trips_subset].rename(
    columns={"gtfs_dataset_key": "schedule_gtfs_dataset_key"}
)

In [18]:
trips_freq_routes = pd.merge(
    trips,
    high_frequency_routes2,
    on=["schedule_gtfs_dataset_key", "route_id", "direction_id"],
    how="inner",
)

In [19]:
trips_freq_routes.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,trip_instance_key,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes
0,cc53a0dbf5df90e3009b9cb5d89d80ba,4869,cd1d4fc457d3a3fff6e77e47336bbc98,7fca7ce64e1b773776b91ec1cf82c9ea,2cfdf0e33e9229d6b0ad124d956f5856,DASH Pico Union/Echo Park,0.0,Northbound,16.67
1,cc53a0dbf5df90e3009b9cb5d89d80ba,4869,180a069ab3aefcf8f3317a788b32c288,7fca7ce64e1b773776b91ec1cf82c9ea,2cfdf0e33e9229d6b0ad124d956f5856,DASH Pico Union/Echo Park,0.0,Northbound,16.67


#### Why are some trip instance keys duplicated?

In [20]:
len(trips_freq_routes) - trips_freq_routes.trip_instance_key.nunique()

0

In [21]:
trips_freq_routes.trip_instance_key.value_counts().head(10)

cd1d4fc457d3a3fff6e77e47336bbc98    1
d94e2fdba686dcfd0b1d49d76f2df439    1
8f161dedf0303023185c63a35fb71e7f    1
529494b21ce8100bf81c3e28ad3aaa77    1
81113671d6b5e279b38edc27b561951e    1
16ebd4de31b78dd93ff74a38c0a365f2    1
71fc5466d52fe1afb62d3564a3791ecb    1
c4289e090cdb8c3e58765ea24e1497ea    1
3b3ed34bf5cc46ac80e261ea75e9b574    1
a5a5e1aab21d7ecea2e91fde6ec41c51    1
Name: trip_instance_key, dtype: int64

### Get Stop Times
* What's the difference btwn `trip_id` and `trip_instance_key`?

In [22]:
rt_stop_times = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_rt_stop_times_2024-05-22.parquet"
)

In [23]:
rt_stop_times.shape

(2601262, 7)

In [24]:
# Find only stop times of trips that belong to high frequency trips
rt_stop_times2 = pd.merge(
    rt_stop_times,
    trips_freq_routes,
    on=[
        "schedule_gtfs_dataset_key",
        "trip_instance_key",
    ],
    how="inner",
)

In [25]:
len(rt_stop_times) - len(rt_stop_times2)

1903905

In [26]:
rt_stop_times2.shape

(697357, 14)

In [27]:
rt_stop_times2.head(2)

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes
0,922552,258,2,61249.0,efbbd5293be71f7a5de0cf82b59febe1,50617e0d3c1bbedd9803836728767a69,61995,3730,e10d20177f6b29f7d2de52645301f18f,0e75eaae4dc791180f05782fa8825254,Main St & Santa Monica Blvd/UCLA,1.0,Southbound,16.08
1,922552,310,9,62012.0,efbbd5293be71f7a5de0cf82b59febe1,50617e0d3c1bbedd9803836728767a69,62826,3730,e10d20177f6b29f7d2de52645301f18f,0e75eaae4dc791180f05782fa8825254,Main St & Santa Monica Blvd/UCLA,1.0,Southbound,16.08


###  Some scheduled arrival seconds span longer than a day: filter them out?
* There are 86,400 seconds in a day

In [28]:
rt_stop_times2.scheduled_arrival_sec.describe()

count   697357.00
mean     50526.22
std      19329.72
min       9420.00
25%      34320.00
50%      49740.00
75%      64380.00
max     108431.00
Name: scheduled_arrival_sec, dtype: float64

In [29]:
len(rt_stop_times2.loc[rt_stop_times2.scheduled_arrival_sec > 86400])

27472

In [30]:
rt_stop_times3 = rt_stop_times2.loc[
    rt_stop_times2.scheduled_arrival_sec < 86400
].reset_index(drop=True)

In [31]:
# Rearrange: I want the stop sequence to be 1,2,3,4.
# stop ids can differ between trips of the same route and the same stop sequence is the same
rt_stop_times4 = rt_stop_times3.sort_values(
    by=[
        "schedule_gtfs_dataset_key",
        "route_id",
        "direction_id",
        "feed_key",
        "shape_array_key",
        "stop_sequence",
        "scheduled_arrival_sec",
    ]
).reset_index(drop=True)

### Calculate the difference btwn actual vs scheduled arrival.

In [32]:
def check_delay(df):
    df = df.assign(delay=df.rt_arrival_sec - df.scheduled_arrival_sec)

    print(df.delay.describe(percentiles=[0.05, 0.1, 0.9, 0.95]))

    max_delay_min = df.delay.max() / 60
    p95_delay_min = df.delay.quantile(q=0.95) / 60

    min_delay_min = df.delay.min() / 60
    p5_delay_min = df.delay.quantile(q=0.05) / 60

    print(f"min / max delay (minutes): {min_delay_min}, {max_delay_min}")
    print(f"5th / 95th delay (minutes): {p5_delay_min}, {p95_delay_min}")

    return df

In [33]:
rt_stop_times4 = check_delay(rt_stop_times4)

count   669746.00
mean        68.40
std       2841.12
min     -86381.00
5%        -167.00
10%       -107.00
50%         89.00
90%        515.00
95%        719.00
max      57878.00
Name: delay, dtype: float64
min / max delay (minutes): -1439.6833333333334, 964.6333333333333
5th / 95th delay (minutes): -2.783333333333333, 11.983333333333333


#### Filter out differences of actual-scheduled arrival time that are more or less than one hour

In [34]:
# Filter to only delays that are an hour or less
rt_stop_times5 = rt_stop_times4[rt_stop_times4["delay"] <= 60 * 60].reset_index(
    drop=True
)

In [35]:
# Filter to only delays that are no less than
rt_stop_times5 = rt_stop_times5[rt_stop_times5["delay"] >= -3600].reset_index(drop=True)

In [36]:
len(rt_stop_times4) - len(rt_stop_times5)

1100

In [37]:
rt_stop_times5.delay.describe()

count   668646.00
mean       158.11
std        306.54
min      -3559.00
25%        -24.00
50%         89.00
75%        269.00
max       3592.00
Name: delay, dtype: float64

In [38]:
3592 / 60

59.86666666666667

### Calculate the actual headway the `operator-route-direction_id-stop_sequence-stop_id-` grain
* Do I need to include feed key and shape array key?

In [39]:
groupby_cols = [
    "schedule_gtfs_dataset_key",
    "feed_key",
    "shape_array_key",
    "route_id",
    "direction_id",
    "route_primary_direction",
    "stop_sequence",
    "stop_id",
]

In [40]:
rt_stop_times5["actual_headway"] = rt_stop_times5.groupby(groupby_cols)[
    "rt_arrival_sec"
].diff()

### Calculate scheduled headway

In [41]:
rt_stop_times5["schd_headway"] = rt_stop_times5.groupby(groupby_cols)[
    "scheduled_arrival_sec"
].diff()

In [42]:
rt_stop_times5.head(10).drop(columns=drop_for_preview)

Unnamed: 0,stop_id,stop_sequence,scheduled_arrival_sec,rt_arrival_sec,route_id,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway
0,4201,2,56100.0,56133,105-13172,Metro Local Line,0.0,Westbound,14.59,33.0,,
1,4201,2,57300.0,57279,105-13172,Metro Local Line,0.0,Westbound,14.59,-21.0,1146.0,1200.0
2,4201,2,58500.0,58551,105-13172,Metro Local Line,0.0,Westbound,14.59,51.0,1272.0,1200.0
3,4201,2,60900.0,61144,105-13172,Metro Local Line,0.0,Westbound,14.59,244.0,2593.0,2400.0
4,4201,2,62100.0,61614,105-13172,Metro Local Line,0.0,Westbound,14.59,-486.0,470.0,1200.0
5,4201,2,63300.0,63384,105-13172,Metro Local Line,0.0,Westbound,14.59,84.0,1770.0,1200.0
6,4201,2,64560.0,65168,105-13172,Metro Local Line,0.0,Westbound,14.59,608.0,1784.0,1260.0
7,4201,2,65940.0,65585,105-13172,Metro Local Line,0.0,Westbound,14.59,-355.0,417.0,1380.0
8,4201,2,67860.0,68099,105-13172,Metro Local Line,0.0,Westbound,14.59,239.0,2514.0,1920.0
9,4198,3,56160.0,56211,105-13172,Metro Local Line,0.0,Westbound,14.59,51.0,,


### Delete out rows that are `nan`??
* `nans` are not impacted when calculating the mean scheduled headway and whatnot?

In [43]:
rt_stop_times6 = rt_stop_times5.loc[~rt_stop_times5.actual_headway.isna()]

In [44]:
rt_stop_times6 = rt_stop_times6.loc[~rt_stop_times5.schd_headway.isna()].reset_index(
    drop=True
)

In [45]:
rt_stop_times6.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 647207 entries, 0 to 647206
Data columns (total 17 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   trip_id                    647207 non-null  object 
 1   stop_id                    647207 non-null  object 
 2   stop_sequence              647207 non-null  int64  
 3   scheduled_arrival_sec      647207 non-null  float64
 4   schedule_gtfs_dataset_key  647207 non-null  object 
 5   trip_instance_key          647207 non-null  object 
 6   rt_arrival_sec             647207 non-null  int64  
 7   route_id                   647207 non-null  object 
 8   shape_array_key            647207 non-null  object 
 9   feed_key                   647207 non-null  object 
 10  route_long_name            647207 non-null  object 
 11  direction_id               647207 non-null  float64
 12  route_primary_direction    647207 non-null  object 
 13  frequency_in_minutes       64

### Find the mean scheduled headway for the `operator-route-direction_id-stop_sequence-stop_id-` grain

In [46]:
agg1 = (
    rt_stop_times6.groupby(groupby_cols)
    .agg({"schd_headway": "mean"})
    .reset_index()
    .rename(columns={"schd_headway": "avg_schd_headway_sec"})
)

In [47]:
agg1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19260 entries, 0 to 19259
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   schedule_gtfs_dataset_key  19260 non-null  object 
 1   feed_key                   19260 non-null  object 
 2   shape_array_key            19260 non-null  object 
 3   route_id                   19260 non-null  object 
 4   direction_id               19260 non-null  float64
 5   route_primary_direction    19260 non-null  object 
 6   stop_sequence              19260 non-null  int64  
 7   stop_id                    19260 non-null  object 
 8   avg_schd_headway_sec       19260 non-null  float64
dtypes: float64(2), int64(1), object(6)
memory usage: 1.3+ MB


#### Check out missing values

In [48]:
agg1.loc[agg1.avg_schd_headway_sec.isna()].head()

Unnamed: 0,schedule_gtfs_dataset_key,feed_key,shape_array_key,route_id,direction_id,route_primary_direction,stop_sequence,stop_id,avg_schd_headway_sec


In [49]:
agg1.loc[
    (agg1.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939")
    & (agg1.route_id == "51-13172")
    & (agg1.stop_sequence == 3)
    & (agg1.route_primary_direction == "Northbound")
    & (agg1.shape_array_key == "058b89eae795266d5e7dba2adba9be12")
]

Unnamed: 0,schedule_gtfs_dataset_key,feed_key,shape_array_key,route_id,direction_id,route_primary_direction,stop_sequence,stop_id,avg_schd_headway_sec


In [50]:
rt_stop_times6.loc[
    (rt_stop_times6.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939")
    & (rt_stop_times6.route_id == "51-13172")
    & (rt_stop_times6.stop_sequence == 3)
    & (rt_stop_times6.route_primary_direction == "Northbound")
    & (rt_stop_times6.shape_array_key == "058b89eae795266d5e7dba2adba9be12")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway


#### Merge

In [51]:
m1 = pd.merge(
    rt_stop_times6,
    agg1,
    on=groupby_cols,
)

In [52]:
len(rt_stop_times6) - len(m1)

0

In [53]:
m1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 647207 entries, 0 to 647206
Data columns (total 18 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   trip_id                    647207 non-null  object 
 1   stop_id                    647207 non-null  object 
 2   stop_sequence              647207 non-null  int64  
 3   scheduled_arrival_sec      647207 non-null  float64
 4   schedule_gtfs_dataset_key  647207 non-null  object 
 5   trip_instance_key          647207 non-null  object 
 6   rt_arrival_sec             647207 non-null  int64  
 7   route_id                   647207 non-null  object 
 8   shape_array_key            647207 non-null  object 
 9   feed_key                   647207 non-null  object 
 10  route_long_name            647207 non-null  object 
 11  direction_id               647207 non-null  float64
 12  route_primary_direction    647207 non-null  object 
 13  frequency_in_minutes       64

### Find standard deviation: how far the actual headway is from the mean scheduled headway for the same grain above.


In [54]:
# Takes 1 minute

std_dev = (
    m1.groupby(groupby_cols)
    .apply(lambda x: (x["actual_headway"] - x["avg_schd_headway_sec"]).std())
    .reset_index(name="std_dev_headway")
)

#### Why are so many missing?
Why are some Groupby and transform: When using groupby and transform, if a group has only one element, the standard deviation is NaN (Not a Number). This is because standard deviation requires at least two data points to calculate.

In [55]:
std_dev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19260 entries, 0 to 19259
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   schedule_gtfs_dataset_key  19260 non-null  object 
 1   feed_key                   19260 non-null  object 
 2   shape_array_key            19260 non-null  object 
 3   route_id                   19260 non-null  object 
 4   direction_id               19260 non-null  float64
 5   route_primary_direction    19260 non-null  object 
 6   stop_sequence              19260 non-null  int64  
 7   stop_id                    19260 non-null  object 
 8   std_dev_headway            17781 non-null  float64
dtypes: float64(2), int64(1), object(6)
memory usage: 1.3+ MB


In [56]:
m1 = pd.merge(
    m1,
    std_dev,
    on=groupby_cols,
)

In [57]:
m1.head()

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway,avg_schd_headway_sec,std_dev_headway
0,10105002681555-DEC23,4201,2,57300.0,0666caf3ec1ecc96b74f4477ee4bc939,1c45265e242009e9589b98d1ec995907,57279,105-13172,6b06373e4a70b2cb094870285bd92bec,608992664173210532aa3e6cc573be2f,Metro Local Line,0.0,Westbound,14.59,-21.0,1146.0,1200.0,1470.0,826.99
1,10105002681615-DEC23,4201,2,58500.0,0666caf3ec1ecc96b74f4477ee4bc939,5cac677d47265ba63da650b8878ab403,58551,105-13172,6b06373e4a70b2cb094870285bd92bec,608992664173210532aa3e6cc573be2f,Metro Local Line,0.0,Westbound,14.59,51.0,1272.0,1200.0,1470.0,826.99
2,10105002681655-DEC23,4201,2,60900.0,0666caf3ec1ecc96b74f4477ee4bc939,22bf23b1d4929fe2020cab40e7a7341b,61144,105-13172,6b06373e4a70b2cb094870285bd92bec,608992664173210532aa3e6cc573be2f,Metro Local Line,0.0,Westbound,14.59,244.0,2593.0,2400.0,1470.0,826.99
3,10105002681715-DEC23,4201,2,62100.0,0666caf3ec1ecc96b74f4477ee4bc939,4578120377cc0958d6206117870d4e8a,61614,105-13172,6b06373e4a70b2cb094870285bd92bec,608992664173210532aa3e6cc573be2f,Metro Local Line,0.0,Westbound,14.59,-486.0,470.0,1200.0,1470.0,826.99
4,10105002681735-DEC23,4201,2,63300.0,0666caf3ec1ecc96b74f4477ee4bc939,46a44f4560e7a383fd5eda5d4ba8ed7b,63384,105-13172,6b06373e4a70b2cb094870285bd92bec,608992664173210532aa3e6cc573be2f,Metro Local Line,0.0,Westbound,14.59,84.0,1770.0,1200.0,1470.0,826.99


In [58]:
m1["bunching_coefficient"] = m1.std_dev_headway / m1.avg_schd_headway_sec

In [59]:
m1.bunching_coefficient.describe()

count   645728.00
mean         0.72
std          0.43
min          0.00
25%          0.47
50%          0.63
75%          0.79
max          5.28
Name: bunching_coefficient, dtype: float64

In [60]:
bunching_by_stops = m1.drop_duplicates(subset=groupby_cols).reset_index(drop=True)

In [61]:
len(m1) - len(bunching_by_stops)

627947

In [62]:
len(bunching_by_stops)

19260

In [63]:
bunching_by_stops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19260 entries, 0 to 19259
Data columns (total 20 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   trip_id                    19260 non-null  object 
 1   stop_id                    19260 non-null  object 
 2   stop_sequence              19260 non-null  int64  
 3   scheduled_arrival_sec      19260 non-null  float64
 4   schedule_gtfs_dataset_key  19260 non-null  object 
 5   trip_instance_key          19260 non-null  object 
 6   rt_arrival_sec             19260 non-null  int64  
 7   route_id                   19260 non-null  object 
 8   shape_array_key            19260 non-null  object 
 9   feed_key                   19260 non-null  object 
 10  route_long_name            19260 non-null  object 
 11  direction_id               19260 non-null  float64
 12  route_primary_direction    19260 non-null  object 
 13  frequency_in_minutes       19260 non-null  flo

In [64]:
len(bunching_by_stops.loc[bunching_by_stops.avg_schd_headway_sec == 0])

0

In [65]:
bunching_by_stops.loc[bunching_by_stops.bunching_coefficient.isna()].head()

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway,avg_schd_headway_sec,std_dev_headway,bunching_coefficient
118,10105002702025-DEC23,14378,74,78300.0,0666caf3ec1ecc96b74f4477ee4bc939,36ee13d32f4bf2db3f32fd63fe28b0c2,78933,105-13172,9fde99fa2d6b8a169d5d8292a8ed8569,608992664173210532aa3e6cc573be2f,Metro Local Line,0.0,Westbound,14.59,633.0,18159.0,17760.0,17760.0,,
119,10105002702025-DEC23,16532,75,78420.0,0666caf3ec1ecc96b74f4477ee4bc939,36ee13d32f4bf2db3f32fd63fe28b0c2,80236,105-13172,9fde99fa2d6b8a169d5d8292a8ed8569,608992664173210532aa3e6cc573be2f,Metro Local Line,0.0,Westbound,14.59,1816.0,19263.0,17760.0,17760.0,,
461,10108004260622-DEC23,5813,83,28200.0,0666caf3ec1ecc96b74f4477ee4bc939,f1f8d5c03f5eadd84b8da65441d17289,28948,108-13172,138b97aca980bbb9aff7385a1df831cb,608992664173210532aa3e6cc573be2f,Metro Local Line,0.0,Eastbound,13.72,748.0,2670.0,1800.0,1800.0,,
539,10108004430722-DEC23,5896,79,33120.0,0666caf3ec1ecc96b74f4477ee4bc939,39af89e6e302860227156f8732ee7cc2,34034,108-13172,14354ac3912b3209f33b3378a8b79df6,608992664173210532aa3e6cc573be2f,Metro Local Line,0.0,Eastbound,13.72,914.0,2638.0,2160.0,2160.0,,
975,10108004510555-DEC23,1635,3,21480.0,0666caf3ec1ecc96b74f4477ee4bc939,d8a16d74fc54fd3c17c96a4f351c0fd6,21521,108-13172,13d1f6a199bb8055089a2f96cea449f0,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Westbound,14.92,41.0,3304.0,3180.0,3180.0,,


In [66]:
rt_stop_times6.loc[
    (rt_stop_times6.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939")
    & (rt_stop_times6.route_id == "105-13172")
    & (rt_stop_times6.stop_sequence == 74)
    & (rt_stop_times6.route_primary_direction == "Westbound")
    & (rt_stop_times6.stop_id == "14378")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway
5675,10105002702025-DEC23,14378,74,78300.0,0666caf3ec1ecc96b74f4477ee4bc939,36ee13d32f4bf2db3f32fd63fe28b0c2,78933,105-13172,9fde99fa2d6b8a169d5d8292a8ed8569,608992664173210532aa3e6cc573be2f,Metro Local Line,0.0,Westbound,14.59,633.0,18159.0,17760.0


In [67]:
rt_stop_times5.loc[
    (rt_stop_times5.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939")
    & (rt_stop_times5.route_id == "105-13172")
    & (rt_stop_times5.stop_sequence == 74)
    & (rt_stop_times5.route_primary_direction == "Westbound")
    & (rt_stop_times5.stop_id == "14378")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway
5793,10105002701506-DEC23,14378,74,60540.0,0666caf3ec1ecc96b74f4477ee4bc939,a262350b71374a3ab43d5f15cbd33a76,60774,105-13172,9fde99fa2d6b8a169d5d8292a8ed8569,608992664173210532aa3e6cc573be2f,Metro Local Line,0.0,Westbound,14.59,234.0,,
5794,10105002702025-DEC23,14378,74,78300.0,0666caf3ec1ecc96b74f4477ee4bc939,36ee13d32f4bf2db3f32fd63fe28b0c2,78933,105-13172,9fde99fa2d6b8a169d5d8292a8ed8569,608992664173210532aa3e6cc573be2f,Metro Local Line,0.0,Westbound,14.59,633.0,18159.0,17760.0


#### Filter out rows without std_dev now

In [68]:
bunching_by_stops2 = bunching_by_stops.loc[
    ~bunching_by_stops.bunching_coefficient.isna()
]

In [69]:
bunching_by_stops2.bunching_coefficient.describe()

count   17781.00
mean        0.79
std         0.61
min         0.00
25%         0.42
50%         0.61
75%         0.92
max         5.28
Name: bunching_coefficient, dtype: float64

### Bunching Coefficient

In [70]:
def coefficient_frequency(row):
    if row.bunching_coefficient <= 0.21:
        return "Service provided like clockwork"
    elif 0.21 < row.bunching_coefficient <= 0.3:
        return "Vehicles slightly off headway"
    elif 0.3 < row.bunching_coefficient <= 0.39:
        return "Vehicles often off headway"
    elif 0.39 < row.bunching_coefficient <= 0.52:
        return "Irregular headways, with some bunching"
    elif 0.52 < row.bunching_coefficient <= 0.74:
        return "Frequent bunching"
    else:
        return "Most vehicles bunched"


bunching_by_stops2["passenger_op_perspective"] = bunching_by_stops2.apply(
    lambda x: coefficient_frequency(x), axis=1
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bunching_by_stops2["passenger_op_perspective"] = bunching_by_stops2.apply(


In [71]:
bunching_by_stops2.passenger_op_perspective.value_counts()

Most vehicles bunched                     6199
Frequent bunching                         4494
Irregular headways, with some bunching    3251
Vehicles often off headway                1545
Service provided like clockwork           1535
Vehicles slightly off headway              757
Name: passenger_op_perspective, dtype: int64

#### How are some routes missing??

In [72]:
bunching_by_stops2.route_id.nunique()

85

In [73]:
high_frequency_routes2.route_id.nunique()

93

In [74]:
bunching_by_stops2["avg_schd_headway_min"] = (
    bunching_by_stops2.avg_schd_headway_sec / 60
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bunching_by_stops2["avg_schd_headway_min"] = (


In [75]:
bunching_by_stops2["actual_headway_min"] = bunching_by_stops2.actual_headway / 60

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bunching_by_stops2["actual_headway_min"] = bunching_by_stops2.actual_headway / 60


In [76]:
drop_for_preview.append("scheduled_arrival_sec")

In [77]:
drop_for_preview.append("std_dev_headway")

In [78]:
drop_for_preview.append("avg_schd_headway_sec")

In [79]:
drop_for_preview.append("actual_headway")

In [80]:
drop_for_preview.append("schd_headway")

In [81]:
drop_for_preview.append("rt_arrival_sec")

In [85]:
fillmore = bunching_by_stops2.loc[bunching_by_stops2.route_long_name == "FILLMORE"]

In [86]:
fillmore.passenger_op_perspective.value_counts()

Most vehicles bunched                     120
Frequent bunching                          24
Irregular headways, with some bunching      1
Vehicles slightly off headway               1
Service provided like clockwork             1
Name: passenger_op_perspective, dtype: int64

In [82]:
bunching_by_stops2.loc[bunching_by_stops2.route_long_name == "FILLMORE"].drop(
    columns=drop_for_preview
)

Unnamed: 0,stop_id,stop_sequence,route_id,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,bunching_coefficient,passenger_op_perspective,avg_schd_headway_min,actual_headway_min
14682,14630,2,22,FILLMORE,0.0,Southbound,7.61,161.0,0.49,"Irregular headways, with some bunching",29.18,39.2
14683,14609,7,22,FILLMORE,0.0,Southbound,7.61,-53.0,0.73,Frequent bunching,22.6,6.4
14684,14627,8,22,FILLMORE,0.0,Southbound,7.61,-98.0,0.99,Most vehicles bunched,22.6,5.63
14685,16754,9,22,FILLMORE,0.0,Southbound,7.61,115.0,0.82,Most vehicles bunched,24.17,36.67
14686,16491,10,22,FILLMORE,0.0,Southbound,7.61,43.0,0.73,Frequent bunching,24.17,35.57
14687,16488,11,22,FILLMORE,0.0,Southbound,7.61,21.0,0.68,Frequent bunching,24.15,35.07
14688,16493,12,22,FILLMORE,0.0,Southbound,7.61,-20.0,0.67,Frequent bunching,24.12,34.13
14689,13086,13,22,FILLMORE,0.0,Southbound,7.61,-58.0,0.66,Frequent bunching,24.09,33.28
14690,14605,14,22,FILLMORE,0.0,Southbound,7.61,-113.0,0.74,Frequent bunching,24.05,32.07
14691,14624,15,22,FILLMORE,0.0,Southbound,7.61,-84.0,0.67,Frequent bunching,24.02,32.27


In [83]:
check1 = (
    bunching_by_stops2.groupby(
        ["stop_sequence", "route_id", "schedule_gtfs_dataset_key"]
    )
    .agg({"route_primary_direction": "nunique"})
    .reset_index()
)

In [84]:
check1.route_primary_direction.describe()

count   4076.00
mean       1.89
std        0.31
min        1.00
25%        2.00
50%        2.00
75%        2.00
max        2.00
Name: route_primary_direction, dtype: float64