## I tried turning `stop_times` to actual dates but it seems like seconds is easier to manipulate.
* 10_transit_bunching.ipynb contains timestamps attempts
* cd data-analyses/rt_segment_speeds && pip install -r requirements.txt && cd ../_shared_utils && make setup_env && cd ../gtfs_digest
* [Issue](https://github.com/cal-itp/data-analyses/issues/1099)

In [1]:
import geopandas as gpd
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers
from shared_utils import catalog_utils, rt_dates, rt_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS

# https://github.com/cal-itp/data-analyses/blob/main/_shared_utils/shared_utils/gtfs_analytics_data.yml
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)

In [2]:
import merge_data

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [4]:
may_date = "2024-05-22"

In [5]:
drop_for_preview = [
    "schedule_gtfs_dataset_key",
    "trip_instance_key",
    "shape_array_key",
    "feed_key",
    "trip_id",
]

### Get high frequency routes
* Group by mean frequency minutes for the operator-route-direction grain.

In [6]:
subset = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "route_primary_direction",
    "service_date",
    "frequency",
]

In [7]:
route_dir = merge_data.concatenate_schedule_by_route_direction([may_date])[subset]

In [8]:
route_dir["frequency_in_minutes"] = 60 / route_dir.frequency

In [9]:
route_freq_groupby = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "route_primary_direction",
]

In [10]:
high_frequency_routes = (
    route_dir.groupby(route_freq_groupby)
    .agg({"frequency_in_minutes": "mean"})
    .reset_index()
)

#### Grab routes in the 5th percentile of frequency for now.

In [11]:
high_frequency_routes["frequency_in_minutes"].describe(
    percentiles=[0.05, 0.1, 0.9, 0.95]
)

count   3417.00
mean     234.64
std      312.42
min        4.00
5%        17.65
10%       23.40
50%       97.71
90%      750.00
95%     1000.00
max     1250.00
Name: frequency_in_minutes, dtype: float64

In [12]:
high_frequency_routes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3417 entries, 0 to 3416
Data columns (total 5 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   schedule_gtfs_dataset_key  3417 non-null   object 
 1   route_id                   3417 non-null   object 
 2   direction_id               3417 non-null   float64
 3   route_primary_direction    3417 non-null   object 
 4   frequency_in_minutes       3417 non-null   float64
dtypes: float64(2), object(3)
memory usage: 133.6+ KB


In [13]:
high_frequency_routes2 = high_frequency_routes.loc[
    high_frequency_routes.frequency_in_minutes <= 17.65
]

### Get trips of high frequency routes

In [14]:
TABLE = GTFS_DATA_DICT.schedule_downloads.trips

In [15]:
FILE = f"{COMPILED_CACHED_VIEWS}{TABLE}_{may_date}.parquet"

In [16]:
trips_subset = [
    "gtfs_dataset_key",
    "route_id",
    "trip_instance_key",
    "shape_array_key",
    "feed_key",
    "route_long_name",
    "direction_id",
]

In [17]:
trips = pd.read_parquet(FILE)[trips_subset].rename(
    columns={"gtfs_dataset_key": "schedule_gtfs_dataset_key"}
)

In [18]:
trips_freq_routes = pd.merge(
    trips,
    high_frequency_routes2,
    on=["schedule_gtfs_dataset_key", "route_id", "direction_id"],
    how="inner",
)

In [19]:
trips_freq_routes.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,trip_instance_key,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes
0,cc53a0dbf5df90e3009b9cb5d89d80ba,4869,cd1d4fc457d3a3fff6e77e47336bbc98,7fca7ce64e1b773776b91ec1cf82c9ea,2cfdf0e33e9229d6b0ad124d956f5856,DASH Pico Union/Echo Park,0.0,Northbound,16.67
1,cc53a0dbf5df90e3009b9cb5d89d80ba,4869,180a069ab3aefcf8f3317a788b32c288,7fca7ce64e1b773776b91ec1cf82c9ea,2cfdf0e33e9229d6b0ad124d956f5856,DASH Pico Union/Echo Park,0.0,Northbound,16.67


### Get Stop Times
* What's the difference btwn `trip_id` and `trip_instance_key`?

In [20]:
rt_stop_times = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_rt_stop_times_2024-05-22.parquet"
)

In [21]:
rt_stop_times.shape

(2601262, 7)

In [22]:
# Find only stop times of trips that belong to high frequency trips
rt_stop_times2 = pd.merge(
    rt_stop_times,
    trips_freq_routes,
    on=[
        "schedule_gtfs_dataset_key",
        "trip_instance_key",
    ],
    how="inner",
)

In [23]:
len(rt_stop_times) - len(rt_stop_times2)

1903905

In [24]:
rt_stop_times2.shape

(697357, 14)

###  Some scheduled arrival seconds span longer than a day: filter them out?
* There are 86,400 seconds in a day

In [25]:
rt_stop_times2.scheduled_arrival_sec.describe()

count   697357.00
mean     50526.22
std      19329.72
min       9420.00
25%      34320.00
50%      49740.00
75%      64380.00
max     108431.00
Name: scheduled_arrival_sec, dtype: float64

In [26]:
len(rt_stop_times2.loc[rt_stop_times2.scheduled_arrival_sec > 86400])

27472

In [27]:
rt_stop_times3 = rt_stop_times2.loc[
    rt_stop_times2.scheduled_arrival_sec < 86400
].reset_index(drop=True)

In [28]:
len(rt_stop_times3)

669746

In [29]:
rt_stop_times3.scheduled_arrival_sec.describe()

count   669746.00
mean     48760.53
std      17580.67
min       9420.00
25%      33720.00
50%      48540.00
75%      62640.00
max      86399.00
Name: scheduled_arrival_sec, dtype: float64

In [30]:
rt_stop_times3.rt_arrival_sec.describe()

count   669746.00
mean     48828.92
std      17647.56
min          0.00
25%      33814.00
50%      48581.00
75%      62840.00
max      86399.00
Name: rt_arrival_sec, dtype: float64

##### Help, which columns should I use to sort? Should I keep `feed_key` and `shape_array_key`?

In [31]:
# Rearrange: I want the stop sequence to be 1,2,3,4.
# stop ids can differ between trips of the same route and the same stop sequence is the same
rt_stop_times4 = rt_stop_times3.sort_values(
    by=[
        "schedule_gtfs_dataset_key",
        "route_id",
        "direction_id",
        "feed_key",
        "shape_array_key",
        "stop_sequence",
        "scheduled_arrival_sec",
    ]
).reset_index(drop=True)

### Calculate the difference btwn actual vs scheduled arrival.

In [32]:
def check_delay(df):
    df = df.assign(delay=df.rt_arrival_sec - df.scheduled_arrival_sec)

    print(df.delay.describe(percentiles=[0.05, 0.1, 0.9, 0.95]))

    max_delay_min = df.delay.max() / 60
    p95_delay_min = df.delay.quantile(q=0.95) / 60

    min_delay_min = df.delay.min() / 60
    p5_delay_min = df.delay.quantile(q=0.05) / 60

    print(f"min / max delay (minutes): {min_delay_min}, {max_delay_min}")
    print(f"5th / 95th delay (minutes): {p5_delay_min}, {p95_delay_min}")

    return df

In [33]:
rt_stop_times4 = check_delay(rt_stop_times4)

count   669746.00
mean        68.40
std       2841.12
min     -86381.00
5%        -167.00
10%       -107.00
50%         89.00
90%        515.00
95%        719.00
max      57878.00
Name: delay, dtype: float64
min / max delay (minutes): -1439.6833333333334, 964.6333333333333
5th / 95th delay (minutes): -2.783333333333333, 11.983333333333333


#### Filter out values in `delay` that are more or less than one hour

In [34]:
# Filter to only delays that are an hour or less
rt_stop_times5 = rt_stop_times4[rt_stop_times4["delay"] <= 60 * 60].reset_index(
    drop=True
)

In [35]:
# Filter to only delays that are no less than
rt_stop_times5 = rt_stop_times5[rt_stop_times5["delay"] >= -3600].reset_index(drop=True)

In [36]:
len(rt_stop_times4) - len(rt_stop_times5)

1100

In [37]:
rt_stop_times5.delay.describe()

count   668646.00
mean       158.11
std        306.54
min      -3559.00
25%        -24.00
50%         89.00
75%        269.00
max       3592.00
Name: delay, dtype: float64

### Calculate the actual headway the `operator-route-direction_id-stop_sequence-stop_id-` grain
* Do I need to include feed key and shape array key?

In [38]:
groupby_cols = [
    "schedule_gtfs_dataset_key",
    "feed_key",
    "shape_array_key",
    "route_id",
    "direction_id",
    "route_primary_direction",
    "stop_sequence",
    "stop_id",
]

In [39]:
rt_stop_times5["actual_headway"] = rt_stop_times5.groupby(groupby_cols)[
    "rt_arrival_sec"
].diff()

### Calculate scheduled headway

In [40]:
rt_stop_times5["schd_headway"] = rt_stop_times5.groupby(groupby_cols)[
    "scheduled_arrival_sec"
].diff()

In [41]:
rt_stop_times5.loc[rt_stop_times5.actual_headway.isna()].head(10).drop(columns=drop_for_preview)

Unnamed: 0,stop_id,stop_sequence,scheduled_arrival_sec,rt_arrival_sec,route_id,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway
0,4201,2,56100.0,56133,105-13172,Metro Local Line,0.0,Westbound,14.59,33.0,,
9,4198,3,56160.0,56211,105-13172,Metro Local Line,0.0,Westbound,14.59,51.0,,
19,10256,4,56280.0,56262,105-13172,Metro Local Line,0.0,Westbound,14.59,-18.0,,
29,10249,5,56340.0,56337,105-13172,Metro Local Line,0.0,Westbound,14.59,-3.0,,
39,36572,6,56460.0,56478,105-13172,Metro Local Line,0.0,Westbound,14.59,18.0,,
49,10243,7,56580.0,56589,105-13172,Metro Local Line,0.0,Westbound,14.59,9.0,,
59,10244,8,56700.0,56695,105-13172,Metro Local Line,0.0,Westbound,14.59,-5.0,,
69,10251,9,56820.0,56743,105-13172,Metro Local Line,0.0,Westbound,14.59,-77.0,,
79,10247,10,56940.0,56832,105-13172,Metro Local Line,0.0,Westbound,14.59,-108.0,,
89,10250,11,57060.0,56955,105-13172,Metro Local Line,0.0,Westbound,14.59,-105.0,,


In [42]:
1200/60

20.0

In [43]:
# rt_stop_times5.head(200).drop(columns=drop_for_preview)

### Delete out rows that are `nan`??
* I am not sure if `nans` impact calculations of the mean scheduled headway and whatnot?
* These `nans` are becuase the first `operator-route-stop_id-stop_sequence` combo won't have anything to compare it to.

In [44]:
rt_stop_times6 = rt_stop_times5.loc[~rt_stop_times5.actual_headway.isna()]

In [45]:
rt_stop_times6 = rt_stop_times6.loc[~rt_stop_times5.schd_headway.isna()].reset_index(
    drop=True
)

In [46]:
len(rt_stop_times6),len(rt_stop_times5)

(647207, 668646)

### Find the mean scheduled headway for the `operator-route-direction_id-stop_sequence-stop_id-` grain

In [47]:
agg1 = (
    rt_stop_times6.groupby(groupby_cols)
    .agg({"schd_headway": "mean"})
    .reset_index()
    .rename(columns={"schd_headway": "avg_schd_headway_sec"})
)

In [48]:
agg1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19260 entries, 0 to 19259
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   schedule_gtfs_dataset_key  19260 non-null  object 
 1   feed_key                   19260 non-null  object 
 2   shape_array_key            19260 non-null  object 
 3   route_id                   19260 non-null  object 
 4   direction_id               19260 non-null  float64
 5   route_primary_direction    19260 non-null  object 
 6   stop_sequence              19260 non-null  int64  
 7   stop_id                    19260 non-null  object 
 8   avg_schd_headway_sec       19260 non-null  float64
dtypes: float64(2), int64(1), object(6)
memory usage: 1.3+ MB


#### Merge

In [49]:
m1 = pd.merge(
    rt_stop_times6,
    agg1,
    on=groupby_cols,
)

In [50]:
len(rt_stop_times6) - len(m1)

0

In [51]:
m1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 647207 entries, 0 to 647206
Data columns (total 18 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   trip_id                    647207 non-null  object 
 1   stop_id                    647207 non-null  object 
 2   stop_sequence              647207 non-null  int64  
 3   scheduled_arrival_sec      647207 non-null  float64
 4   schedule_gtfs_dataset_key  647207 non-null  object 
 5   trip_instance_key          647207 non-null  object 
 6   rt_arrival_sec             647207 non-null  int64  
 7   route_id                   647207 non-null  object 
 8   shape_array_key            647207 non-null  object 
 9   feed_key                   647207 non-null  object 
 10  route_long_name            647207 non-null  object 
 11  direction_id               647207 non-null  float64
 12  route_primary_direction    647207 non-null  object 
 13  frequency_in_minutes       64

### Find standard deviation: how far the actual headway is from the mean scheduled headway for the same grain above.


In [52]:
# Takes 1 minute
std_dev = (
    m1.groupby(groupby_cols)
    .apply(lambda x: (x["actual_headway"] - x["avg_schd_headway_sec"]).std())
    .reset_index(name="std_dev_headway")
)

#### Why are so many missing? Case 1
Why are some Groupby and transform: When using groupby and transform, if a group has only one element, the standard deviation is NaN (Not a Number). This is because standard deviation requires at least two data points to calculate.

In [53]:
std_dev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19260 entries, 0 to 19259
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   schedule_gtfs_dataset_key  19260 non-null  object 
 1   feed_key                   19260 non-null  object 
 2   shape_array_key            19260 non-null  object 
 3   route_id                   19260 non-null  object 
 4   direction_id               19260 non-null  float64
 5   route_primary_direction    19260 non-null  object 
 6   stop_sequence              19260 non-null  int64  
 7   stop_id                    19260 non-null  object 
 8   std_dev_headway            17781 non-null  float64
dtypes: float64(2), int64(1), object(6)
memory usage: 1.3+ MB


In [54]:
std_dev.loc[std_dev.std_dev_headway.isna()].sample(1)

Unnamed: 0,schedule_gtfs_dataset_key,feed_key,shape_array_key,route_id,direction_id,route_primary_direction,stop_sequence,stop_id,std_dev_headway
1358,0666caf3ec1ecc96b74f4477ee4bc939,608992664173210532aa3e6cc573be2f,13d1f6a199bb8055089a2f96cea449f0,108-13172,1.0,Westbound,52,14644,


In [55]:
std_dev2 = std_dev.loc[~std_dev.std_dev_headway.isna()].reset_index(drop = True)

In [56]:
std_dev2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17781 entries, 0 to 17780
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   schedule_gtfs_dataset_key  17781 non-null  object 
 1   feed_key                   17781 non-null  object 
 2   shape_array_key            17781 non-null  object 
 3   route_id                   17781 non-null  object 
 4   direction_id               17781 non-null  float64
 5   route_primary_direction    17781 non-null  object 
 6   stop_sequence              17781 non-null  int64  
 7   stop_id                    17781 non-null  object 
 8   std_dev_headway            17781 non-null  float64
dtypes: float64(2), int64(1), object(6)
memory usage: 1.2+ MB


#### Investigate missing rows
* This one seems to have some very unrealistic time stamps, like the time between scheduled versus actual arrival times are more than an hour.

In [57]:
m1.loc[(m1.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939") &
                  (m1.shape_array_key == "0688a14c97a2ebfe90f5674c1262d741") & 
                  (m1.route_id== "217-13172") &
                  (m1.direction_id == 1) &
                   (m1.stop_id == "15434")]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway,avg_schd_headway_sec
161685,10217003302323-DEC23,15434,3,84360.0,0666caf3ec1ecc96b74f4477ee4bc939,bc7a2481002d5e5b5938a991db4e69d1,84408,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92,48.0,1818.0,1800.0,1800.0


In [58]:
rt_stop_times2.loc[(rt_stop_times2.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939") &
                  (rt_stop_times2.shape_array_key == "0688a14c97a2ebfe90f5674c1262d741") & 
                  (rt_stop_times2.route_id== "217-13172") &
                  (rt_stop_times2.direction_id == 1) &
                   (rt_stop_times2.stop_id == "15434")]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes
413908,10217003302253-DEC23,15434,3,82560.0,0666caf3ec1ecc96b74f4477ee4bc939,27d29b3a92104fdcb72b4095ef46fed6,82590,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92
419236,10217003302323-DEC23,15434,3,84360.0,0666caf3ec1ecc96b74f4477ee4bc939,bc7a2481002d5e5b5938a991db4e69d1,84408,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92
425286,10217003302432-DEC23,15434,3,88500.0,0666caf3ec1ecc96b74f4477ee4bc939,6699f5297ef2d670988b29937f33b56e,2031,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92
479447,10217003302354-DEC23,15434,3,86220.0,0666caf3ec1ecc96b74f4477ee4bc939,2f10227a381957bbf2b4f388e7f2a3e9,303,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92


In [59]:
rt_stop_times3.loc[(rt_stop_times3.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939") &
                  (rt_stop_times3.shape_array_key == "0688a14c97a2ebfe90f5674c1262d741") & 
                  (rt_stop_times3.route_id== "217-13172") &
                  (rt_stop_times3.direction_id == 1) &
                   (rt_stop_times3.stop_id == "15434")]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes
413484,10217003302253-DEC23,15434,3,82560.0,0666caf3ec1ecc96b74f4477ee4bc939,27d29b3a92104fdcb72b4095ef46fed6,82590,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92
417843,10217003302323-DEC23,15434,3,84360.0,0666caf3ec1ecc96b74f4477ee4bc939,bc7a2481002d5e5b5938a991db4e69d1,84408,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92
462159,10217003302354-DEC23,15434,3,86220.0,0666caf3ec1ecc96b74f4477ee4bc939,2f10227a381957bbf2b4f388e7f2a3e9,303,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92


In [60]:
rt_stop_times4.loc[(rt_stop_times4.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939") &
                  (rt_stop_times4.shape_array_key == "0688a14c97a2ebfe90f5674c1262d741") & 
                  (rt_stop_times4.route_id== "217-13172") &
                  (rt_stop_times4.direction_id == 1) &
                   (rt_stop_times4.stop_id == "15434")]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay
168196,10217003302253-DEC23,15434,3,82560.0,0666caf3ec1ecc96b74f4477ee4bc939,27d29b3a92104fdcb72b4095ef46fed6,82590,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92,30.0
168197,10217003302323-DEC23,15434,3,84360.0,0666caf3ec1ecc96b74f4477ee4bc939,bc7a2481002d5e5b5938a991db4e69d1,84408,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92,48.0
168198,10217003302354-DEC23,15434,3,86220.0,0666caf3ec1ecc96b74f4477ee4bc939,2f10227a381957bbf2b4f388e7f2a3e9,303,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92,-85917.0


In [61]:
rt_stop_times5.loc[(rt_stop_times5.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939") &
                  (rt_stop_times5.shape_array_key == "0688a14c97a2ebfe90f5674c1262d741") & 
                  (rt_stop_times5.route_id== "217-13172") &
                  (rt_stop_times5.direction_id == 1) &
                   (rt_stop_times5.stop_id == "15434")]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway
168062,10217003302253-DEC23,15434,3,82560.0,0666caf3ec1ecc96b74f4477ee4bc939,27d29b3a92104fdcb72b4095ef46fed6,82590,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92,30.0,,
168063,10217003302323-DEC23,15434,3,84360.0,0666caf3ec1ecc96b74f4477ee4bc939,bc7a2481002d5e5b5938a991db4e69d1,84408,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92,48.0,1818.0,1800.0


In [62]:
rt_stop_times6.loc[(rt_stop_times6.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939") &
                  (rt_stop_times6.shape_array_key == "0688a14c97a2ebfe90f5674c1262d741") & 
                  (rt_stop_times6.route_id== "217-13172") &
                  (rt_stop_times6.direction_id == 1) &
                   (rt_stop_times6.stop_id == "15434")]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway
161685,10217003302323-DEC23,15434,3,84360.0,0666caf3ec1ecc96b74f4477ee4bc939,bc7a2481002d5e5b5938a991db4e69d1,84408,217-13172,0688a14c97a2ebfe90f5674c1262d741,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,13.92,48.0,1818.0,1800.0


#### Missing rows Case 2
* There are only 2 rows, I guess there needs to be at least 3 rows to calculate the standard deviation since the first row of a combo won't have anything.

In [63]:
m1.loc[(m1.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939") &
                  (m1.shape_array_key == "6f33c9cd019664d5085f94294aeacfd3") & 
                  (m1.route_id== "234-13172") &
                  (m1.direction_id == 1) &
                   (m1.stop_id == "15383")]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway,avg_schd_headway_sec
179405,10234000780433-DEC23,15383,58,19380.0,0666caf3ec1ecc96b74f4477ee4bc939,3741cc1a8fd3d2ea5ddc59ba1766c0f5,19418,234-13172,6f33c9cd019664d5085f94294aeacfd3,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,14.42,38.0,1841.0,1800.0,1800.0


In [64]:
rt_stop_times2.loc[(rt_stop_times2.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939") &
                  (rt_stop_times2.shape_array_key == "6f33c9cd019664d5085f94294aeacfd3") & 
                  (rt_stop_times2.route_id== "234-13172") &
                  (rt_stop_times2.direction_id == 1) &
                   (rt_stop_times2.stop_id == "15383")]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes
256950,10234000780433-DEC23,15383,58,19380.0,0666caf3ec1ecc96b74f4477ee4bc939,3741cc1a8fd3d2ea5ddc59ba1766c0f5,19418,234-13172,6f33c9cd019664d5085f94294aeacfd3,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,14.42
482097,10234000780403-DEC23,15383,58,17580.0,0666caf3ec1ecc96b74f4477ee4bc939,bd2c7c473164147ae73920b7ea99c3b6,17577,234-13172,6f33c9cd019664d5085f94294aeacfd3,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,14.42


In [65]:
rt_stop_times3.loc[(rt_stop_times3.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939") &
                  (rt_stop_times3.shape_array_key == "6f33c9cd019664d5085f94294aeacfd3") & 
                  (rt_stop_times3.route_id== "234-13172") &
                  (rt_stop_times3.direction_id == 1) &
                   (rt_stop_times3.stop_id == "15383")]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes
256533,10234000780433-DEC23,15383,58,19380.0,0666caf3ec1ecc96b74f4477ee4bc939,3741cc1a8fd3d2ea5ddc59ba1766c0f5,19418,234-13172,6f33c9cd019664d5085f94294aeacfd3,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,14.42
462564,10234000780403-DEC23,15383,58,17580.0,0666caf3ec1ecc96b74f4477ee4bc939,bd2c7c473164147ae73920b7ea99c3b6,17577,234-13172,6f33c9cd019664d5085f94294aeacfd3,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,14.42


In [66]:
rt_stop_times4.loc[(rt_stop_times4.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939") &
                  (rt_stop_times4.shape_array_key == "6f33c9cd019664d5085f94294aeacfd3") & 
                  (rt_stop_times4.route_id== "234-13172") &
                  (rt_stop_times4.direction_id == 1) &
                   (rt_stop_times4.stop_id == "15383")]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay
186848,10234000780403-DEC23,15383,58,17580.0,0666caf3ec1ecc96b74f4477ee4bc939,bd2c7c473164147ae73920b7ea99c3b6,17577,234-13172,6f33c9cd019664d5085f94294aeacfd3,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,14.42,-3.0
186849,10234000780433-DEC23,15383,58,19380.0,0666caf3ec1ecc96b74f4477ee4bc939,3741cc1a8fd3d2ea5ddc59ba1766c0f5,19418,234-13172,6f33c9cd019664d5085f94294aeacfd3,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,14.42,38.0


In [67]:
rt_stop_times5.loc[(rt_stop_times5.schedule_gtfs_dataset_key == "0666caf3ec1ecc96b74f4477ee4bc939") &
                  (rt_stop_times5.shape_array_key == "6f33c9cd019664d5085f94294aeacfd3") & 
                  (rt_stop_times5.route_id== "234-13172") &
                  (rt_stop_times5.direction_id == 1) &
                   (rt_stop_times5.stop_id == "15383")]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway
186708,10234000780403-DEC23,15383,58,17580.0,0666caf3ec1ecc96b74f4477ee4bc939,bd2c7c473164147ae73920b7ea99c3b6,17577,234-13172,6f33c9cd019664d5085f94294aeacfd3,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,14.42,-3.0,,
186709,10234000780433-DEC23,15383,58,19380.0,0666caf3ec1ecc96b74f4477ee4bc939,3741cc1a8fd3d2ea5ddc59ba1766c0f5,19418,234-13172,6f33c9cd019664d5085f94294aeacfd3,608992664173210532aa3e6cc573be2f,Metro Local Line,1.0,Southbound,14.42,38.0,1841.0,1800.0


#### Merge

In [68]:
m2 = pd.merge(
    m1,
    std_dev2,
    on=groupby_cols,
)

### Bunching coefficient is for the entire grain, rather than each row

In [69]:
m2["bunching_coefficient"] = m2.std_dev_headway / m2.avg_schd_headway_sec

In [70]:
m2.bunching_coefficient.describe()

count   645728.00
mean         0.72
std          0.43
min          0.00
25%          0.47
50%          0.63
75%          0.79
max          5.28
Name: bunching_coefficient, dtype: float64

In [71]:
m2["avg_schd_headway_min"] = (
    m2.avg_schd_headway_sec / 60
)

In [72]:
m2["actual_headway_min"] = m2.actual_headway / 60

In [90]:
m2["sched_headway_min"] = m2.schd_headway / 60

In [73]:
bunching_by_stops = m2.drop_duplicates(subset=groupby_cols).reset_index(drop=True).drop(columns = ["actual_headway"])

In [74]:
len(m2) - len(bunching_by_stops)

627947

In [75]:
len(bunching_by_stops)

17781

In [76]:
bunching_by_stops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17781 entries, 0 to 17780
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   trip_id                    17781 non-null  object 
 1   stop_id                    17781 non-null  object 
 2   stop_sequence              17781 non-null  int64  
 3   scheduled_arrival_sec      17781 non-null  float64
 4   schedule_gtfs_dataset_key  17781 non-null  object 
 5   trip_instance_key          17781 non-null  object 
 6   rt_arrival_sec             17781 non-null  int64  
 7   route_id                   17781 non-null  object 
 8   shape_array_key            17781 non-null  object 
 9   feed_key                   17781 non-null  object 
 10  route_long_name            17781 non-null  object 
 11  direction_id               17781 non-null  float64
 12  route_primary_direction    17781 non-null  object 
 13  frequency_in_minutes       17781 non-null  flo

### Bunching Coefficient

In [77]:
def coefficient_frequency(row):
    if row.bunching_coefficient <= 0.21:
        return "Service provided like clockwork"
    elif 0.21 < row.bunching_coefficient <= 0.3:
        return "Vehicles slightly off headway"
    elif 0.3 < row.bunching_coefficient <= 0.39:
        return "Vehicles often off headway"
    elif 0.39 < row.bunching_coefficient <= 0.52:
        return "Irregular headways, with some bunching"
    elif 0.52 < row.bunching_coefficient <= 0.74:
        return "Frequent bunching"
    else:
        return "Most vehicles bunched"


bunching_by_stops["passenger_op_perspective"] = bunching_by_stops.apply(
    lambda x: coefficient_frequency(x), axis=1
)

In [78]:
bunching_by_stops.passenger_op_perspective.value_counts()

Most vehicles bunched                     6199
Frequent bunching                         4494
Irregular headways, with some bunching    3251
Vehicles often off headway                1545
Service provided like clockwork           1535
Vehicles slightly off headway              757
Name: passenger_op_perspective, dtype: int64

#### How are some routes missing??

In [79]:
bunching_by_stops.route_id.nunique()

85

In [80]:
high_frequency_routes2.route_id.nunique()

93

In [81]:
more_values = ["scheduled_arrival_sec","std_dev_headway","avg_schd_headway_sec","schd_headway","rt_arrival_sec"]

In [82]:
drop_for_preview = drop_for_preview + more_values

In [83]:
drop_for_preview

['schedule_gtfs_dataset_key',
 'trip_instance_key',
 'shape_array_key',
 'feed_key',
 'trip_id',
 'scheduled_arrival_sec',
 'std_dev_headway',
 'avg_schd_headway_sec',
 'schd_headway',
 'rt_arrival_sec']

### Fillmore Test

In [84]:
fillmore = bunching_by_stops.loc[bunching_by_stops.route_long_name == "FILLMORE"]

In [85]:
fillmore.passenger_op_perspective.value_counts()

Most vehicles bunched                     120
Frequent bunching                          24
Irregular headways, with some bunching      1
Vehicles slightly off headway               1
Service provided like clockwork             1
Name: passenger_op_perspective, dtype: int64

In [86]:
fillmore.drop(columns = drop_for_preview)

Unnamed: 0,stop_id,stop_sequence,route_id,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,bunching_coefficient,avg_schd_headway_min,actual_headway_min,passenger_op_perspective
13438,14630,2,22,FILLMORE,0.0,Southbound,7.61,161.0,0.49,29.18,39.2,"Irregular headways, with some bunching"
13439,14609,7,22,FILLMORE,0.0,Southbound,7.61,-53.0,0.73,22.6,6.4,Frequent bunching
13440,14627,8,22,FILLMORE,0.0,Southbound,7.61,-98.0,0.99,22.6,5.63,Most vehicles bunched
13441,16754,9,22,FILLMORE,0.0,Southbound,7.61,115.0,0.82,24.17,36.67,Most vehicles bunched
13442,16491,10,22,FILLMORE,0.0,Southbound,7.61,43.0,0.73,24.17,35.57,Frequent bunching
13443,16488,11,22,FILLMORE,0.0,Southbound,7.61,21.0,0.68,24.15,35.07,Frequent bunching
13444,16493,12,22,FILLMORE,0.0,Southbound,7.61,-20.0,0.67,24.12,34.13,Frequent bunching
13445,13086,13,22,FILLMORE,0.0,Southbound,7.61,-58.0,0.66,24.09,33.28,Frequent bunching
13446,14605,14,22,FILLMORE,0.0,Southbound,7.61,-113.0,0.74,24.05,32.07,Frequent bunching
13447,14624,15,22,FILLMORE,0.0,Southbound,7.61,-84.0,0.67,24.02,32.27,Frequent bunching


In [87]:
fillmore.tail(1)

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,schd_headway,avg_schd_headway_sec,std_dev_headway,bunching_coefficient,avg_schd_headway_min,actual_headway_min,passenger_op_perspective
13584,11490047_M31,17769,7,67772.0,7cc0cb1871dfd558f11a2885c145d144,51672d1ed74ce3e66e4bfc273e6e9931,68087,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,315.0,25957.0,10023.5,11333.78,1.13,167.06,436.3,Most vehicles bunched


#### What do you do with scheduled headways that are pretty extreme compared to the other rows?

In [92]:
25957.00/60

432.6166666666667

In [94]:
436/60

7.266666666666667

In [91]:
m2.loc[(m2.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144") &
                  (m2.shape_array_key == "fefbc78a6cf676d7fbd1d25b61ef7bfb") & 
                  (m2.route_id== "22") &
                  (m2.direction_id == 1) &
                   (m2.stop_id == "17769")]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway,avg_schd_headway_sec,std_dev_headway,bunching_coefficient,avg_schd_headway_min,actual_headway_min,sched_headway_min
443917,11490047_M31,17769,7,67772.0,7cc0cb1871dfd558f11a2885c145d144,51672d1ed74ce3e66e4bfc273e6e9931,68087,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,315.0,26178.0,25957.0,10023.5,11333.78,1.13,167.06,436.3,432.62
443918,11490090_M31,17769,7,72092.0,7cc0cb1871dfd558f11a2885c145d144,6aaf49dfccb82109a1f77471fee6e43a,71689,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,-403.0,3602.0,4320.0,10023.5,11333.78,1.13,167.06,60.03,72.0
443919,11490091_M31,17769,7,73892.0,7cc0cb1871dfd558f11a2885c145d144,e8ff0c3ae23fd4ba4ab3d251d0c11069,72690,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,-1202.0,1001.0,1800.0,10023.5,11333.78,1.13,167.06,16.68,30.0
443920,11490099_M31,17769,7,81909.0,7cc0cb1871dfd558f11a2885c145d144,2c824de8cf7a1b5338f76d2501e6ebd5,81349,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,-560.0,8659.0,8017.0,10023.5,11333.78,1.13,167.06,144.32,133.62


In [93]:
rt_stop_times2.loc[(rt_stop_times2.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144") &
                  (rt_stop_times2.shape_array_key == "fefbc78a6cf676d7fbd1d25b61ef7bfb") & 
                  (rt_stop_times2.route_id== "22") &
                  (rt_stop_times2.direction_id == 1) &
                   (rt_stop_times2.stop_id == "17769")]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes
528875,11490047_M31,17769,7,67772.0,7cc0cb1871dfd558f11a2885c145d144,51672d1ed74ce3e66e4bfc273e6e9931,68087,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6
529753,11490091_M31,17769,7,73892.0,7cc0cb1871dfd558f11a2885c145d144,e8ff0c3ae23fd4ba4ab3d251d0c11069,72690,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6
556189,11490099_M31,17769,7,81909.0,7cc0cb1871dfd558f11a2885c145d144,2c824de8cf7a1b5338f76d2501e6ebd5,81349,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6
688955,11490090_M31,17769,7,72092.0,7cc0cb1871dfd558f11a2885c145d144,6aaf49dfccb82109a1f77471fee6e43a,71689,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6
689073,11490074_M31,17769,7,41815.0,7cc0cb1871dfd558f11a2885c145d144,59dbf32fbb998fc9837ad6fe10146b64,41909,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6


In [95]:
rt_stop_times3.loc[(rt_stop_times3.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144") &
                  (rt_stop_times3.shape_array_key == "fefbc78a6cf676d7fbd1d25b61ef7bfb") & 
                  (rt_stop_times3.route_id== "22") &
                  (rt_stop_times3.direction_id == 1) &
                   (rt_stop_times3.stop_id == "17769")]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes
508536,11490047_M31,17769,7,67772.0,7cc0cb1871dfd558f11a2885c145d144,51672d1ed74ce3e66e4bfc273e6e9931,68087,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6
509366,11490091_M31,17769,7,73892.0,7cc0cb1871dfd558f11a2885c145d144,e8ff0c3ae23fd4ba4ab3d251d0c11069,72690,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6
534538,11490099_M31,17769,7,81909.0,7cc0cb1871dfd558f11a2885c145d144,2c824de8cf7a1b5338f76d2501e6ebd5,81349,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6
661545,11490090_M31,17769,7,72092.0,7cc0cb1871dfd558f11a2885c145d144,6aaf49dfccb82109a1f77471fee6e43a,71689,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6
661663,11490074_M31,17769,7,41815.0,7cc0cb1871dfd558f11a2885c145d144,59dbf32fbb998fc9837ad6fe10146b64,41909,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6


In [96]:
rt_stop_times4.loc[(rt_stop_times4.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144") &
                  (rt_stop_times4.shape_array_key == "fefbc78a6cf676d7fbd1d25b61ef7bfb") & 
                  (rt_stop_times4.route_id== "22") &
                  (rt_stop_times4.direction_id == 1) &
                   (rt_stop_times4.stop_id == "17769")]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay
462155,11490074_M31,17769,7,41815.0,7cc0cb1871dfd558f11a2885c145d144,59dbf32fbb998fc9837ad6fe10146b64,41909,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,94.0
462156,11490047_M31,17769,7,67772.0,7cc0cb1871dfd558f11a2885c145d144,51672d1ed74ce3e66e4bfc273e6e9931,68087,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,315.0
462157,11490090_M31,17769,7,72092.0,7cc0cb1871dfd558f11a2885c145d144,6aaf49dfccb82109a1f77471fee6e43a,71689,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,-403.0
462158,11490091_M31,17769,7,73892.0,7cc0cb1871dfd558f11a2885c145d144,e8ff0c3ae23fd4ba4ab3d251d0c11069,72690,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,-1202.0
462159,11490099_M31,17769,7,81909.0,7cc0cb1871dfd558f11a2885c145d144,2c824de8cf7a1b5338f76d2501e6ebd5,81349,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,-560.0


In [97]:
rt_stop_times5.loc[(rt_stop_times5.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144") &
                  (rt_stop_times5.shape_array_key == "fefbc78a6cf676d7fbd1d25b61ef7bfb") & 
                  (rt_stop_times5.route_id== "22") &
                  (rt_stop_times5.direction_id == 1) &
                   (rt_stop_times5.stop_id == "17769")]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_primary_direction,frequency_in_minutes,delay,actual_headway,schd_headway
461677,11490074_M31,17769,7,41815.0,7cc0cb1871dfd558f11a2885c145d144,59dbf32fbb998fc9837ad6fe10146b64,41909,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,94.0,,
461678,11490047_M31,17769,7,67772.0,7cc0cb1871dfd558f11a2885c145d144,51672d1ed74ce3e66e4bfc273e6e9931,68087,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,315.0,26178.0,25957.0
461679,11490090_M31,17769,7,72092.0,7cc0cb1871dfd558f11a2885c145d144,6aaf49dfccb82109a1f77471fee6e43a,71689,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,-403.0,3602.0,4320.0
461680,11490091_M31,17769,7,73892.0,7cc0cb1871dfd558f11a2885c145d144,e8ff0c3ae23fd4ba4ab3d251d0c11069,72690,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,-1202.0,1001.0,1800.0
461681,11490099_M31,17769,7,81909.0,7cc0cb1871dfd558f11a2885c145d144,2c824de8cf7a1b5338f76d2501e6ebd5,81349,22,fefbc78a6cf676d7fbd1d25b61ef7bfb,7f69c2fdaa134642f14064a0b64d1495,FILLMORE,1.0,Northbound,7.6,-560.0,8659.0,8017.0


In [None]:
check1 = (
    bunching_by_stops.groupby(
        ["stop_sequence", "route_id", "schedule_gtfs_dataset_key"]
    )
    .agg({"route_primary_direction": "nunique"})
    .reset_index()
)

In [None]:
check1.route_primary_direction.describe()