## Check out Results

In [1]:
import geopandas as gpd
import pandas as pd

In [2]:
from segment_speed_utils import helpers, sched_rt_utils, wrangle_shapes
from segment_speed_utils.project_vars import (
    GCS_FILE_PATH,
    PROJECT_CRS,
    SEGMENT_GCS,
    analysis_date,
)
from shared_utils import geography_utils, portfolio_utils, rt_dates, schedule_rt_utils

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [4]:
GCS_PATH = "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/trip_level_metrics/"

In [5]:
months = ["dec", "nov", "oct", "sep", "aug", "jul", "jun", "may", "apr", "mar"]

analysis_date_list = [rt_dates.DATES[f"{m}2023"] for m in months]

In [6]:
analysis_date_list

['2023-12-13',
 '2023-11-15',
 '2023-10-11',
 '2023-09-13',
 '2023-08-15',
 '2023-07-12',
 '2023-06-14',
 '2023-05-17',
 '2023-04-12',
 '2023-03-15']

In [7]:
def check_out(df: pd.DataFrame):
    display(df.spatial_accuracy_pct.describe())
    display(df.pings_per_min.describe())
    display(df.rt_triptime_w_gtfs_pct.describe())
    display(df.rt_v_scheduled_trip_time_pct.describe())

In [8]:
mar_df = pd.read_parquet(f"{GCS_PATH}2023-03-15_metrics.parquet")

In [9]:
check_out(mar_df)

count   69494.00
mean       94.10
std        12.34
min         0.00
25%        95.24
50%       100.00
75%       100.00
max       100.00
Name: spatial_accuracy_pct, dtype: float64

count   83620.00
mean        2.46
std         0.68
min         0.00
25%         1.91
50%         2.86
75%         2.96
max         4.75
Name: pings_per_min, dtype: float64

count   83620.00
mean       95.20
std        14.55
min         0.28
25%        97.99
50%        99.58
75%       100.25
max       108.43
Name: rt_triptime_w_gtfs_pct, dtype: float64

count   71797.00
mean       60.68
std       333.76
min       -87.37
25%        10.76
50%        25.19
75%        44.44
max     17909.79
Name: rt_v_scheduled_trip_time_pct, dtype: float64

In [10]:
apr_df = pd.read_parquet(f"{GCS_PATH}2023-04-12_metrics.parquet")

In [11]:
check_out(apr_df)

count   71094.00
mean       94.06
std        12.64
min         0.00
25%        95.45
50%       100.00
75%       100.00
max       100.00
Name: spatial_accuracy_pct, dtype: float64

count   84516.00
mean        2.45
std         0.67
min         0.01
25%         1.91
50%         2.83
75%         2.95
max         5.18
Name: pings_per_min, dtype: float64

count   84516.00
mean       95.23
std        14.48
min         0.56
25%        97.88
50%        99.53
75%       100.20
max       108.11
Name: rt_triptime_w_gtfs_pct, dtype: float64

count   73471.00
mean       61.42
std       349.18
min       -86.02
25%        10.46
50%        25.10
75%        44.38
max     15903.70
Name: rt_v_scheduled_trip_time_pct, dtype: float64

In [12]:
may_df = pd.read_parquet(f"{GCS_PATH}2023-05-17_metrics.parquet")

In [13]:
check_out(may_df)

count   65385.00
mean       94.07
std        12.45
min         0.00
25%        95.00
50%       100.00
75%       100.00
max       100.00
Name: spatial_accuracy_pct, dtype: float64

count   83606.00
mean        2.47
std         0.68
min         0.00
25%         1.95
50%         2.86
75%         2.95
max         5.14
Name: pings_per_min, dtype: float64

count   83606.00
mean       95.47
std        13.78
min         0.27
25%        97.76
50%        99.51
75%       100.21
max       108.43
Name: rt_triptime_w_gtfs_pct, dtype: float64

count   67864.00
mean       60.61
std       329.38
min       -90.09
25%        10.96
50%        25.64
75%        44.61
max     17907.92
Name: rt_v_scheduled_trip_time_pct, dtype: float64

In [14]:
jun_df = pd.read_parquet(f"{GCS_PATH}2023-06-14_metrics.parquet")

In [15]:
check_out(jun_df)

count   64270.00
mean       93.92
std        12.67
min         0.00
25%        94.77
50%       100.00
75%       100.00
max       100.00
Name: spatial_accuracy_pct, dtype: float64

count   80331.00
mean        2.51
std         0.67
min         0.01
25%         1.98
50%         2.90
75%         2.96
max         4.93
Name: pings_per_min, dtype: float64

count   80331.00
mean       95.92
std        13.48
min         0.28
25%        98.43
50%        99.67
75%       100.28
max       107.78
Name: rt_triptime_w_gtfs_pct, dtype: float64

count   66748.00
mean       63.20
std       345.10
min       -90.25
25%        11.69
50%        25.91
75%        45.15
max     12997.42
Name: rt_v_scheduled_trip_time_pct, dtype: float64

In [16]:
jul_df = pd.read_parquet(f"{GCS_PATH}2023-07-12_metrics.parquet")

In [17]:
check_out(jul_df)

count   67091.00
mean       93.83
std        13.15
min         0.00
25%        94.96
50%       100.00
75%       100.00
max       100.00
Name: spatial_accuracy_pct, dtype: float64

count   82044.00
mean        2.49
std         0.67
min         0.00
25%         1.95
50%         2.90
75%         2.97
max         5.76
Name: pings_per_min, dtype: float64

count   82044.00
mean       96.42
std        11.07
min         0.43
25%        98.28
50%        99.65
75%       100.25
max       108.60
Name: rt_triptime_w_gtfs_pct, dtype: float64

count   69488.00
mean       56.85
std       291.52
min       -91.05
25%        11.82
50%        26.10
75%        45.69
max     17880.62
Name: rt_v_scheduled_trip_time_pct, dtype: float64

In [18]:
aug_df = pd.read_parquet(f"{GCS_PATH}2023-08-15_metrics.parquet")

In [19]:
sept_df = pd.read_parquet(f"{GCS_PATH}2023-09-13_metrics.parquet")

In [20]:
oct_df = pd.read_parquet(f"{GCS_PATH}2023-10-11_metrics.parquet")

In [21]:
nov_df = pd.read_parquet(f"{GCS_PATH}2023-11-15_metrics.parquet")

In [22]:
check_out(nov_df)

count   74891.00
mean       93.53
std        13.12
min         0.00
25%        93.94
50%        99.68
75%       100.00
max       100.00
Name: spatial_accuracy_pct, dtype: float64

count   86832.00
mean        2.51
std         0.63
min         0.00
25%         2.09
50%         2.86
75%         2.96
max         5.29
Name: pings_per_min, dtype: float64

count   86832.00
mean       95.93
std        12.15
min         0.45
25%        98.40
50%        99.67
75%       100.26
max       108.93
Name: rt_triptime_w_gtfs_pct, dtype: float64

count   77194.00
mean       42.53
std       196.80
min       -88.16
25%        11.44
50%        26.10
75%        46.17
max     11797.08
Name: rt_v_scheduled_trip_time_pct, dtype: float64

In [23]:
len(nov_df[nov_df.rt_triptime_w_gtfs_pct > 100])

30287

In [24]:
len(nov_df)

86832

In [25]:
nov_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86832 entries, 0 to 86831
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   trip_instance_key             86832 non-null  object 
 1   rt_service_min                86832 non-null  float64
 2   min_w_atleast2_trip_updates   86832 non-null  int64  
 3   total_pings_for_trip          86832 non-null  int64  
 4   total_min_w_gtfs              86832 non-null  int64  
 5   total_vp                      74891 non-null  float64
 6   vp_in_shape                   74891 non-null  float64
 7   speed_mph                     77194 non-null  float64
 8   route_id                      75619 non-null  object 
 9   time_of_day                   77194 non-null  object 
 10  service_minutes               77194 non-null  float64
 11  pings_per_min                 86832 non-null  float64
 12  spatial_accuracy_pct          74891 non-null  float64
 13  r

In [26]:
analysis_date

'2023-12-13'

In [27]:
dec_df = pd.read_parquet(f"{GCS_PATH}2023-12-13_metrics.parquet")

In [28]:
check_out(dec_df)

count   74609.00
mean       93.55
std        13.20
min         0.00
25%        94.23
50%        99.64
75%       100.00
max       100.00
Name: spatial_accuracy_pct, dtype: float64

count   86128.00
mean        2.44
std         0.60
min         0.00
25%         2.05
50%         2.67
75%         2.94
max         5.15
Name: pings_per_min, dtype: float64

count   86128.00
mean       95.94
std        11.70
min         0.10
25%        98.36
50%        99.65
75%       100.00
max       100.00
Name: rt_triptime_w_gtfs_pct, dtype: float64

count   76878.00
mean       44.40
std       272.99
min       -86.89
25%        11.36
50%        25.56
75%        45.00
max     18873.69
Name: rt_v_scheduled_trip_time_pct, dtype: float64

In [29]:
dec_df.columns

Index(['trip_instance_key', 'rt_service_min', 'min_w_atleast2_trip_updates',
       'total_pings_for_trip', 'total_min_w_gtfs', 'total_vp', 'vp_in_shape',
       'speed_mph', 'service_minutes', 'pings_per_min', 'spatial_accuracy_pct',
       'rt_triptime_w_gtfs_pct', 'rt_v_scheduled_trip_time_pct'],
      dtype='object')

In [30]:
dec_df.rt_v_scheduled_trip_time_pct.describe()

count   76878.00
mean       44.40
std       272.99
min       -86.89
25%        11.36
50%        25.56
75%        45.00
max     18873.69
Name: rt_v_scheduled_trip_time_pct, dtype: float64

### See why some trips have such crazy rt_v_scheduled_trip_time_pct

In [31]:
preview_cols = ["rt_service_min", "service_minutes", "rt_v_scheduled_trip_time_pct"]

In [32]:
dec_df.loc[dec_df.rt_v_scheduled_trip_time_pct > 10000][preview_cols].sample()

Unnamed: 0,rt_service_min,service_minutes,rt_v_scheduled_trip_time_pct
75715,7020.27,37.0,18873.69


In [33]:
dec_df.loc[dec_df.rt_v_scheduled_trip_time_pct > 10000][preview_cols]

Unnamed: 0,rt_service_min,service_minutes,rt_v_scheduled_trip_time_pct
39307,1297.03,12.0,10708.61
39308,1309.6,12.0,10813.33
39309,1300.33,12.0,10736.11
39359,1390.38,12.0,11486.53
39360,1408.93,12.0,11641.11
39361,1402.7,12.0,11589.17
39462,1341.02,12.0,11075.14
39463,1363.98,12.0,11266.53
39464,1366.92,12.0,11290.97
39602,1364.3,12.0,11269.17


### Test aggregating with Dec

#### Step 1: add missing cols
* https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/avg_speeds_by_segment.py#L135

In [34]:
def add_scheduled_trip_columns(
    df: pd.DataFrame, analysis_date: str, group_cols: list = ["trip_instance_key"]
) -> pd.DataFrame:
    """
    Merge RT trips (vehicle positions) to scheduled trips.
    Add in the needed scheduled trip columns to take
    route-direction-time_of_day averages.
    """
    keep_cols = [
        "gtfs_dataset_key",
        "direction_id",
        "route_id",
        "route_short_name",
        "route_long_name",
        "route_desc",
    ] + group_cols

    crosswalk = helpers.import_scheduled_trips(
        analysis_date, columns=keep_cols, get_pandas=True
    )

    common_keep_cols = [
        "schedule_gtfs_dataset_key",
        "route_id",
        "direction_id",
        "shape_array_key",
    ]
    common_shape = sched_rt_utils.most_common_shape_by_route_direction(analysis_date)[
        common_keep_cols
    ]

    crosswalk2 = pd.merge(
        crosswalk,
        common_shape,
        on=["schedule_gtfs_dataset_key", "route_id", "direction_id"],
        how="inner",
    ).astype({"direction_id": "Int64"})

    crosswalk2 = portfolio_utils.add_route_name(crosswalk2).drop(
        columns=["route_short_name", "route_long_name", "route_desc"]
    )

    time_keep_cols = [
        "trip_instance_key",
        "service_hours",
        "trip_first_departure_datetime_pacific",
        "time_of_day",
    ]
    time_of_day = sched_rt_utils.get_trip_time_buckets(analysis_date)[time_keep_cols]

    df2 = pd.merge(df, crosswalk2, on="trip_instance_key", how="left").merge(
        time_of_day, on="trip_instance_key", how="left"
    )

    return df2

In [35]:
dec_df2 = add_scheduled_trip_columns(dec_df, analysis_date, ["trip_instance_key"])

In [36]:
type(dec_df2)

pandas.core.frame.DataFrame

#### Aggregate avg speed by route
* DO I need to use the other functions in the script
* Do we still drop rows that are above 70 mph?
* https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/export.py#L150


In [37]:
def average_route_speeds_for_export(
    df: pd.DataFrame,
    analysis_date: str,
    max_speed: int,
) -> gpd.GeoDataFrame:
    """
    Aggregate trip speeds to route-direction.
    Attach shape geometry to most common shape_id.
    """
    df2 = df.loc[df.speed_mph <= 70].reset_index(drop=True)

    route_cols = [
        "schedule_gtfs_dataset_key",
        "time_of_day",
        "route_id",
        "direction_id",
        "route_name_used",
        "shape_array_key",
    ]

    mean_cols = [
        "service_minutes",
        "rt_service_min",
        "speed_mph",
        "pings_per_min",
        "total_vp",
        "vp_in_shape",
    ]
    count_cols = ["trip_instance_key"]

    df3 = (
        df2.groupby(route_cols)
        .agg({**{e: "mean" for e in mean_cols}, **{e: "count" for e in count_cols}})
        .reset_index()
    )

    df4 = df3.assign(
        rt_service_min=df3.rt_service_min.round(1),
        service_minutes=df3.service_minutes.round(1),
        speed_mph=df3.speed_mph.round(1),
        pings_per_min=df3.pings_per_min.round(1),
    ).rename(
        columns={
            "service_minutes": "avg_sched_trip_min",
            "rt_service_min": "avg_rt_trip_min",
            "trip_instance_key": "n_trips",
            "route_name_used": "route_name",
            "pings_per_min": "avg_pings_per_min",
            "schedule_gtfs_dataset_key": "gtfs_dataset_key",
        }
    )

    org_crosswalk = schedule_rt_utils.sample_gtfs_dataset_key_to_organization_crosswalk(
        df4,
        analysis_date,
        quartet_data="schedule",
        dim_gtfs_dataset_cols=["key", "base64_url"],
        dim_organization_cols=["source_record_id", "name", "caltrans_district"],
    )

    df_with_org = pd.merge(
        df4,
        org_crosswalk.rename(columns={"schedule_gtfs_dataset_key": "gtfs_dataset_key"}),
        on="gtfs_dataset_key",
        how="inner",
    )

    shapes = helpers.import_scheduled_shapes(
        analysis_date,
        columns=["shape_array_key", "geometry"],
        get_pandas=True,
        crs=geography_utils.WGS84,
    )

    df_with_shape = pd.merge(
        shapes,
        df_with_org,
        on="shape_array_key",  # once merged, can drop shape_array_key
        how="inner",
    )

    df_with_shape["avg_pct_vp_shape"] = (
        df_with_shape.vp_in_shape / df_with_shape.total_vp * 100
    )

    df_with_shape["avg_pct_rt_v_sched"] = (
        df_with_shape.avg_rt_trip_min / df_with_shape.avg_sched_trip_min - 1
    ) * 100

    final_df = df_with_shape.drop(columns=["total_vp", "vp_in_shape"])

    agency_cols = ["organization_source_record_id", "organization_name"]
    route_cols = [
        "route_id",
        "route_name",
        "direction_id",
    ]

    col_order = (
        agency_cols
        + route_cols
        + [
            "time_of_day",
            "speed_mph",
            "n_trips",
            "avg_sched_trip_min",
            "avg_rt_trip_min",
            "base64_url",
            "caltrans_district",
            "geometry",
            "avg_pings_per_min",
            "avg_pct_vp_shape",
            "avg_pct_rt_v_sched",
        ]
    )

    final_df = df_with_shape.reindex(columns=col_order).rename(
        columns={
            "organization_source_record_id": "org_id",
            "organization_name": "agency",
            "caltrans_district": "district_name",
        }
    )

    return df2, final_df

In [38]:
dec_intermediary, dec_final = average_route_speeds_for_export(
    dec_df2, analysis_date, 70
)

In [39]:
dec_final.avg_pings_per_min.describe()

count   11397.00
mean        2.38
std         0.56
min         0.10
25%         1.90
50%         2.50
75%         2.90
max         3.50
Name: avg_pings_per_min, dtype: float64

In [40]:
dec_final.shape

(11397, 16)

In [41]:
dec_final.drop(columns=["geometry", "base64_url"]).sample(3)

Unnamed: 0,org_id,agency,route_id,route_name,direction_id,time_of_day,speed_mph,n_trips,avg_sched_trip_min,avg_rt_trip_min,district_name,avg_pings_per_min,avg_pct_vp_shape,avg_pct_rt_v_sched
31,rec8zhnCPETu6qEiH,City of Redondo Beach,4819,Redondo Beach Pier / Greenline Station,1,AM Peak,7.2,6,46.5,119.7,07 - Los Angeles,2.4,99.61,157.42
8468,recSiaaMmBXW7fUZS,Stanislaus Regional Transit Authority,29,,1,PM Peak,8.5,5,27.0,29.4,10 - Stockton,3.0,83.92,8.89
2440,recOZgevYf7Jimm9L,Alameda-Contra Costa Transit District,6,Berkeley - Telegraph - Oakland,1,AM Peak,6.3,15,32.2,46.9,04 - Oakland,2.8,87.65,45.65


In [42]:
dec_df.sample()

Unnamed: 0,trip_instance_key,rt_service_min,min_w_atleast2_trip_updates,total_pings_for_trip,total_min_w_gtfs,total_vp,vp_in_shape,speed_mph,service_minutes,pings_per_min,spatial_accuracy_pct,rt_triptime_w_gtfs_pct,rt_v_scheduled_trip_time_pct
29113,220b6e20e6957e20bceb14947d71d367,68.02,67,201,68,,,,,2.96,,99.98,


### Check results after aggregating up to route
* How are the results sooo wrong with `265-13172`

In [46]:
def checkout_route(
    og_df: pd.DataFrame,
    route_agg: gpd.GeoDataFrame,
    route_id: str,
    time_of_day: str,
    direction_id: int,
):
    print("final")
    display(
        route_agg.loc[
            (route_agg.route_id == route_id)
            & (route_agg.time_of_day == time_of_day)
            & (route_agg.direction_id == direction_id)
        ].drop(columns=["geometry", "base64_url"])
    )
    cols = [
        "trip_instance_key",
        "time_of_day",
        "speed_mph",
        "rt_service_min",
        "service_minutes",
        "pings_per_min",
    ]

    print("original")
    og_df2 = og_df.loc[
        (og_df.route_id == route_id)
        & (og_df.time_of_day == time_of_day)
        & (og_df.direction_id == direction_id)
    ]

    print(f"pings per min {og_df2.pings_per_min.mean()}")
    print(f"speed_mph {og_df2.speed_mph.mean()}")
    print(f"total_vp {og_df2.total_vp.mean()}")
    print(f"vp_in_shape {og_df2.vp_in_shape.mean()}")
    display(og_df2[cols])

In [47]:
dec_intermediary.columns

Index(['trip_instance_key', 'rt_service_min', 'min_w_atleast2_trip_updates',
       'total_pings_for_trip', 'total_min_w_gtfs', 'total_vp', 'vp_in_shape',
       'speed_mph', 'service_minutes', 'pings_per_min', 'spatial_accuracy_pct',
       'rt_triptime_w_gtfs_pct', 'rt_v_scheduled_trip_time_pct',
       'schedule_gtfs_dataset_key', 'direction_id', 'route_id',
       'shape_array_key', 'route_name_used', 'service_hours',
       'trip_first_departure_datetime_pacific', 'time_of_day'],
      dtype='object')

#### scheduled trip min (renamed from service_mins) is completely lower.

In [48]:
route_265 = checkout_route(dec_intermediary, dec_final, "265-13172", "Early AM", 0)

final


Unnamed: 0,org_id,agency,route_id,route_name,direction_id,time_of_day,speed_mph,n_trips,avg_sched_trip_min,avg_rt_trip_min,district_name,avg_pings_per_min,avg_pct_vp_shape,avg_pct_rt_v_sched
6533,recPnGkwdpnr8jmHB,Los Angeles County Metropolitan Transportation Authority,265-13172,PICO RIVERA - LAKEWOOD CTR MALL VIA PARAMOUNT BL,0,Early AM,6.9,2,59.0,99.6,07 - Los Angeles,2.7,70.09,68.81


original
pings per min 2.6849179704528776
speed_mph 6.887376703252869
total_vp 267.5
vp_in_shape 187.5


Unnamed: 0,trip_instance_key,time_of_day,speed_mph,rt_service_min,service_minutes,pings_per_min
24866,48a01217589c2faa46db395d6cf8317d,Early AM,9.4,95.02,58.0,2.65
24867,70674803a1c4416fc49f883bc3b2c18b,Early AM,4.38,104.13,60.0,2.72


In [49]:
checkout_route(dec_intermediary, dec_final, "5671", "Early AM", 1)

final


Unnamed: 0,org_id,agency,route_id,route_name,direction_id,time_of_day,speed_mph,n_trips,avg_sched_trip_min,avg_rt_trip_min,district_name,avg_pings_per_min,avg_pct_vp_shape,avg_pct_rt_v_sched
193,rec3u4aMplqObcoTR,Tahoe Transportation District,5671,Valley Express Daily,1,Early AM,2.2,1,38.0,313.5,03 - Marysville,1.4,22.65,725.0


original
pings per min 1.422723164442554
speed_mph 2.158633017384419
total_vp 446.0
vp_in_shape 101.0


Unnamed: 0,trip_instance_key,time_of_day,speed_mph,rt_service_min,service_minutes,pings_per_min
27758,6fadf197f5bb105ed916de0a337386ee,Early AM,2.16,313.48,38.0,1.42


In [50]:
checkout_route(dec_intermediary, dec_final, "38R", "AM Peak", 1)

final


Unnamed: 0,org_id,agency,route_id,route_name,direction_id,time_of_day,speed_mph,n_trips,avg_sched_trip_min,avg_rt_trip_min,district_name,avg_pings_per_min,avg_pct_vp_shape,avg_pct_rt_v_sched
9012,rechaapWbeffO33OX,City and County of San Francisco,38R,Weekdays 5am-10pm Weekends 6am-9pm,1,AM Peak,6.7,30,43.9,58.9,04 - Oakland,3.0,88.33,34.17


original
pings per min 2.961175851535439
speed_mph 6.682525584870673
total_vp 174.56666666666666
vp_in_shape 154.2


Unnamed: 0,trip_instance_key,time_of_day,speed_mph,rt_service_min,service_minutes,pings_per_min
61610,76fa3ed3fd8ef28a446eedb4c1e94e6a,AM Peak,9.39,48.82,39.0,2.97
61611,47d8da2afbc3e4c78f9be4c6c53a7776,AM Peak,10.0,49.38,39.0,2.96
61612,40590ba21f73ae1775a4538e34e67cc3,AM Peak,5.34,97.67,41.0,2.98
61613,aa1b90f05357a29abc97fae90cd5bafe,AM Peak,9.73,49.45,41.0,2.95
61614,4d7cb7ddea2191f062c1ea3165df67c8,AM Peak,8.58,57.95,41.0,2.97
61615,d4e16623b6cf749d8e389b905ab44089,AM Peak,4.49,93.43,42.0,2.99
61616,ac9822b2d06b84790c10b7cbbf694410,AM Peak,7.1,67.83,42.0,2.96
61617,092c4f20368b13bc5c92fda4fb16bd93,AM Peak,3.77,54.93,43.0,2.97
61618,88cf8d5d7cd5d0c74a9b78d5e62fd8a3,AM Peak,7.06,52.62,43.0,2.96
61619,2845f0ae70ae06c3618d58fa99a1cdde,AM Peak,8.57,51.58,43.0,2.97


In [51]:
checkout_route(dec_intermediary, dec_final, "Lynx", "AM Peak", 1)

final


Unnamed: 0,org_id,agency,route_id,route_name,direction_id,time_of_day,speed_mph,n_trips,avg_sched_trip_min,avg_rt_trip_min,district_name,avg_pings_per_min,avg_pct_vp_shape,avg_pct_rt_v_sched
5973,recIKnsnTdKQ0vsiv,Western Contra Costa Transit Authority,Lynx,Rodeo/Hercules/San Francisco Transbay Terminal,1,AM Peak,13.4,6,50.0,60.7,04 - Oakland,2.8,,21.4


original
pings per min 2.8253065827801014
speed_mph 13.403813542450534
total_vp nan
vp_in_shape nan


Unnamed: 0,trip_instance_key,time_of_day,speed_mph,rt_service_min,service_minutes,pings_per_min
12987,14c9fc6b379e1fe869ba37bfd79a8245,AM Peak,13.31,61.4,50.0,2.72
12990,2ee02fe17c9acad17ccb44eaaea4debe,AM Peak,12.69,63.18,50.0,2.9
12991,78a402dfe7d89e0b919a193ac59c69ae,AM Peak,12.61,63.18,50.0,2.82
12994,2bb4857e894a94d48a79620858c8384e,AM Peak,12.44,64.45,50.0,2.84
12997,b97a1995cd54253c58e82bb7c9ad3414,AM Peak,15.59,52.82,50.0,2.82
13004,baeeed7c3d6ab74ad9ff40f42a2f1da3,AM Peak,13.78,59.13,50.0,2.86


#### Test grouping

In [52]:
stop

NameError: name 'stop' is not defined

In [None]:
df2.columns

In [None]:
route_groupby_cols = [
    "schedule_gtfs_dataset_key",
    "time_of_day",
    "route_id",
    "direction_id",
    "route_name_used",
    "shape_array_key",
]

In [None]:
route_groupby_cols

In [None]:
test1 = (
    df2.groupby(route_groupby_cols, observed=False, group_keys=True)
    .agg(
        {
            "service_minutes": "mean",
            "rt_service_min": "mean",
            "speed_mph": "mean",
            "pings_per_min": "mean",
            "total_vp": "mean",
            "vp_in_shape": "mean",
            "trip_instance_key": "count",
        }
    )
    .reset_index()
)

In [None]:
def checkout_test_groups(
    df: pd.DataFrame, route_id: str, time_of_day: str, direction_id: int
):
    test_cols = [
        "trip_instance_key",
        "service_minutes",
        "rt_service_min",
        "pings_per_min",
        "speed_mph",
        "total_vp",
        "vp_in_shape",
    ]
    display(
        df.loc[
            (df.route_id == route_id)
            & (df.time_of_day == time_of_day)
            & (df.direction_id == direction_id)
        ][test_cols]
    )
    df2_cols = [
        "trip_instance_key",
        "time_of_day",
        "speed_mph",
        "rt_service_min",
        "service_minutes",
        "pings_per_min",
        "total_vp",
        "vp_in_shape",
    ]
    df2_filtered = df2.loc[
        (df2.route_id == route_id)
        & (df2.time_of_day == time_of_day)
        & (df2.direction_id == direction_id)
    ]
    display(df2_filtered.pings_per_min.mean())
    display(df2_filtered.speed_mph.mean())
    display(df2_filtered.total_vp.mean())
    display(df2_filtered.vp_in_shape.mean())
    display(df2_filtered[df2_cols])

In [None]:
checkout_test_groups(test1, "265-13172", "Early AM", 0)

In [None]:
checkout_test_groups(test1, "5671", "Early AM", 1)

In [None]:
checkout_test_groups(test1, "Lynx", "AM Peak", 1)

In [None]:
checkout_test_groups(test1, "38R", "AM Peak", 1)

In [None]:
checkout_test_groups(test1, "16611", "AM Peak", 0)