## Check out Results

In [1]:
import geopandas as gpd
import pandas as pd

In [2]:
from segment_speed_utils import helpers, sched_rt_utils, wrangle_shapes
from segment_speed_utils.project_vars import (
    GCS_FILE_PATH,
    PROJECT_CRS,
    SEGMENT_GCS,
    analysis_date,
)
from shared_utils import geography_utils, portfolio_utils, rt_dates, schedule_rt_utils

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [5]:
months = ["dec", "nov", "oct", "sep", "aug", "jul", "jun", "may", "apr", "mar"]

analysis_date_list = [rt_dates.DATES[f"{m}2023"] for m in months]

In [6]:
analysis_date_list

['2023-12-13',
 '2023-11-15',
 '2023-10-11',
 '2023-09-13',
 '2023-08-15',
 '2023-07-12',
 '2023-06-14',
 '2023-05-17',
 '2023-04-12',
 '2023-03-15']

In [None]:
"gs://calitp-analytics-data/data-analyses/rt_vs_schedule/trip_level_metrics/2023-12-13_metrics.parquet"

### Open all the files

In [37]:
def read_files_into_dataframes(dates: list)-> list:
    """
    Read files with given dates into separate pandas DataFrames.

    Parameters:
    - dates (list): List of date strings in the format 'YYYY-MM-DD'.

    Returns:
    - dfs (dict): A dictionary of DataFrames with keys as DataFrame names.
    """
    dfs = {}
    GCS_PATH = (
        "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/trip_level_metrics/"
    )

    for date in dates:
        file_path = (
            f"{GCS_PATH}{date}_metrics.parquet" 
        )

        # Read the file into a DataFrame
        df = pd.read_parquet(
            file_path
        )  

        df_name = f"df_{date.replace('-', '_')}"
        # Store the DataFrame in the dictionary
        dfs[df_name] = df

    return dfs

In [38]:
dataframes = read_files_into_dataframes(analysis_date_list)

In [40]:
type(dataframes)

dict

In [43]:
df_2023_12_13 = dataframes["df_2023_12_13"]

In [51]:
df_2023_11_15 = dataframes["df_2023_11_15"]
df_2023_10_11 = dataframes["df_2023_10_11"]
df_2023_09_13 = dataframes["df_2023_09_13"]
df_2023_08_15 = dataframes["df_2023_08_15"]
df_2023_07_12 = dataframes["df_2023_07_12"]
df_2023_06_14 = dataframes["df_2023_06_14"]
df_2023_05_17 = dataframes["df_2023_05_17"]
df_2023_04_12 = dataframes["df_2023_04_12"]
df_2023_03_15 = dataframes["df_2023_03_15"]

In [54]:
all_dfs = [df_2023_12_13, df_2023_11_15, df_2023_10_11, df_2023_09_13, df_2023_08_15, df_2023_07_12, df_2023_06_14, df_2023_05_17, df_2023_04_12, df_2023_03_15]

In [47]:
def check_out(df: pd.DataFrame):
    display(df.spatial_accuracy_pct.describe())
    display(df.pings_per_min.describe())
    display(df.rt_triptime_w_gtfs_pct.describe())
    display(df.rt_v_scheduled_trip_time_pct.describe())

In [55]:
for i in all_dfs:
    check_out(i)

count   74609.00
mean       93.55
std        13.20
min         0.00
25%        94.23
50%        99.64
75%       100.00
max       100.00
Name: spatial_accuracy_pct, dtype: float64

count   86128.00
mean        2.44
std         0.60
min         0.00
25%         2.05
50%         2.67
75%         2.94
max         5.15
Name: pings_per_min, dtype: float64

count   86128.00
mean       95.94
std        11.70
min         0.10
25%        98.36
50%        99.65
75%       100.00
max       100.00
Name: rt_triptime_w_gtfs_pct, dtype: float64

count   76878.00
mean       44.40
std       272.99
min       -86.89
25%        11.36
50%        25.56
75%        45.00
max     18873.69
Name: rt_v_scheduled_trip_time_pct, dtype: float64

count   74891.00
mean       93.53
std        13.12
min         0.00
25%        93.94
50%        99.68
75%       100.00
max       100.00
Name: spatial_accuracy_pct, dtype: float64

count   86832.00
mean        2.51
std         0.63
min         0.00
25%         2.09
50%         2.86
75%         2.96
max         5.29
Name: pings_per_min, dtype: float64

count   86832.00
mean       95.68
std        12.05
min         0.45
25%        98.40
50%        99.67
75%       100.00
max       100.00
Name: rt_triptime_w_gtfs_pct, dtype: float64

count   77194.00
mean       42.53
std       196.80
min       -88.16
25%        11.44
50%        26.10
75%        46.17
max     11797.08
Name: rt_v_scheduled_trip_time_pct, dtype: float64

count   73902.00
mean       94.06
std        12.62
min         0.00
25%        95.22
50%       100.00
75%       100.00
max       100.00
Name: spatial_accuracy_pct, dtype: float64

count   86486.00
mean        2.48
std         0.66
min         0.00
25%         1.93
50%         2.87
75%         2.97
max         4.75
Name: pings_per_min, dtype: float64

count   86486.00
mean       96.36
std        10.91
min         0.14
25%        98.51
50%        99.69
75%       100.00
max       100.00
Name: rt_triptime_w_gtfs_pct, dtype: float64

count   76255.00
mean       43.72
std       232.88
min       -86.20
25%        11.80
50%        25.92
75%        45.07
max     11905.42
Name: rt_v_scheduled_trip_time_pct, dtype: float64

count   65175.00
mean       93.89
std        13.05
min         0.00
25%        94.83
50%       100.00
75%       100.00
max       100.00
Name: spatial_accuracy_pct, dtype: float64

count   86133.00
mean        2.46
std         0.68
min         0.01
25%         1.92
50%         2.85
75%         2.96
max         4.86
Name: pings_per_min, dtype: float64

count   86133.00
mean       96.03
std        11.53
min         0.71
25%        98.36
50%        99.66
75%       100.00
max       100.00
Name: rt_triptime_w_gtfs_pct, dtype: float64

count   67169.00
mean       42.21
std       219.14
min       -85.54
25%        11.04
50%        24.77
75%        42.61
max     11905.69
Name: rt_v_scheduled_trip_time_pct, dtype: float64

count   72938.00
mean       93.92
std        12.79
min         0.00
25%        94.89
50%       100.00
75%       100.00
max       100.00
Name: spatial_accuracy_pct, dtype: float64

count   84422.00
mean        2.44
std         0.67
min         0.00
25%         1.92
50%         2.81
75%         2.96
max         5.48
Name: pings_per_min, dtype: float64

count   84422.00
mean       95.73
std        11.76
min         0.42
25%        98.10
50%        99.62
75%       100.00
max       100.00
Name: rt_triptime_w_gtfs_pct, dtype: float64

count   75286.00
mean       43.46
std       242.02
min       -87.18
25%        11.11
50%        25.43
75%        44.25
max     11905.42
Name: rt_v_scheduled_trip_time_pct, dtype: float64

count   67091.00
mean       93.83
std        13.15
min         0.00
25%        94.96
50%       100.00
75%       100.00
max       100.00
Name: spatial_accuracy_pct, dtype: float64

count   82044.00
mean        2.49
std         0.67
min         0.00
25%         1.95
50%         2.90
75%         2.97
max         5.76
Name: pings_per_min, dtype: float64

count   82044.00
mean       96.17
std        10.97
min         0.43
25%        98.28
50%        99.65
75%       100.00
max       100.00
Name: rt_triptime_w_gtfs_pct, dtype: float64

count   69488.00
mean       56.85
std       291.52
min       -91.05
25%        11.82
50%        26.10
75%        45.69
max     17880.62
Name: rt_v_scheduled_trip_time_pct, dtype: float64

count   64270.00
mean       93.92
std        12.67
min         0.00
25%        94.77
50%       100.00
75%       100.00
max       100.00
Name: spatial_accuracy_pct, dtype: float64

count   80331.00
mean        2.51
std         0.67
min         0.01
25%         1.98
50%         2.90
75%         2.96
max         4.93
Name: pings_per_min, dtype: float64

count   80331.00
mean       95.66
std        13.38
min         0.28
25%        98.43
50%        99.67
75%       100.00
max       100.00
Name: rt_triptime_w_gtfs_pct, dtype: float64

count   66748.00
mean       63.20
std       345.10
min       -90.25
25%        11.69
50%        25.91
75%        45.15
max     12997.42
Name: rt_v_scheduled_trip_time_pct, dtype: float64

count   65385.00
mean       94.07
std        12.45
min         0.00
25%        95.00
50%       100.00
75%       100.00
max       100.00
Name: spatial_accuracy_pct, dtype: float64

count   83606.00
mean        2.47
std         0.68
min         0.00
25%         1.95
50%         2.86
75%         2.95
max         5.14
Name: pings_per_min, dtype: float64

count   83606.00
mean       95.22
std        13.69
min         0.27
25%        97.76
50%        99.51
75%       100.00
max       100.00
Name: rt_triptime_w_gtfs_pct, dtype: float64

count   67864.00
mean       60.61
std       329.38
min       -90.09
25%        10.96
50%        25.64
75%        44.61
max     17907.92
Name: rt_v_scheduled_trip_time_pct, dtype: float64

count   71094.00
mean       94.06
std        12.64
min         0.00
25%        95.45
50%       100.00
75%       100.00
max       100.00
Name: spatial_accuracy_pct, dtype: float64

count   84516.00
mean        2.45
std         0.67
min         0.01
25%         1.91
50%         2.83
75%         2.95
max         5.18
Name: pings_per_min, dtype: float64

count   84516.00
mean       94.99
std        14.38
min         0.56
25%        97.88
50%        99.53
75%       100.00
max       100.00
Name: rt_triptime_w_gtfs_pct, dtype: float64

count   73471.00
mean       61.42
std       349.18
min       -86.02
25%        10.46
50%        25.10
75%        44.38
max     15903.70
Name: rt_v_scheduled_trip_time_pct, dtype: float64

count   69494.00
mean       94.10
std        12.34
min         0.00
25%        95.24
50%       100.00
75%       100.00
max       100.00
Name: spatial_accuracy_pct, dtype: float64

count   83620.00
mean        2.46
std         0.68
min         0.00
25%         1.91
50%         2.86
75%         2.96
max         4.75
Name: pings_per_min, dtype: float64

count   83620.00
mean       94.96
std        14.45
min         0.28
25%        97.99
50%        99.58
75%       100.00
max       100.00
Name: rt_triptime_w_gtfs_pct, dtype: float64

count   71797.00
mean       60.68
std       333.76
min       -87.37
25%        10.76
50%        25.19
75%        44.44
max     17909.79
Name: rt_v_scheduled_trip_time_pct, dtype: float64

### Aggregating up to the route level

#### Step 1: add missing cols
* https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/avg_speeds_by_segment.py#L135

In [57]:
def add_scheduled_trip_columns(
    df: pd.DataFrame, analysis_date: str, group_cols: list = ["trip_instance_key"]
) -> pd.DataFrame:
    """
    Merge RT trips (vehicle positions) to scheduled trips.
    Add in the needed scheduled trip columns to take
    route-direction-time_of_day averages.
    """
    keep_cols = [
        "gtfs_dataset_key",
        "direction_id",
        "route_id",
        "route_short_name",
        "route_long_name",
        "route_desc",
    ] + group_cols

    crosswalk = helpers.import_scheduled_trips(
        analysis_date, columns=keep_cols, get_pandas=True
    )

    common_keep_cols = [
        "schedule_gtfs_dataset_key",
        "route_id",
        "direction_id",
        "shape_array_key",
    ]
    common_shape = sched_rt_utils.most_common_shape_by_route_direction(analysis_date)[
        common_keep_cols
    ]

    crosswalk2 = pd.merge(
        crosswalk,
        common_shape,
        on=["schedule_gtfs_dataset_key", "route_id", "direction_id"],
        how="inner",
    ).astype({"direction_id": "Int64"})

    crosswalk2 = portfolio_utils.add_route_name(crosswalk2).drop(
        columns=["route_short_name", "route_long_name", "route_desc"]
    )

    time_keep_cols = [
        "trip_instance_key",
        "service_hours",
        "trip_first_departure_datetime_pacific",
        "time_of_day",
    ]
    time_of_day = sched_rt_utils.get_trip_time_buckets(analysis_date)[time_keep_cols]

    df2 = pd.merge(df, crosswalk2, on="trip_instance_key", how="left").merge(
        time_of_day, on="trip_instance_key", how="left"
    )

    return df2

In [58]:
def average_route_speeds_for_export(
    df: pd.DataFrame,
    analysis_date: str,
    max_speed: int,
) -> gpd.GeoDataFrame:
    """
    Aggregate trip speeds to route-direction.
    Attach shape geometry to most common shape_id.
    """
    df2 = df.loc[df.speed_mph <= 70].reset_index(drop=True)

    route_cols = [
        "schedule_gtfs_dataset_key",
        "time_of_day",
        "route_id",
        "direction_id",
        "route_name_used",
        "shape_array_key",
    ]

    mean_cols = [
        "service_minutes",
        "rt_service_min",
        "speed_mph",
        "pings_per_min",
        "total_vp",
        "vp_in_shape",
        "total_min_w_gtfs",
        "min_w_atleast2_trip_updates",
    ]
    count_cols = ["trip_instance_key"]

    df3 = (
        df2.groupby(route_cols)
        .agg({**{e: "mean" for e in mean_cols}, **{e: "count" for e in count_cols}})
        .reset_index()
    )

    df4 = df3.assign(
        rt_service_min=df3.rt_service_min.round(1),
        service_minutes=df3.service_minutes.round(1),
        speed_mph=df3.speed_mph.round(1),
        pings_per_min=df3.pings_per_min.round(1),
        min_w_atleast2_trip_updates=df3.min_w_atleast2_trip_updates.round(1),
        total_min_w_gtfs=df3.total_min_w_gtfs.round(1),
    ).rename(
        columns={
            "service_minutes": "avg_sched_trip_min",
            "rt_service_min": "avg_rt_trip_min",
            "trip_instance_key": "n_trips",
            "route_name_used": "route_name",
            "pings_per_min": "avg_pings_per_min",
            "schedule_gtfs_dataset_key": "gtfs_dataset_key",
            "total_min_w_gtfs": "avg_total_min_w_gtfs",
            "min_w_atleast2_trip_updates": "avg_min_w_atleast2_trip_updates",
            "speed_mph": "avg_speed_mph",
        }
    )

    org_crosswalk = schedule_rt_utils.sample_gtfs_dataset_key_to_organization_crosswalk(
        df4,
        analysis_date,
        quartet_data="schedule",
        dim_gtfs_dataset_cols=["key", "base64_url"],
        dim_organization_cols=["source_record_id", "name", "caltrans_district"],
    )

    df_with_org = pd.merge(
        df4,
        org_crosswalk.rename(columns={"schedule_gtfs_dataset_key": "gtfs_dataset_key"}),
        on="gtfs_dataset_key",
        how="inner",
    )

    shapes = helpers.import_scheduled_shapes(
        analysis_date,
        columns=["shape_array_key", "geometry"],
        get_pandas=True,
        crs=geography_utils.WGS84,
    )

    df_with_shape = pd.merge(
        shapes,
        df_with_org,
        on="shape_array_key",  # once merged, can drop shape_array_key
        how="inner",
    )

    df_with_shape["avg_pct_vp_shape"] = (
        df_with_shape.vp_in_shape / df_with_shape.total_vp * 100
    )

    df_with_shape["avg_pct_rt_v_sched"] = (
        df_with_shape.avg_rt_trip_min / df_with_shape.avg_sched_trip_min - 1
    ) * 100

    df_with_shape["avg_rt_triptime_w_gtfs_pct"] = (
        df_with_shape.avg_total_min_w_gtfs / df_with_shape.avg_rt_trip_min
    ) * 100

    final_df = df_with_shape.drop(columns=["total_vp", "vp_in_shape"])

    agency_cols = ["organization_source_record_id", "organization_name"]
    route_cols = [
        "route_id",
        "route_name",
        "direction_id",
    ]

    col_order = (
        agency_cols
        + route_cols
        + [
            "time_of_day",
            "avg_speed_mph",
            "n_trips",
            "avg_sched_trip_min",
            "avg_rt_trip_min",
            "base64_url",
            "caltrans_district",
            "geometry",
            "avg_pings_per_min",
            "avg_pct_vp_shape",
            "avg_pct_rt_v_sched",
            "avg_rt_triptime_w_gtfs_pct",
            "avg_min_w_atleast2_trip_updates",
        ]
    )

    final_df = df_with_shape.reindex(columns=col_order).rename(
        columns={
            "organization_source_record_id": "org_id",
            "organization_name": "agency",
            "caltrans_district": "district_name",
        }
    )

    return df2, final_df

In [59]:
dec_df2 = add_scheduled_trip_columns(df_2023_12_13, analysis_date, ["trip_instance_key"])

In [60]:
dec_df2.head(2)

Unnamed: 0,trip_instance_key,rt_service_min,min_w_atleast2_trip_updates,total_pings_for_trip,total_min_w_gtfs,total_vp,vp_in_shape,speed_mph,service_minutes,pings_per_min,spatial_accuracy_pct,rt_triptime_w_gtfs_pct,rt_v_scheduled_trip_time_pct,schedule_gtfs_dataset_key,direction_id,route_id,shape_array_key,route_name_used,service_hours,trip_first_departure_datetime_pacific,time_of_day
0,5d25a4366c173007d9c29fdead0299d7,74.03,73,216,74,216.0,148.0,21.01,58.0,2.92,68.52,99.95,27.64,63029a23cb0e73f2a5d98a345c5e2e40,1,3428,0d0ca5bc40fb6266a03f400c3aa7e6cb,,0.97,2023-12-13 05:34:00,Early AM
1,4b72b80fc9cfe5e613bab95585cbe7e4,23.45,21,59,23,59.0,19.0,54.95,58.0,2.52,32.2,98.08,-59.57,63029a23cb0e73f2a5d98a345c5e2e40,1,3428,0d0ca5bc40fb6266a03f400c3aa7e6cb,,0.97,2023-12-13 06:34:00,Early AM


In [61]:
type(dec_df2)

pandas.core.frame.DataFrame

In [62]:
dec_intermediary, dec_final = average_route_speeds_for_export(
    dec_df2, analysis_date, 70
)

In [63]:
dec_final.avg_pings_per_min.describe()

count   11397.00
mean        2.38
std         0.56
min         0.10
25%         1.90
50%         2.50
75%         2.90
max         3.50
Name: avg_pings_per_min, dtype: float64

In [64]:
dec_final.shape

(11397, 18)

In [65]:
dec_final.drop(columns=["geometry", "base64_url"]).sample(3)

Unnamed: 0,org_id,agency,route_id,route_name,direction_id,time_of_day,avg_speed_mph,n_trips,avg_sched_trip_min,avg_rt_trip_min,district_name,avg_pings_per_min,avg_pct_vp_shape,avg_pct_rt_v_sched,avg_rt_triptime_w_gtfs_pct,avg_min_w_atleast2_trip_updates
3370,rec4pgjrmdhCh4z01,City of Los Angeles,576,Midtown,0,PM Peak,8.6,7,38.0,47.6,07 - Los Angeles,2.6,95.21,25.26,84.66,38.3
1173,recaJnArpFEk5QooE,City of Elk Grove,3425,,0,AM Peak,19.9,5,36.2,46.0,03 - Marysville,2.9,100.0,27.07,99.13,45.0
9120,rechaapWbeffO33OX,City and County of San Francisco,S,Additional Weekday Service,0,Midday,15.4,9,15.0,24.8,04 - Oakland,2.7,96.78,65.33,85.89,20.8


In [66]:
dec_df.sample()

Unnamed: 0,trip_instance_key,rt_service_min,min_w_atleast2_trip_updates,total_pings_for_trip,total_min_w_gtfs,total_vp,vp_in_shape,speed_mph,service_minutes,pings_per_min,spatial_accuracy_pct,rt_triptime_w_gtfs_pct,rt_v_scheduled_trip_time_pct
75367,bdaf00fefb641fad797f64761bade1df,55.97,56,166,56,166.0,165.0,5.78,23.0,2.97,99.4,100.0,143.33


### Check results after aggregating up to route
* How are the results sooo wrong with `265-13172`

In [67]:
def checkout_route(
    og_df: pd.DataFrame,
    route_agg: gpd.GeoDataFrame,
    route_id: str,
    time_of_day: str,
    direction_id: int,
):
    print("final")
    display(
        route_agg.loc[
            (route_agg.route_id == route_id)
            & (route_agg.time_of_day == time_of_day)
            & (route_agg.direction_id == direction_id)
        ].drop(columns=["geometry", "base64_url"])
    )
    cols = [
        "trip_instance_key",
        "time_of_day",
        "speed_mph",
        "rt_service_min",
        "service_minutes",
        "pings_per_min",
        "total_min_w_gtfs",
        "min_w_atleast2_trip_updates",
    ]

    print("original")
    og_df2 = og_df.loc[
        (og_df.route_id == route_id)
        & (og_df.time_of_day == time_of_day)
        & (og_df.direction_id == direction_id)
    ]

    print(f"pings per min {og_df2.pings_per_min.mean()}")
    print(f"speed_mph {og_df2.speed_mph.mean()}")
    print(f"total_vp {og_df2.total_vp.mean()}")
    print(f"vp_in_shape {og_df2.vp_in_shape.mean()}")
    print(f"min w gtfs {og_df2.total_min_w_gtfs.mean()}")
    print(f"min w at least 2 pings {og_df2.min_w_atleast2_trip_updates.mean()}")
    display(og_df2[cols])

#### scheduled trip min (renamed from service_mins) is completely lower.

In [68]:
route_265 = checkout_route(dec_intermediary, dec_final, "265-13172", "Early AM", 0)

final


Unnamed: 0,org_id,agency,route_id,route_name,direction_id,time_of_day,avg_speed_mph,n_trips,avg_sched_trip_min,avg_rt_trip_min,district_name,avg_pings_per_min,avg_pct_vp_shape,avg_pct_rt_v_sched,avg_rt_triptime_w_gtfs_pct,avg_min_w_atleast2_trip_updates
6533,recPnGkwdpnr8jmHB,Los Angeles County Metropolitan Transportation Authority,265-13172,PICO RIVERA - LAKEWOOD CTR MALL VIA PARAMOUNT BL,0,Early AM,6.9,2,59.0,99.6,07 - Los Angeles,2.7,70.09,68.81,99.9,92.0


original
pings per min 2.6849179704528776
speed_mph 6.887376703252869
total_vp 267.5
vp_in_shape 187.5
min w gtfs 99.5
min w at least 2 pings 92.0


Unnamed: 0,trip_instance_key,time_of_day,speed_mph,rt_service_min,service_minutes,pings_per_min,total_min_w_gtfs,min_w_atleast2_trip_updates
24866,48a01217589c2faa46db395d6cf8317d,Early AM,9.4,95.02,58.0,2.65,95,87
24867,70674803a1c4416fc49f883bc3b2c18b,Early AM,4.38,104.13,60.0,2.72,104,97


In [69]:
checkout_route(dec_intermediary, dec_final, "5671", "Early AM", 1)

final


Unnamed: 0,org_id,agency,route_id,route_name,direction_id,time_of_day,avg_speed_mph,n_trips,avg_sched_trip_min,avg_rt_trip_min,district_name,avg_pings_per_min,avg_pct_vp_shape,avg_pct_rt_v_sched,avg_rt_triptime_w_gtfs_pct,avg_min_w_atleast2_trip_updates
193,rec3u4aMplqObcoTR,Tahoe Transportation District,5671,Valley Express Daily,1,Early AM,2.2,1,38.0,313.5,03 - Marysville,1.4,22.65,725.0,50.08,152.0


original
pings per min 1.422723164442554
speed_mph 2.158633017384419
total_vp 446.0
vp_in_shape 101.0
min w gtfs 157.0
min w at least 2 pings 152.0


Unnamed: 0,trip_instance_key,time_of_day,speed_mph,rt_service_min,service_minutes,pings_per_min,total_min_w_gtfs,min_w_atleast2_trip_updates
27758,6fadf197f5bb105ed916de0a337386ee,Early AM,2.16,313.48,38.0,1.42,157,152


In [70]:
checkout_route(dec_intermediary, dec_final, "38R", "AM Peak", 1)

final


Unnamed: 0,org_id,agency,route_id,route_name,direction_id,time_of_day,avg_speed_mph,n_trips,avg_sched_trip_min,avg_rt_trip_min,district_name,avg_pings_per_min,avg_pct_vp_shape,avg_pct_rt_v_sched,avg_rt_triptime_w_gtfs_pct,avg_min_w_atleast2_trip_updates
9012,rechaapWbeffO33OX,City and County of San Francisco,38R,Weekdays 5am-10pm Weekends 6am-9pm,1,AM Peak,6.7,30,43.9,58.9,04 - Oakland,3.0,88.33,34.17,100.17,58.3


original
pings per min 2.961175851535439
speed_mph 6.682525584870673
total_vp 174.56666666666666
vp_in_shape 154.2
min w gtfs 58.96666666666667
min w at least 2 pings 58.3


Unnamed: 0,trip_instance_key,time_of_day,speed_mph,rt_service_min,service_minutes,pings_per_min,total_min_w_gtfs,min_w_atleast2_trip_updates
61610,76fa3ed3fd8ef28a446eedb4c1e94e6a,AM Peak,9.39,48.82,39.0,2.97,49,49
61611,47d8da2afbc3e4c78f9be4c6c53a7776,AM Peak,10.0,49.38,39.0,2.96,50,49
61612,40590ba21f73ae1775a4538e34e67cc3,AM Peak,5.34,97.67,41.0,2.98,98,97
61613,aa1b90f05357a29abc97fae90cd5bafe,AM Peak,9.73,49.45,41.0,2.95,49,49
61614,4d7cb7ddea2191f062c1ea3165df67c8,AM Peak,8.58,57.95,41.0,2.97,58,58
61615,d4e16623b6cf749d8e389b905ab44089,AM Peak,4.49,93.43,42.0,2.99,94,93
61616,ac9822b2d06b84790c10b7cbbf694410,AM Peak,7.1,67.83,42.0,2.96,68,67
61617,092c4f20368b13bc5c92fda4fb16bd93,AM Peak,3.77,54.93,43.0,2.97,55,54
61618,88cf8d5d7cd5d0c74a9b78d5e62fd8a3,AM Peak,7.06,52.62,43.0,2.96,52,52
61619,2845f0ae70ae06c3618d58fa99a1cdde,AM Peak,8.57,51.58,43.0,2.97,52,51


In [71]:
checkout_route(dec_intermediary, dec_final, "Lynx", "AM Peak", 1)

final


Unnamed: 0,org_id,agency,route_id,route_name,direction_id,time_of_day,avg_speed_mph,n_trips,avg_sched_trip_min,avg_rt_trip_min,district_name,avg_pings_per_min,avg_pct_vp_shape,avg_pct_rt_v_sched,avg_rt_triptime_w_gtfs_pct,avg_min_w_atleast2_trip_updates
5973,recIKnsnTdKQ0vsiv,Western Contra Costa Transit Authority,Lynx,Rodeo/Hercules/San Francisco Transbay Terminal,1,AM Peak,13.4,6,50.0,60.7,04 - Oakland,2.8,,21.4,98.02,57.7


original
pings per min 2.8253065827801014
speed_mph 13.403813542450534
total_vp nan
vp_in_shape nan
min w gtfs 59.5
min w at least 2 pings 57.666666666666664


Unnamed: 0,trip_instance_key,time_of_day,speed_mph,rt_service_min,service_minutes,pings_per_min,total_min_w_gtfs,min_w_atleast2_trip_updates
12987,14c9fc6b379e1fe869ba37bfd79a8245,AM Peak,13.31,61.4,50.0,2.72,57,56
12990,2ee02fe17c9acad17ccb44eaaea4debe,AM Peak,12.69,63.18,50.0,2.9,63,62
12991,78a402dfe7d89e0b919a193ac59c69ae,AM Peak,12.61,63.18,50.0,2.82,62,60
12994,2bb4857e894a94d48a79620858c8384e,AM Peak,12.44,64.45,50.0,2.84,63,62
12997,b97a1995cd54253c58e82bb7c9ad3414,AM Peak,15.59,52.82,50.0,2.82,53,50
13004,baeeed7c3d6ab74ad9ff40f42a2f1da3,AM Peak,13.78,59.13,50.0,2.86,59,56


### Test aggregating with March

In [72]:
mar_date = "2023-03-15"

In [73]:
mar_df.sample()

Unnamed: 0,trip_instance_key,rt_service_min,min_w_atleast2_trip_updates,total_pings_for_trip,total_min_w_gtfs,total_vp,vp_in_shape,speed_mph,service_minutes,pings_per_min,spatial_accuracy_pct,rt_triptime_w_gtfs_pct,rt_v_scheduled_trip_time_pct
63831,3d8b42c5ef16df6405029c9c87f91615,38.72,28,67,37,67.0,36.0,9.38,19.0,1.73,53.73,95.57,103.77


In [74]:
mar_df2 = add_scheduled_trip_columns(df_2023_03_15, mar_date, ["trip_instance_key"])

In [75]:
mar_df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 83620 entries, 0 to 83619
Data columns (total 21 columns):
 #   Column                                 Non-Null Count  Dtype         
---  ------                                 --------------  -----         
 0   trip_instance_key                      83620 non-null  object        
 1   rt_service_min                         83620 non-null  float64       
 2   min_w_atleast2_trip_updates            83620 non-null  int64         
 3   total_pings_for_trip                   83620 non-null  int64         
 4   total_min_w_gtfs                       83620 non-null  int64         
 5   total_vp                               69494 non-null  float64       
 6   vp_in_shape                            69494 non-null  float64       
 7   speed_mph                              71797 non-null  float64       
 8   service_minutes                        71797 non-null  float64       
 9   pings_per_min                          83620 non-null  float6

In [76]:
mar_intermediary, mar_final = average_route_speeds_for_export(mar_df2, mar_date, 70)

In [77]:
mar_final.columns

Index(['org_id', 'agency', 'route_id', 'route_name', 'direction_id',
       'time_of_day', 'avg_speed_mph', 'n_trips', 'avg_sched_trip_min',
       'avg_rt_trip_min', 'base64_url', 'district_name', 'geometry',
       'avg_pings_per_min', 'avg_pct_vp_shape', 'avg_pct_rt_v_sched',
       'avg_rt_triptime_w_gtfs_pct', 'avg_min_w_atleast2_trip_updates'],
      dtype='object')

In [78]:
mar_final.sample(3).drop(columns=["base64_url", "geometry"])

Unnamed: 0,org_id,agency,route_id,route_name,direction_id,time_of_day,avg_speed_mph,n_trips,avg_sched_trip_min,avg_rt_trip_min,district_name,avg_pings_per_min,avg_pct_vp_shape,avg_pct_rt_v_sched,avg_rt_triptime_w_gtfs_pct,avg_min_w_atleast2_trip_updates
1780,recfma7GNR5lQTTTg,Orange County Transportation Authority,54,Garden Grove - Orange via Chapman Ave,1,Midday,11.7,10,75.5,112.5,12 - Irvine,2.9,99.04,49.01,96.62,106.9
8979,recJcXMNC5MUm2uDe,Victor Valley Transit Authority,3215,Victor Valley Mall - Victor Valley College,0,Evening,10.0,1,19.0,43.0,08 - San Bernardino,2.8,100.0,126.32,97.67,40.0
4227,recANs4M9yDhvDyob,Livermore / Amador Valley Transit Authority,611,Ruby Hill,1,PM Peak,13.4,1,42.0,47.2,04 - Oakland,3.0,87.14,12.38,99.58,47.0


In [79]:
checkout_route(mar_intermediary, mar_final, "4763", "PM Peak", 0)

final


Unnamed: 0,org_id,agency,route_id,route_name,direction_id,time_of_day,avg_speed_mph,n_trips,avg_sched_trip_min,avg_rt_trip_min,district_name,avg_pings_per_min,avg_pct_vp_shape,avg_pct_rt_v_sched,avg_rt_triptime_w_gtfs_pct,avg_min_w_atleast2_trip_updates
3720,reckQmUdXUzHFmlVf,City of Ojai,4763,20,0,PM Peak,26.1,4,49.0,33.0,07 - Los Angeles,2.9,100.0,-32.65,100.61,32.2


original
pings per min 2.909125279321039
speed_mph 26.116502281263557
total_vp 96.75
vp_in_shape 96.75
min w gtfs 33.25
min w at least 2 pings 32.25


Unnamed: 0,trip_instance_key,time_of_day,speed_mph,rt_service_min,service_minutes,pings_per_min,total_min_w_gtfs,min_w_atleast2_trip_updates
36452,7ea7312469de416b5c96c2df46b07c85,PM Peak,46.12,16.67,34.0,2.88,16,16
36453,3d6760f1db43f24d68d2e780ac2be0db,PM Peak,15.04,49.05,54.0,2.94,50,48
37260,1af731c948711c3f047f29f922d24ee6,PM Peak,14.26,51.68,54.0,2.96,52,51
37278,49ce553a741ac1fcd734f035f299b81b,PM Peak,29.06,14.68,54.0,2.86,15,14


In [80]:
checkout_route(mar_intermediary, mar_final, "332", "Evening", 0)

final


Unnamed: 0,org_id,agency,route_id,route_name,direction_id,time_of_day,avg_speed_mph,n_trips,avg_sched_trip_min,avg_rt_trip_min,district_name,avg_pings_per_min,avg_pct_vp_shape,avg_pct_rt_v_sched,avg_rt_triptime_w_gtfs_pct,avg_min_w_atleast2_trip_updates
1285,recRBcrX4ZvTyvSnm,North County Transit District,332,Vista TC - Buena Creek Station,0,Evening,16.0,2,32.0,46.2,11 - San Diego,2.6,100.0,44.38,99.57,43.0


original
pings per min 2.5985041071389485
speed_mph 15.952292627153918
total_vp 120.0
vp_in_shape 120.0
min w gtfs 46.0
min w at least 2 pings 43.0


Unnamed: 0,trip_instance_key,time_of_day,speed_mph,rt_service_min,service_minutes,pings_per_min,total_min_w_gtfs,min_w_atleast2_trip_updates
41418,e54c1ac191dc0b57df34834df825d0ad,Evening,10.83,47.1,31.0,2.59,47,43
41433,e1d2ca1ad28f5fe1a3ec9a772cfae369,Evening,21.07,45.27,33.0,2.61,45,43
