# Migrate to VP Usable
* https://github.com/cal-itp/data-analyses/issues/936
* cd rt_segment_speeds && pip install -r requirements.txt && cd ..
    * https://github.com/cal-itp/data-analyses/blob/main/Makefile#L49C2-L49C66
    

In [1]:
import datetime

import dask
import dask.dataframe as dd
import dask_geopandas as dg
import geopandas as gpd
import numpy as np
import pandas as pd
from calitp_data_analysis.geography_utils import WGS84
from scripts import vp_spatial_accuracy
from segment_speed_utils import helpers, wrangle_shapes
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    CONFIG_PATH,
    GCS_FILE_PATH,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SEGMENT_GCS,
    analysis_date,
)

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
# calitp-analytics-data/data-analyses/rt_segment_speeds/vp_usable_2023-10-11
operator = "Bay Area 511 Muni VehiclePositions"
gtfs_key = "7cc0cb1871dfd558f11a2885c145d144"

## Spatial Accuracy
* Based on https://github.com/cal-itp/data-analyses/blob/main/rt_scheduled_v_ran/scripts/vp_spatial_accuracy.py
### Grab_shape_keys_in_vp
#### First time reading `vp_usable`

In [4]:
def grab_shape_keys_in_vp(analysis_date: str) -> pd.DataFrame:
    """
    Subset raw vp and find unique trip_instance_keys.
    Create crosswalk to link trip_instance_key to shape_array_key.
    """
    vp_trip_df = pd.read_parquet(
        f"{SEGMENT_GCS}vp_usable_{analysis_date}",
        filters=[
            [
                ("gtfs_dataset_name", "==", operator),
                ("schedule_gtfs_dataset_key", "==", gtfs_key),
            ]
        ],
        columns=["trip_instance_key"],
    )

    vp_trip_df = vp_trip_df.drop_duplicates().reset_index(drop=True)

    # Make sure we have a shape geometry too
    # otherwise map_partitions will throw error
    shapes = (
        pd.read_parquet(
            f"{COMPILED_CACHED_VIEWS}routelines_{analysis_date}.parquet",
            columns=["shape_array_key"],
        )
        .dropna()
        .drop_duplicates()
    )

    trips_with_shape = (
        helpers.import_scheduled_trips(
            analysis_date,
            columns=["trip_instance_key", "shape_array_key"],
            get_pandas=True,
        )
        .merge(shapes, on="shape_array_key", how="inner")
        .merge(vp_trip_df, on="trip_instance_key", how="inner")
        .drop_duplicates()
        .dropna()
        .reset_index(drop=True)
    )

    return trips_with_shape

In [5]:
trips_with_shape = grab_shape_keys_in_vp(analysis_date)

In [6]:
trips_with_shape.shape

(9240, 2)

In [7]:
trips_with_shape.head()

Unnamed: 0,trip_instance_key,shape_array_key
0,12de3d260e9fe09fa878cb4cdb2d6898,749b225ca6691f77914e88577dc13e68
1,934aea5748bce830ffc2fa88dc01402a,749b225ca6691f77914e88577dc13e68
2,3d06fa8e68e4f38d3ccc7adfabb5c3d9,f1a7410fae06937b7183f6a553707915
3,34109ad8cfeca83cd459d42c7d51d602,f1a7410fae06937b7183f6a553707915
4,34acd907ae9d5eb5456d683d2458bbc6,f1a7410fae06937b7183f6a553707915


### Buffer shapes

In [8]:
# This is trips_with_shape_geom
trips_with_shape_geom = vp_spatial_accuracy.buffer_shapes(
    analysis_date, trips_with_shape, 35
)

In [9]:
trips_with_shape_geom.shape

(9240, 3)

#### Second time reading in the same file, streamline

In [10]:
vp = dd.read_parquet(
    f"{SEGMENT_GCS}vp_usable_{analysis_date}",
    filters=[
        [
            ("gtfs_dataset_name", "==", operator),
            ("schedule_gtfs_dataset_key", "==", gtfs_key),
        ]
    ],
)

### Full function

In [13]:
def merge_vp_with_shape_and_count(
    vp: dd.DataFrame, trips_with_shape_geom: gpd.GeoDataFrame
) -> gpd.GeoDataFrame:
    """
    Merge vp with crosswalk and buffered shapes.
    Get vp count totals and vp within shape.
    """
    # COMMENT BACK IN ONCE I MERGE WITH MAIN
    # vp_gdf = wrangle_shapes.vp_as_gdf(vp)

    vp_gdf = gpd.GeoDataFrame(
        vp,
        geometry = gpd.points_from_xy(vp.x, vp.y),
        crs = WGS84
    ).to_crs(PROJECT_CRS)

    vp_gdf = vp_gdf.rename(
        columns={
            0: "gtfs_dataset_name",
            1: "schedule_gtfs_dataset_key",
            2: "trip_id",
            3: "trip_instance_key",
            4: "location_timestamp",
            5: "location_timestamp_local",
            6: "x",
            7: "y",
            8: "vp_idx",
            9: "gtfs_dataset_key",
            10: "vp_dir_xnorm",
            11: "vp_dir_ynorm",
            12: "vp_primary_direction",
        }
    )

    vp2 = pd.merge(
        vp_gdf, trips_with_shape_geom, on="trip_instance_key", how="inner"
    ).reset_index(drop=True)

    total_vp = vp_spatial_accuracy.total_vp_counts_by_trip(vp2)

    vp2 = vp2.assign(is_within=vp2.geometry_x.within(vp2.geometry_y)).query(
        "is_within==True"
    )

    vps_in_shape = (
        vp2.groupby("trip_instance_key", observed=True, group_keys=False)
        .agg({"location_timestamp_local": "count"})
        .reset_index()
        .rename(columns={"location_timestamp_local": "vp_in_shape"})
    )

    count_df = pd.merge(total_vp, vps_in_shape, on="trip_instance_key", how="left")

    count_df = count_df.assign(
        vp_in_shape=count_df.vp_in_shape.fillna(0).astype("int32"),
        total_vp=count_df.total_vp.fillna(0).astype("int32"),
    )

    count_df["total_vp_in_shape_pct"] = count_df.vp_in_shape / count_df.total_vp * 100

    return count_df

In [14]:
muni = merge_vp_with_shape_and_count(vp, trips_with_shape_geom)

In [15]:
muni.vp_in_shape.describe()

count   9240.00
mean     161.87
std       64.06
min        0.00
25%      118.00
50%      167.00
75%      207.00
max      481.00
Name: vp_in_shape, dtype: float64

In [16]:
muni.trip_instance_key.nunique()

9240

In [17]:
(muni.vp_in_shape / muni.total_vp).describe()

count   9240.00
mean       0.93
std        0.14
min        0.00
25%        0.94
50%        1.00
75%        1.00
max        1.00
dtype: float64

In [18]:
muni.total_vp_in_shape_pct.describe()

count   9240.00
mean       0.93
std        0.14
min        0.00
25%        0.94
50%        1.00
75%        1.00
max        1.00
Name: total_vp_in_shape_percentage, dtype: float64

## Update Completeness
* https://github.com/cal-itp/data-analyses/blob/main/rt_predictions/01_update_completeness.ipynb

#### Keep only relevant `trips instance keys`?
* Trips have certain attributes: name, key, gtfs key, etc in addition to trip instance key
* Start from `vp_usable` join to `shapes` as first pass and get total positions in, add a column that counts counts number of pings
* Read dataset in once, derive a bunch of rows. 
* I have 100 vps for one trip -> turn the trip into one row with summary statistics
* Pass output of dataset into three different ways
* Each pass will summarize vp-usable from thousands of rows to just one per trip
* Pass `vp_usable` into three different functions.
* Final step: merge all on trips.
* Each function is individual, can be pandas.

In [19]:
# Only use the trips with shapes.
relevant_trips = list(muni.trip_instance_key.unique())

#### Third time reading in `vp_usable`

#### Density

6ef4805f0104b95614b86a2b1c374d23
* Minutes skipped: 6:12 to 6:18
* 6:19-6:26
* 6:28-7:33 etc etc
* Trip started at 5:48, ended at 8:31 

Can also check w/ 38247cbee93b6f85d58bf1812ae553b9
* Began at 16:01:00, end at 17:00:000
* Skips having data: jumps from 4:38 to 4:54
    * Trip: know the min and the max 
    * Bin the minutes: everything else between 4:38 to 4:54 can't be binned. 
    * Dem: gap between max and min 
* Vp doesn't necessarily update every minute 
* TO DO
    * Correct trip duration numerator: average pings per minute or ten minutes or five minutes...Help us understand coverage.  
    * How to express time lapses: time deletas in pandas

In [20]:
completeness_cols = [
    "vp_idx",
    "location_timestamp_local",
    "trip_instance_key",
    "gtfs_dataset_key",
]

In [21]:
vp_filtered = pd.read_parquet(
    f"{SEGMENT_GCS}vp_usable_{analysis_date}",
    columns=completeness_cols,
    filters=[
        [
            ("gtfs_dataset_name", "==", operator),
            ("schedule_gtfs_dataset_key", "==", gtfs_key),
            ("trip_instance_key", "in", relevant_trips),
        ]
    ],
)

In [22]:
def pct_of_pings(df: pd.DataFrame):

    # Determine which rows have 2+ pings per minute
    df2 = df.assign(
        atleast2_trip_updates=df.apply(
            lambda x: 1 if x.number_of_pings_per_minute >= 2 else 0, axis=1
        )
    )

    # Calculate total trip time
    df2["max_time"] = df2.location_timestamp_local
    df3 = (
        df2.groupby(["trip_instance_key"])
        .agg(
            {
                "location_timestamp_local": "min",
                "max_time": "max",
                "atleast2_trip_updates": "sum",
            }
        )
        .reset_index()
        .rename(columns={"location_timestamp_local": "min_time"})
    )

    # Add an extra minute
    df3["total_trip_time"] = (df3.max_time - df3.min_time) / pd.Timedelta(minutes=1) + 1

    # Find % of each trip in which one minute has 2+ pings
    df3 = df3.assign(
        pct_with_2_pings_per_min=df3.atleast2_trip_updates.divide(df3.total_trip_time) * 100
    )

    return df3

In [23]:
def density_of_pings(df: pd.DataFrame):

    # Count number of pings that occur
    # on average per 5 minutes of the trip
    df2 = (
        df.groupby(
            [
                *["trip_instance_key"],
                pd.Grouper(key="location_timestamp_local", freq="5Min"),
            ]
        )
        .sum()
        .reset_index()
        .rename(columns={"number_of_pings_per_minute": "number_of_pings_per_5_min"})
    )

    df3 = (
        df2.groupby(["trip_instance_key"])
        .agg({"number_of_pings_per_5_min": "median"})
        .reset_index()
    )

    df3["avg_pings_per_5_min"] = df3.number_of_pings_per_5_min / 5

    return df3

In [24]:
def total_minutes_with_gtfs(df: pd.DataFrame):
    """
    Total minutes of a trip that recorded any
    vehicle positions.
    """
    df2 = (
        df.groupby(["trip_instance_key"])
        .agg({"gtfs_dataset_key": "count"})
        .rename(columns={"gtfs_dataset_key": "total_minutes_w_gtfs"})
        .reset_index()
    )
    return df2

In [25]:
def update_completeness(df: pd.DataFrame):
    # Find number of pings per minute for each trip
    df = df.sort_values(["vp_idx"]).reset_index(drop=True)

    df2 = (
        df.groupby(
            [
                *["trip_instance_key"],
                pd.Grouper(key="location_timestamp_local", freq="1Min"),
            ]
        )
        .count()
        .reset_index()
        .rename(columns={"vp_idx": "number_of_pings_per_minute"})
    )

    pings_df = pct_of_pings(df2)
    density_df = density_of_pings(df2)
    total_minutes_df = total_minutes_with_gtfs(df2)

    m1 = pings_df.merge(density_df, on="trip_instance_key", how="inner").merge(
        total_minutes_df, on="trip_instance_key", how="inner"
    )

    m1["pct_of_trip_w_gtfs"] = m1.total_minutes_w_gtfs / m1.total_trip_time * 100
    m1 = m1.drop(columns=["min_time", "max_time", "number_of_pings_per_5_min"])
    return m1

In [26]:
update_completeness_df = update_completeness(vp_filtered)

In [27]:
update_completeness_df.sample(3)

Unnamed: 0,trip_instance_key,atleast2_trip_updates,total_trip_time,pct_with_2_pings_per_min,avg_pings_per_5_min,total_minutes_w_gtfs,pct_of_trip_w_gtfs
8654,f0bb3ec282e839a64996debf49cfde5f,14,16.0,0.88,2.4,16,1.0
5510,9951ffb6c7cb7a83dea1373d6304069c,65,66.0,0.98,3.0,66,1.0
8705,f238ff93a0b8b6732bee41cb84c4a2af,15,17.0,0.88,2.6,17,1.0


In [28]:
update_completeness_df.pct_of_trip_w_gtfs.describe()

count   9240.00
mean       0.99
std        0.05
min        0.04
25%        1.00
50%        1.00
75%        1.00
max        1.00
Name: pct_of_trip_w_gtfs, dtype: float64

In [29]:
update_completeness_df.avg_pings_per_5_min.describe()

count   9240.00
mean       2.96
std        0.18
min        0.30
25%        3.00
50%        3.00
75%        3.00
max        3.00
Name: avg_pings_per_5_min, dtype: float64

In [30]:
update_completeness_df.pct_with_2_pings_per_min.describe()

count   9240.00
mean       0.98
std        0.06
min        0.02
25%        0.98
50%        0.99
75%        1.00
max        1.00
Name: pct_with_2_pings_per_min, dtype: float64

## How many minutes a trip took and the average speeds?

In [31]:
vp_filtered.sample()

Unnamed: 0,vp_idx,location_timestamp_local,trip_instance_key,gtfs_dataset_key
12615548,12615548,2023-10-11 13:46:53,a0cb0738267cdb45dc7925f60c7e122d,c0e3039da063db95ebabd3fe4ee611a4


In [32]:
crosswalk = (
    vp_filtered[["gtfs_dataset_key", "trip_instance_key"]]
    .drop_duplicates()
    .reset_index(drop=True)
)

In [153]:
crosswalk.shape

(9240, 2)

In [33]:
trip_speeds = pd.read_parquet(
    f"{SEGMENT_GCS}trip_summary/trip_speeds_{analysis_date}.parquet",
    columns=[
        "trip_instance_key",
        "speed_mph",
        "route_id",
        "time_of_day",
        "service_minutes",
        "direction_id",
        "route_name_used",
    ],
)

In [34]:
trip_speeds.sample()

Unnamed: 0,trip_instance_key,speed_mph,route_id,time_of_day,service_minutes,direction_id,route_name_used
6542,16187aef9abd74be67361544fd43b178,6.84,3739,AM Peak,83.0,1,CEDAR


### Are we still filtering out trips with overly high/low speeds?
* What happens if a route doesn't have an id or used name?

In [35]:
# Filtered
trip_speeds2 = trip_speeds.query("speed_mph <= 80")

In [36]:
trip_speeds2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 76255 entries, 0 to 76254
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   trip_instance_key  76255 non-null  object 
 1   speed_mph          76255 non-null  float64
 2   route_id           74710 non-null  object 
 3   time_of_day        76255 non-null  object 
 4   service_minutes    76255 non-null  float64
 5   direction_id       74710 non-null  Int64  
 6   route_name_used    74710 non-null  object 
dtypes: Int64(1), float64(2), object(4)
memory usage: 4.7+ MB


In [155]:
# trip_speeds2 = trip_speeds2.fillna('NA')

In [37]:
len(trip_speeds2), len(trip_speeds)

(76255, 76255)

In [38]:
# HOW are there some that are right only??
pd.merge(
    trip_speeds2, crosswalk, on=["trip_instance_key"], how="outer", indicator=True
)[["_merge"]].value_counts()

_merge    
left_only     67113
both           9142
right_only       98
dtype: int64

In [39]:
trip_speeds2 = pd.merge(
    trip_speeds2, crosswalk, on=["trip_instance_key"], how="outer"
)

In [40]:
len(trip_speeds2), len(trip_speeds)

(76353, 76255)

In [41]:
vp_filtered.sample()

Unnamed: 0,vp_idx,location_timestamp_local,trip_instance_key,gtfs_dataset_key
12646659,12646659,2023-10-11 10:23:22,1d7936f5b857112948210c296adb8b30,c0e3039da063db95ebabd3fe4ee611a4


In [42]:
trip_speeds2_muni = trip_speeds2.loc[trip_speeds2.gtfs_dataset_key == "c0e3039da063db95ebabd3fe4ee611a4"]

In [43]:
trip_speeds2_muni.shape

(9240, 8)

In [44]:
trip_speeds2_muni.trip_instance_key.nunique()

9240

In [45]:
# trip_speeds_all  = pd.read_parquet(f"{SEGMENT_GCS}trip_summary/trip_speeds_{analysis_date}.parquet",)

In [46]:
# https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/avg_speeds_by_segment.py#L18C1-L48C17
def calculate_avg_speeds(df: pd.DataFrame, group_cols: list) -> pd.DataFrame:
    """
    Calculate the median, 20th, and 80th percentile speeds
    by groups.
    """
    # pd.groupby and pd.quantile is so slow
    # create our own list of speeds and use np
    df2 = (
        df.groupby(group_cols, observed=True, group_keys=False)
        .agg({"speed_mph": lambda x: sorted(list(x))})
        .reset_index()
        .rename(columns={"speed_mph": "speed_mph_list"})
    )

    df2 = df2.assign(
        p50_mph=df2.apply(lambda x: np.percentile(x.speed_mph_list, 0.5), axis=1),
        n_trips=df2.apply(lambda x: len(x.speed_mph_list), axis=1).astype("int"),
        p20_mph=df2.apply(lambda x: np.percentile(x.speed_mph_list, 0.2), axis=1),
        p80_mph=df2.apply(lambda x: np.percentile(x.speed_mph_list, 0.8), axis=1),
    )

    stats = df2.drop(columns="speed_mph_list")

    # Clean up for map
    speed_cols = [c for c in stats.columns if "_mph" in c]
    stats[speed_cols] = stats[speed_cols].round(2)

    return stats

In [53]:
peak = calculate_avg_speeds(
    trip_speeds2_muni[trip_speeds2_muni.time_of_day.isin(["AM Peak", "PM Peak"])],
    ["gtfs_dataset_key","route_name_used", "route_id"],
)

In [54]:
peak2 = calculate_avg_speeds(
    trip_speeds2_muni[trip_speeds2_muni.time_of_day.isin(["AM Peak", "PM Peak"])],
    ["trip_instance_key","route_name_used", "route_id"],
)

In [55]:
all_day = calculate_avg_speeds(
    trip_speeds2_muni,
    ["gtfs_dataset_key","route_name_used", "route_id"],
)

In [56]:
all_day.loc[(all_day.route_name_used == "5am-10pm daily") & (all_day.route_id == "14R")]

Unnamed: 0,gtfs_dataset_key,route_name_used,route_id,p50_mph,n_trips,p20_mph,p80_mph
9,c0e3039da063db95ebabd3fe4ee611a4,5am-10pm daily,14R,3.46,261,3.01,3.48


In [57]:
stats = pd.concat([
        all_day.assign(time_of_day = "all_day"),
        peak.assign(time_of_day = "peak")
    ], axis=0)

In [162]:
stats.sample()

Unnamed: 0,gtfs_dataset_key,route_name_used,route_id,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
58,c0e3039da063db95ebabd3fe4ee611a4,Weekdays 7am-7pm,5R,2.79,84,2.76,2.83,peak


In [160]:
stats2 = pd.merge(stats, trip_speeds2_muni.drop(columns = ['time_of_day']), on = ['gtfs_dataset_key', 'route_id','route_name_used'])

In [165]:
stats2.loc[(stats2.route_id == '5R') & (stats2.time_of_day != 'all_day')][['speed_mph']].describe()

Unnamed: 0,speed_mph
count,144.0
mean,4.29
std,1.48
min,2.73
25%,3.29
50%,3.71
75%,4.41
max,10.5


In [167]:
stats2.loc[(stats2.route_id == '5R') & (stats2.time_of_day != 'all_day')][['speed_mph']].describe()

Unnamed: 0,speed_mph
count,144.0
mean,4.29
std,1.48
min,2.73
25%,3.29
50%,3.71
75%,4.41
max,10.5


In [164]:
stats2.loc[(stats2.route_id == '5R') & (stats2.time_of_day != 'all_day')]

Unnamed: 0,gtfs_dataset_key,route_name_used,route_id,p50_mph,n_trips,p20_mph,p80_mph,time_of_day,trip_instance_key,speed_mph,service_minutes,direction_id
17543,c0e3039da063db95ebabd3fe4ee611a4,Weekdays 7am-7pm,5R,2.79,84,2.76,2.83,peak,000c62b70d9438b7951457a74a4c89b2,4.16,49.0,1
17544,c0e3039da063db95ebabd3fe4ee611a4,Weekdays 7am-7pm,5R,2.79,84,2.76,2.83,peak,041051b27cc7ffaa41fa33966f2805b1,3.5,49.0,0
17545,c0e3039da063db95ebabd3fe4ee611a4,Weekdays 7am-7pm,5R,2.79,84,2.76,2.83,peak,0acac1f10f9bc1460b75665914de59ec,3.97,50.0,1
17546,c0e3039da063db95ebabd3fe4ee611a4,Weekdays 7am-7pm,5R,2.79,84,2.76,2.83,peak,0ad24178641c9756e10466f664566b9d,3.32,48.0,0
17547,c0e3039da063db95ebabd3fe4ee611a4,Weekdays 7am-7pm,5R,2.79,84,2.76,2.83,peak,0d3353e83550b6eb5cc174b9a9ab2c19,5.86,50.0,0
17548,c0e3039da063db95ebabd3fe4ee611a4,Weekdays 7am-7pm,5R,2.79,84,2.76,2.83,peak,0d64cb8a7768d8123be9b7788a0f91d4,3.22,49.0,1
17549,c0e3039da063db95ebabd3fe4ee611a4,Weekdays 7am-7pm,5R,2.79,84,2.76,2.83,peak,0dec5877b6456ff182d3017e97dbd139,3.28,47.0,0
17550,c0e3039da063db95ebabd3fe4ee611a4,Weekdays 7am-7pm,5R,2.79,84,2.76,2.83,peak,12d9b085138f675bf9fd5c1048036796,3.2,50.0,0
17551,c0e3039da063db95ebabd3fe4ee611a4,Weekdays 7am-7pm,5R,2.79,84,2.76,2.83,peak,12de3d260e9fe09fa878cb4cdb2d6898,6.35,36.0,1
17552,c0e3039da063db95ebabd3fe4ee611a4,Weekdays 7am-7pm,5R,2.79,84,2.76,2.83,peak,1b097ba04412ab08bfc1871bd88ea80a,3.67,44.0,0


## Outer Join
* Add scheduled trips:

    * How to use map partitions
        * https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/shapely_project_vp.py#L61
    * https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/avg_speeds_by_segment.py
    * https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/config.yml

In [125]:
# Inner merge for now b/c only SF Muni
m1 = muni.merge(update_completeness_df, on="trip_instance_key", how="outer").merge(
    trip_speeds2, on="trip_instance_key", how="inner"
)

### Rt vs. Scheduled
* A trip that was scheudled to run 50 minutes but ran 83 minutes is took 60% longer

In [131]:
m1['rt_vs_scheduled_pct'] = (m1.total_trip_time/m1.service_minutes - 1) * 100

In [132]:
# (74/69 - 1) * 100

In [141]:
 m1.rt_vs_scheduled_pct = ((m1.rt_vs_scheduled_pct/5).fillna(0).round().astype(int) * 5)

In [150]:
m1.rt_vs_scheduled_pct.describe()

count   9240.00
mean      41.89
std       55.54
min      -70.00
25%       10.00
50%       30.00
75%       60.00
max     1350.00
Name: rt_vs_scheduled_pct, dtype: float64

In [143]:
m1[['rt_vs_scheduled_pct','total_trip_time', 'service_minutes']].sample(10)

Unnamed: 0,rt_vs_scheduled_pct,total_trip_time,service_minutes
5425,20,56.0,46.0
109,60,99.0,63.0
461,40,53.0,37.0
8353,30,54.0,42.0
9231,80,46.0,25.0
3576,20,69.0,57.0
5707,40,40.0,28.0
5340,0,45.0,44.0
2897,10,36.0,33.0
2311,80,97.0,53.0


In [145]:
m1.loc[m1.rt_vs_scheduled_pct == 1350]

Unnamed: 0,trip_instance_key,total_vp,vp_in_shape,total_vp_in_shape_percentage,atleast2_trip_updates,total_trip_time,pct_with_2_pings_per_min,avg_pings_per_5_min,total_minutes_w_gtfs,pct_of_trip_w_gtfs,speed_mph,route_id,time_of_day,service_minutes,direction_id,route_name_used,gtfs_dataset_key,rt_vs_scheduled_pct
2132,3be5e98c0350a773dabd34e7a74ae1a8,87,18,0.21,29,29.0,1.0,3.0,29,1.0,2.54,CA,Midday,2.0,0,7 am-8:30 pm daily,c0e3039da063db95ebabd3fe4ee611a4,1350


In [146]:
m1.loc[m1.rt_vs_scheduled_pct == -70]

Unnamed: 0,trip_instance_key,total_vp,vp_in_shape,total_vp_in_shape_percentage,atleast2_trip_updates,total_trip_time,pct_with_2_pings_per_min,avg_pings_per_5_min,total_minutes_w_gtfs,pct_of_trip_w_gtfs,speed_mph,route_id,time_of_day,service_minutes,direction_id,route_name_used,gtfs_dataset_key,rt_vs_scheduled_pct
1873,3513c07b09c3da765ff43dd7317a7689,45,42,0.93,15,16.0,0.94,2.9,16,1.0,15.97,49,PM Peak,50.0,1,5am-12 midnight daily,c0e3039da063db95ebabd3fe4ee611a4,-70
4230,768cb2018f0e2f716b7f4ecc9e2a66c9,40,32,0.8,14,14.0,1.0,2.3,14,1.0,18.75,38R,Midday,47.0,1,Weekdays 5am-10pm Weekends 6am-9pm,c0e3039da063db95ebabd3fe4ee611a4,-70
5289,9352e26ef303febc50a799f0a6885a20,40,40,1.0,13,14.0,0.93,2.2,14,1.0,16.47,F,Midday,52.0,0,7am-10pm daily,c0e3039da063db95ebabd3fe4ee611a4,-70
8083,e0f64b5821efb178f43da88f8a6300fa,32,32,1.0,11,11.0,1.0,1.8,11,1.0,20.72,J,Midday,37.0,0,5am-12 midnight daily,c0e3039da063db95ebabd3fe4ee611a4,-70


In [152]:
m1.loc[m1.rt_vs_scheduled_pct == 40].sample()

Unnamed: 0,trip_instance_key,total_vp,vp_in_shape,total_vp_in_shape_percentage,atleast2_trip_updates,total_trip_time,pct_with_2_pings_per_min,avg_pings_per_5_min,total_minutes_w_gtfs,pct_of_trip_w_gtfs,speed_mph,route_id,time_of_day,service_minutes,direction_id,route_name_used,gtfs_dataset_key,rt_vs_scheduled_pct
4218,763ffd514c3cb058e6858134052e5e35,188,188,1.0,63,64.0,0.98,3.0,64,1.0,3.75,38,AM Peak,47.0,1,24 hour service daily,c0e3039da063db95ebabd3fe4ee611a4,40


In [149]:
# 7;55 to 8:10: 15 minutes
vp_filtered.loc[vp_filtered.trip_instance_key == "3513c07b09c3da765ff43dd7317a7689"]

Unnamed: 0,vp_idx,location_timestamp_local,trip_instance_key,gtfs_dataset_key
12346313,12346313,2023-10-11 19:55:34,3513c07b09c3da765ff43dd7317a7689,c0e3039da063db95ebabd3fe4ee611a4
12346314,12346314,2023-10-11 19:55:50,3513c07b09c3da765ff43dd7317a7689,c0e3039da063db95ebabd3fe4ee611a4
12346315,12346315,2023-10-11 19:56:20,3513c07b09c3da765ff43dd7317a7689,c0e3039da063db95ebabd3fe4ee611a4
12346316,12346316,2023-10-11 19:56:36,3513c07b09c3da765ff43dd7317a7689,c0e3039da063db95ebabd3fe4ee611a4
12346317,12346317,2023-10-11 19:56:51,3513c07b09c3da765ff43dd7317a7689,c0e3039da063db95ebabd3fe4ee611a4
12346318,12346318,2023-10-11 19:57:06,3513c07b09c3da765ff43dd7317a7689,c0e3039da063db95ebabd3fe4ee611a4
12346319,12346319,2023-10-11 19:57:37,3513c07b09c3da765ff43dd7317a7689,c0e3039da063db95ebabd3fe4ee611a4
12346320,12346320,2023-10-11 19:57:53,3513c07b09c3da765ff43dd7317a7689,c0e3039da063db95ebabd3fe4ee611a4
12346321,12346321,2023-10-11 19:58:08,3513c07b09c3da765ff43dd7317a7689,c0e3039da063db95ebabd3fe4ee611a4
12346322,12346322,2023-10-11 19:58:39,3513c07b09c3da765ff43dd7317a7689,c0e3039da063db95ebabd3fe4ee611a4


In [144]:
m1.rt_vs_scheduled_pct.describe()

count   9240.00
mean      41.89
std       55.54
min      -70.00
25%       10.00
50%       30.00
75%       60.00
max     1350.00
Name: rt_vs_scheduled_pct, dtype: float64