# Migrate to VP Usable
* https://github.com/cal-itp/data-analyses/issues/936
* cd rt_segment_speeds && pip install -r requirements.txt && cd ..
    * https://github.com/cal-itp/data-analyses/blob/main/Makefile#L49C2-L49C66
    

In [1]:
import datetime

import dask
import dask.dataframe as dd
import dask_geopandas as dg
import geopandas as gpd
import numpy as np
import pandas as pd
from calitp_data_analysis.geography_utils import WGS84
from scripts import vp_spatial_accuracy
from segment_speed_utils import helpers, wrangle_shapes
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    CONFIG_PATH,
    GCS_FILE_PATH,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SEGMENT_GCS,
    analysis_date,
)

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
# calitp-analytics-data/data-analyses/rt_segment_speeds/vp_usable_2023-10-11
operator = "Bay Area 511 Muni VehiclePositions"
gtfs_key = "7cc0cb1871dfd558f11a2885c145d144"

## Spatial Accuracy
* Based on https://github.com/cal-itp/data-analyses/blob/main/rt_scheduled_v_ran/scripts/vp_spatial_accuracy.py
### Grab_shape_keys_in_vp
#### First time reading `vp_usable`

In [4]:
def grab_shape_keys_in_vp(analysis_date: str) -> pd.DataFrame:
    """
    Subset raw vp and find unique trip_instance_keys.
    Create crosswalk to link trip_instance_key to shape_array_key.
    """
    vp_trip_df = pd.read_parquet(
        f"{SEGMENT_GCS}vp_usable_{analysis_date}",
        filters=[
            [
                ("gtfs_dataset_name", "==", operator),
                ("schedule_gtfs_dataset_key", "==", gtfs_key),
            ]
        ],
        columns=["trip_instance_key"],
    )

    vp_trip_df = vp_trip_df.drop_duplicates().reset_index(drop=True)

    # Make sure we have a shape geometry too
    # otherwise map_partitions will throw error
    shapes = (
        pd.read_parquet(
            f"{COMPILED_CACHED_VIEWS}routelines_{analysis_date}.parquet",
            columns=["shape_array_key"],
        )
        .dropna()
        .drop_duplicates()
    )

    trips_with_shape = (
        helpers.import_scheduled_trips(
            analysis_date,
            columns=["trip_instance_key", "shape_array_key"],
            get_pandas=True,
        )
        .merge(shapes, on="shape_array_key", how="inner")
        .merge(vp_trip_df, on="trip_instance_key", how="inner")
        .drop_duplicates()
        .dropna()
        .reset_index(drop=True)
    )

    return trips_with_shape

In [5]:
trips_with_shape = grab_shape_keys_in_vp(analysis_date)

In [6]:
trips_with_shape.shape

(9240, 2)

In [7]:
trips_with_shape.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   trip_instance_key  9240 non-null   object
 1   shape_array_key    9240 non-null   object
dtypes: object(2)
memory usage: 144.5+ KB


In [8]:
trips_with_shape.head()

Unnamed: 0,trip_instance_key,shape_array_key
0,12de3d260e9fe09fa878cb4cdb2d6898,749b225ca6691f77914e88577dc13e68
1,934aea5748bce830ffc2fa88dc01402a,749b225ca6691f77914e88577dc13e68
2,3d06fa8e68e4f38d3ccc7adfabb5c3d9,f1a7410fae06937b7183f6a553707915
3,34109ad8cfeca83cd459d42c7d51d602,f1a7410fae06937b7183f6a553707915
4,34acd907ae9d5eb5456d683d2458bbc6,f1a7410fae06937b7183f6a553707915


### Buffer shapes

In [9]:
# This is trips_with_shape_geom
trips_with_shape_geom = vp_spatial_accuracy.buffer_shapes(
    analysis_date, trips_with_shape, 35
)

In [10]:
trips_with_shape_geom.shape

(9240, 3)

In [11]:
trips_with_shape_geom.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 9240 entries, 0 to 9239
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   shape_array_key    9240 non-null   object  
 1   geometry           9240 non-null   geometry
 2   trip_instance_key  9240 non-null   object  
dtypes: geometry(1), object(2)
memory usage: 288.8+ KB


In [12]:
trips_with_shape_geom.columns

Index(['shape_array_key', 'geometry', 'trip_instance_key'], dtype='object')

#### Second time reading in the same file, streamline

In [13]:
vp = dd.read_parquet(
    f"{SEGMENT_GCS}vp_usable_{analysis_date}",
    filters=[
        [
            ("gtfs_dataset_name", "==", operator),
            ("schedule_gtfs_dataset_key", "==", gtfs_key),
        ]
    ],
)

### Full function

In [14]:
def merge_vp_with_shape_and_count(
    vp: dd.DataFrame, trips_with_shape_geom: gpd.GeoDataFrame
) -> gpd.GeoDataFrame:
    """
    Merge vp with crosswalk and buffered shapes.
    Get vp count totals and vp within shape.
    """
    # COMMENT BACK IN ONCE I MERGE WITH MAIN
    # vp_gdf = wrangle_shapes.vp_as_gdf(vp)

    vp_gdf = gpd.GeoDataFrame(
        vp, geometry=gpd.points_from_xy(vp.x, vp.y), crs=WGS84
    ).to_crs(PROJECT_CRS)

    vp_gdf = vp_gdf.rename(
        columns={
            0: "gtfs_dataset_name",
            1: "schedule_gtfs_dataset_key",
            2: "trip_id",
            3: "trip_instance_key",
            4: "location_timestamp",
            5: "location_timestamp_local",
            6: "x",
            7: "y",
            8: "vp_idx",
            9: "gtfs_dataset_key",
            10: "vp_dir_xnorm",
            11: "vp_dir_ynorm",
            12: "vp_primary_direction",
        }
    )

    vp2 = pd.merge(
        vp_gdf, trips_with_shape_geom, on="trip_instance_key", how="inner"
    ).reset_index(drop=True)

    total_vp = vp_spatial_accuracy.total_vp_counts_by_trip(vp2)

    vp2 = vp2.assign(is_within=vp2.geometry_x.within(vp2.geometry_y)).query(
        "is_within==True"
    )

    vps_in_shape = (
        vp2.groupby("trip_instance_key", observed=True, group_keys=False)
        .agg({"location_timestamp_local": "count"})
        .reset_index()
        .rename(columns={"location_timestamp_local": "vp_in_shape"})
    )

    count_df = pd.merge(total_vp, vps_in_shape, on="trip_instance_key", how="left")

    count_df = count_df.assign(
        vp_in_shape=count_df.vp_in_shape.fillna(0).astype("int32"),
        total_vp=count_df.total_vp.fillna(0).astype("int32"),
    )

    count_df["total_vp_in_shape_pct"] = count_df.vp_in_shape / count_df.total_vp * 100

    return count_df

In [15]:
muni = merge_vp_with_shape_and_count(vp, trips_with_shape_geom)

In [16]:
muni.vp_in_shape.describe()

count   9240.00
mean     161.87
std       64.06
min        0.00
25%      118.00
50%      167.00
75%      207.00
max      481.00
Name: vp_in_shape, dtype: float64

In [17]:
muni.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9240 entries, 0 to 9239
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   trip_instance_key      9240 non-null   object 
 1   total_vp               9240 non-null   int32  
 2   vp_in_shape            9240 non-null   int32  
 3   total_vp_in_shape_pct  9240 non-null   float64
dtypes: float64(1), int32(2), object(1)
memory usage: 288.8+ KB


In [18]:
muni.trip_instance_key.nunique()

9240

In [19]:
(muni.vp_in_shape / muni.total_vp).describe()

count   9240.00
mean       0.93
std        0.14
min        0.00
25%        0.94
50%        1.00
75%        1.00
max        1.00
dtype: float64

In [20]:
muni.total_vp_in_shape_pct.describe()

count   9240.00
mean      93.34
std       13.91
min        0.00
25%       94.15
50%      100.00
75%      100.00
max      100.00
Name: total_vp_in_shape_pct, dtype: float64

## Update Completeness
* https://github.com/cal-itp/data-analyses/blob/main/rt_predictions/01_update_completeness.ipynb

#### Keep only relevant `trips instance keys`?
* Trips have certain attributes: name, key, gtfs key, etc in addition to trip instance key
* Start from `vp_usable` join to `shapes` as first pass and get total positions in, add a column that counts counts number of pings
* Read dataset in once, derive a bunch of rows. 
* I have 100 vps for one trip -> turn the trip into one row with summary statistics
* Pass output of dataset into three different ways
* Each pass will summarize vp-usable from thousands of rows to just one per trip
* Pass `vp_usable` into three different functions.
* Final step: merge all on trips.
* Each function is individual, can be pandas.

In [21]:
# Only use the trips with shapes.
relevant_trips = list(muni.trip_instance_key.unique())

#### Third time reading in `vp_usable`

#### Density

6ef4805f0104b95614b86a2b1c374d23
* Minutes skipped: 6:12 to 6:18
* 6:19-6:26
* 6:28-7:33 etc etc
* Trip started at 5:48, ended at 8:31 

Can also check w/ 38247cbee93b6f85d58bf1812ae553b9
* Began at 16:01:00, end at 17:00:000
* Skips having data: jumps from 4:38 to 4:54
    * Trip: know the min and the max 
    * Bin the minutes: everything else between 4:38 to 4:54 can't be binned. 
    * Dem: gap between max and min 
* Vp doesn't necessarily update every minute 
* TO DO
    * Correct trip duration numerator: average pings per minute or ten minutes or five minutes...Help us understand coverage.  
    * How to express time lapses: time deletas in pandas

In [22]:
completeness_cols = [
    "vp_idx",
    "location_timestamp_local",
    "trip_instance_key",
    "gtfs_dataset_key",
]

In [23]:
vp_filtered = pd.read_parquet(
    f"{SEGMENT_GCS}vp_usable_{analysis_date}",
    columns=completeness_cols,
    filters=[
        [
            ("gtfs_dataset_name", "==", operator),
            ("schedule_gtfs_dataset_key", "==", gtfs_key),
            ("trip_instance_key", "in", relevant_trips),
        ]
    ],
)

In [24]:
def pct_of_pings(df: pd.DataFrame):

    # Determine which rows have 2+ pings per minute
    df2 = df.assign(
        atleast2_trip_updates=df.apply(
            lambda x: 1 if x.number_of_pings_per_minute >= 2 else 0, axis=1
        )
    )

    # Calculate total trip time
    df2["max_time"] = df2.location_timestamp_local
    df3 = (
        df2.groupby(["trip_instance_key"])
        .agg(
            {
                "location_timestamp_local": "min",
                "max_time": "max",
                "atleast2_trip_updates": "sum",
            }
        )
        .reset_index()
        .rename(columns={"location_timestamp_local": "min_time"})
    )

    # Add an extra minute
    df3["total_trip_time"] = (df3.max_time - df3.min_time) / pd.Timedelta(minutes=1) + 1

    # Find % of each trip in which one minute has 2+ pings
    df3 = df3.assign(
        pct_with_2_pings_per_min=df3.atleast2_trip_updates.divide(df3.total_trip_time)
        * 100
    )

    return df3

In [25]:
def density_of_pings(df: pd.DataFrame):

    # Count number of pings that occur
    # on average per 5 minutes of the trip
    df2 = (
        df.groupby(
            [
                *["trip_instance_key"],
                pd.Grouper(key="location_timestamp_local", freq="5Min"),
            ]
        )
        .sum()
        .reset_index()
        .rename(columns={"number_of_pings_per_minute": "number_of_pings_per_5_min"})
    )

    df3 = (
        df2.groupby(["trip_instance_key"])
        .agg({"number_of_pings_per_5_min": "median"})
        .reset_index()
    )

    df3["avg_pings_per_5_min"] = df3.number_of_pings_per_5_min / 5

    return df3

In [26]:
def total_minutes_with_gtfs(df: pd.DataFrame):
    """
    Total minutes of a trip that recorded any
    vehicle positions.
    """
    df2 = (
        df.groupby(["trip_instance_key"])
        .agg({"gtfs_dataset_key": "count"})
        .rename(columns={"gtfs_dataset_key": "total_minutes_w_gtfs"})
        .reset_index()
    )
    return df2

In [27]:
def update_completeness(df: pd.DataFrame):
    # Find number of pings per minute for each trip
    df = df.sort_values(["vp_idx"]).reset_index(drop=True)

    df2 = (
        df.groupby(
            [
                *["trip_instance_key"],
                pd.Grouper(key="location_timestamp_local", freq="1Min"),
            ]
        )
        .count()
        .reset_index()
        .rename(columns={"vp_idx": "number_of_pings_per_minute"})
    )

    pings_df = pct_of_pings(df2)
    density_df = density_of_pings(df2)
    total_minutes_df = total_minutes_with_gtfs(df2)

    m1 = pings_df.merge(density_df, on="trip_instance_key", how="inner").merge(
        total_minutes_df, on="trip_instance_key", how="inner"
    )

    # m1["pct_of_trip_w_gtfs"] = m1.total_minutes_w_gtfs / m1.total_trip_time * 100
    m1 = m1.drop(columns=["min_time", "max_time", "number_of_pings_per_5_min"])
    return m1

In [28]:
update_completeness_df = update_completeness(vp_filtered)

In [29]:
update_completeness_df.sample(3)

Unnamed: 0,trip_instance_key,atleast2_trip_updates,total_trip_time,pct_with_2_pings_per_min,avg_pings_per_5_min,total_minutes_w_gtfs
1870,350860b22af18fe23c8fe1fc268293a9,86,86.0,100.0,3.0,86
2465,4479c2be170d782986df20ad8593fb12,58,58.0,100.0,3.0,58
2923,5263c231e44af7edbc241a8eece1b682,67,69.0,97.1,3.0,69


In [33]:
update_completeness_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9240 entries, 0 to 9239
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   trip_instance_key         9240 non-null   object 
 1   atleast2_trip_updates     9240 non-null   int64  
 2   total_trip_time           9240 non-null   float64
 3   pct_with_2_pings_per_min  9240 non-null   float64
 4   avg_pings_per_5_min       9240 non-null   float64
 5   total_minutes_w_gtfs      9240 non-null   int64  
dtypes: float64(3), int64(2), object(1)
memory usage: 505.3+ KB


In [31]:
# update_completeness_df.pct_of_trip_w_gtfs.describe()

In [32]:
update_completeness_df.avg_pings_per_5_min.describe()

count   9240.00
mean       2.96
std        0.18
min        0.30
25%        3.00
50%        3.00
75%        3.00
max        3.00
Name: avg_pings_per_5_min, dtype: float64

In [None]:
update_completeness_df.pct_with_2_pings_per_min.describe()

## How many minutes a trip took and the average speeds?

In [None]:
vp_filtered.sample()

In [None]:
crosswalk = (
    vp_filtered[["gtfs_dataset_key", "trip_instance_key"]]
    .drop_duplicates()
    .reset_index(drop=True)
)

In [None]:
crosswalk.shape

In [None]:
trip_speeds = pd.read_parquet(
    f"{SEGMENT_GCS}trip_summary/trip_speeds_{analysis_date}.parquet",
    columns=[
        "trip_instance_key",
        "speed_mph",
        "route_id",
        "time_of_day",
        "service_minutes",
        "direction_id",
        "route_name_used",
    ],
)

In [None]:
trip_speeds.sample()

### Average speed questions:
* Are we still filtering out trips with overly high/low speeds?
* What happens if a route doesn't have an id or used name? How do you merge it back?
* DO I still use the average function?
    * The results I get are kind of different then when I average 

In [None]:
# Filtered
trip_speeds2 = trip_speeds.query("speed_mph <= 80")

In [None]:
len(trip_speeds), len(trip_speeds2)

In [None]:
trip_speeds2.info()

In [None]:
trip_speeds2.speed_mph.describe()

In [None]:
# HOW are there some that are right only??
pd.merge(
    trip_speeds2, crosswalk, on=["trip_instance_key"], how="outer", indicator=True
)[["_merge"]].value_counts()

In [None]:
trip_speeds2 = pd.merge(trip_speeds2, crosswalk, on=["trip_instance_key"], how="outer")

In [None]:
len(trip_speeds2), len(trip_speeds)

In [None]:
vp_filtered.sample()

In [None]:
trip_speeds2_muni = trip_speeds2.loc[
    trip_speeds2.gtfs_dataset_key == "c0e3039da063db95ebabd3fe4ee611a4"
]

In [None]:
trip_speeds2_muni.shape

In [None]:
trip_speeds2_muni.trip_instance_key.nunique()

In [None]:
# trip_speeds_all  = pd.read_parquet(f"{SEGMENT_GCS}trip_summary/trip_speeds_{analysis_date}.parquet",)

In [None]:
# https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/avg_speeds_by_segment.py#L18C1-L48C17
def calculate_avg_speeds(df: pd.DataFrame, group_cols: list) -> pd.DataFrame:
    """
    Calculate the median, 20th, and 80th percentile speeds
    by groups.
    """
    # pd.groupby and pd.quantile is so slow
    # create our own list of speeds and use np
    df2 = (
        df.groupby(group_cols, observed=True, group_keys=False)
        .agg({"speed_mph": lambda x: sorted(list(x))})
        .reset_index()
        .rename(columns={"speed_mph": "speed_mph_list"})
    )

    df2 = df2.assign(
        p50_mph=df2.apply(lambda x: np.percentile(x.speed_mph_list, 0.5), axis=1),
        n_trips=df2.apply(lambda x: len(x.speed_mph_list), axis=1).astype("int"),
        p20_mph=df2.apply(lambda x: np.percentile(x.speed_mph_list, 0.2), axis=1),
        p80_mph=df2.apply(lambda x: np.percentile(x.speed_mph_list, 0.8), axis=1),
    )

    stats = df2.drop(columns="speed_mph_list")

    # Clean up for map
    speed_cols = [c for c in stats.columns if "_mph" in c]
    stats[speed_cols] = stats[speed_cols].round(2)

    return stats

In [None]:
peak = calculate_avg_speeds(
    trip_speeds2_muni[trip_speeds2_muni.time_of_day.isin(["AM Peak", "PM Peak"])],
    ["gtfs_dataset_key", "route_name_used", "route_id"],
)

In [None]:
all_day = calculate_avg_speeds(
    trip_speeds2_muni,
    ["gtfs_dataset_key", "route_name_used", "route_id"],
)

In [None]:
trip_speeds2_muni.shape

In [None]:
trip_speeds2_muni.time_of_day.unique()

In [None]:
stats = pd.concat(
    [all_day.assign(time_of_day="all_day"), peak.assign(time_of_day="peak")], axis=0
)

In [None]:
stats.sample()

In [None]:
trip_speeds2_muni.sample(3)

In [None]:
stats2 = pd.merge(
    stats,
    trip_speeds2_muni.drop(columns=["time_of_day"]),
    on=["gtfs_dataset_key", "route_id", "route_name_used"],
)

In [None]:
stats2.loc[(stats2.route_id == "49") & (stats2.time_of_day == "peak")][
    ["speed_mph"]
].describe()

In [None]:
# stats2.loc[(stats2.route_id == "5R") & (stats2.time_of_day != "all_day")]

In [None]:
stats2.trip_instance_key.nunique()

In [None]:
stats2.shape

In [None]:
stats2.columns

In [None]:
update_completeness_df.columns

## Outer Join
* Add scheduled trips:

    * How to use map partitions
        * https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/shapely_project_vp.py#L61
    * https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/avg_speeds_by_segment.py
    * https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/config.yml

In [None]:
update_completeness_df.shape

In [None]:
muni.shape

In [None]:
# Inner merge for now b/c only SF Muni
m1 = muni.merge(update_completeness_df, on="trip_instance_key", how="outer")

#### TO DO: stats2 merges to 18,000 rows from 9000 because of change in peak...find a way to get it back to normal

In [None]:
test = m1.merge(stats2, on = "trip_instance_key", how = "inner")

In [None]:
test.shape

In [None]:
# m1 = m1.drop(columns = ['gtfs_dataset_key'])

In [None]:
m1.shape

In [None]:
m1.columns

## More Metrics
### Trip Timeliness
* A trip that was scheduled to run 46 minutes but took 43 minutes per GTFS is 5% faster
* A trip scheduled to run 82 minutes but ran 138 minutes is 70% slower.
* Median: a trip is around 35% longer than what it is scheduled to take
* Instead of using minutes, which can be arbitrary. A trip that was scheduled for 30 minutes and runs 15 minutes late makes a bigger difference than a trip shceduled for 90 minutes and runs 15 minutes late. Trying to capture this nuance.

To think about
* A trip can be faster than scheduled, but some of the rows are dramatically faster (above 70%) because it collected only 15 minutes of data when the trip is scheduled for 50 minutes. Obviously the trip can't go that quickly...How to account for this?

In [None]:
m1["trip_timeliness_pct"] = (m1.total_trip_time / m1.service_minutes - 1) * 100

In [None]:
# (74/69 - 1) * 100

In [None]:
m1.trip_timeliness_pct = (m1.trip_timeliness_pct / 5).fillna(0).round().astype(int) * 5

In [None]:
m1.trip_timeliness_pct.describe()

In [None]:
m1.loc[m1.trip_timeliness_pct < 0].shape

In [None]:
m1.loc[m1.trip_timeliness_pct  < -10].sample()

In [None]:
m1[["trip_timeliness_pct", "total_trip_time", "service_minutes"]].head(10)

In [None]:
138/82

In [None]:
# m1.loc[m1.trip_timeliness_pct == 1350]

### Metric: tracking missing minutes of data.
* These trips below for the same operator were scheduled to run 40-50 minutes. Their total trip time is 11-16 minutes long. Each minute captured at least 2 pings, so it seems like nothing is going wrong with the GTFS data collection. However, this is not a complete collection of data.
* Maybe redo the completeness??

In [None]:
m1['service_vs_total_trip_pct'] = (m1.total_trip_time / m1.service_minutes * 100).astype(int)

In [None]:
# Mask any value above 100 
m1.service_vs_total_trip_pct = m1.service_vs_total_trip_pct.mask(m1.service_vs_total_trip_pct > 100).fillna(100)

In [None]:
m1.service_vs_total_trip_pct.describe()

In [None]:
m1.loc[m1.trip_timeliness_pct == -75]

In [None]:
m1.loc[m1.service_vs_total_trip_pct != 100].shape

* Figure out a way to distinguish between a trip going faster than scheduled versus missing a lot of GTFS data

In [None]:
m1.loc[m1.service_vs_total_trip_pct != 100][['service_vs_total_trip_pct', 'total_trip_time', 'service_minutes', 'trip_timeliness_pct']].head(10)

In [None]:
m1.loc[m1.service_vs_total_trip_pct < 85].shape

#### Idea
* If a trip is more than x percent faster than scheduled service minutes..correct gtfs density
    * However describe is giving overly "dramatic" results, like a trip that ran 14 minutes but scheduled for 18 is flagged 
    * Using 30% as an arbitrary threshold 
* Can delete service vs total trip pct since it's the very similar to timeliness??

In [None]:
m1.loc[m1.trip_timeliness_pct < 0][['trip_timeliness_pct']].describe()

In [None]:
m1.loc[m1.trip_timeliness_pct < 0][['trip_timeliness_pct']].value_counts()

In [None]:
m1.loc[m1.service_vs_total_trip_pct != 100][['service_vs_total_trip_pct']].describe()

In [None]:
test1 = m1.loc[(m1.trip_timeliness_pct <= -35)]

In [None]:
test1.shape

In [None]:
preview = ['total_vp_in_shape_pct',
       'total_trip_time', 'pct_with_2_pings_per_min','total_minutes_w_gtfs', 'route_name_used',
       'route_id',
       'speed_mph', 'service_minutes','trip_timeliness_pct','service_vs_total_trip_pct','pct_trip_captured_by_gtfs']

In [None]:
16/24

In [None]:
test1['pct_trip_captured_by_gtfs'] = (test1.total_minutes_w_gtfs / test1.service_minutes * 100).astype(int)

In [None]:
test1.columns

test1[preview].sort_values(['trip_timeliness_pct'])

#### How to apply this to the whole df and account for trips that did go realistically faster than scheduled.
* Filter out the trips with missing data
* Calculate the rt time vs scheduled time for trips with missing data
* Delete those trips out of the original df
* Do the same calculation for the remaining trips
* Concat everythign together

In [None]:
trip_keys_list = list(test1.trip_instance_key.unique())

In [None]:
# Delete out trip keys 
m2 = m1[~m1.isin(trip_keys_list)]

In [None]:
m2.shape, m1.shape

In [None]:
# Apply to whole df
m1['pct_trip_captured_by_gtfs'] = (m1.total_minutes_w_gtfs / m1.service_minutes * 100).astype(int)

In [None]:
# Mask
m1.pct_trip_captured_by_gtfs = m1.pct_trip_captured_by_gtfs.mask(m1.pct_trip_captured_by_gtfs > 100).fillna(100)

In [None]:
m1['pct_trip_captured_by_gtfs'].describe()

In [None]:
m1.head()[preview]

In [None]:
m1.loc[m1['pct_trip_captured_by_gtfs'] == 5][preview]

In [None]:
m1.loc[m1['trip_timeliness_pct'] == -75][preview]

In [None]:
# 7;55 to 8:10: 15 minutes
# vp_filtered.loc[vp_filtered.trip_instance_key == "3513c07b09c3da765ff43dd7317a7689"]