# Migrate to VP Usable
* https://github.com/cal-itp/data-analyses/issues/936
* cd rt_segment_speeds && pip install -r requirements.txt && cd ..
    * https://github.com/cal-itp/data-analyses/blob/main/Makefile#L49C2-L49C66
    

In [1]:
import datetime

import dask
import dask.dataframe as dd
import dask_geopandas as dg
import geopandas as gpd
import numpy as np
import pandas as pd
from calitp_data_analysis.geography_utils import WGS84
from scripts import vp_spatial_accuracy
from segment_speed_utils import helpers, wrangle_shapes
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    CONFIG_PATH,
    GCS_FILE_PATH,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SEGMENT_GCS,
    analysis_date,
)

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
# calitp-analytics-data/data-analyses/rt_segment_speeds/vp_usable_2023-10-11
operator = "Bay Area 511 Muni VehiclePositions"
gtfs_key = "7cc0cb1871dfd558f11a2885c145d144"

## Spatial Accuracy
* Based on https://github.com/cal-itp/data-analyses/blob/main/rt_scheduled_v_ran/scripts/vp_spatial_accuracy.py
### Grab_shape_keys_in_vp
#### First time reading `vp_usable`

In [4]:
def grab_shape_keys_in_vp(analysis_date: str) -> pd.DataFrame:
    """
    Subset raw vp and find unique trip_instance_keys.
    Create crosswalk to link trip_instance_key to shape_array_key.
    """
    vp_trip_df = pd.read_parquet(
        f"{SEGMENT_GCS}vp_usable_{analysis_date}",
        filters=[
            [
                ("gtfs_dataset_name", "==", operator),
                ("schedule_gtfs_dataset_key", "==", gtfs_key),
            ]
        ],
        columns=["trip_instance_key"],
    )

    vp_trip_df = vp_trip_df.drop_duplicates().reset_index(drop=True)

    # Make sure we have a shape geometry too
    # otherwise map_partitions will throw error
    shapes = (
        pd.read_parquet(
            f"{COMPILED_CACHED_VIEWS}routelines_{analysis_date}.parquet",
            columns=["shape_array_key"],
        )
        .dropna()
        .drop_duplicates()
    )

    trips_with_shape = (
        helpers.import_scheduled_trips(
            analysis_date,
            columns=["trip_instance_key", "shape_array_key"],
            get_pandas=True,
        )
        .merge(shapes, on="shape_array_key", how="inner")
        .merge(vp_trip_df, on="trip_instance_key", how="inner")
        .drop_duplicates()
        .dropna()
        .reset_index(drop=True)
    )

    return trips_with_shape

In [5]:
trips_with_shape = grab_shape_keys_in_vp(analysis_date)

In [6]:
trips_with_shape.shape

(9240, 2)

In [7]:
trips_with_shape.head()

Unnamed: 0,trip_instance_key,shape_array_key
0,12de3d260e9fe09fa878cb4cdb2d6898,749b225ca6691f77914e88577dc13e68
1,934aea5748bce830ffc2fa88dc01402a,749b225ca6691f77914e88577dc13e68
2,3d06fa8e68e4f38d3ccc7adfabb5c3d9,f1a7410fae06937b7183f6a553707915
3,34109ad8cfeca83cd459d42c7d51d602,f1a7410fae06937b7183f6a553707915
4,34acd907ae9d5eb5456d683d2458bbc6,f1a7410fae06937b7183f6a553707915


### Buffer shapes

In [8]:
# This is trips_with_shape_geom
trips_with_shape_geom = vp_spatial_accuracy.buffer_shapes(
    analysis_date, trips_with_shape, 35
)

In [9]:
trips_with_shape_geom.shape

(9240, 3)

#### Second time reading in the same file, streamline

In [10]:
vp = dd.read_parquet(
    f"{SEGMENT_GCS}vp_usable_{analysis_date}",
    filters=[
        [
            ("gtfs_dataset_name", "==", operator),
            ("schedule_gtfs_dataset_key", "==", gtfs_key),
        ]
    ],
)

### Full function

In [11]:
def merge_vp_with_shape_and_count(
    vp: dd.DataFrame, trips_with_shape_geom: gpd.GeoDataFrame
) -> gpd.GeoDataFrame:
    """
    Merge vp with crosswalk and buffered shapes.
    Get vp count totals and vp within shape.
    """
    # COMMENT BACK IN ONCE I MERGE WITH MAIN
    # vp_gdf = wrangle_shapes.vp_as_gdf(vp)

    vp_gdf = gpd.GeoDataFrame(
        vp, geometry=gpd.points_from_xy(vp.x, vp.y), crs=WGS84
    ).to_crs(PROJECT_CRS)

    vp_gdf = vp_gdf.rename(
        columns={
            0: "gtfs_dataset_name",
            1: "schedule_gtfs_dataset_key",
            2: "trip_id",
            3: "trip_instance_key",
            4: "location_timestamp",
            5: "location_timestamp_local",
            6: "x",
            7: "y",
            8: "vp_idx",
            9: "gtfs_dataset_key",
            10: "vp_dir_xnorm",
            11: "vp_dir_ynorm",
            12: "vp_primary_direction",
        }
    )

    vp2 = pd.merge(
        vp_gdf, trips_with_shape_geom, on="trip_instance_key", how="inner"
    ).reset_index(drop=True)

    total_vp = vp_spatial_accuracy.total_vp_counts_by_trip(vp2)

    vp2 = vp2.assign(is_within=vp2.geometry_x.within(vp2.geometry_y)).query(
        "is_within==True"
    )

    vps_in_shape = (
        vp2.groupby("trip_instance_key", observed=True, group_keys=False)
        .agg({"location_timestamp_local": "count"})
        .reset_index()
        .rename(columns={"location_timestamp_local": "vp_in_shape"})
    )

    count_df = pd.merge(total_vp, vps_in_shape, on="trip_instance_key", how="left")

    count_df = count_df.assign(
        vp_in_shape=count_df.vp_in_shape.fillna(0).astype("int32"),
        total_vp=count_df.total_vp.fillna(0).astype("int32"),
    )

    count_df["total_vp_in_shape_pct"] = count_df.vp_in_shape / count_df.total_vp * 100

    return count_df

In [12]:
muni = merge_vp_with_shape_and_count(vp, trips_with_shape_geom)

In [13]:
muni.vp_in_shape.describe()

count   9240.00
mean     161.87
std       64.06
min        0.00
25%      118.00
50%      167.00
75%      207.00
max      481.00
Name: vp_in_shape, dtype: float64

In [14]:
muni.trip_instance_key.nunique()

9240

In [15]:
(muni.vp_in_shape / muni.total_vp).describe()

count   9240.00
mean       0.93
std        0.14
min        0.00
25%        0.94
50%        1.00
75%        1.00
max        1.00
dtype: float64

In [16]:
muni.total_vp_in_shape_pct.describe()

count   9240.00
mean      93.34
std       13.91
min        0.00
25%       94.15
50%      100.00
75%      100.00
max      100.00
Name: total_vp_in_shape_pct, dtype: float64

## Update Completeness
* https://github.com/cal-itp/data-analyses/blob/main/rt_predictions/01_update_completeness.ipynb

#### Keep only relevant `trips instance keys`?
* Trips have certain attributes: name, key, gtfs key, etc in addition to trip instance key
* Start from `vp_usable` join to `shapes` as first pass and get total positions in, add a column that counts counts number of pings
* Read dataset in once, derive a bunch of rows. 
* I have 100 vps for one trip -> turn the trip into one row with summary statistics
* Pass output of dataset into three different ways
* Each pass will summarize vp-usable from thousands of rows to just one per trip
* Pass `vp_usable` into three different functions.
* Final step: merge all on trips.
* Each function is individual, can be pandas.

In [17]:
# Only use the trips with shapes.
relevant_trips = list(muni.trip_instance_key.unique())

#### Third time reading in `vp_usable`

#### Density

6ef4805f0104b95614b86a2b1c374d23
* Minutes skipped: 6:12 to 6:18
* 6:19-6:26
* 6:28-7:33 etc etc
* Trip started at 5:48, ended at 8:31 

Can also check w/ 38247cbee93b6f85d58bf1812ae553b9
* Began at 16:01:00, end at 17:00:000
* Skips having data: jumps from 4:38 to 4:54
    * Trip: know the min and the max 
    * Bin the minutes: everything else between 4:38 to 4:54 can't be binned. 
    * Dem: gap between max and min 
* Vp doesn't necessarily update every minute 
* TO DO
    * Correct trip duration numerator: average pings per minute or ten minutes or five minutes...Help us understand coverage.  
    * How to express time lapses: time deletas in pandas

In [18]:
completeness_cols = [
    "vp_idx",
    "location_timestamp_local",
    "trip_instance_key",
    "gtfs_dataset_key",
]

In [19]:
vp_filtered = pd.read_parquet(
    f"{SEGMENT_GCS}vp_usable_{analysis_date}",
    columns=completeness_cols,
    filters=[
        [
            ("gtfs_dataset_name", "==", operator),
            ("schedule_gtfs_dataset_key", "==", gtfs_key),
            ("trip_instance_key", "in", relevant_trips),
        ]
    ],
)

In [20]:
def pct_of_pings(df: pd.DataFrame):

    # Determine which rows have 2+ pings per minute
    df2 = df.assign(
        atleast2_trip_updates=df.apply(
            lambda x: 1 if x.number_of_pings_per_minute >= 2 else 0, axis=1
        )
    )

    # Calculate total trip time
    df2["max_time"] = df2.location_timestamp_local
    df3 = (
        df2.groupby(["trip_instance_key"])
        .agg(
            {
                "location_timestamp_local": "min",
                "max_time": "max",
                "atleast2_trip_updates": "sum",
            }
        )
        .reset_index()
        .rename(columns={"location_timestamp_local": "min_time"})
    )

    # Add an extra minute
    df3["total_trip_time"] = (df3.max_time - df3.min_time) / pd.Timedelta(minutes=1) + 1

    # Find % of each trip in which one minute has 2+ pings
    df3 = df3.assign(
        pct_with_2_pings_per_min=df3.atleast2_trip_updates.divide(df3.total_trip_time)
        * 100
    )

    return df3

In [21]:
def density_of_pings(df: pd.DataFrame):

    # Count number of pings that occur
    # on average per 5 minutes of the trip
    df2 = (
        df.groupby(
            [
                *["trip_instance_key"],
                pd.Grouper(key="location_timestamp_local", freq="5Min"),
            ]
        )
        .sum()
        .reset_index()
        .rename(columns={"number_of_pings_per_minute": "number_of_pings_per_5_min"})
    )

    df3 = (
        df2.groupby(["trip_instance_key"])
        .agg({"number_of_pings_per_5_min": "median"})
        .reset_index()
    )

    df3["avg_pings_per_5_min"] = df3.number_of_pings_per_5_min / 5

    return df3

In [22]:
def total_minutes_with_gtfs(df: pd.DataFrame):
    """
    Total minutes of a trip that recorded any
    vehicle positions.
    """
    df2 = (
        df.groupby(["trip_instance_key"])
        .agg({"gtfs_dataset_key": "count"})
        .rename(columns={"gtfs_dataset_key": "total_minutes_w_gtfs"})
        .reset_index()
    )
    return df2

In [23]:
def update_completeness(df: pd.DataFrame):
    # Find number of pings per minute for each trip
    df = df.sort_values(["vp_idx"]).reset_index(drop=True)

    df2 = (
        df.groupby(
            [
                *["trip_instance_key"],
                pd.Grouper(key="location_timestamp_local", freq="1Min"),
            ]
        )
        .count()
        .reset_index()
        .rename(columns={"vp_idx": "number_of_pings_per_minute"})
    )

    pings_df = pct_of_pings(df2)
    density_df = density_of_pings(df2)
    total_minutes_df = total_minutes_with_gtfs(df2)

    m1 = pings_df.merge(density_df, on="trip_instance_key", how="inner").merge(
        total_minutes_df, on="trip_instance_key", how="inner"
    )

    # m1["pct_of_trip_w_gtfs"] = m1.total_minutes_w_gtfs / m1.total_trip_time * 100
    m1 = m1.drop(columns=["min_time", "max_time", "number_of_pings_per_5_min"])
    return m1

In [24]:
update_completeness_df = update_completeness(vp_filtered)

In [25]:
update_completeness_df.sample(3)

Unnamed: 0,trip_instance_key,atleast2_trip_updates,total_trip_time,pct_with_2_pings_per_min,avg_pings_per_5_min,total_minutes_w_gtfs
7455,cf957f0cf035da1ce22c5dd5311f1df1,75,75.0,100.0,3.0,75
1667,2f796977ba5835faea59ae2f266ffa75,93,93.0,100.0,3.0,93
754,15289e832dbef429b4fca144e9eafcf9,50,50.0,100.0,3.0,50


In [27]:
# update_completeness_df.pct_of_trip_w_gtfs.describe()

In [28]:
update_completeness_df.avg_pings_per_5_min.describe()

count   9240.00
mean       2.96
std        0.18
min        0.30
25%        3.00
50%        3.00
75%        3.00
max        3.00
Name: avg_pings_per_5_min, dtype: float64

In [29]:
update_completeness_df.pct_with_2_pings_per_min.describe()

count   9240.00
mean      97.50
std        5.89
min        2.04
25%       97.62
50%       98.67
75%      100.00
max      100.00
Name: pct_with_2_pings_per_min, dtype: float64

## How many minutes a trip took and the average speeds?

In [30]:
vp_filtered.sample()

Unnamed: 0,vp_idx,location_timestamp_local,trip_instance_key,gtfs_dataset_key
11771165,11771165,2023-10-11 11:48:49,1f31a1602d6f0da057e1716ae7e300b2,c0e3039da063db95ebabd3fe4ee611a4


In [31]:
crosswalk = (
    vp_filtered[["gtfs_dataset_key", "trip_instance_key"]]
    .drop_duplicates()
    .reset_index(drop=True)
)

In [32]:
crosswalk.shape

(9240, 2)

In [33]:
trip_speeds = pd.read_parquet(
    f"{SEGMENT_GCS}trip_summary/trip_speeds_{analysis_date}.parquet",
    columns=[
        "trip_instance_key",
        "speed_mph",
        "route_id",
        "time_of_day",
        "service_minutes",
        "direction_id",
        "route_name_used",
    ],
)

In [34]:
trip_speeds.sample()

Unnamed: 0,trip_instance_key,speed_mph,route_id,time_of_day,service_minutes,direction_id,route_name_used
52991,b2b20c73cb81f463b786f2363f6d69a9,9.71,45-13168,PM Peak,79.0,1,LINCOLN HEIGHTS-DTWN LA-HARBR FWY STA VIA BROADWAY


### Average speed questions:
* Are we still filtering out trips with overly high/low speeds?
* What happens if a route doesn't have an id or used name? How do you merge it back?
* DO I still use the average function?
    * The results I get are kind of different

In [35]:
# Filtered
trip_speeds2 = trip_speeds.query("speed_mph <= 80")

In [36]:
trip_speeds2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 76255 entries, 0 to 76254
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   trip_instance_key  76255 non-null  object 
 1   speed_mph          76255 non-null  float64
 2   route_id           74710 non-null  object 
 3   time_of_day        76255 non-null  object 
 4   service_minutes    76255 non-null  float64
 5   direction_id       74710 non-null  Int64  
 6   route_name_used    74710 non-null  object 
dtypes: Int64(1), float64(2), object(4)
memory usage: 4.7+ MB


In [37]:
trip_speeds2.speed_mph.describe()

count   76255.00
mean       10.03
std         5.94
min         0.04
25%         6.02
50%         8.85
75%        12.57
max        79.56
Name: speed_mph, dtype: float64

In [38]:
# trip_speeds2 = trip_speeds2.fillna('NA')

In [39]:
len(trip_speeds2), len(trip_speeds)

(76255, 76255)

In [40]:
# HOW are there some that are right only??
pd.merge(
    trip_speeds2, crosswalk, on=["trip_instance_key"], how="outer", indicator=True
)[["_merge"]].value_counts()

_merge    
left_only     67113
both           9142
right_only       98
dtype: int64

In [41]:
trip_speeds2 = pd.merge(trip_speeds2, crosswalk, on=["trip_instance_key"], how="outer")

In [42]:
len(trip_speeds2), len(trip_speeds)

(76353, 76255)

In [43]:
vp_filtered.sample()

Unnamed: 0,vp_idx,location_timestamp_local,trip_instance_key,gtfs_dataset_key
11595213,11595213,2023-10-11 10:57:35,df819485ae9b6dcc8e74579480eeed57,c0e3039da063db95ebabd3fe4ee611a4


In [44]:
trip_speeds2_muni = trip_speeds2.loc[
    trip_speeds2.gtfs_dataset_key == "c0e3039da063db95ebabd3fe4ee611a4"
]

In [45]:
trip_speeds2_muni.shape

(9240, 8)

In [46]:
trip_speeds2_muni.trip_instance_key.nunique()

9240

In [47]:
# trip_speeds_all  = pd.read_parquet(f"{SEGMENT_GCS}trip_summary/trip_speeds_{analysis_date}.parquet",)

In [48]:
# https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/avg_speeds_by_segment.py#L18C1-L48C17
def calculate_avg_speeds(df: pd.DataFrame, group_cols: list) -> pd.DataFrame:
    """
    Calculate the median, 20th, and 80th percentile speeds
    by groups.
    """
    # pd.groupby and pd.quantile is so slow
    # create our own list of speeds and use np
    df2 = (
        df.groupby(group_cols, observed=True, group_keys=False)
        .agg({"speed_mph": lambda x: sorted(list(x))})
        .reset_index()
        .rename(columns={"speed_mph": "speed_mph_list"})
    )

    df2 = df2.assign(
        p50_mph=df2.apply(lambda x: np.percentile(x.speed_mph_list, 0.5), axis=1),
        n_trips=df2.apply(lambda x: len(x.speed_mph_list), axis=1).astype("int"),
        p20_mph=df2.apply(lambda x: np.percentile(x.speed_mph_list, 0.2), axis=1),
        p80_mph=df2.apply(lambda x: np.percentile(x.speed_mph_list, 0.8), axis=1),
    )

    stats = df2.drop(columns="speed_mph_list")

    # Clean up for map
    speed_cols = [c for c in stats.columns if "_mph" in c]
    stats[speed_cols] = stats[speed_cols].round(2)

    return stats

In [49]:
peak = calculate_avg_speeds(
    trip_speeds2_muni[trip_speeds2_muni.time_of_day.isin(["AM Peak", "PM Peak"])],
    ["gtfs_dataset_key", "route_name_used", "route_id"],
)

In [50]:
peak2 = calculate_avg_speeds(
    trip_speeds2_muni[trip_speeds2_muni.time_of_day.isin(["AM Peak", "PM Peak"])],
    ["trip_instance_key", "route_name_used", "route_id"],
)

In [51]:
all_day = calculate_avg_speeds(
    trip_speeds2_muni,
    ["gtfs_dataset_key", "route_name_used", "route_id"],
)

In [219]:
trip_speeds2_muni.shape

(9240, 8)

In [52]:
all_day.loc[(all_day.route_name_used == "5am-10pm daily") & (all_day.route_id == "14R")]

Unnamed: 0,gtfs_dataset_key,route_name_used,route_id,p50_mph,n_trips,p20_mph,p80_mph
9,c0e3039da063db95ebabd3fe4ee611a4,5am-10pm daily,14R,3.46,261,3.01,3.48


In [53]:
stats = pd.concat(
    [all_day.assign(time_of_day="all_day"), peak.assign(time_of_day="peak")], axis=0
)

In [54]:
stats.sample()

Unnamed: 0,gtfs_dataset_key,route_name_used,route_id,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
30,c0e3039da063db95ebabd3fe4ee611a4,6:30am-7pm daily,2,1.99,39,1.97,2.0,peak


In [223]:
trip_speeds2_muni.sample()

Unnamed: 0,trip_instance_key,speed_mph,route_id,time_of_day,service_minutes,direction_id,route_name_used,gtfs_dataset_key
69795,eaadaeebd44abcf317cbc098314a4b13,5.83,67,PM Peak,17.0,0,Weekdays 5am-10pm Weekends 6am-10pm,c0e3039da063db95ebabd3fe4ee611a4


In [222]:
trip_speeds2_muni.columns

Index(['trip_instance_key', 'speed_mph', 'route_id', 'time_of_day',
       'service_minutes', 'direction_id', 'route_name_used',
       'gtfs_dataset_key'],
      dtype='object')

In [221]:
trip_speeds2_muni.columns

Index(['trip_instance_key', 'speed_mph', 'route_id', 'time_of_day',
       'service_minutes', 'direction_id', 'route_name_used',
       'gtfs_dataset_key'],
      dtype='object')

In [55]:
stats2 = pd.merge(
    stats,
    trip_speeds2_muni.drop(columns=["time_of_day"]),
    on=["gtfs_dataset_key", "route_id", "route_name_used"],
)

In [56]:
stats2.loc[(stats2.route_id == "49") & (stats2.time_of_day == "peak")][
    ["speed_mph"]
].describe()

Unnamed: 0,speed_mph
count,347.0
mean,5.55
std,2.04
min,0.15
25%,3.72
50%,5.95
75%,6.85
max,15.97


In [57]:
# stats2.loc[(stats2.route_id == "5R") & (stats2.time_of_day != "all_day")]

In [58]:
stats2.trip_instance_key.nunique()

9142

In [218]:
stats2.shape

(18145, 12)

In [59]:
stats2.columns

Index(['gtfs_dataset_key', 'route_name_used', 'route_id', 'p50_mph', 'n_trips',
       'p20_mph', 'p80_mph', 'time_of_day', 'trip_instance_key', 'speed_mph',
       'service_minutes', 'direction_id'],
      dtype='object')

In [60]:
update_completeness_df.columns

Index(['trip_instance_key', 'atleast2_trip_updates', 'total_trip_time',
       'pct_with_2_pings_per_min', 'avg_pings_per_5_min',
       'total_minutes_w_gtfs'],
      dtype='object')

## Outer Join
* Add scheduled trips:

    * How to use map partitions
        * https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/shapely_project_vp.py#L61
    * https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/avg_speeds_by_segment.py
    * https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/config.yml

In [212]:
update_completeness_df.shape

(9240, 6)

In [213]:
muni.shape

(9240, 4)

In [214]:
# Inner merge for now b/c only SF Muni
m1 = muni.merge(update_completeness_df, on="trip_instance_key", how="outer")

#### TO DO: stats2 merges to 18,000 rows from 9000 because of change in peak...find a way to get it back to normal

In [215]:
test = m1.merge(stats2, on = "trip_instance_key", how = "inner")

In [216]:
test.shape

(18145, 20)

In [210]:
m1 = m1.drop(columns = ['gtfs_dataset_key'])

In [211]:
m1.shape

(18145, 19)

## More Metrics
### Trip Timeliness
* A trip that was scheduled to run 46 minutes but took 43 minutes per GTFS is 5% faster
* A trip scheduled to run 82 minutes but ran 138 minutes is 70% slower.
* Median: a trip is around 35% longer than what it is scheduled to take
* Instead of using minutes, which can be arbitrary. A trip that was scheduled for 30 minutes and runs 15 minutes late makes a bigger difference than a trip shceduled for 90 minutes and runs 15 minutes late. Trying to capture this nuance.

To think about
* A trip can be faster than scheduled, but some of the rows are dramatically faster (above 70%) because it collected only 15 minutes of data when the trip is scheduled for 50 minutes. Obviously the trip can't go that quickly...How to account for this?

In [105]:
m1["trip_timeliness_pct"] = (m1.total_trip_time / m1.service_minutes - 1) * 100

In [106]:
# (74/69 - 1) * 100

In [107]:
m1.trip_timeliness_pct = (m1.trip_timeliness_pct / 5).fillna(0).round().astype(int) * 5

In [108]:
m1.trip_timeliness_pct.describe()

count   18145.00
mean       42.31
std        55.70
min       -75.00
25%        15.00
50%        35.00
75%        55.00
max      1350.00
Name: trip_timeliness_pct, dtype: float64

In [109]:
m1.loc[m1.trip_timeliness_pct < 0].shape

(1272, 22)

In [111]:
m1.loc[m1.trip_timeliness_pct  < -10].sample()

Unnamed: 0,trip_instance_key,total_vp,vp_in_shape,total_vp_in_shape_pct,atleast2_trip_updates,total_trip_time,pct_with_2_pings_per_min,avg_pings_per_5_min,total_minutes_w_gtfs,route_name_used,route_id,p50_mph,n_trips,p20_mph,p80_mph,time_of_day,speed_mph,service_minutes,direction_id,trip_timeliness_pct,service_min_vs_total_trip_time,service_vs_total_trip_pct
10456,94ead7250ffa9090c64d818ab1dd7357,33,33,100.0,11,11.0,100.0,1.8,11,9am-7pm daily,39,2.25,30,2.25,2.26,peak,4.85,16.0,1,-30,68.0,68.0


In [112]:
m1[["trip_timeliness_pct", "total_trip_time", "service_minutes"]].head(10)

Unnamed: 0,trip_timeliness_pct,total_trip_time,service_minutes
0,-5,43.0,46.0
1,-5,43.0,46.0
2,5,52.0,49.0
3,5,52.0,49.0
4,70,138.0,82.0
5,70,138.0,82.0
6,20,69.0,57.0
7,20,69.0,57.0
8,40,76.0,55.0
9,40,76.0,55.0


In [113]:
138/82

1.6829268292682926

In [114]:
# m1.loc[m1.trip_timeliness_pct == 1350]

### Metric: tracking missing minutes of data.
* These trips below for the same operator were scheduled to run 40-50 minutes. Their total trip time is 11-16 minutes long. Each minute captured at least 2 pings, so it seems like nothing is going wrong with the GTFS data collection. However, this is not a complete collection of data.
* Maybe redo the completeness??

In [115]:
m1['service_vs_total_trip_pct'] = (m1.total_trip_time / m1.service_minutes * 100).astype(int)

In [116]:
# Mask any value above 100 
m1.service_vs_total_trip_pct = m1.service_vs_total_trip_pct.mask(m1.service_vs_total_trip_pct > 100).fillna(100)

In [117]:
m1.service_vs_total_trip_pct.describe()

count   18145.00
mean       98.98
std         4.95
min        26.00
25%       100.00
50%       100.00
75%       100.00
max       100.00
Name: service_vs_total_trip_pct, dtype: float64

In [118]:
m1.loc[m1.trip_timeliness_pct == -75]

Unnamed: 0,trip_instance_key,total_vp,vp_in_shape,total_vp_in_shape_pct,atleast2_trip_updates,total_trip_time,pct_with_2_pings_per_min,avg_pings_per_5_min,total_minutes_w_gtfs,route_name_used,route_id,p50_mph,n_trips,p20_mph,p80_mph,time_of_day,speed_mph,service_minutes,direction_id,trip_timeliness_pct,service_min_vs_total_trip_time,service_vs_total_trip_pct
10373,9352e26ef303febc50a799f0a6885a20,40,40,100.0,13,14.0,92.86,2.2,14,7am-10pm daily,F,2.0,141,1.99,2.01,all_day,16.47,52.0,0,-75,26.0,26.0
10374,9352e26ef303febc50a799f0a6885a20,40,40,100.0,13,14.0,92.86,2.2,14,7am-10pm daily,F,2.06,71,2.04,2.08,peak,16.47,52.0,0,-75,26.0,26.0


In [119]:
m1.loc[m1.service_vs_total_trip_pct != 100].shape

(1446, 22)

* Figure out a way to distinguish between a trip going faster than scheduled versus missing a lot of GTFS data

In [120]:
m1.loc[m1.service_vs_total_trip_pct != 100][['service_vs_total_trip_pct', 'total_trip_time', 'service_minutes', 'trip_timeliness_pct']].head(10)

Unnamed: 0,service_vs_total_trip_pct,total_trip_time,service_minutes,trip_timeliness_pct
0,93.0,43.0,46.0,-5
1,93.0,43.0,46.0,-5
60,95.0,64.0,67.0,-5
61,95.0,64.0,67.0,-5
70,97.0,40.0,41.0,0
71,97.0,40.0,41.0,0
80,72.0,31.0,43.0,-30
81,72.0,31.0,43.0,-30
88,94.0,36.0,38.0,-5
89,94.0,36.0,38.0,-5


In [122]:
m1.loc[m1.service_vs_total_trip_pct < 85].shape

(352, 22)

#### Idea
* If a trip is more than x percent faster than scheduled service minutes..correct gtfs density
    * However describe is giving overly "dramatic" results, like a trip that ran 14 minutes but scheduled for 18 is flagged 
    * Using 30% as an arbitrary threshold 
* Can delete service vs total trip pct since it's the very similar to timeliness??

In [110]:
m1.loc[m1.trip_timeliness_pct < 0][['trip_timeliness_pct']].describe()

Unnamed: 0,trip_timeliness_pct
count,1272.0
mean,-13.79
std,12.82
min,-75.0
25%,-15.0
50%,-10.0
75%,-5.0
max,-5.0


In [144]:
m1.loc[m1.trip_timeliness_pct < 0][['trip_timeliness_pct']].value_counts()

trip_timeliness_pct
-5                     529
-10                    303
-15                    150
-20                     78
-25                     50
-35                     42
-30                     42
-40                     22
-60                     16
-55                     10
-50                     10
-45                      8
-70                      6
-65                      4
-75                      2
dtype: int64

In [181]:
m1.loc[m1.service_vs_total_trip_pct != 100][['service_vs_total_trip_pct']].describe()

Unnamed: 0,service_vs_total_trip_pct
count,1446.0
mean,87.26
std,12.6
min,26.0
25%,85.0
50%,92.0
75%,95.0
max,99.0


In [190]:
test1 = m1.loc[(m1.trip_timeliness_pct <= -35)]

In [191]:
test1.shape

(120, 23)

In [192]:
preview = ['total_vp_in_shape_pct',
       'total_trip_time', 'pct_with_2_pings_per_min','total_minutes_w_gtfs', 'route_name_used',
       'route_id',
       'speed_mph', 'service_minutes','trip_timeliness_pct','service_vs_total_trip_pct','pct_trip_captured_by_gtfs']

In [193]:
16/24

0.6666666666666666

In [194]:
test1['pct_trip_captured_by_gtfs'] = (test1.total_minutes_w_gtfs / test1.service_minutes * 100).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test1['pct_trip_captured_by_gtfs'] = (test1.total_minutes_w_gtfs / test1.service_minutes * 100).astype(int)


In [195]:
test1.columns

Index(['trip_instance_key', 'total_vp', 'vp_in_shape', 'total_vp_in_shape_pct',
       'atleast2_trip_updates', 'total_trip_time', 'pct_with_2_pings_per_min',
       'avg_pings_per_5_min', 'total_minutes_w_gtfs', 'route_name_used',
       'route_id', 'p50_mph', 'n_trips', 'p20_mph', 'p80_mph', 'time_of_day',
       'speed_mph', 'service_minutes', 'direction_id', 'trip_timeliness_pct',
       'service_min_vs_total_trip_time', 'service_vs_total_trip_pct',
       'pct_trip_captured_by_gtfs'],
      dtype='object')

test1[preview].sort_values(['trip_timeliness_pct'])

#### How to apply this to the whole df and account for trips that did go realistically faster than scheduled.
* Filter out the trips with missing data
* Calculate the rt time vs scheduled time for trips with missing data
* Delete those trips out of the original df
* Do the same calculation for the remaining trips
* Concat everythign together

In [196]:
trip_keys_list = list(test1.trip_instance_key.unique())

In [206]:
# Delete out trip keys 
m2 = m1[~m1.isin(trip_keys_list)]

In [207]:
m2.shape, m1.shape

((18145, 23), (18145, 23))

In [198]:
# Apply to whole df
m1['pct_trip_captured_by_gtfs'] = (m1.total_minutes_w_gtfs / m1.service_minutes * 100).astype(int)

In [199]:
# Mask
m1.pct_trip_captured_by_gtfs = m1.pct_trip_captured_by_gtfs.mask(m1.pct_trip_captured_by_gtfs > 100).fillna(100)

In [200]:
m1['pct_trip_captured_by_gtfs'].describe()

count   18145.00
mean       98.76
std         6.05
min         5.00
25%       100.00
50%       100.00
75%       100.00
max       100.00
Name: pct_trip_captured_by_gtfs, dtype: float64

In [201]:
m1.head()[preview]

Unnamed: 0,total_vp_in_shape_pct,total_trip_time,pct_with_2_pings_per_min,total_minutes_w_gtfs,route_name_used,route_id,speed_mph,service_minutes,trip_timeliness_pct,service_vs_total_trip_pct,pct_trip_captured_by_gtfs
0,100.0,43.0,97.67,43,5am-10pm daily,14R,10.77,46.0,-5,93.0,93.0
1,100.0,43.0,97.67,43,5am-10pm daily,14R,10.77,46.0,-5,93.0,93.0
2,92.11,52.0,96.15,52,Weekdays 7am-7pm,5R,4.16,49.0,5,100.0,100.0
3,92.11,52.0,96.15,52,Weekdays 7am-7pm,5R,4.16,49.0,5,100.0,100.0
4,76.27,138.0,100.0,138,5am-12 midnight daily,43,3.77,82.0,70,100.0,100.0


In [202]:
m1.loc[m1['pct_trip_captured_by_gtfs'] == 5][preview]

Unnamed: 0,total_vp_in_shape_pct,total_trip_time,pct_with_2_pings_per_min,total_minutes_w_gtfs,route_name_used,route_id,speed_mph,service_minutes,trip_timeliness_pct,service_vs_total_trip_pct,pct_trip_captured_by_gtfs
13413,100.0,49.0,2.04,2,5am-12 midnight daily,K,5.09,39.0,25,100.0,5.0
13414,100.0,49.0,2.04,2,5am-12 midnight daily,K,5.09,39.0,25,100.0,5.0


In [203]:
m1.loc[m1['trip_timeliness_pct'] == -75][preview]

Unnamed: 0,total_vp_in_shape_pct,total_trip_time,pct_with_2_pings_per_min,total_minutes_w_gtfs,route_name_used,route_id,speed_mph,service_minutes,trip_timeliness_pct,service_vs_total_trip_pct,pct_trip_captured_by_gtfs
10373,100.0,14.0,92.86,14,7am-10pm daily,F,16.47,52.0,-75,26.0,26.0
10374,100.0,14.0,92.86,14,7am-10pm daily,F,16.47,52.0,-75,26.0,26.0


In [204]:
# 7;55 to 8:10: 15 minutes
# vp_filtered.loc[vp_filtered.trip_instance_key == "3513c07b09c3da765ff43dd7317a7689"]