## Map Partitions Test - Update Completeness
* https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/nearest_vp_to_stop.py
* The functions should all start from `vp_usable`
* cd rt_segment_speeds && pip install -r requirements.txt && cd

In [1]:
import datetime
from typing import Literal

import dask
import dask.dataframe as dd
import dask_geopandas as dg
import geopandas as gpd
import numpy as np
import pandas as pd
from calitp_data_analysis import utils
from calitp_data_analysis.geography_utils import WGS84
from scripts import vp_spatial_accuracy
from segment_speed_utils import helpers, sched_rt_utils, segment_calcs, wrangle_shapes
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    CONFIG_PATH,
    GCS_FILE_PATH,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SEGMENT_GCS,
    analysis_date,
)

# cd rt_segment_speeds && pip install -r requirements.txt && cd
from shared_utils import portfolio_utils, schedule_rt_utils
from shared_utils.rt_utils import MPH_PER_MPS

In [2]:
# Times
import datetime

from loguru import logger

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:
# 14,514,960 rows
vp_usable = dd.read_parquet(f"{SEGMENT_GCS}vp_usable_{analysis_date}")

In [None]:
gtfs_keys = ["7cc0cb1871dfd558f11a2885c145d144", "d2b09fbd392b28d767c28ea26529b0cd"]

In [None]:
# Test a subset
# vp_usable_subset = vp_usable.loc[vp_usable.schedule_gtfs_dataset_key.isin(gtfs_keys)]

### % of total trip time with 2 pings per minute - 2 minutes total

In [7]:
def two_pings_per_min(vp_usable_df: pd.DataFrame) -> pd.DataFrame:

    # Find number of pings each minute
    df = (
        vp_usable_df.groupby(
            [
                "trip_instance_key",
                pd.Grouper(key="location_timestamp_local", freq="1Min"),
            ]
        )
        .vp_idx.count()
        .reset_index()
        .rename(columns={"vp_idx": "number_of_pings_per_minute"})
    )

    # Determine which rows have 2+ pings per minute
    df = df.assign(
        minutes_w_atleast2_trip_updates=df.apply(
            lambda x: 1 if x.number_of_pings_per_minute >= 2 else 0, axis=1
        )
    )

    # Need a copy of loc-timestamp-local to get max time
    df["max_time"] = df.location_timestamp_local
    
    # Need a copy of numer of pings per minute to count
    # for total minutes w gtfs
    df["total_minute_w_gtfs"] = df.number_of_pings_per_minute

    # Find the min time for each trip and sum up total min with at least 2 pings per min
    df = (
        df.groupby(["trip_instance_key"])
        .agg(
            {
                "location_timestamp_local": "min",
                "max_time": "max",
                "minutes_w_atleast2_trip_updates": "sum",
                "number_of_pings_per_minute": "median",
                "total_minute_w_gtfs": "count",
            }
        )
        .reset_index()
        .rename(
            columns={
                "location_timestamp_local": "min_time",
                "number_of_pings_per_minute": "median_pings_per_min",
            }
        )
    )

    # Find total trip time and add an extra minute
    df["total_trip_time"] = (df.max_time - df.min_time) / pd.Timedelta(minutes=1) + 1

    df = df.drop(columns=["min_time", "max_time"])
    return df

In [None]:
# df1 = two_pings_per_min(vp_usable_pd)

In [None]:
# df1.info()

In [None]:
len(vp_usable)

In [None]:
start = datetime.datetime.now()
print(start)
partitions_test1 = vp_usable.map_partitions(
    two_pings_per_min,
    meta={
        "trip_instance_key": "object",
        "minutes_w_atleast2_trip_updates": "int64",
        "median_pings_per_min": "int64",
        "total_minute_w_gtfs": "int64",
        "total_trip_time": "float64",
    },
    align_dataframes=False,
).persist()

end = datetime.datetime.now()
logger.info(f"execution time: {end-start}")

In [None]:
type(partitions_test1)

In [None]:

start = datetime.datetime.now()
print(start)
partitions_test1 = partitions_test1.compute()

In [None]:
partitions_test1.median_pings_per_min.describe()

In [None]:
partitions_test1.sample(3)

In [None]:
# df2.loc[df2.trip_instance_key ==  "00068c2e2316950af50ffaa9584c7a46"]

### Density: on average, how many pings occur per minute
* Takes 34 secs
* Double check this

In [8]:
def density_pings_5_min(vp_usable_df: pd.DataFrame) -> pd.DataFrame:

    # Count number of pings per 5 minutes
    df = (
        vp_usable_df.groupby(
            [
                *["trip_instance_key"],
                pd.Grouper(key="location_timestamp_local", freq="5Min"),
            ]
        )
        .vp_idx.count()
        .reset_index()
    )

    # Find median of pings per 5 minutes for each trip
    df = (
        df.groupby(["trip_instance_key"])
        .agg({"vp_idx": "median"})
        .reset_index()
        .rename(columns={"vp_idx": "median_pings_per_5_min"})
    )

    # Divide by 5
    # df.median_pings_per_5_min = df.median_pings_per_5_min/5

    return df

In [None]:
# df2 = density_pings_5_min(vp_usable_pd)

In [None]:
# df2.info()

In [None]:
start = datetime.datetime.now()
print(start)
partitions_test2 = vp_usable.map_partitions(
    density_pings_5_min,
    meta={"trip_instance_key": "object", "median_pings_per_5_min": "float64"},
    align_dataframes=False,
).persist()

end = datetime.datetime.now()
logger.info(f"execution time: {end-start}")

In [None]:
# len(partitions_test2)

In [None]:
# len(partitions_test1)

In [None]:
partitions_test2 = partitions_test2.compute()

### Spatial Accuracy - 18 mins
* Do I use shapes or trips_with_shape?

#### Test to see difference between `shapes` and `trips_with_shape`

In [None]:
shapes = (
    pd.read_parquet(
        f"{COMPILED_CACHED_VIEWS}routelines_{analysis_date}.parquet",
        columns=["shape_array_key"],
    )
    .dropna()
    .drop_duplicates()
)

In [None]:
trips_with_shape = helpers.import_scheduled_trips(
    analysis_date,
    columns=["trip_instance_key", "shape_array_key"],
    get_pandas=True,
)

In [None]:
trips_with_shape.head()

In [None]:
trips_with_shape.trip_instance_key.nunique(), trips_with_shape.shape

In [None]:
trips_with_shape_shapes = set(trips_with_shape.shape_array_key.unique().tolist())
shapes_shapes = set(shapes.shape_array_key.unique().tolist())
trips_with_shape_shapes - shapes_shapes

In [None]:
shapes_shapes - trips_with_shape_shapes

#### `grab_shape_keys_in_vp` Takes 1 second

In [9]:
def grab_shape_keys_in_vp(vp_usable: dd.DataFrame, analysis_date: str) -> pd.DataFrame:
    """
    Subset raw vp and find unique trip_instance_keys.
    Create crosswalk to link trip_instance_key to shape_array_key.
    """
    vp_usable = (
        vp_usable[["trip_instance_key"]].drop_duplicates().reset_index(drop=True)
    )

    trips_with_shape = helpers.import_scheduled_trips(
        analysis_date,
        columns=["trip_instance_key", "shape_array_key"],
        get_pandas=True,
    )

    # Only one row per trip/shape
    # trip_instance_key and shape_array_key are the only 2 cols left
    m1 = dd.merge(vp_usable, trips_with_shape, on="trip_instance_key", how="inner")

    return m1

In [None]:

start = datetime.datetime.now()
print(start)
spatial_df1 = grab_shape_keys_in_vp(vp_usable, analysis_date)
end = datetime.datetime.now()
logger.info(f"execution time: {end-start}")

#### `buffer_shapes2` takes 3 mins

In [10]:
def buffer_shapes(
    trips_with_shape: pd.DataFrame,
    analysis_date: str,
    buffer_meters: int = 35,
):
    """
    Filter scheduled shapes down to the shapes that appear in vp.
    Buffer these.

    Attach the shape geometry for a subset of shapes or trips.
    """
    subset = trips_with_shape.shape_array_key.unique().compute().tolist()

    shapes = helpers.import_scheduled_shapes(
        analysis_date,
        columns=["shape_array_key", "geometry"],
        filters=[[("shape_array_key", "in", subset)]],
        crs=PROJECT_CRS,
        get_pandas=False,
    ).pipe(helpers.remove_shapes_outside_ca)

    # to_crs takes awhile, so do a filtering on only shapes we need
    shapes = shapes.assign(geometry=shapes.geometry.buffer(buffer_meters))

    trips_with_shape_geom = dd.merge(
        shapes, trips_with_shape, on="shape_array_key", how="inner"
    )

    trips_with_shape_geom = trips_with_shape_geom.compute()
    return trips_with_shape_geom

In [None]:
start = datetime.datetime.now()
print(start)
spatial_df2 = buffer_shapes2(spatial_df1, analysis_date, 35)
end = datetime.datetime.now()
logger.info(f"execution time: {end-start}")

In [None]:
type(spatial_df2)

#### Tiffany's function
* Takes 15 mins

In [11]:
def vp_in_shape(
    vp_usable: dd.DataFrame, trips_with_buffered_shape: gpd.GeoDataFrame
) -> gpd.GeoDataFrame:

    keep = ["trip_instance_key", "x", "y", "location_timestamp_local"]
    vp_usable = vp_usable[keep]

    vp_gdf = gpd.GeoDataFrame(
        vp_usable, geometry=gpd.points_from_xy(vp_usable.x, vp_usable.y), crs=WGS84
    ).to_crs(PROJECT_CRS)

    gdf = pd.merge(
        vp_gdf, trips_with_buffered_shape, on="trip_instance_key", how="inner"
    )
    
    gdf = gdf.assign(is_within=gdf.geometry_x.within(gdf.geometry_y))
    gdf = gdf[["trip_instance_key", "location_timestamp_local", "is_within"]]
    
    return gdf

In [None]:
start = datetime.datetime.now()
print(start)
result = vp_usable.map_partitions(
    vp_in_shape,
    spatial_df2,
    meta={
        "trip_instance_key": "object",
        # "vp_idx": "int",
        "location_timestamp_local": "datetime64[ns]",
        "is_within":"bool",
    },
    align_dataframes=False,
).persist()
end = datetime.datetime.now()
logger.info(f"execution time: {end-start}")

#### Final grouping of all the vps in shape
* Takes 3 secs

In [12]:
def total_counts(result: dd.DataFrame):
    
    total_vp = vp_spatial_accuracy.total_vp_counts_by_trip(result)
    
    result2 = result.loc[result.is_within == True].reset_index(drop = True)
    result2 = result2[["trip_instance_key", "location_timestamp_local"]]    
    vps_in_shape = (
        result2.groupby("trip_instance_key", observed=True, group_keys=False)
        .agg({"location_timestamp_local": "count"})
        .reset_index()
        .rename(columns={"location_timestamp_local": "vp_in_shape"})
    )

    # Count total vps for the trip
    # total vp by trip can be done on vp_usable / break apart from vp_in_shape

    count_df = pd.merge(total_vp, vps_in_shape, on="trip_instance_key", how="left")

    count_df = count_df.assign(
        vp_in_shape=count_df.vp_in_shape.fillna(0).astype("int32"),
        total_vp=count_df.total_vp.fillna(0).astype("int32"),
    )
    
    return count_df

In [None]:
result_pd= result.compute()

In [None]:
len(result_pd)

In [None]:
start = datetime.datetime.now()
print(start)
spatial_accuracy_final2 = total_counts(result_pd)
end = datetime.datetime.now()
logger.info(f"execution time: {end-start}")

In [None]:
result2

In [None]:
start = datetime.datetime.now()
print(start)
spatial_accuracy_final = result.map_partitions(
    total_counts,
    meta={"trip_instance_key": "object", "total_vp": "int32", "vp_in_shape": "int32"},
    align_dataframes=False,
).persist()
end = datetime.datetime.now()
logger.info(f"execution time: {end-start}")

### Put it all together

In [13]:
def load_trip_speeds(analysis_date):
    df = pd.read_parquet(
    f"{SEGMENT_GCS}trip_summary/trip_speeds_{analysis_date}.parquet",
    columns=[
        "trip_instance_key",
        "speed_mph",
        "route_id",
        "time_of_day",
        "service_minutes",
    ])
    
    return df

In [30]:
def test_complete_func(analysis_date:str) -> pd.DataFrame:
    
    vp_usable = dd.read_parquet(f"{SEGMENT_GCS}vp_usable_{analysis_date}")
    
    ## Update Completeness ##
    
    # Find total min with gtfs, total trip time, 
    # median pings per minute 
    pings_trip_time_df = vp_usable.map_partitions(
    two_pings_per_min,
    meta={
        "trip_instance_key": "object",
        "minutes_w_atleast2_trip_updates": "int64",
        "median_pings_per_min": "int64",
        "total_minute_w_gtfs": "int64",
        "total_trip_time": "float64",
    },
    align_dataframes=False).persist()
    
    """
    COMMENTING OUT BC IT doesn't add any insights
    # Find median pings per 5 minutes
    density_df = vp_usable.map_partitions(
    density_pings_5_min,
    meta={"trip_instance_key": "object", "median_pings_per_5_min": "float64"},
    align_dataframes=False,).persist()
    """
    
    ## Spatial accuracy  ##
    
    # Determine which trips have shapes associated with them
    trips_with_shapes_df = grab_shape_keys_in_vp(vp_usable, analysis_date)
    
    # Buffer the shapes 
    buffered_shapes_df = buffer_shapes(trips_with_shapes_df, analysis_date, 35)
    
    # Find the vps that fall into buffered shapes
    in_shape_df = vp_usable.map_partitions(
    vp_in_shape,
    buffered_shapes_df,
    meta={
        "trip_instance_key": "object",
        "location_timestamp_local": "datetime64[ns]",
        "is_within":"bool",
    },
    align_dataframes=False,).persist()
    
    # Compare total vps for a trip versus total vps that 
    # fell in the recorded shape
    spatial_accuracy_df = in_shape_df.map_partitions(
    total_counts,
    meta={"trip_instance_key": "object", "total_vp": "int32", "vp_in_shape": "int32"},
    align_dataframes=False,).persist()
    
    # Load trip speeds
    trip_speeds_df = load_trip_speeds(analysis_date)
    
    # Merges
    pings_trip_time_df = pings_trip_time_df.compute()
    spatial_accuracy_df = spatial_accuracy_df.compute()
    # density_df = density_df.compute()
    
    m1 = (
    pings_trip_time_df.merge(spatial_accuracy_df, on=["trip_instance_key"], how="outer")
    .merge(trip_speeds_df, on=["trip_instance_key"], how="outer")
    )
    
    return m1

In [31]:
start = datetime.datetime.now()
print(start)
final = test_complete_func(analysis_date)
end = datetime.datetime.now()
logger.info(f"execution time: {end-start}")

2023-12-06 13:31:05.527378


2023-12-06 13:53:53.015 | INFO     | __main__:<module>:5 - execution time: 0:22:47.487440


In [33]:
final.shape

(86486, 11)

* This metric finds the % the trip went over or under the scheduled time
* If a trip ran 1 hour but was scheduled for 30 minutes, it ran 59% over time

In [34]:

final["trip_timeliness_pct"] = (final.total_trip_time / final.service_minutes - 1) * 100

* This metric compares the total minutes of a trip with GTFS data versus the last time stamp and first time stamp. 
* Ex: a trip ran from 10-11am but it didn't record any data from 10:30-10:45. 
    * As such this trip's total trip time is 1 hour but the total minute with GTFS is 45 minutes
    * Thus this trip only has 75% of its trip covered with GTFS tracking

In [47]:

final['gtfs_coverage_v_total_trip_time_pct'] = (final.total_minute_w_gtfs/final.total_trip_time) * 100

#### Del metric?
* Seems too similar with gtfs_coverage_v_total_time_pct
* Should I mask any value above 100 because often total trip time is slower than scheduled?
* This metric shows how many minutes of gtfs data a trip has v scheduled.

In [55]:
# final['gtfs_coverage_v_scheduled_pct'] = (final.total_minute_w_gtfs / final.service_minutes * 100)

In [49]:
# Mask any value above 100 because often total trip time is slower than scheduled
# final.gtfs_coverage_v_scheduled_pct = final.gtfs_coverage_v_scheduled_pct.mask(final.gtfs_coverage_v_scheduled_pct > 100).fillna(100)

* How many vps fell into the route shape we have on file.

In [50]:
final['pct_vp_in_route_shape'] = (final.vp_in_shape/final.total_vp) * 100

AttributeError: 'DataFrame' object has no attribute 'vp_in_shape'

In [53]:
# final = final.drop(columns = ['total_vp','vp_in_shape'])

In [56]:
final.describe()

Unnamed: 0,minutes_w_atleast2_trip_updates,median_pings_per_min,total_minute_w_gtfs,total_trip_time,speed_mph,service_minutes,trip_timeliness_pct,gtfs_coverage_v_total_trip_time_pct,gtfs_coverage_v_scheduled_pct,pct_vp_in_route_shape
count,86486.0,86486.0,86486.0,86486.0,76255.0,76255.0,76255.0,86486.0,86486.0,73902.0
mean,58.14,2.63,64.11,68.55,10.03,52.63,43.68,96.65,97.69,94.06
std,38.03,0.6,36.48,62.31,5.94,26.63,232.84,10.97,9.11,12.62
min,0.0,1.0,2.0,11.0,0.04,2.0,-87.06,0.14,2.2,0.0
25%,32.0,2.0,40.0,41.0,6.02,32.0,11.76,100.0,100.0,95.22
50%,53.0,3.0,60.0,62.0,8.85,50.0,25.86,100.0,100.0,100.0
75%,77.0,3.0,82.0,84.0,12.57,69.0,45.0,100.0,100.0,100.0
max,1440.0,5.0,1440.0,1450.0,79.56,300.0,11900.0,100.0,100.0,100.0


#### How to handle trips that are going "too fast"?
* These trips are going faster than the value in `service_minutes`
* How to gauge whether a trip is going too fast to be accurate?

In [57]:
fast = final.loc[final.trip_timeliness_pct < 0].reset_index(drop = True)

In [60]:
fast.describe()

Unnamed: 0,minutes_w_atleast2_trip_updates,median_pings_per_min,total_minute_w_gtfs,total_trip_time,speed_mph,service_minutes,trip_timeliness_pct,gtfs_coverage_v_total_trip_time_pct,gtfs_coverage_v_scheduled_pct,pct_vp_in_route_shape
count,7326.0,7326.0,7326.0,7326.0,7326.0,7326.0,7326.0,7326.0,7326.0,6987.0
mean,38.07,2.49,43.87,45.76,12.72,52.11,-12.65,96.07,84.11,97.03
std,25.29,0.68,25.54,26.01,8.57,27.35,13.39,11.35,16.54,9.4
min,0.0,1.0,2.0,11.0,0.06,12.0,-87.06,3.03,2.2,0.0
25%,20.0,2.0,25.0,26.0,7.5,31.0,-15.79,98.51,80.0,98.52
50%,33.0,3.0,39.0,41.0,10.52,47.0,-8.0,100.0,90.0,100.0
75%,51.0,3.0,58.0,60.0,15.16,66.0,-4.0,100.0,95.0,100.0
max,224.0,4.0,225.0,241.0,79.42,270.0,-0.0,100.0,100.0,100.0


In [66]:
len(final.loc[final.trip_timeliness_pct < -16].reset_index(drop = True))

1816

* Many trips like a trip ran 21 minutes instead of 25 or a trip ran 25 minutes instead of 32. Or a trip that ran 20 minutes but was scheduled for 24.

In [68]:
below_25_percentile = final.loc[final.trip_timeliness_pct < -16].reset_index(drop = True)

In [72]:
below_25_percentile.sample(5)

Unnamed: 0,trip_instance_key,minutes_w_atleast2_trip_updates,median_pings_per_min,total_minute_w_gtfs,total_trip_time,speed_mph,route_id,time_of_day,service_minutes,trip_timeliness_pct,gtfs_coverage_v_total_trip_time_pct,gtfs_coverage_v_scheduled_pct,pct_vp_in_route_shape
1159,bb151a9ba8476cc7ce00cae7c286628f,23,3.0,24,24.0,9.32,4869,Midday,53.0,-54.72,100.0,45.28,100.0
1689,e850a237b36e565f0e55fa28373fa27a,23,3.0,24,24.0,10.02,39,AM Peak,32.0,-25.0,100.0,75.0,100.0
1496,147fb0cfe8a14a03ab97df8896dd96b5,17,3.0,19,19.0,6.59,2,Midday,35.0,-45.71,100.0,54.29,72.22
1382,78cfddf36e68cdea2eb1ca4d7373ad84,20,3.0,20,20.0,21.3,39,PM Peak,24.0,-16.67,100.0,83.33,100.0
1555,6ee96719ae3be425bb12f2835a447c78,32,3.0,33,33.0,6.58,5R,AM Peak,50.0,-34.0,100.0,66.0,100.0


In [71]:
below_25_percentile.describe()

Unnamed: 0,minutes_w_atleast2_trip_updates,median_pings_per_min,total_minute_w_gtfs,total_trip_time,speed_mph,service_minutes,trip_timeliness_pct,gtfs_coverage_v_total_trip_time_pct,gtfs_coverage_v_scheduled_pct,pct_vp_in_route_shape
count,1816.0,1816.0,1816.0,1816.0,1816.0,1816.0,1816.0,1816.0,1816.0,1751.0
mean,24.96,2.43,29.37,31.64,16.33,47.27,-31.08,93.82,64.89,94.2
std,17.4,0.72,17.32,18.23,12.3,25.73,14.96,15.42,18.06,14.95
min,0.0,1.0,2.0,11.0,0.09,14.0,-87.06,3.03,2.2,0.0
25%,13.0,2.0,17.0,18.0,8.52,29.0,-37.14,96.67,56.6,96.88
50%,22.0,3.0,25.0,27.0,12.35,40.0,-25.79,100.0,71.05,100.0
75%,33.0,3.0,38.0,40.0,19.38,60.25,-20.0,100.0,78.39,100.0
max,161.0,3.0,171.0,192.0,79.42,245.0,-16.0,100.0,84.0,100.0


In [75]:
# Narrow down to even faster trips...using mean from above
fast2 = fast.loc[fast.trip_timeliness_pct < -32].reset_index(drop = True)

In [76]:
fast2.describe()

Unnamed: 0,minutes_w_atleast2_trip_updates,median_pings_per_min,total_minute_w_gtfs,total_trip_time,speed_mph,service_minutes,trip_timeliness_pct,gtfs_coverage_v_total_trip_time_pct,gtfs_coverage_v_scheduled_pct,pct_vp_in_route_shape
count,619.0,619.0,619.0,619.0,619.0,619.0,619.0,619.0,619.0,606.0
mean,20.63,2.49,24.18,26.55,22.46,53.45,-48.08,91.71,47.7,91.03
std,13.54,0.7,13.45,13.74,15.6,25.7,13.39,18.26,15.87,19.57
min,0.0,1.0,3.0,11.0,0.09,17.0,-87.06,6.0,4.0,0.0
25%,11.0,2.0,14.5,16.0,11.07,34.0,-57.49,95.24,36.63,93.42
50%,18.0,3.0,21.0,23.0,17.8,50.0,-43.84,100.0,51.43,100.0
75%,28.0,3.0,30.0,34.0,29.45,69.0,-36.88,100.0,61.87,100.0
max,80.0,3.0,80.0,84.0,79.42,138.0,-32.08,100.0,67.92,100.0


* This trip lasted 16 minutes but was scheduled for 61 minutes.
* That's probably unlikely
* How to capture this? 
    * The metric `gtfs_coverage_v_total_trip_time_pct` looks good because 100% of the total trip time minutes has at least one ping but in reality, this is missing 75% of the data.

In [65]:
fast.loc[fast.trip_instance_key == "e0672609f2cf72285094898d77ef2880"]

Unnamed: 0,trip_instance_key,minutes_w_atleast2_trip_updates,median_pings_per_min,total_minute_w_gtfs,total_trip_time,speed_mph,route_id,time_of_day,service_minutes,trip_timeliness_pct,gtfs_coverage_v_total_trip_time_pct,gtfs_coverage_v_scheduled_pct,pct_vp_in_route_shape
3143,e0672609f2cf72285094898d77ef2880,12,2.0,16,16.0,36.34,486,AM Peak,61.0,-73.77,100.0,26.23,100.0


In [78]:
preview = ['total_minute_w_gtfs', 'total_trip_time', 'service_minutes', 'trip_timeliness_pct', 'trip_instance_key']

In [80]:
fast2[preview].sort_values(['trip_timeliness_pct'])

Unnamed: 0,total_minute_w_gtfs,total_trip_time,service_minutes,trip_timeliness_pct,trip_instance_key
206,6,11.0,85.0,-87.06,cb4a9bc995808a421731494ba65f2185
204,12,12.0,84.0,-85.71,c708c4cef5a7ee467280684d741770ff
474,16,16.0,106.0,-84.91,0e9ef8cd2b3e5bbd520049519ecd8fac
300,12,12.0,77.0,-84.42,41601d45a881b0ee40e9f9cd670b1291
347,17,17.0,97.0,-82.47,f43e271b18e33d5cb6f1670186073955
177,15,15.0,82.0,-81.71,76745d6cafe19770b0adbc103cb2015d
484,19,19.0,103.0,-81.55,f44a2bcee3286852333f7dcca1e01232
599,11,13.0,69.0,-81.16,2ac9ce3aa62ceb206acb30fb5bdbfa9e
185,21,21.0,109.0,-80.73,9a64f2ce1671ab6a40d3496e7807119e
170,12,13.0,67.0,-80.6,49e6314f537c5003777cb2c7536d6407


#### How to handle overly long trips?
* The bus is usually slow but these trips are way too slow


In [84]:
# Check up trips that are double the time scheduled to run
slow1 = final.loc[final.trip_timeliness_pct > 100]

In [85]:
slow1.trip_instance_key.nunique()

4014

In [87]:
slow1[preview].sample(10)

Unnamed: 0,total_minute_w_gtfs,total_trip_time,service_minutes,trip_timeliness_pct,trip_instance_key
55089,73,74.0,33.0,124.24,d1afe766ceeb43886718a76da7fcfc90
60156,49,49.0,22.0,122.73,eeb919ce26afcc44bf3b9c5e42591516
41652,58,58.0,12.0,383.33,d062ee14e876c3ab6ea37eb6fa5ea980
68568,102,102.0,39.0,161.54,306c6d61d22f715812a8f6286728b932
5413,30,43.0,20.0,115.0,04dbbe991f55e27afeb8876eeb7431b6
48598,51,55.0,24.0,129.17,9bbd38a2a9218d9f43a4cfa9330e5e4b
14920,130,130.0,57.0,128.07,041eed4879ffb8ead37227a3eac940f2
69533,56,56.0,15.0,273.33,4b1128beea3c4e8d728d3d509bbbe25e
76372,74,222.0,23.0,865.22,cc7c0f1ac6107b29c7fa76039f491963
46291,141,141.0,59.0,138.98,f45a0f7a20876a51f39de3df33aa6ba1


In [88]:
slow1.head()

Unnamed: 0,trip_instance_key,minutes_w_atleast2_trip_updates,median_pings_per_min,total_minute_w_gtfs,total_trip_time,speed_mph,route_id,time_of_day,service_minutes,trip_timeliness_pct,gtfs_coverage_v_total_trip_time_pct,gtfs_coverage_v_scheduled_pct,pct_vp_in_route_shape
108,00a3977f7ece89de096253945da28251,58,3.0,58,58.0,12.57,3421,Early AM,27.0,114.81,100.0,100.0,80.92
120,0bdaac2d30b3d3007ce90cdba424f308,219,3.0,220,220.0,36.26,3439,Early AM,60.0,266.67,100.0,100.0,48.78
151,3345f021ab85ef2afc258cc227c61855,96,3.0,96,96.0,6.79,3422,Early AM,38.0,152.63,100.0,100.0,57.19
154,34b71ccfce7ed030296eab81f24bd19c,121,3.0,121,121.0,8.68,3428,PM Peak,58.0,108.62,100.0,100.0,55.12
160,37151e76d8696c1140cb8267cc7afacd,107,3.0,107,107.0,9.81,3435,AM Peak,50.0,114.0,100.0,100.0,93.46


In [89]:
final.describe()

Unnamed: 0,minutes_w_atleast2_trip_updates,median_pings_per_min,total_minute_w_gtfs,total_trip_time,speed_mph,service_minutes,trip_timeliness_pct,gtfs_coverage_v_total_trip_time_pct,gtfs_coverage_v_scheduled_pct,pct_vp_in_route_shape
count,86486.0,86486.0,86486.0,86486.0,76255.0,76255.0,76255.0,86486.0,86486.0,73902.0
mean,58.14,2.63,64.11,68.55,10.03,52.63,43.68,96.65,97.69,94.06
std,38.03,0.6,36.48,62.31,5.94,26.63,232.84,10.97,9.11,12.62
min,0.0,1.0,2.0,11.0,0.04,2.0,-87.06,0.14,2.2,0.0
25%,32.0,2.0,40.0,41.0,6.02,32.0,11.76,100.0,100.0,95.22
50%,53.0,3.0,60.0,62.0,8.85,50.0,25.86,100.0,100.0,100.0
75%,77.0,3.0,82.0,84.0,12.57,69.0,45.0,100.0,100.0,100.0
max,1440.0,5.0,1440.0,1450.0,79.56,300.0,11900.0,100.0,100.0,100.0


In [86]:
slow1.describe()

Unnamed: 0,minutes_w_atleast2_trip_updates,median_pings_per_min,total_minute_w_gtfs,total_trip_time,speed_mph,service_minutes,trip_timeliness_pct,gtfs_coverage_v_total_trip_time_pct,gtfs_coverage_v_scheduled_pct,pct_vp_in_route_shape
count,4014.0,4014.0,4014.0,4014.0,4014.0,4014.0,4014.0,4014.0,4014.0,3891.0
mean,65.0,2.61,72.08,118.9,8.16,27.37,343.98,89.29,99.61,77.44
std,79.66,0.59,78.77,228.64,6.53,19.66,961.06,22.9,4.18,25.79
min,0.0,1.0,6.0,13.0,0.05,2.0,100.82,0.67,6.08,0.0
25%,29.0,2.0,35.0,38.0,4.11,14.0,116.75,95.24,100.0,59.1
50%,43.0,3.0,51.0,57.0,6.6,21.0,142.86,100.0,100.0,88.24
75%,71.75,3.0,78.0,95.0,10.3,35.0,214.29,100.0,100.0,100.0
max,1261.0,3.0,1262.0,1442.0,69.44,169.0,11900.0,100.0,100.0,100.0
