## Improving on Script
* Feedback: https://github.com/cal-itp/data-analyses/pull/961

In [53]:
import datetime

import dask
import dask.dataframe as dd
import geopandas as gpd
import pandas as pd
from calitp_data_analysis.geography_utils import WGS84
from scripts import vp_spatial_accuracy
from segment_speed_utils import helpers
from segment_speed_utils import wrangle_shapes
from segment_speed_utils.project_vars import (
    GCS_FILE_PATH,
    PROJECT_CRS,
    SEGMENT_GCS,
    analysis_date,
)

In [54]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### Filter columns 
* ['trip_instance_key', 'location_timestamp_local', 'x','y','vp_idx']

In [69]:
operator = "Bay Area 511 Muni VehiclePositions"
gtfs_key = "7cc0cb1871dfd558f11a2885c145d144"


In [89]:
def load_vp_usable(analysis_date):
    df = dd.read_parquet(f"{SEGMENT_GCS}vp_usable_{analysis_date}",
    columns=['trip_instance_key', 'location_timestamp_local', 'x','y','vp_idx'])
    
    return df

In [90]:
vp_usable = load_vp_usable(analysis_date)

### Total Trip Time
* Addresses "<i>in this function, min_time, max_time are created on the grouped df (vp_usable grouped by trip and binned minute)...I think to be safer, it should be created on vp_usable grouped by trip.</i>"

In [59]:
def total_trip_time(vp_usable_df:pd.DataFrame):
    """
    For each trip: find the total service minutes
    recorded in real time data so we can compare it with
    scheduled service minutes.
    """
    subset = ['location_timestamp_local','trip_instance_key']
    vp_usable_df = vp_usable_df[subset]
    
    # Need an extra copy of the column to find the max
    vp_usable_df['max_time'] = vp_usable_df.location_timestamp_local
    
    # Find the max and the min time based on location timestamp 
    df = (vp_usable_df.groupby(['trip_instance_key'])
       .agg({'location_timestamp_local':'min', 'max_time':'max'})
       .reset_index()
       .rename(columns = {'location_timestamp_local':'min_time'})
      )
    
    # Find total rt service mins and add an extra minute
    df["rt_service_minutes"] = (df.max_time - df.min_time) / pd.Timedelta(minutes=1) + 1
    
    # Return only one row per trip with the total trip time
    df = df.drop(columns = ['max_time', 'min_time'])
    
    return df

### Update Completeness

In [60]:
def two_pings_per_min(vp_usable_df: pd.DataFrame) -> pd.DataFrame:
    """
    For each trip: find the median GTFS pings per minute, 
    the total minutes with at least 1 GTFS ping per minute,
    and total minutes with at least 2 GTFS pings per minute.
    """
    subset = ['location_timestamp_local','trip_instance_key', 'vp_idx']
    vp_usable_df = vp_usable_df[subset]

    # Find number of pings each minute
    df = (
        vp_usable_df.groupby(
            [
                "trip_instance_key",
                pd.Grouper(key="location_timestamp_local", freq="1Min"),
            ]
        )
        .vp_idx.count()
        .reset_index()
        .rename(columns={"vp_idx": "number_of_pings_per_minute"})
    )
    
    # Determine which rows have 2+ pings per minute
    df = df.assign(
        min_w_atleast2_trip_updates=df.apply(
            lambda x: 1 if x.number_of_pings_per_minute >= 2 else 0, axis=1
        )
    )
    
    # Need a copy of numer of pings per minute to count for total minutes w gtfs
    df["total_minute_w_gtfs"] = df.number_of_pings_per_minute

    # Find the total min with at least 2 pings per min
    df = (
        df.groupby(["trip_instance_key"])
        .agg(
            {
                "min_w_atleast2_trip_updates": "sum",
                "number_of_pings_per_minute": "median",
                "total_minute_w_gtfs": "count",
            }
        )
        .reset_index()
        .rename(
            columns={
                "number_of_pings_per_minute": "median_pings_per_min",
            }
        )
    )

    return df

In [61]:
update = two_pings_per_min(vp_usable_pd)

### Spatial Accuracy
* Addresses "<i>in next draft, work on grouping functions that belong together, such as this one. total_counts and total_counts_by_trip sound basically equivalent, and they are nearly doing the same thing, except total_counts actually creates 2 columns. work on logically grouping or absorbing functions or rewriting functions so the same function can now be used twice.
Adapt this function to be used twice
Compare it to this to find where they have stuff in common and which part should be removed from the generic function</i>"

In [62]:
def grab_shape_keys_in_vp(vp_usable: dd.DataFrame, analysis_date: str) -> pd.DataFrame:
    """
    Subset raw vp and find unique trip_instance_keys.
    Create crosswalk to link trip_instance_key to shape_array_key.
    """
    vp_usable = (vp_usable[["trip_instance_key"]]
                 .drop_duplicates()
                 .reset_index(drop=True)
    )

    trips_with_shape = helpers.import_scheduled_trips(
        analysis_date,
        columns=["trip_instance_key", "shape_array_key"],
        get_pandas=True,
    )

    # Only one row per trip/shape
    # trip_instance_key and shape_array_key are the only 2 cols left
    m1 = dd.merge(vp_usable, trips_with_shape, on="trip_instance_key", how="inner")

    return m1

In [63]:
def buffer_shapes(
    trips_with_shape: pd.DataFrame,
    analysis_date: str,
    buffer_meters: int = 35,
):
    """
    Filter scheduled shapes down to the shapes that appear in vp.
    Buffer these.

    Attach the shape geometry for a subset of shapes or trips.
    """
    subset = trips_with_shape.shape_array_key.unique().compute().tolist()

    shapes = helpers.import_scheduled_shapes(
        analysis_date,
        columns=["shape_array_key", "geometry"],
        filters=[[("shape_array_key", "in", subset)]],
        crs=PROJECT_CRS,
        get_pandas=False,
    ).pipe(helpers.remove_shapes_outside_ca)

    # to_crs takes awhile, so do a filtering on only shapes we need
    shapes = shapes.assign(geometry=shapes.geometry.buffer(buffer_meters))

    trips_with_shape_geom = dd.merge(
        shapes, trips_with_shape, on="shape_array_key", how="inner"
    )

    trips_with_shape_geom = trips_with_shape_geom.compute()
    return trips_with_shape_geom

In [67]:
def vp_in_shape(
    vp_usable: dd.DataFrame, trips_with_buffered_shape: gpd.GeoDataFrame
) -> gpd.GeoDataFrame:

    keep = ["trip_instance_key", "x", "y", "location_timestamp_local"]
    vp_usable = vp_usable[keep]
    
    gdf = wrangle_shapes.vp_as_gdf(gdf)
    
    gdf = pd.merge(
        vp_gdf, trips_with_buffered_shape, on="trip_instance_key", how="inner"
    )
    
    gdf = gdf.assign(is_within=gdf.geometry_x.within(gdf.geometry_y))
    gdf = gdf[["trip_instance_key", "location_timestamp_local", "is_within"]]
    
    return gdf

#### Adapt this to be used in multiple places

In [91]:
def total_vp_counts_by_trip(vp: gpd.GeoDataFrame, new_col_title:str) -> pd.DataFrame:
    """
    Get a count of vp for each trip, whether or not those fall 
    within buffered shape or not
    """
    count_vp = (
        vp.groupby("trip_instance_key", 
                   observed=True, group_keys=False)
        .agg({"location_timestamp_local": "count"})
        .reset_index()
        .rename(columns={"location_timestamp_local": new_col_title})
    )
    
    return count_vp

In [92]:
def total_counts(result: dd.DataFrame):
    
    # Find the total number of vps for each route
    total_vp = total_vp_counts_by_trip(result, "total_vps_for_route")
    
    # Find the total number of vps that actually fall within the  route shape
    result2 = result.loc[result.is_within == True].reset_index(drop = True)
    subset = ["trip_instance_key", "location_timestamp_local"]
    result2 = result2[subset]    
    
    vps_in_shape = total_vp_counts_by_trip(result, "vp_in_shape")

    # Count total vps for the trip
    count_df = pd.merge(total_vp, vps_in_shape, on="trip_instance_key", how="left")

    count_df = count_df.assign(
        vp_in_shape=count_df.vp_in_shape.fillna(0).astype("int32"),
        total_vp=count_df.total_vp.fillna(0).astype("int32"),
    )
    
    return count_df