## Improving on Script
* Feedback: https://github.com/cal-itp/data-analyses/pull/961

In [1]:
import datetime

import dask
import dask.dataframe as dd
import geopandas as gpd
import pandas as pd
from calitp_data_analysis.geography_utils import WGS84
from loguru import logger
from scripts import vp_spatial_accuracy
from segment_speed_utils import helpers, wrangle_shapes
from segment_speed_utils.project_vars import (
    GCS_FILE_PATH,
    PROJECT_CRS,
    SEGMENT_GCS,
    analysis_date,
)

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### Filter columns 
* ['trip_instance_key', 'location_timestamp_local', 'x','y','vp_idx']

In [3]:
operator = "Bay Area 511 Muni VehiclePositions"
gtfs_key = "7cc0cb1871dfd558f11a2885c145d144"

In [57]:
def load_vp_usable(analysis_date):

    # Delete schedule_gtfs_dataset_key later
    df = dd.read_parquet(
        f"{SEGMENT_GCS}vp_usable_{analysis_date}",
        columns=[
            "schedule_gtfs_dataset_key",
            "trip_instance_key",
            "location_timestamp_local",
            "x",
            "y",
            "vp_idx",
        ],
    )

    # Create a copy of location timestamp for the total_trip_time function
    # to avoid type setting
    df["max_time"] = df.location_timestamp_local
    return df

In [58]:
vp_usable = load_vp_usable(analysis_date)

In [59]:
vp_usable.columns

Index(['schedule_gtfs_dataset_key', 'trip_instance_key',
       'location_timestamp_local', 'x', 'y', 'vp_idx', 'max_time'],
      dtype='object')

In [62]:
# Filter for now
vp_usable2 = vp_usable.loc[vp_usable.schedule_gtfs_dataset_key == gtfs_key].reset_index(
    drop=True
)

In [63]:
vp_usable2 = vp_usable2.compute()

### Total Trip Time
* Addresses "<i>in this function, min_time, max_time are created on the grouped df (vp_usable grouped by trip and binned minute)...I think to be safer, it should be created on vp_usable grouped by trip.</i>"
* The copy setting is now turned on?? How to get it to go away?

In [64]:
def total_trip_time(vp_usable_df: pd.DataFrame):
    """
    For each trip: find the total service minutes
    recorded in real time data so we can compare it with
    scheduled service minutes.
    """
    subset = ["location_timestamp_local", "trip_instance_key", "max_time"]
    vp_usable_df = vp_usable_df[subset]

    # Need an extra copy of the column to find the max

    # Find the max and the min time based on location timestamp
    df = (
        vp_usable_df.groupby(["trip_instance_key"])
        .agg({"location_timestamp_local": "min", "max_time": "max"})
        .reset_index()
        .rename(columns={"location_timestamp_local": "min_time"})
    )

    # Find total rt service mins and add an extra minute
    df["rt_service_min"] = (df.max_time - df.min_time) / pd.Timedelta(minutes=1) + 1

    # Return only one row per trip with the total trip time
    df = df.drop(columns=["max_time", "min_time"])

    return df

In [73]:
start = datetime.datetime.now()
print(start)
total_trip_time_df = vp_usable.map_partitions(
    total_trip_time,
    meta={
        "trip_instance_key": "object",
        "rt_service_min": "float64",
    },
    align_dataframes=False,
).persist()

end = datetime.datetime.now()
logger.info(f"execution time: {end-start}")

2023-12-11 09:34:27.716717


2023-12-11 09:35:40.937 | INFO     | __main__:<module>:13 - execution time: 0:01:13.220638


In [105]:
start = datetime.datetime.now()
print(start)
test = total_trip_time(vp_usable)

end = datetime.datetime.now()
logger.info(f"execution time: {end-start}")

2023-12-11 10:36:13.603 | INFO     | __main__:<module>:6 - execution time: 0:00:00.026458


2023-12-11 10:36:13.576859


In [106]:
type(test)

dask.dataframe.core.DataFrame

In [104]:
len(total_trip_time_df)

86832

In [111]:
total_trip_time_df = total_trip_time_df.compute()

AttributeError: 'DataFrame' object has no attribute 'compute'

### Update Completeness

In [76]:
def two_pings_per_min(vp_usable_df: pd.DataFrame) -> pd.DataFrame:
    """
    For each trip: find the median GTFS pings per minute,
    the total minutes with at least 1 GTFS ping per minute,
    and total minutes with at least 2 GTFS pings per minute.
    """
    subset = ["location_timestamp_local", "trip_instance_key", "vp_idx"]
    vp_usable_df = vp_usable_df[subset]

    # Find number of pings each minute
    df = (
        vp_usable_df.groupby(
            [
                "trip_instance_key",
                pd.Grouper(key="location_timestamp_local", freq="1Min"),
            ]
        )
        .vp_idx.count()
        .reset_index()
        .rename(columns={"vp_idx": "number_of_pings_per_minute"})
    )

    # Determine which rows have 2+ pings per minute
    df = df.assign(
        min_w_atleast2_trip_updates=df.apply(
            lambda x: 1 if x.number_of_pings_per_minute >= 2 else 0, axis=1
        )
    )

    # Need a copy of numer of pings per minute to count for total minutes w gtfs
    df["total_minute_w_gtfs"] = df.number_of_pings_per_minute

    # Find the total min with at least 2 pings per min
    df = (
        df.groupby(["trip_instance_key"])
        .agg(
            {
                "min_w_atleast2_trip_updates": "sum",
                "number_of_pings_per_minute": "median",
                "total_minute_w_gtfs": "count",
            }
        )
        .reset_index()
        .rename(
            columns={
                "number_of_pings_per_minute": "median_pings_per_min",
            }
        )
    )

    return df

In [77]:
update = two_pings_per_min(vp_usable2)

In [110]:
update.head()

Unnamed: 0,trip_instance_key,min_w_atleast2_trip_updates,median_pings_per_min,total_minute_w_gtfs
0,00040782d9c6ac2e2179b660a78172af,48,3.0,49
1,00068beec6e7a45f81e8d39d099aaa06,55,3.0,56
2,000c57a57a2a162789915932a04709b9,64,3.0,65
3,00118123ee98d8b4590d4c8f46249c79,14,3.0,15
4,00172767ca26437a73e3852c1471920b,58,3.0,59


In [79]:
start = datetime.datetime.now()
print(start)
update_df = vp_usable.map_partitions(
    two_pings_per_min,
    meta={
        "trip_instance_key": "object",
        "min_w_atleast2_trip_updates": "int64",
        "median_pings_per_min": "float64",
        "total_minute_w_gtfs": "int64",
    },
    align_dataframes=False,
).persist()

end = datetime.datetime.now()
logger.info(f"execution time: {end-start}")

2023-12-11 09:44:04.116676


2023-12-11 09:45:49.131 | INFO     | __main__:<module>:15 - execution time: 0:01:45.014798


### Spatial Accuracy
* Addresses "<i>in next draft, work on grouping functions that belong together, such as this one. total_counts and total_counts_by_trip sound basically equivalent, and they are nearly doing the same thing, except total_counts actually creates 2 columns. work on logically grouping or absorbing functions or rewriting functions so the same function can now be used twice.
Adapt this function to be used twice
Compare it to this to find where they have stuff in common and which part should be removed from the generic function</i>"

In [80]:
def grab_shape_keys_in_vp(vp_usable: dd.DataFrame, analysis_date: str) -> pd.DataFrame:
    """
    Subset raw vp and find unique trip_instance_keys.
    Create crosswalk to link trip_instance_key to shape_array_key.
    """
    vp_usable = (
        vp_usable[["trip_instance_key"]].drop_duplicates().reset_index(drop=True)
    )

    trips_with_shape = helpers.import_scheduled_trips(
        analysis_date,
        columns=["trip_instance_key", "shape_array_key"],
        get_pandas=True,
    )

    # Only one row per trip/shape
    # trip_instance_key and shape_array_key are the only 2 cols left
    m1 = dd.merge(vp_usable, trips_with_shape, on="trip_instance_key", how="inner")

    return m1

In [81]:
shapes_df = grab_shape_keys_in_vp(vp_usable, analysis_date)

In [83]:
def buffer_shapes(
    trips_with_shape: pd.DataFrame,
    analysis_date: str,
    buffer_meters: int = 35,
):
    """
    Filter scheduled shapes down to the shapes that appear in vp.
    Buffer these.

    Attach the shape geometry for a subset of shapes or trips.
    """
    subset = trips_with_shape.shape_array_key.unique().compute().tolist()

    shapes = helpers.import_scheduled_shapes(
        analysis_date,
        columns=["shape_array_key", "geometry"],
        filters=[[("shape_array_key", "in", subset)]],
        crs=PROJECT_CRS,
        get_pandas=False,
    ).pipe(helpers.remove_shapes_outside_ca)

    # to_crs takes awhile, so do a filtering on only shapes we need
    shapes = shapes.assign(geometry=shapes.geometry.buffer(buffer_meters))

    trips_with_shape_geom = dd.merge(
        shapes, trips_with_shape, on="shape_array_key", how="inner"
    )

    trips_with_shape_geom = trips_with_shape_geom.compute()
    return trips_with_shape_geom

In [84]:
buffer_df = buffer_shapes(shapes_df, analysis_date, 35)

In [90]:
def vp_in_shape(
    vp_usable: dd.DataFrame, trips_with_buffered_shape: gpd.GeoDataFrame
) -> gpd.GeoDataFrame:

    keep = ["trip_instance_key", "x", "y", "location_timestamp_local"]
    vp_usable = vp_usable[keep]

    vp_gdf = wrangle_shapes.vp_as_gdf(vp_usable)

    gdf = pd.merge(
        vp_gdf, trips_with_buffered_shape, on="trip_instance_key", how="inner"
    )

    gdf = gdf.assign(is_within=gdf.geometry_x.within(gdf.geometry_y))
    gdf = gdf[["trip_instance_key", "location_timestamp_local", "is_within"]]

    return gdf

In [91]:
start = datetime.datetime.now()
print(start)
spatial_accuracy_df1 = vp_usable.map_partitions(
    vp_in_shape,
    buffer_df,
    meta={
        "trip_instance_key": "object",
        "location_timestamp_local": "datetime64[ns]",
        "is_within": "bool",
    },
    align_dataframes=False,
).persist()

end = datetime.datetime.now()
logger.info(f"execution time: {end-start}")

2023-12-11 09:52:03.995864


2023-12-11 10:05:53.240 | INFO     | __main__:<module>:14 - execution time: 0:13:49.244378


#### Adapt this to be used in multiple places

In [95]:
def total_vp_counts_by_trip(vp: gpd.GeoDataFrame, new_col_title: str) -> pd.DataFrame:
    """
    Get a count of vp for each trip, whether or not those fall
    within buffered shape or not
    """
    count_vp = (
        vp.groupby("trip_instance_key", observed=True, group_keys=False)
        .agg({"location_timestamp_local": "count"})
        .reset_index()
        .rename(columns={"location_timestamp_local": new_col_title})
    )

    return count_vp

In [98]:
def total_counts(result: dd.DataFrame):

    # Find the total number of vps for each route
    total_vp_df = total_vp_counts_by_trip(result, "total_vp")

    # Find the total number of vps that actually fall within the  route shape
    subset = ["trip_instance_key", "location_timestamp_local"]
    result2 = result.loc[result.is_within == True].reset_index(drop=True)[subset]

    vps_in_shape = total_vp_counts_by_trip(result2, "vp_in_shape")

    # Count total vps for the trip
    count_df = pd.merge(total_vp_df, vps_in_shape, on="trip_instance_key", how="left")

    count_df = count_df.assign(
        vp_in_shape=count_df.vp_in_shape.fillna(0).astype("int32"),
        total_vp=count_df.total_vp.fillna(0).astype("int32"),
    )

    return count_df

In [99]:
start = datetime.datetime.now()
print(start)
spatial_accuracy_df2 = spatial_accuracy_df1.map_partitions(
    total_counts,
    meta={"trip_instance_key": "object", "total_vp": "int32", "vp_in_shape": "int32"},
    align_dataframes=False,
).persist()

end = datetime.datetime.now()
logger.info(f"execution time: {end-start}")

2023-12-11 10:11:12.640232


2023-12-11 10:11:14.805 | INFO     | __main__:<module>:13 - execution time: 0:00:02.164918


In [101]:
spatial_accuracy_df2 = spatial_accuracy_df2.compute()

In [102]:
spatial_accuracy_df2.head()

Unnamed: 0,trip_instance_key,total_vp,vp_in_shape
0,04f5b4b55721bc7660d35070f517a0ba,156,156
1,058eab708e0ac2975eed59769824834e,81,81
2,05943c816dcc57bf29e7a7fc1d5a6ebb,448,248
3,06915ade82a0b28d3dd71f86a4400080,108,108
4,06944145dd4971a65428be69c89b2b77,145,145


In [112]:
type(update)

pandas.core.frame.DataFrame

test_m = (total_trip_time_df.merge(update, on = "trip_instance_key", how = "outer")
         .merge(spatial_accuracy_df2, on ="trip_instance_key", how = "outer")) 

### Read back in the file 12/11

In [4]:
full_df = pd.read_parquet("./scripts/rt_v_schedule_trip_metrics.parquet")

In [7]:
full_df.sample()

Unnamed: 0,trip_instance_key,rt_service_min,min_w_atleast2_trip_updates,median_pings_per_min,total_minute_w_gtfs,total_vp,vp_in_shape,speed_mph,route_id,time_of_day,service_minutes
75597,ee6e77a507b432d82e6a1cefed9b544f,71.13,70,3.0,71,211.0,211.0,7.09,19,Early AM,52.0


In [8]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86832 entries, 0 to 86831
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   trip_instance_key            86832 non-null  object 
 1   rt_service_min               86832 non-null  float64
 2   min_w_atleast2_trip_updates  86832 non-null  int64  
 3   median_pings_per_min         86832 non-null  float64
 4   total_minute_w_gtfs          86832 non-null  int64  
 5   total_vp                     74891 non-null  float64
 6   vp_in_shape                  74891 non-null  float64
 7   speed_mph                    77194 non-null  float64
 8   route_id                     75619 non-null  object 
 9   time_of_day                  77194 non-null  object 
 10  service_minutes              77194 non-null  float64
dtypes: float64(6), int64(2), object(3)
memory usage: 7.9+ MB


In [6]:
full_df.trip_instance_key.nunique(), len(full_df)

(86832, 86832)