## Improving on Script
* Feedback: https://github.com/cal-itp/data-analyses/pull/961

In [1]:
import datetime

import dask
import dask.dataframe as dd
import geopandas as gpd
import pandas as pd
from calitp_data_analysis.geography_utils import WGS84
from loguru import logger
from scripts import vp_spatial_accuracy
from segment_speed_utils import helpers, wrangle_shapes
from segment_speed_utils.project_vars import (
    GCS_FILE_PATH,
    PROJECT_CRS,
    SEGMENT_GCS,
    analysis_date,
)

In [2]:
analysis_date

'2023-11-15'

In [3]:
SEGMENT_GCS

'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/'

In [4]:
f"{GCS_FILE_PATH}rt_vs_schedule/"

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [5]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [6]:
from scripts import update_vars2

In [7]:
update_vars2.analysis_date_list

['2023-04-12', '2023-10-11', '2023-09-13']

### Filter columns 
* ['trip_instance_key', 'location_timestamp_local', 'x','y','vp_idx']

In [8]:
operator = "Bay Area 511 Muni VehiclePositions"
gtfs_key = "7cc0cb1871dfd558f11a2885c145d144"

In [9]:
def load_vp_usable(analysis_date):

    # Delete schedule_gtfs_dataset_key later
    df = dd.read_parquet(
        f"{SEGMENT_GCS}vp_usable_{analysis_date}",
        columns=[
            "schedule_gtfs_dataset_key",
            "trip_instance_key",
            "location_timestamp_local",
            "x",
            "y",
            "vp_idx",
        ],
    )

    # Create a copy of location timestamp for the total_trip_time function
    # to avoid type setting
    df["max_time"] = df.location_timestamp_local
    return df

In [10]:
vp_usable = load_vp_usable(analysis_date)
# Filter for now
vp_usable2 = vp_usable.loc[vp_usable.schedule_gtfs_dataset_key == gtfs_key].reset_index(
    drop=True
)

In [11]:
vp_usable.columns

Index(['schedule_gtfs_dataset_key', 'trip_instance_key',
       'location_timestamp_local', 'x', 'y', 'vp_idx', 'max_time'],
      dtype='object')

In [13]:
# vp_usable2 = vp_usable2.compute()

In [14]:
type(vp_usable2)

dask.dataframe.core.DataFrame

In [15]:
vp_usable2.schedule_gtfs_dataset_key.unique()

Dask Series Structure:
npartitions=1
    object
       ...
Name: schedule_gtfs_dataset_key, dtype: object
Dask Name: unique-agg, 10 graph layers

### Total Trip Time
* Addresses "<i>in this function, min_time, max_time are created on the grouped df (vp_usable grouped by trip and binned minute)...I think to be safer, it should be created on vp_usable grouped by trip.</i>"
* The copy setting is now turned on?? How to get it to go away?

In [16]:
def total_trip_time(vp_usable_df: pd.DataFrame):
    """
    For each trip: find the total service minutes
    recorded in real time data so we can compare it with
    scheduled service minutes.
    """
    subset = ["location_timestamp_local", "trip_instance_key", "max_time"]
    vp_usable_df = vp_usable_df[subset]

    # Need an extra copy of the column to find the max

    # Find the max and the min time based on location timestamp
    df = (
        vp_usable_df.groupby(["trip_instance_key"])
        .agg({"location_timestamp_local": "min", "max_time": "max"})
        .reset_index()
        .rename(columns={"location_timestamp_local": "min_time"})
    )

    # Find total rt service mins and add an extra minute
    df["rt_service_min"] = (df.max_time - df.min_time) / pd.Timedelta(minutes=1) + 1

    # Return only one row per trip with the total trip time
    df = df.drop(columns=["max_time", "min_time"])

    return df

#### Change in script: remove map partitions

In [17]:
start = datetime.datetime.now()
print(start)
test = total_trip_time(vp_usable)

end = datetime.datetime.now()
logger.info(f"execution time: {end-start}")

2023-12-13 11:30:02.714 | INFO     | __main__:<module>:6 - execution time: 0:00:00.060655


2023-12-13 11:30:02.654182


In [18]:
# total_trip_time_df = total_trip_time_df.compute()

### Update Completeness
#### Break apart?

In [19]:
def trips_by_one_min(vp_usable_df: pd.DataFrame):
    """
    For each trip: count how many rows are associated with each minute
    then tag whether or not a minute has 2+ pings. 
    """
    subset = ["location_timestamp_local", "trip_instance_key", "vp_idx"]
    vp_usable_df = vp_usable_df[subset]

    # Find number of pings each minute
    df = (
        vp_usable_df.groupby(
            [
                "trip_instance_key",
                pd.Grouper(key="location_timestamp_local", freq="1Min"),
            ]
        )
        .vp_idx.count()
        .reset_index()
        .rename(columns={"vp_idx": "number_of_pings_per_minute"})
    )

    # Determine which rows have 2+ pings per minute
    df = df.assign(
        min_w_atleast2_trip_updates=df.apply(
            lambda x: 1 if x.number_of_pings_per_minute >= 2 else 0, axis=1
        )
    )
    
    df = df.drop(columns = ['location_timestamp_local'])
    return df

In [20]:
# map partitions here 
# test = trips_by_min(vp_usable2)

In [21]:
start = datetime.datetime.now()
print(start)
one_min_ping_df = vp_usable.map_partitions(
    trips_by_one_min,
    meta={
        "trip_instance_key": "object",
        "number_of_pings_per_minute": "int64",
        "min_w_atleast2_trip_updates":"int64"
    },
    align_dataframes=False,
).persist()

end = datetime.datetime.now()
logger.info(f"execution time: {end-start}")

2023-12-13 11:30:02.769780


2023-12-13 11:32:17.554 | INFO     | __main__:<module>:14 - execution time: 0:02:14.785107


In [22]:
len(one_min_ping_df)

5554717

In [23]:
type(one_min_ping_df)

dask.dataframe.core.DataFrame

In [24]:
one_min_ping_df

Unnamed: 0_level_0,trip_instance_key,number_of_pings_per_minute,min_w_atleast2_trip_updates
npartitions=96,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,object,int64,int64
,...,...,...
...,...,...,...
,...,...,...
,...,...,...


In [25]:
def update_completeness(trips_by_one_min: pd.DataFrame):
    """
    For each trip: find the median GTFS pings per minute,
    the total minutes with at least 1 GTFS ping per minute,
    and total minutes with at least 2 GTFS pings per minute.
    """
    # Need a copy of numer of pings per minute to count for total minutes w gtfs
    df["total_min_w_gtfs"] = df.number_of_pings_per_minute
    
    # Find the total min with at least 2 pings per min
    df = (
        df.groupby(["trip_instance_key"])
        .agg(
            {
                "min_w_atleast2_trip_updates": "sum",
                "number_of_pings_per_minute": "mean",
                "total_min_w_gtfs": "count",
            }
        )
        .reset_index()
        .rename(
            columns={
                "number_of_pings_per_minute": "avg_pings_per_min",
            }
        )
    )

    return df

In [26]:
start = datetime.datetime.now()
print(start)
update_df = update_completeness(one_min_ping_df)
end = datetime.datetime.now()
logger.info(f"execution time: {end-start}")

2023-12-13 11:32:17.984788


UnboundLocalError: local variable 'df' referenced before assignment

### Spatial Accuracy
* Addresses "<i>in next draft, work on grouping functions that belong together, such as this one. total_counts and total_counts_by_trip sound basically equivalent, and they are nearly doing the same thing, except total_counts actually creates 2 columns. work on logically grouping or absorbing functions or rewriting functions so the same function can now be used twice.
Adapt this function to be used twice
Compare it to this to find where they have stuff in common and which part should be removed from the generic function</i>"

In [27]:
def grab_shape_keys_in_vp(vp_usable: dd.DataFrame, analysis_date: str) -> pd.DataFrame:
    """
    Subset raw vp and find unique trip_instance_keys.
    Create crosswalk to link trip_instance_key to shape_array_key.
    """
    vp_usable = (
        vp_usable[["trip_instance_key"]].drop_duplicates().reset_index(drop=True)
    )

    trips_with_shape = helpers.import_scheduled_trips(
        analysis_date,
        columns=["trip_instance_key", "shape_array_key"],
        get_pandas=True,
    )

    # Only one row per trip/shape
    # trip_instance_key and shape_array_key are the only 2 cols left
    m1 = dd.merge(vp_usable, trips_with_shape, on="trip_instance_key", how="inner")

    return m1

### Why is September not working


In [31]:
date2= '2023-09-13'

In [54]:
vp_usable = load_vp_usable(date2)
# Filter for now
vp_usable2 = vp_usable.loc[vp_usable.schedule_gtfs_dataset_key == gtfs_key].reset_index(
    drop=True
)

In [55]:
vp_usable2 = vp_usable2.compute()

In [61]:
vp_usable2.schedule_gtfs_dataset_key.unique()

array(['7cc0cb1871dfd558f11a2885c145d144'], dtype=object)

In [65]:
test_keys = vp_usable2[['trip_instance_key']].drop_duplicates()

In [67]:
test_keys.shape

(9360, 1)

In [69]:
test_keys.head()

Unnamed: 0,trip_instance_key
0,cb78e2e2d2bdd74e897574f57629ebf3
106,ac45dac6886a6c20c532396e674d1f39
168,456b7c997b0c63be4e0eeba930ce9460
327,68f864dd5831697753c11e29479967e8
423,edd4e5cfb00cc83cdeb7f15649826bc3


In [68]:
 trips_with_shape = helpers.import_scheduled_trips(
        date2,
        columns=["trip_instance_key", "shape_array_key"],
        get_pandas=True,
    )

In [70]:
m1 = pd.merge(test_keys, trips_with_shape, on="trip_instance_key", how="inner")

In [71]:
m1.shape

(0, 2)

In [62]:
shapes_df = grab_shape_keys_in_vp(vp_usable2, date2)

In [63]:
type(shapes_df)

pandas.core.frame.DataFrame

In [64]:
shapes_df.shape

(0, 2)

In [52]:
subset = shapes_df.shape_array_key.unique().tolist()

In [53]:
subset

[]

In [29]:
def buffer_shapes(
    trips_with_shape: pd.DataFrame,
    analysis_date: str,
    buffer_meters: int = 35,
):
    """
    Filter scheduled shapes down to the shapes that appear in vp.
    Buffer these.

    Attach the shape geometry for a subset of shapes or trips.
    """
    subset = trips_with_shape.shape_array_key.unique().compute().tolist()

    shapes = helpers.import_scheduled_shapes(
        analysis_date,
        columns=["shape_array_key", "geometry"],
        filters=[[("shape_array_key", "in", subset)]],
        crs=PROJECT_CRS,
        get_pandas=False,
    ).pipe(helpers.remove_shapes_outside_ca)

    # to_crs takes awhile, so do a filtering on only shapes we need
    shapes = shapes.assign(geometry=shapes.geometry.buffer(buffer_meters))

    trips_with_shape_geom = dd.merge(
        shapes, trips_with_shape, on="shape_array_key", how="inner"
    )

    trips_with_shape_geom = trips_with_shape_geom.compute()
    return trips_with_shape_geom

In [36]:
from segment_speed_utils.project_vars import COMPILED_CACHED_VIEWS

In [37]:
COMPILED_CACHED_VIEWS

'gs://calitp-analytics-data/data-analyses/rt_delay/compiled_cached_views/'

In [38]:
FILE = f"{COMPILED_CACHED_VIEWS}routelines_{date2}.parquet"

In [39]:
FILE

'gs://calitp-analytics-data/data-analyses/rt_delay/compiled_cached_views/routelines_2023-09-13.parquet'

In [40]:
# gs://calitp-analytics-data/data-analyses/rt_delay/compiled_cached_views/routelines_2023-09-13.parquet

In [50]:
shapes = helpers.import_scheduled_shapes(
        date2,
        columns=["shape_array_key", "geometry"],
        crs=PROJECT_CRS,
        get_pandas=False,
    ).pipe(helpers.remove_shapes_outside_ca)

In [42]:
import dask_geopandas as dg

In [43]:
test = dg.read_parquet(FILE)

In [44]:
test2 = gpd.read_parquet(FILE)

In [49]:
test2.columns

Index(['feed_key', 'feed_timezone', 'service_date',
       'shape_first_departure_datetime_pacific',
       'shape_last_arrival_datetime_pacific', 'shape_id', 'shape_array_key',
       'n_trips', 'geometry'],
      dtype='object')

In [33]:
buffer_df = buffer_shapes(shapes_df, date2, 35)

ArrowTypeError: Array type doesn't match type of values set: string vs null

In [None]:
def vp_in_shape(
    vp_usable: dd.DataFrame, trips_with_buffered_shape: gpd.GeoDataFrame
) -> gpd.GeoDataFrame:

    keep = ["trip_instance_key", "x", "y", "location_timestamp_local"]
    vp_usable = vp_usable[keep]

    vp_gdf = wrangle_shapes.vp_as_gdf(vp_usable)

    gdf = pd.merge(
        vp_gdf, trips_with_buffered_shape, on="trip_instance_key", how="inner"
    )

    gdf = gdf.assign(is_within=gdf.geometry_x.within(gdf.geometry_y))
    gdf = gdf[["trip_instance_key", "location_timestamp_local", "is_within"]]

    return gdf

In [None]:
start = datetime.datetime.now()
print(start)
spatial_accuracy_df1 = vp_usable.map_partitions(
    vp_in_shape,
    buffer_df,
    meta={
        "trip_instance_key": "object",
        "location_timestamp_local": "datetime64[ns]",
        "is_within": "bool",
    },
    align_dataframes=False,
).persist()

end = datetime.datetime.now()
logger.info(f"execution time: {end-start}")

#### Adapt this to be used in multiple places

In [None]:
def total_vp_counts_by_trip(vp: gpd.GeoDataFrame, new_col_title: str) -> pd.DataFrame:
    """
    Get a count of vp for each trip, whether or not those fall
    within buffered shape or not
    """
    count_vp = (
        vp.groupby("trip_instance_key", observed=True, group_keys=False)
        .agg({"location_timestamp_local": "count"})
        .reset_index()
        .rename(columns={"location_timestamp_local": new_col_title})
    )

    return count_vp

In [None]:
def total_counts(result: dd.DataFrame):

    # Find the total number of vps for each route
    total_vp_df = total_vp_counts_by_trip(result, "total_vp")

    # Find the total number of vps that actually fall within the  route shape
    subset = ["trip_instance_key", "location_timestamp_local"]
    result2 = result.loc[result.is_within == True].reset_index(drop=True)[subset]

    vps_in_shape = total_vp_counts_by_trip(result2, "vp_in_shape")

    # Count total vps for the trip
    count_df = pd.merge(total_vp_df, vps_in_shape, on="trip_instance_key", how="left")

    count_df = count_df.assign(
        vp_in_shape=count_df.vp_in_shape.fillna(0).astype("int32"),
        total_vp=count_df.total_vp.fillna(0).astype("int32"),
    )

    return count_df

In [None]:
start = datetime.datetime.now()
print(start)
spatial_accuracy_df2 = spatial_accuracy_df1.map_partitions(
    total_counts,
    meta={"trip_instance_key": "object", "total_vp": "int32", "vp_in_shape": "int32"},
    align_dataframes=False,
).persist()

end = datetime.datetime.now()
logger.info(f"execution time: {end-start}")

In [None]:
spatial_accuracy_df2 = spatial_accuracy_df2.compute()

In [None]:
spatial_accuracy_df2.head()

In [None]:
type(update)

test_m = (total_trip_time_df.merge(update, on = "trip_instance_key", how = "outer")
         .merge(spatial_accuracy_df2, on ="trip_instance_key", how = "outer")) 

### Read back in the file 12/11

In [None]:
full_df = pd.read_parquet("./scripts/rt_v_schedule_trip_metrics.parquet")

In [None]:
full_df.sample()

In [None]:
full_df.info()

In [None]:
full_df.trip_instance_key.nunique(), len(full_df)