## Improving on Script
* Feedback: https://github.com/cal-itp/data-analyses/pull/961

In [1]:
import datetime

import dask
import dask.dataframe as dd
import geopandas as gpd
import pandas as pd
from calitp_data_analysis.geography_utils import WGS84
"""
from scripts import vp_spatial_accuracy

from segment_speed_utils import helpers, wrangle_shapes
from segment_speed_utils.project_vars import (
    GCS_FILE_PATH,
    PROJECT_CRS,
    SEGMENT_GCS,
    analysis_date,
)
"""

'\nfrom scripts import vp_spatial_accuracy\n\nfrom segment_speed_utils import helpers, wrangle_shapes\nfrom segment_speed_utils.project_vars import (\n    GCS_FILE_PATH,\n    PROJECT_CRS,\n    SEGMENT_GCS,\n    analysis_date,\n)\n'

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### Read in files

In [3]:
may_df = pd.read_parquet('gs://calitp-analytics-data/data-analyses/rt_vs_schedule/trip_level_metrics/2023-05-17_metrics.parquet')

In [4]:
april_df = pd.read_parquet('gs://calitp-analytics-data/data-analyses/rt_vs_schedule/trip_level_metrics/2023-04-12_metrics.parquet')

In [5]:
mar_df = pd.read_parquet('gs://calitp-analytics-data/data-analyses/rt_vs_schedule/trip_level_metrics/2023-03-15_metrics.parquet')

In [6]:
may_df.shape

(83606, 15)

### To do
* `rt_w_gtfs_pct` mask values above 100 with 100 

In [7]:
def check_out(df:pd.DataFrame):
    display(df.spatial_accuracy_pct.describe())
    display(df.pings_per_min.describe())
    display(df.rt_w_gtfs_pct.describe())
    display(df.rt_v_scheduled_trip_time_pct.describe())

In [8]:
check_out(mar_df)

count   69494.00
mean       94.10
std        12.34
min         0.00
25%        95.24
50%       100.00
75%       100.00
max       100.00
Name: spatial_accuracy_pct, dtype: float64

count   83620.00
mean        2.46
std         0.68
min         0.00
25%         1.91
50%         2.86
75%         2.96
max         4.75
Name: pings_per_min, dtype: float64

count   83620.00
mean       95.20
std        14.55
min         0.28
25%        97.99
50%        99.58
75%       100.25
max       108.43
Name: rt_w_gtfs_pct, dtype: float64

count   71797.00
mean       60.68
std       333.76
min       -87.37
25%        10.76
50%        25.19
75%        44.44
max     17909.79
Name: rt_v_scheduled_trip_time_pct, dtype: float64

In [9]:
check_out(april_df)

count   71094.00
mean       94.06
std        12.64
min         0.00
25%        95.45
50%       100.00
75%       100.00
max       100.00
Name: spatial_accuracy_pct, dtype: float64

count   84516.00
mean        2.45
std         0.67
min         0.01
25%         1.91
50%         2.83
75%         2.95
max         5.18
Name: pings_per_min, dtype: float64

count   84516.00
mean       95.23
std        14.48
min         0.56
25%        97.88
50%        99.53
75%       100.20
max       108.11
Name: rt_w_gtfs_pct, dtype: float64

count   73471.00
mean       61.42
std       349.18
min       -86.02
25%        10.46
50%        25.10
75%        44.38
max     15903.70
Name: rt_v_scheduled_trip_time_pct, dtype: float64

In [10]:
check_out(may_df)

count   65385.00
mean       94.07
std        12.45
min         0.00
25%        95.00
50%       100.00
75%       100.00
max       100.00
Name: spatial_accuracy_pct, dtype: float64

count   83606.00
mean        2.47
std         0.68
min         0.00
25%         1.95
50%         2.86
75%         2.95
max         5.14
Name: pings_per_min, dtype: float64

count   83606.00
mean       95.47
std        13.78
min         0.27
25%        97.76
50%        99.51
75%       100.21
max       108.43
Name: rt_w_gtfs_pct, dtype: float64

count   67864.00
mean       60.61
std       329.38
min       -90.09
25%        10.96
50%        25.64
75%        44.61
max     17907.92
Name: rt_v_scheduled_trip_time_pct, dtype: float64

### Filter columns 
* ['trip_instance_key', 'location_timestamp_local', 'x','y','vp_idx']

In [None]:
operator = "Bay Area 511 Muni VehiclePositions"
gtfs_key = "7cc0cb1871dfd558f11a2885c145d144"

In [None]:
def load_vp_usable(analysis_date):

    # Delete schedule_gtfs_dataset_key later
    df = dd.read_parquet(
        f"{SEGMENT_GCS}vp_usable_{analysis_date}",
        columns=[
            "schedule_gtfs_dataset_key",
            "trip_instance_key",
            "location_timestamp_local",
            "x",
            "y",
            "vp_idx",
        ],
    )

    # Create a copy of location timestamp for the total_trip_time function
    # to avoid type setting
    df["max_time"] = df.location_timestamp_local
    return df

In [None]:
vp_usable = load_vp_usable(analysis_date)
# Filter for now
vp_usable2 = vp_usable.loc[vp_usable.schedule_gtfs_dataset_key == gtfs_key].reset_index(
    drop=True
)

In [None]:
# vp_usable2 = vp_usable2.compute()

In [None]:
type(vp_usable2)

### Total Trip Time
* Addresses "<i>in this function, min_time, max_time are created on the grouped df (vp_usable grouped by trip and binned minute)...I think to be safer, it should be created on vp_usable grouped by trip.</i>"
* The copy setting is now turned on?? How to get it to go away?

In [None]:
def total_trip_time(vp_usable_df: pd.DataFrame):
    """
    For each trip: find the total service minutes
    recorded in real time data so we can compare it with
    scheduled service minutes.
    """
    subset = ["location_timestamp_local", "trip_instance_key", "max_time"]
    vp_usable_df = vp_usable_df[subset]

    # Need an extra copy of the column to find the max

    # Find the max and the min time based on location timestamp
    df = (
        vp_usable_df.groupby(["trip_instance_key"])
        .agg({"location_timestamp_local": "min", "max_time": "max"})
        .reset_index()
        .rename(columns={"location_timestamp_local": "min_time"})
    )

    # Find total rt service mins and add an extra minute
    df["rt_service_min"] = (df.max_time - df.min_time) / pd.Timedelta(minutes=1) + 1

    # Return only one row per trip with the total trip time
    df = df.drop(columns=["max_time", "min_time"])

    return df

#### Change in script: remove map partitions

In [None]:
start = datetime.datetime.now()
print(start)
test = total_trip_time(vp_usable)

end = datetime.datetime.now()
logger.info(f"execution time: {end-start}")

In [None]:
# total_trip_time_df = total_trip_time_df.compute()

In [None]:
test.head()

### Update Completeness
#### Break apart?

In [None]:
def trips_by_one_min(vp_usable_df: pd.DataFrame):
    """
    For each trip: count how many rows are associated with each minute
    then tag whether or not a minute has 2+ pings. 
    """
    subset = ["location_timestamp_local", "trip_instance_key", "vp_idx"]
    vp_usable_df = vp_usable_df[subset]

    # Find number of pings each minute
    df = (
        vp_usable_df.groupby(
            [
                "trip_instance_key",
                pd.Grouper(key="location_timestamp_local", freq="1Min"),
            ]
        )
        .vp_idx.count()
        .reset_index()
        .rename(columns={"vp_idx": "number_of_pings_per_minute"})
    )

    # Determine which rows have 2+ pings per minute
    df = df.assign(
        min_w_atleast2_trip_updates=df.apply(
            lambda x: 1 if x.number_of_pings_per_minute >= 2 else 0, axis=1
        )
    )
    
    df = df.drop(columns = ['location_timestamp_local'])
    return df

In [None]:
# map partitions here 
# test = trips_by_min(vp_usable2)

In [None]:
start = datetime.datetime.now()
print(start)
one_min_ping_df = vp_usable.map_partitions(
    trips_by_one_min,
    meta={
        "trip_instance_key": "object",
        "number_of_pings_per_minute": "int64",
        "min_w_atleast2_trip_updates":"int64"
    },
    align_dataframes=False,
).persist()

end = datetime.datetime.now()
logger.info(f"execution time: {end-start}")

In [None]:
len(one_min_ping_df)

In [None]:
type(one_min_ping_df)

In [None]:
def update_completeness(df: pd.DataFrame):
    """
    For each trip: find the median GTFS pings per minute,
    the total minutes with at least 1 GTFS ping per minute,
    and total minutes with at least 2 GTFS pings per minute.
    """
    # Need a copy of numer of pings per minute to count for total minutes w gtfs
    df["total_min_w_gtfs"] = df.number_of_pings_per_minute
    
    # Find the total min with at least 2 pings per min
    df = (
        df.groupby(["trip_instance_key"])
        .agg(
            {
                "min_w_atleast2_trip_updates": "sum",
                "number_of_pings_per_minute": "sum",
                "total_min_w_gtfs": "count",
            }
        )
        .reset_index()
        .rename(
            columns={
                "number_of_pings_per_minute": "total_pings_for_trip",
            }
        )
    )

    return df

In [None]:
start = datetime.datetime.now()
print(start)
update_df = update_completeness(one_min_ping_df)
end = datetime.datetime.now()
logger.info(f"execution time: {end-start}")

In [None]:
update_df

#### Testing new way to find pings per min

In [None]:
m1 = dd.merge(update_df, test, on = ['trip_instance_key'], how = 'inner')

In [None]:
m1 = m1.compute()

In [None]:
m1['ping_per_min'] = (m1.total_pings_for_trip / m1.rt_service_min)

In [None]:
m1.columns

In [None]:
full_df = pd.read_parquet("./scripts/rt_v_schedule_trip_metrics.parquet")

In [None]:
subset = ['trip_instance_key','ping_per_min', 'rt_service_min']

In [None]:
pings_test = pd.merge(m1[subset], full_df[['trip_instance_key', 'avg_pings_per_min', 'rt_service_min']], on = 'trip_instance_key', how = "inner")

In [None]:
pings_test['difference'] = pings_test.ping_per_min - pings_test.avg_pings_per_min

In [None]:
pings_test['difference'].describe()

In [None]:
pings_test.loc[pings_test.difference < 0].sort_values(by = ['difference'], ascending = True).head(30)

In [None]:
m1.loc[m1.trip_instance_key == "923ff90bea616b5ac4ec909d79503424"]

In [None]:
one_trip = one_min_ping_df.loc[one_min_ping_df.trip_instance_key == "923ff90bea616b5ac4ec909d79503424"].compute()

In [None]:
one_trip_og = vp_usable.loc[vp_usable.trip_instance_key == "923ff90bea616b5ac4ec909d79503424"].compute()

In [None]:
# one_trip_og

In [None]:
 # one_trip

In [None]:
one_trip.number_of_pings_per_minute.mean()

In [None]:
one_trip.number_of_pings_per_minute.sum()/198.6

### Spatial Accuracy
* Addresses "<i>in next draft, work on grouping functions that belong together, such as this one. total_counts and total_counts_by_trip sound basically equivalent, and they are nearly doing the same thing, except total_counts actually creates 2 columns. work on logically grouping or absorbing functions or rewriting functions so the same function can now be used twice.
Adapt this function to be used twice
Compare it to this to find where they have stuff in common and which part should be removed from the generic function</i>"

In [None]:
def grab_shape_keys_in_vp(vp_usable: dd.DataFrame, analysis_date: str) -> pd.DataFrame:
    """
    Subset raw vp and find unique trip_instance_keys.
    Create crosswalk to link trip_instance_key to shape_array_key.
    """
    vp_usable = (
        vp_usable[["trip_instance_key"]].drop_duplicates().reset_index(drop=True)
    )

    trips_with_shape = helpers.import_scheduled_trips(
        analysis_date,
        columns=["trip_instance_key", "shape_array_key"],
        get_pandas=True,
    )

    # Only one row per trip/shape
    # trip_instance_key and shape_array_key are the only 2 cols left
    m1 = dd.merge(vp_usable, trips_with_shape, on="trip_instance_key", how="inner")

    return m1

### Why is September not working


In [None]:
date2= '2023-09-13'

In [None]:
vp_usable = load_vp_usable(date2)
# Filter for now
vp_usable2 = vp_usable.loc[vp_usable.schedule_gtfs_dataset_key == gtfs_key].reset_index(
    drop=True
)

In [None]:
vp_usable2 = vp_usable2.compute()

In [None]:
vp_usable2.schedule_gtfs_dataset_key.unique()

In [None]:
test_keys = vp_usable2[['trip_instance_key']].drop_duplicates()

In [None]:
test_keys.shape

In [None]:
test_keys.head()

In [None]:
 trips_with_shape = helpers.import_scheduled_trips(
        date2,
        columns=["trip_instance_key", "shape_array_key"],
        get_pandas=True,
    )

In [None]:
m1 = pd.merge(test_keys, trips_with_shape, on="trip_instance_key", how="inner")

In [None]:
m1.shape

In [None]:
shapes_df = grab_shape_keys_in_vp(vp_usable2, date2)

In [None]:
type(shapes_df)

In [None]:
shapes_df.shape

In [None]:
subset = shapes_df.shape_array_key.unique().tolist()

In [None]:
subset

In [None]:
def buffer_shapes(
    trips_with_shape: pd.DataFrame,
    analysis_date: str,
    buffer_meters: int = 35,
):
    """
    Filter scheduled shapes down to the shapes that appear in vp.
    Buffer these.

    Attach the shape geometry for a subset of shapes or trips.
    """
    subset = trips_with_shape.shape_array_key.unique().compute().tolist()

    shapes = helpers.import_scheduled_shapes(
        analysis_date,
        columns=["shape_array_key", "geometry"],
        filters=[[("shape_array_key", "in", subset)]],
        crs=PROJECT_CRS,
        get_pandas=False,
    ).pipe(helpers.remove_shapes_outside_ca)

    # to_crs takes awhile, so do a filtering on only shapes we need
    shapes = shapes.assign(geometry=shapes.geometry.buffer(buffer_meters))

    trips_with_shape_geom = dd.merge(
        shapes, trips_with_shape, on="shape_array_key", how="inner"
    )

    trips_with_shape_geom = trips_with_shape_geom.compute()
    return trips_with_shape_geom

In [None]:
from segment_speed_utils.project_vars import COMPILED_CACHED_VIEWS

In [None]:
COMPILED_CACHED_VIEWS

In [None]:
FILE = f"{COMPILED_CACHED_VIEWS}routelines_{date2}.parquet"

In [None]:
FILE

In [None]:
# gs://calitp-analytics-data/data-analyses/rt_delay/compiled_cached_views/routelines_2023-09-13.parquet

In [None]:
shapes = helpers.import_scheduled_shapes(
        date2,
        columns=["shape_array_key", "geometry"],
        crs=PROJECT_CRS,
        get_pandas=False,
    ).pipe(helpers.remove_shapes_outside_ca)

In [None]:
import dask_geopandas as dg

In [None]:
test = dg.read_parquet(FILE)

In [None]:
test2 = gpd.read_parquet(FILE)

In [None]:
test2.columns

In [None]:
buffer_df = buffer_shapes(shapes_df, date2, 35)

In [None]:
def vp_in_shape(
    vp_usable: dd.DataFrame, trips_with_buffered_shape: gpd.GeoDataFrame
) -> gpd.GeoDataFrame:

    keep = ["trip_instance_key", "x", "y", "location_timestamp_local"]
    vp_usable = vp_usable[keep]

    vp_gdf = wrangle_shapes.vp_as_gdf(vp_usable)

    gdf = pd.merge(
        vp_gdf, trips_with_buffered_shape, on="trip_instance_key", how="inner"
    )

    gdf = gdf.assign(is_within=gdf.geometry_x.within(gdf.geometry_y))
    gdf = gdf[["trip_instance_key", "location_timestamp_local", "is_within"]]

    return gdf

In [None]:
start = datetime.datetime.now()
print(start)
spatial_accuracy_df1 = vp_usable.map_partitions(
    vp_in_shape,
    buffer_df,
    meta={
        "trip_instance_key": "object",
        "location_timestamp_local": "datetime64[ns]",
        "is_within": "bool",
    },
    align_dataframes=False,
).persist()

end = datetime.datetime.now()
logger.info(f"execution time: {end-start}")

#### Adapt this to be used in multiple places

In [None]:
def total_vp_counts_by_trip(vp: gpd.GeoDataFrame, new_col_title: str) -> pd.DataFrame:
    """
    Get a count of vp for each trip, whether or not those fall
    within buffered shape or not
    """
    count_vp = (
        vp.groupby("trip_instance_key", observed=True, group_keys=False)
        .agg({"location_timestamp_local": "count"})
        .reset_index()
        .rename(columns={"location_timestamp_local": new_col_title})
    )

    return count_vp

In [None]:
def total_counts(result: dd.DataFrame):

    # Find the total number of vps for each route
    total_vp_df = total_vp_counts_by_trip(result, "total_vp")

    # Find the total number of vps that actually fall within the  route shape
    subset = ["trip_instance_key", "location_timestamp_local"]
    result2 = result.loc[result.is_within == True].reset_index(drop=True)[subset]

    vps_in_shape = total_vp_counts_by_trip(result2, "vp_in_shape")

    # Count total vps for the trip
    count_df = pd.merge(total_vp_df, vps_in_shape, on="trip_instance_key", how="left")

    count_df = count_df.assign(
        vp_in_shape=count_df.vp_in_shape.fillna(0).astype("int32"),
        total_vp=count_df.total_vp.fillna(0).astype("int32"),
    )

    return count_df

In [None]:
start = datetime.datetime.now()
print(start)
spatial_accuracy_df2 = spatial_accuracy_df1.map_partitions(
    total_counts,
    meta={"trip_instance_key": "object", "total_vp": "int32", "vp_in_shape": "int32"},
    align_dataframes=False,
).persist()

end = datetime.datetime.now()
logger.info(f"execution time: {end-start}")

In [None]:
spatial_accuracy_df2 = spatial_accuracy_df2.compute()

In [None]:
spatial_accuracy_df2.head()

In [None]:
type(update)

test_m = (total_trip_time_df.merge(update, on = "trip_instance_key", how = "outer")
         .merge(spatial_accuracy_df2, on ="trip_instance_key", how = "outer")) 

### Read back in the file 12/11

In [None]:
full_df = pd.read_parquet("./scripts/rt_v_schedule_trip_metrics.parquet")

In [None]:
full_df.sample()

In [None]:
full_df.info()

In [None]:
full_df.trip_instance_key.nunique(), len(full_df)