## Map Partitions Test - Update Completeness
* https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/nearest_vp_to_stop.py
* The functions should all start from `vp_usable`

In [None]:
import datetime
import dask.dataframe as dd
import dask_geopandas as dg
import dask
import geopandas as gpd
import pandas as pd
from scripts import vp_spatial_accuracy
from segment_speed_utils import helpers
from calitp_data_analysis.geography_utils import WGS84
from segment_speed_utils.project_vars import (
    PROJECT_CRS,
    SEGMENT_GCS,
    analysis_date,
    GCS_FILE_PATH,
    COMPILED_CACHED_VIEWS,
    RT_SCHED_GCS,
    CONFIG_PATH
)

from typing import Literal
import numpy as np

from shared_utils.rt_utils import MPH_PER_MPS
from calitp_data_analysis import utils

# cd rt_segment_speeds && pip install -r requirements.txt && cd
from shared_utils import portfolio_utils, schedule_rt_utils
from segment_speed_utils import helpers, sched_rt_utils, wrangle_shapes, segment_calcs

In [None]:
# Times
import datetime
from loguru import logger

In [None]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:
# 14,514,960 rows
vp_usable= dd.read_parquet(
      f"{SEGMENT_GCS}vp_usable_{analysis_date}"
)

In [None]:
gtfs_keys = ["7cc0cb1871dfd558f11a2885c145d144",
             "d2b09fbd392b28d767c28ea26529b0cd"]

In [None]:
# Test a subset
vp_usable_subset = vp_usable.loc[vp_usable.schedule_gtfs_dataset_key.isin(gtfs_keys)]

### % of total trip time with 2 pings per minute
* Takes 1:23 secs
* Counting how many rows appear per minute by `trip instance key` to figure out how many gtfs pings occur.

In [None]:
def two_pings_per_min(vp_usable_df:pd.DataFrame) -> pd.DataFrame:
    
    # Find number of pings each minute
    df = (
        vp_usable_df.groupby(
            ["trip_instance_key",
                pd.Grouper(key="location_timestamp_local", freq="1Min"),
            ]
        )
        .vp_idx.count()
        .reset_index()
        .rename(columns={"vp_idx": "number_of_pings_per_minute"})
    )
    
    # Determine which rows have 2+ pings per minute
    df = df.assign(
        minutes_w_atleast2_trip_updates= df.apply(
            lambda x: 1 if x.number_of_pings_per_minute >= 2 else 0, axis=1
        )
    )
    
    # Create max time col
    df["max_time"] = df.location_timestamp_local
    
    # Find the min time for each trip and sum up total min with at least 2 pings per min
    df = (
        df.groupby(["trip_instance_key"])
        .agg(
            {
                "location_timestamp_local": "min",
                "max_time": "max",
                "minutes_w_atleast2_trip_updates": "sum",
                "number_of_pings_per_minute":"count"
            }
        )
        .reset_index()
        .rename(columns={"location_timestamp_local": "min_time",
                         "number_of_pings_per_minute":"total_minute_w_gtfs"})
    )
    
    # Find total trip time and add an extra minute
    df["total_trip_time"] = (df.max_time - df.min_time) / pd.Timedelta(minutes=1) + 1
    
    df = df.drop(columns = ['min_time','max_time'])
    return df

In [None]:
# df1 = two_pings_per_min(vp_usable_pd)

In [None]:
# df1.info()

In [None]:
start = datetime.datetime.now()
print(start)
partitions_test1 = vp_usable.map_partitions(
       two_pings_per_min,
        meta = {'trip_instance_key':'object', 
                'minutes_w_atleast2_trip_updates':'int64', 
                'total_minute_w_gtfs':'int64',
                'total_trip_time':'float64',},
        align_dataframes = False
    ).persist()

end = datetime.datetime.now()
logger.info(f"execution time: {end-start}")

#### Look at one trip

In [None]:
# df1.loc[df1.trip_instance_key == "00068c2e2316950af50ffaa9584c7a46"]

In [None]:
# df2.loc[df2.trip_instance_key ==  "00068c2e2316950af50ffaa9584c7a46"]

### Density: on average, how many pings occur per minute
* Takes 34 secs

In [None]:
def density_pings_5_min(vp_usable_df:pd.DataFrame) -> pd.DataFrame:
    
    # Count number of pings per 5 minutes
    df = (
        vp_usable_df.groupby(
            [
                *["trip_instance_key"],
                pd.Grouper(key="location_timestamp_local", freq="5Min"),
            ]
        )
        .vp_idx.count()
        .reset_index()
    )
    
    # Find median of pings per 5 minutes for each trip
    df = (
        df.groupby(["trip_instance_key"])
        .agg({"vp_idx": "median"})
        .reset_index()
        .rename(columns = {'vp_idx':'median_pings_per_5_min'})
    )
    
    # Divide by 5
    df.median_pings_per_5_min = df.median_pings_per_5_min/5
    
    return df

In [None]:
# df2 = density_pings_5_min(vp_usable_pd)

In [None]:
df2.info()

In [None]:
start = datetime.datetime.now()
print(start)
partitions_test2 = vp_usable.map_partitions(
       density_pings_5_min,
        meta = {'trip_instance_key':'object', 
                'median_pings_per_5_min':'float64'},
        align_dataframes = False
    ).persist()

end = datetime.datetime.now()
logger.info(f"execution time: {end-start}")

In [None]:
update_completeness = partitions_test1.merge(partitions_test2, on="trip_instance_key", how="inner")

In [None]:
type(update_completeness)

In [None]:
# len(update_completeness)

In [None]:
# update_completeness.trip_instance_key.nunique().compute()

### Spatial Accuracy
* Do I use shapes or trips_with_shape?

#### Test to see difference between `shapes` and `trips_with_shape`

In [None]:
 shapes = (
        pd.read_parquet(
            f"{COMPILED_CACHED_VIEWS}routelines_{analysis_date}.parquet",
            columns=["shape_array_key"],
        )
        .dropna()
        .drop_duplicates()
    )


In [None]:
trips_with_shape = (
        helpers.import_scheduled_trips(
            analysis_date,
            columns=["trip_instance_key", "shape_array_key"],
            get_pandas=True,
        ))

In [None]:
trips_with_shape_shapes = set(trips_with_shape.shape_array_key.unique().tolist())
shapes_shapes = set(shapes.shape_array_key.unique().tolist())
trips_with_shape_shapes - shapes_shapes

In [None]:
shapes_shapes - trips_with_shape_shapes 

In [None]:
trips_with_shape.head()

In [None]:
def grab_shape_keys_in_vp(vp_usable: dd.DataFrame, analysis_date: str) -> dd.DataFrame:
    """
    Subset raw vp and find unique trip_instance_keys.
    Create crosswalk to link trip_instance_key to shape_array_key.
    """
    vp_usable = (vp_usable[['trip_instance_key']]
                 .drop_duplicates()
                 .reset_index(drop=True)
                )

    trips_with_shape = (
        helpers.import_scheduled_trips(
            analysis_date,
            columns=["trip_instance_key", "shape_array_key"],
            get_pandas=False,
        )
    )
    
    m1 = dd.merge(vp_usable, trips_with_shape, on = "trip_instance_key", how = "inner")
    m1 = m1.compute()
    return m1

In [None]:
start = datetime.datetime.now()
print(start)
spatial_df1 = grab_shape_keys_in_vp(vp_usable, analysis_date)
end = datetime.datetime.now()
logger.info(f"execution time: {end-start}")

In [None]:
type(spatial_df1)

In [None]:
spatial_df1.shape

In [None]:
spatial_df1.head(2)

In [None]:
def buffer_shapes2(
    trips_with_shape: pd.DataFrame,
    analysis_date: str,
    buffer_meters: int = 35,
    **kwargs
):
    """
    Filter scheduled shapes down to the shapes that appear in vp.
    Buffer these.
    
    Attach the shape geometry for a subset of shapes or trips.
    """
    subset = trips_with_shape.shape_array_key.unique().tolist()
    
    shapes = helpers.import_scheduled_shapes(
        analysis_date,
        columns = ["shape_array_key", "geometry"],
        filters = [[("shape_array_key", "in", subset)]],
        crs = PROJECT_CRS,
        get_pandas = True
    )
    
    # to_crs takes awhile, so do a filtering on only shapes we need
    shapes = shapes.assign(
        geometry = shapes.geometry.buffer(buffer_meters)
    )
    
    trips_with_shape_geom = pd.merge(
        shapes,
        trips_with_shape,
        on = "shape_array_key",
        how = "inner"
    )
    
    return trips_with_shape_geom


In [None]:
start = datetime.datetime.now()
print(start)
spatial_df2 = buffer_shapes2(spatial_df1,
       analysis_date,
       35) 
end = datetime.datetime.now()
logger.info(f"execution time: {end-start}")

In [None]:
type(spatial_df2)

In [None]:
spatial_df2.info()

In [None]:
stop

#### Redo `merge_vp_with_shape_and_count` because it takes super long

In [None]:
keep = ['trip_instance_key','x','y','location_timestamp_local']

In [None]:
def vp_usable_to_gdf(vp_usable: dd.DataFrame, 
                        trips_with_shape_geom: gpd.GeoDataFrame)-> gpd.GeoDataFrame:
    
    keep = ['trip_instance_key','x','y','location_timestamp_local']
    vp_usable = vp_usable[keep]
    
    vp_gdf = gpd.GeoDataFrame(
        vp_usable, geometry=gpd.points_from_xy(vp_usable.x, vp_usable.y), crs=WGS84
    ).to_crs(PROJECT_CRS)
    

    vp_gdf = vp_gdf.rename(
        columns={
            0: "trip_instance_key",
            3: "location_timestamp_local",
          }
    )
    
    vp_gdf = vp_gdf[["trip_instance_key","location_timestamp_local",'geometry']]
  
    return vp_gdf

In [None]:
# test = vp_usable_to_gdf(vp_usable_subset, spatial_df2)

In [None]:
start = datetime.datetime.now()
print(start)
spatial_accuracy_df1 = vp_usable_subset.map_partitions(
        vp_usable_to_gdf,
        spatial_df2,
        meta = {'trip_instance_key':'object', 
                'location_timestamp_local':'datetime64[ns]', 
                'geometry':'geometry'},
        align_dataframes = False
    ).persist()
end = datetime.datetime.now()
logger.info(f"execution time: {end-start}")

In [None]:
spatial_accuracy_df1.columns

In [None]:
type(spatial_accuracy_df1)

In [None]:
def merge_vp_shapes(vp_gdf:gpd.GeoDataFrame, buffered_gdf:dd.DataFrame) -> gpd.GeoDataFrame:
    buffered_gdf = buffered_gdf.set_geometry('geometry')
    buffered_gdf = buffered_gdf.set_crs(PROJECT_CRS)
    
    m1 = dd.merge(
        vp_gdf, buffered_gdf, on="trip_instance_key", how="inner"
    )
    
    return m1

In [None]:
start = datetime.datetime.now()
print(start)
test1 = merge_vp_shapes(spatial_accuracy_df1, spatial_df2)

In [None]:
type(test1)

In [None]:
def total_counts(gdf: gpd.GeoDataFrame) -> pd.DataFrame:
    
    # Count total vps for the trip 
    total_vp = vp_spatial_accuracy.total_vp_counts_by_trip(gdf)
    
    # Count vps in the shape
    vp2 = gdf.assign(is_within=gdf.geometry_x.within(df.geometry_y)).query(
        "is_within==True"
    )
        
    vps_in_shape = (
        vp2.groupby("trip_instance_key", observed=True, group_keys=False)
        .agg({"location_timestamp_local": "count"})
        .reset_index()
        .rename(columns={"location_timestamp_local": "vp_in_shape"})
    )
    
    count_df = pd.merge(total_vp, vps_in_shape, on="trip_instance_key", how="left")
    
    count_df = count_df.assign(
        vp_in_shape=count_df.vp_in_shape.fillna(0).astype("int32"),
        total_vp=count_df.total_vp.fillna(0).astype("int32"),
    )
    
    return count_df

In [None]:
start = datetime.datetime.now()
print(start)
test2 = total_counts(spatial_accuracy_df1)
end = datetime.datetime.now()
logger.info(f"execution time: {end-start}")