## Spatial Accuracy
* Based on https://github.com/cal-itp/data-analyses/blob/main/rt_scheduled_v_ran/scripts/vp_spatial_accuracy.py
* https://github.com/cal-itp/data-analyses/blob/main/Makefile#L49C2-L49C66

In [None]:
import datetime
import dask.dataframe as dd
import dask_geopandas as dg
import dask
import geopandas as gpd
import pandas as pd
from scripts import vp_spatial_accuracy
from segment_speed_utils import helpers
from calitp_data_analysis.geography_utils import WGS84
from segment_speed_utils.project_vars import (
    PROJECT_CRS,
    SEGMENT_GCS,
    analysis_date,
    GCS_FILE_PATH,
    COMPILED_CACHED_VIEWS,
    RT_SCHED_GCS,
    CONFIG_PATH
)

# For speeds
from typing import Literal
import numpy as np

In [None]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:
# calitp-analytics-data/data-analyses/rt_segment_speeds/vp_usable_2023-10-11
operator = 'Bay Area 511 Muni VehiclePositions'
gtfs_key = '7cc0cb1871dfd558f11a2885c145d144'

### Grab_shape_keys_in_vp

In [None]:
def grab_shape_keys_in_vp(analysis_date: str) -> pd.DataFrame:
    """
    Subset raw vp and find unique trip_instance_keys.
    Create crosswalk to link trip_instance_key to shape_array_key.
    """
    vp_trip_df  = pd.read_parquet(f"{SEGMENT_GCS}vp_usable_{analysis_date}",
        filters = [[('gtfs_dataset_name', "==", operator),
                   ('schedule_gtfs_dataset_key', '==', gtfs_key)]],
        columns = ['trip_instance_key'])
    
    vp_trip_df  = vp_trip_df.drop_duplicates(subset="trip_instance_key").reset_index(drop = True)
    
    # Make sure we have a shape geometry too
    # otherwise map_partitions will throw error
    shapes = pd.read_parquet(
        f"{COMPILED_CACHED_VIEWS}routelines_{analysis_date}.parquet",
        columns = ["shape_array_key"],
    ).dropna().drop_duplicates()
    
    trips_with_shape = helpers.import_scheduled_trips(
        analysis_date,
        columns = ["trip_instance_key", "shape_array_key"],
        get_pandas = True
    ).merge(
        shapes,
        on = "shape_array_key",
        how = "inner"
    ).merge(
        vp_trip_df,
        on = "trip_instance_key",
        how = "inner"
    ).drop_duplicates().dropna().reset_index(drop=True)

    return trips_with_shape

In [None]:
trips_with_shape = grab_shape_keys_in_vp(analysis_date)

In [None]:
trips_with_shape.shape

In [None]:
# Try usin the original dataset
def grab_shape_keys_in_vp2(analysis_date: str) -> pd.DataFrame:
    """
    Subset raw vp and find unique trip_instance_keys.
    Create crosswalk to link trip_instance_key to shape_array_key.
    """
    vp_trip_df = pd.read_parquet(f"{SEGMENT_GCS}vp_{analysis_date}.parquet",
        filters = [[('gtfs_dataset_name', "==", operator),
                   ('schedule_gtfs_dataset_key', '==', gtfs_key)]])
    
    vp_trip_df  = vp_trip_df.drop_duplicates(subset="trip_instance_key").reset_index(drop = True)
    
    # Make sure we have a shape geometry too
    # otherwise map_partitions will throw error
    shapes = pd.read_parquet(
        f"{COMPILED_CACHED_VIEWS}routelines_{analysis_date}.parquet",
        columns = ["shape_array_key"],
    ).dropna().drop_duplicates()
    
    trips_with_shape = helpers.import_scheduled_trips(
        analysis_date,
        columns = ["trip_instance_key", "shape_array_key"],
        get_pandas = True
    ).merge(
        shapes,
        on = "shape_array_key",
        how = "inner"
    ).merge(
        vp_trip_df,
        on = "trip_instance_key",
        how = "inner"
    ).drop_duplicates().dropna().reset_index(drop=True)

    return trips_with_shape

In [None]:
# trips_with_shape_og = grab_shape_keys_in_vp2(analysis_date)

In [None]:
# 9374 rows
# trips_with_shape_og.shape

### Buffer shapes

In [None]:
# This is trips_with_shape_geom
trips_with_shape_geom =  vp_spatial_accuracy.buffer_shapes(analysis_date, 
                                trips_with_shape,
                                35)

In [None]:
type(trips_with_shape_geom)

In [None]:
trips_with_shape_geom.shape

### Actual function

#### Looking at original vp not `vp_usable`
* Everything plots correctly

In [None]:
stop

In [None]:
og_vp = dg.read_parquet(f"{SEGMENT_GCS}vp_{analysis_date}.parquet",
        filters = [[('gtfs_dataset_name', "==", operator),
                   ('schedule_gtfs_dataset_key', '==', gtfs_key)]])

In [None]:
og_vp.head(1)

In [None]:
# og_vp[['geometry','trip_instance_key']].head().explore('trip_instance_key',marker_kwds= {'radius':25})

In [None]:
og_vp = og_vp.to_crs(PROJECT_CRS)

In [None]:
og_vp = og_vp.assign(
            x = og_vp.geometry.x,
            y = og_vp.geometry.y
        ).drop(columns = "geometry")

In [None]:
type(og_vp)

In [None]:
og_vp = gpd.GeoDataFrame(
        og_vp,
        geometry = gpd.points_from_xy(og_vp.x, og_vp.y),
        crs = PROJECT_CRS
    )

In [None]:
og_vp.columns

In [None]:
og_vp = og_vp.rename(columns = {
    0:'gtfs_dataset_name',
    1: 'schedule_gtfs_dataset_key', 
    2:'trip_id',
    3:'trip_instance_key',
    4: 'location_timestamp', 
    5: 'location_timestamp_local',
    6:'x',
    7:'y', 
    8:'vp_idx', 
    9:'gtfs_dataset_key', 
    10:'vp_dir_xnorm', 
    11:'vp_dir_ynorm',
    12:'vp_primary_direction'})

In [None]:
og_vp.columns

In [None]:
# og_vp[['trip_instance_key','geometry','x','y']].head(5).explore('trip_instance_key',marker_kwds= {'radius':25})

In [None]:
og_vp2 = pd.merge(
        og_vp,
        buffered_shapes,
        on = "trip_instance_key",
        how = "inner"
    ).reset_index(drop=True)

In [None]:
total_og_vp2 = vp_spatial_accuracy.total_vp_counts_by_trip(og_vp2)

In [None]:
og_vp2 = og_vp2.assign(
        is_within = og_vp2.geometry_x.within(og_vp2.geometry_y)
    ).query('is_within==True')

In [None]:
og_vp2.is_within.value_counts()

#### Use vp_usable

In [None]:
vp = dd.read_parquet(f"{SEGMENT_GCS}vp_usable_{analysis_date}",
        filters = [[('gtfs_dataset_name', "==", operator),
                   ('schedule_gtfs_dataset_key', '==', gtfs_key)]])

In [None]:
total_vp_test = vp_spatial_accuracy.total_vp_counts_by_trip(vp)

In [None]:
total_vp_test.head()

In [None]:
len(total_vp_test)

* For some reason the only point that shows up is all the way in Stanislaus County no matter what

In [None]:
vp.head(2)

In [None]:
# WGS84

In [None]:
vp_gdf = gpd.GeoDataFrame(
        vp,
        geometry = gpd.points_from_xy(vp.x, vp.y),
        crs = WGS84
    ).to_crs(PROJECT_CRS)

In [None]:
vp_gdf = vp_gdf.rename(columns = {
    0:'gtfs_dataset_name',
    1: 'schedule_gtfs_dataset_key', 
    2:'trip_id',
    3:'trip_instance_key',
    4: 'location_timestamp', 
    5: 'location_timestamp_local',
    6:'x',
    7:'y', 
    8:'vp_idx', 
    9:'gtfs_dataset_key', 
    10:'vp_dir_xnorm', 
    11:'vp_dir_ynorm',
    12:'vp_primary_direction'})

In [None]:
# vp_gdf[['trip_instance_key','geometry','x','y']].tail(10).explore('trip_instance_key',marker_kwds= {'radius':25})

In [None]:
# Testing to make sure the two CRS look the same
# vp_gdf = vp_gdf.to_crs(PROJECT_CRS)

In [None]:
# vp_gdf[['trip_instance_key','geometry','x','y']].tail(10).explore('trip_instance_key',marker_kwds= {'radius':25})

#### Troubleshoot: not all vp_idx appear on `og_vp` versus the `vp_usable`.

In [None]:
analysis_date

In [None]:
# Try some other dates
march = '2023-03-15'
may = '2023-05-17'
july = '2023-07-12'
sept = '2023-09-13'

In [None]:
# pd.read_parquet(f"{SEGMENT_GCS}vp_{march}.parquet").sample(1)

In [None]:
# pd.read_parquet(f"{SEGMENT_GCS}vp_{may}.parquet").sample(1)

In [None]:
# pd.read_parquet(f"{SEGMENT_GCS}vp_{july}.parquet").sample(1)

In [None]:
# pd.read_parquet(f"{SEGMENT_GCS}vp_{sept}.parquet").sample(1)

In [None]:
#vp_usable_vps = set(vp_gdf.vp_idx.unique().tolist())
#og_vp_vps = set(og_vp.vp_idx.unique().tolist())
#len(vp_usable_vps - og_vp_vps)

In [None]:
#len(vp_usable_vps), len(og_vp_vps)

In [None]:
#len(og_vp)-len(vp_gdf)

In [None]:
#len(og_vp), len(vp_gdf)

In [None]:
# Can't find the same vp_idx?? 
#og_vp.loc[og_vp.vp_idx == "11412288"]

In [None]:
#og_vp.loc[og_vp.vp_idx == "11412289"]

In [None]:
#vp_gdf.head(2)

In [None]:
# vp_gdf[['trip_instance_key','geometry','x','y']].sample(10).explore('trip_instance_key',marker_kwds= {'radius':25})

##### Delete out `vp_idx` in October

In [None]:
og_vp = gpd.read_parquet(f"{SEGMENT_GCS}vp_{analysis_date}.parquet")

In [None]:
og_vp.head(2)

In [None]:
og_vp.gtfs_dataset_name.nunique()

In [None]:
og_vp.shape

#### Continuing on with the function

In [None]:
vp2 = pd.merge(
        vp_gdf,
        buffered_shapes,
        on = "trip_instance_key",
        how = "inner"
    ).reset_index(drop=True)

In [None]:
type(vp2)

In [None]:
total_vp = vp_spatial_accuracy.total_vp_counts_by_trip(vp2)

In [None]:
total_vp[['total_vp']].sample(5)

In [None]:
vp2 = vp2.assign(
       is_within = vp2.geometry_x.within(vp2.geometry_y)
  )

In [None]:
# Out of curiousity
vp2[['is_within']].value_counts()

In [None]:

vp2 = vp2.query('is_within==True')

In [None]:
vps_in_shape = (vp2.groupby("trip_instance_key", 
                                observed = True, group_keys = False)
                    .agg({"location_timestamp_local": "count"})
                    .reset_index()
                    .rename(columns = {"location_timestamp_local": "vp_in_shape"})
                   )
        

In [None]:
count_df = pd.merge(
        total_vp,
        vps_in_shape,
        on = "trip_instance_key",
        how = "left"
    )

In [None]:
count_df.shape

In [None]:
count_df.head()

### Function

In [None]:
vp = dd.read_parquet(f"{SEGMENT_GCS}vp_usable_{analysis_date}",
        filters = [[('gtfs_dataset_name', "==", operator),
                   ('schedule_gtfs_dataset_key', '==', gtfs_key)]])

In [None]:
def merge_vp_with_shape_and_count(
    vp: dd.DataFrame,
    trips_with_shape_geom: gpd.GeoDataFrame
) -> gpd.GeoDataFrame:
    """
    Merge vp with crosswalk and buffered shapes.
    Get vp count totals and vp within shape.
    """
    vp_gdf = gpd.GeoDataFrame(
        vp,
        geometry = gpd.points_from_xy(vp.x, vp.y),
        crs = WGS84
    ).to_crs(PROJECT_CRS)
    
    vp_gdf = vp_gdf.rename(columns = {
    0:'gtfs_dataset_name',
    1: 'schedule_gtfs_dataset_key', 
    2:'trip_id',
    3:'trip_instance_key',
    4: 'location_timestamp', 
    5: 'location_timestamp_local',
    6:'x',
    7:'y', 
    8:'vp_idx', 
    9:'gtfs_dataset_key', 
    10:'vp_dir_xnorm', 
    11:'vp_dir_ynorm',
    12:'vp_primary_direction'})
    
    vp2 = pd.merge(
        vp_gdf,
        trips_with_shape_geom,
        on = "trip_instance_key",
        how = "inner"
    ).reset_index(drop=True)
    
    total_vp = vp_spatial_accuracy.total_vp_counts_by_trip(vp2)
    
    vp2 = vp2.assign(
        is_within = vp2.geometry_x.within(vp2.geometry_y)
    ).query('is_within==True')
    
    vps_in_shape = (vp2.groupby("trip_instance_key", 
                                observed = True, group_keys = False)
                    .agg({"location_timestamp_local": "count"})
                    .reset_index()
                    .rename(columns = {"location_timestamp_local": "vp_in_shape"})
                   )
        
    count_df = pd.merge(
        total_vp,
        vps_in_shape,
        on = "trip_instance_key",
        how = "left"
    )
    
    count_df = count_df.assign(
        vp_in_shape = count_df.vp_in_shape.fillna(0).astype("int32"),
        total_vp = count_df.total_vp.fillna(0).astype("int32")
    )
    
    return count_df

In [None]:
muni = merge_vp_with_shape_and_count(vp, trips_with_shape_geom)

In [None]:
muni.sample(10)

## How many minutes a trip took and the average speeds?
* Temporary place until I move the spatial stuff out
* https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/C2_triangulate_vp.py

In [None]:
# Only use the trips with shapes.
relevant_trips = list(muni.trip_instance_key.unique())

In [None]:
type(muni)

In [None]:
len(relevant_trips)

In [None]:
# Test if I can filter for relevant trips
# Yes, can filter for only relevant trips up here? 
# https://github.com/cal-itp/data-analyses/blob/main/rt_scheduled_v_ran/scripts/vp_spatial_accuracy.py#L190-L193
trips_with_shape_trips = set(trips_with_shape.trip_instance_key.unique().tolist())
muni_trips = set(muni.trip_instance_key.unique().tolist())
trips_with_shape_trips - muni_trips

In [None]:
muni_trips - trips_with_shape_trips

In [None]:
vp_filtered = dd.read_parquet(f"{SEGMENT_GCS}vp_usable_{analysis_date}",
        filters = [[('gtfs_dataset_name', "==", operator),
                   ('schedule_gtfs_dataset_key', '==', gtfs_key),
                    ('trip_instance_key', 'in', relevant_trips)]])

In [None]:
# len(vp_filtered)

In [None]:
# vp_filtered[['trip_instance_key']].compute().nunique()

### Stop_seg_dict
* Also no more `grouping_col`? https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/C2_triangulate_vp.py#L94
    * https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/config.yml
* How to use function from `rt_segment_speeds/scripts/A3_valid_vehicle_positions.py`?

In [None]:
# Not working
# STOP_SEG_DICT = helpers.get_parameters('/rt_segment_speeds/scripts/config.yml', "stop_segments")

In [None]:
def merge_usable_vp_with_sjoin_vpidx(
    usable_vp_file: str,
    sjoin_results_file: str,
    sjoin_filtering: tuple = None,
    **kwargs
) -> dd.DataFrame:
    """
    Grab all the usable vp (with lat/lon columns), filter it down to
    normal or special cases, and merge it
    against the sjoin results (which only has vp_idx + segment_identifier_cols).
    """
    # First, grab all the usable vp (with lat/lon columns)
    usable_vp = dd.read_parquet(
        f"{SEGMENT_GCS}{usable_vp_file}", 
        **kwargs
    ).repartition(npartitions=100)
            
    # Grab our results of vp_idx joined to segments
    vp_to_seg = dd.read_parquet(
        f"{SEGMENT_GCS}vp_sjoin/{sjoin_results_file}",
        filters = sjoin_filtering,
    )
    
    usable_vp_full_info = dd.merge(
        usable_vp,
        vp_to_seg,
        on = "vp_idx",
        how = "inner"
    )
    
    return usable_vp_full_info

In [None]:
temp_dict = {
    'stage1': "vp_usable",
    'stage2': "nearest_vp",
    'stage3': "stop_arrivals",
    'stage4': "speed_stop_segments",
    'segment_identifier_cols': ["shape_array_key", "stop_sequence"],
   'timestamp_col': "location_timestamp_local",
    'time_min_cutoff': 10,
    'pct_segment_minimum': 0.3}

In [None]:
temp_dict['stage1']

In [None]:
def subset_usable_vp(usable_vp_w_shapes:pd.DataFrame, dict_inputs: dict) -> np.ndarray:
    """
    Subset all the usable vp and keep a sample of triangulated
    vp per trip.
    """
    SEGMENT_FILE = f'{dict_inputs["segments_file"]}_{analysis_date}'
    SJOIN_FILE = f'{dict_inputs["stage2"]}_{analysis_date}'
    # USABLE_FILE = f'{dict_inputs["stage1"]}_{analysis_date}'
    GROUPING_COL = dict_inputs["segment_identifier_cols"]
    
    all_shapes = pd.read_parquet(
        f"{SEGMENT_GCS}{SEGMENT_FILE}.parquet",
        columns = ["shape_array_key"]
    ).shape_array_key.unique().tolist()
    
    # Use this function to attach the crosswalk of sjoin results
    # back to usable_vp
    ddf = merge_usable_vp_with_sjoin_vpidx(
        usable_vp_w_shapes,
        SJOIN_FILE,
        sjoin_filtering = [(GROUPING_COL, "in", all_shapes)],
        columns = ["trip_instance_key", "vp_idx"]
    )
    
    # Results are just vp_idx as np array
    results = triangulate_vp(
        ddf, 
        ["trip_instance_key"]
    )
    
    return results

In [None]:
results = subset_usable_vp(muni, temp_dict)