## Spatial Accuracy
* Based on https://github.com/cal-itp/data-analyses/blob/main/rt_scheduled_v_ran/scripts/vp_spatial_accuracy.py

In [1]:
import datetime
import dask.dataframe as dd
import dask_geopandas as dg
import dask
import geopandas as gpd
import pandas as pd
from scripts import vp_spatial_accuracy
from segment_speed_utils import helpers
from segment_speed_utils.project_vars import (
    PROJECT_CRS,
    SEGMENT_GCS,
    analysis_date,
    GCS_FILE_PATH,
    COMPILED_CACHED_VIEWS,
    RT_SCHED_GCS
)

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
# calitp-analytics-data/data-analyses/rt_segment_speeds/vp_usable_2023-10-11
operator = 'Bay Area 511 Muni VehiclePositions'
gtfs_key = '7cc0cb1871dfd558f11a2885c145d144'

### Grab_shape_keys_in_vp

In [4]:
# Change to dask? 
vp_trip_df  = pd.read_parquet(f"{SEGMENT_GCS}vp_usable_{analysis_date}",
        filters = [[('gtfs_dataset_name', "==", operator),
                   ('schedule_gtfs_dataset_key', '==', gtfs_key)]],
        columns = ['trip_instance_key'])

In [5]:
vp_trip_df  = vp_trip_df.drop_duplicates(subset="trip_instance_key").reset_index(drop = True)

In [6]:
type(vp_trip_df)

pandas.core.frame.DataFrame

In [7]:
vp_trip_df.head()

Unnamed: 0,trip_instance_key
0,5e067d518df2c7d76b7465f1c7d0901e
1,367f856cd7327baa25e2b60983354b0b
2,09611dca0bc4621a15ff3190a60c98eb
3,6c3756e38210a805672cb5af4db5695d
4,7592a99c1dd9393aac1aae75a5e99756


In [8]:
# Routes of that day
shapes = pd.read_parquet(
        f"{COMPILED_CACHED_VIEWS}routelines_{analysis_date}.parquet",
        columns = ["shape_array_key"],
    ).dropna().drop_duplicates()

In [9]:
shapes.head()

Unnamed: 0,shape_array_key
0,e3435a4b882913d92a12563910f7193d
1,1280fbe0539903ceb0e4323ec64a6f03
2,c56094b2391edb1c4b3d60671bf9fd7d
3,23d4243fa5bbed460ab4f0b1a86340c8
4,cedcf94b8182617c885319fcd7406c40


In [10]:
# Acts as a crosswalk between vp usable and shapes
# Just previewing
"""
scheduled =  helpers.import_scheduled_trips(
        analysis_date,
        columns = ["trip_instance_key", "shape_array_key"],
        get_pandas = True)
        """

'\nscheduled =  helpers.import_scheduled_trips(\n        analysis_date,\n        columns = ["trip_instance_key", "shape_array_key"],\n        get_pandas = True)\n        '

In [11]:
trips_with_shape = helpers.import_scheduled_trips(
        analysis_date,
        columns = ["trip_instance_key", "shape_array_key"],
        get_pandas = True
    ).merge(
        shapes,
        on = "shape_array_key",
        how = "inner"
    ).merge(
        vp_trip_df,
        on = "trip_instance_key",
        how = "inner"
    ).drop_duplicates().dropna().reset_index(drop=True)

In [12]:
type(trips_with_shape)

pandas.core.frame.DataFrame

In [50]:
trips_with_shape.shape

(9240, 2)

In [51]:
vp_trip_df.shape

(9240, 1)

### Buffer shapes

In [13]:
def buffer_shapes(
    analysis_date: str,
    trips_with_shape_subset: dd.DataFrame,
    buffer_meters: int = 35,
    **kwargs
) -> gpd.GeoDataFrame:
    """
    Filter scheduled shapes down to the shapes that appear in vp.
    Buffer these.
    
    Attach the shape geometry for a subset of shapes or trips.
    """
    shapes_subset = trips_with_shape_subset.shape_array_key.unique().tolist()
    
    shapes = helpers.import_scheduled_shapes(
        analysis_date,
        columns = ["shape_array_key", "geometry"],
        filters = [[("shape_array_key", "in", shapes_subset)]],
        crs = PROJECT_CRS,
        get_pandas = True
    )
    
    # to_crs takes awhile, so do a filtering on only shapes we need
    shapes = shapes.assign(
        geometry = shapes.geometry.buffer(buffer_meters)
    )
    
    trips_with_shape_geom = dd.merge(
        shapes,
        trips_with_shape_subset,
        on = "shape_array_key",
        how = "inner"
    )
    
    return trips_with_shape_geom

In [14]:
# This is trips_with_shape_geom
buffered_shapes = buffer_shapes(analysis_date, 
                                trips_with_shape,
                                35)

In [15]:
type(buffered_shapes)

geopandas.geodataframe.GeoDataFrame

In [16]:
# buffered_shapes.plot('shape_array_key')

### Actual function

#### Looking at original vp not `vp_usable`
* Everything plots correctly

In [17]:
og_vp = dg.read_parquet(f"{SEGMENT_GCS}vp_{analysis_date}.parquet",
        filters = [[('gtfs_dataset_name', "==", operator),
                   ('schedule_gtfs_dataset_key', '==', gtfs_key)]]).to_crs(PROJECT_CRS)

In [19]:
og_vp[['geometry','trip_instance_key']].head().explore('trip_instance_key',marker_kwds= {'radius':25})

In [20]:
og_vp = og_vp.assign(
            x = og_vp.geometry.x,
            y = og_vp.geometry.y
        ).drop(columns = "geometry")

In [21]:
og_vp = gpd.GeoDataFrame(
        og_vp,
        geometry = gpd.points_from_xy(og_vp.x, og_vp.y),
        crs = PROJECT_CRS
    )

In [22]:
og_vp.columns

Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 'geometry'], dtype='object')

In [23]:
og_vp = og_vp.rename(columns = {
    0:'gtfs_dataset_name',
    1: 'schedule_gtfs_dataset_key', 
    2:'trip_id',
    3:'trip_instance_key',
    4: 'location_timestamp', 
    5: 'location_timestamp_local',
    6:'x',
    7:'y', 
    8:'vp_idx', 
    9:'gtfs_dataset_key', 
    10:'vp_dir_xnorm', 
    11:'vp_dir_ynorm',
    12:'vp_primary_direction'})

In [24]:
og_vp.columns

Index(['gtfs_dataset_name', 'schedule_gtfs_dataset_key', 'trip_id',
       'trip_instance_key', 'location_timestamp', 'location_timestamp_local',
       'x', 'y', 'vp_idx', 'geometry'],
      dtype='object')

In [25]:
og_vp[['trip_instance_key','geometry','x','y']].head(5).explore('trip_instance_key',marker_kwds= {'radius':25})

In [46]:
og_vp2 = pd.merge(
        og_vp,
        buffered_shapes,
        on = "trip_instance_key",
        how = "inner"
    ).reset_index(drop=True)

In [47]:
total_og_vp2 = vp_spatial_accuracy.total_vp_counts_by_trip(og_vp2)

In [48]:
og_vp2 = og_vp2.assign(
        is_within = og_vp2.geometry_x.within(og_vp2.geometry_y)
    ).query('is_within==True')

In [49]:
og_vp2.is_within.value_counts()

True    1495711
Name: is_within, dtype: int64

#### Use vp_usable

In [29]:
vp = dd.read_parquet(f"{SEGMENT_GCS}vp_usable_{analysis_date}",
        filters = [[('gtfs_dataset_name', "==", operator),
                   ('schedule_gtfs_dataset_key', '==', gtfs_key)]])

In [30]:
total_vp_test = vp_spatial_accuracy.total_vp_counts_by_trip(vp)

In [31]:
total_vp_test.head()

Unnamed: 0,trip_instance_key,total_vp
0,5e067d518df2c7d76b7465f1c7d0901e,111
1,367f856cd7327baa25e2b60983354b0b,81
2,09611dca0bc4621a15ff3190a60c98eb,145
3,6c3756e38210a805672cb5af4db5695d,106
4,7592a99c1dd9393aac1aae75a5e99756,144


* For some reason the only point that shows up is all the way in Stanislaus County no matter what

In [32]:
vp.head(2)

Unnamed: 0,gtfs_dataset_name,schedule_gtfs_dataset_key,trip_id,trip_instance_key,location_timestamp,location_timestamp_local,x,y,vp_idx,gtfs_dataset_key,vp_dir_xnorm,vp_dir_ynorm,vp_primary_direction
11412288,Bay Area 511 Muni VehiclePositions,7cc0cb1871dfd558f11a2885c145d144,11389544_M11,5e067d518df2c7d76b7465f1c7d0901e,2023-10-11 11:21:53+00:00,2023-10-11 04:21:53,-122.45,37.78,11412288,c0e3039da063db95ebabd3fe4ee611a4,,,Unknown
11412289,Bay Area 511 Muni VehiclePositions,7cc0cb1871dfd558f11a2885c145d144,11389544_M11,5e067d518df2c7d76b7465f1c7d0901e,2023-10-11 11:22:08+00:00,2023-10-11 04:22:08,-122.45,37.78,11412289,c0e3039da063db95ebabd3fe4ee611a4,,,Unknown


In [35]:
vp_gdf = gpd.GeoDataFrame(
        vp,
        geometry = gpd.points_from_xy(vp.x, vp.y),
        crs = PROJECT_CRS
    )

In [39]:
vp_gdf = vp_gdf.rename(columns = {
    0:'gtfs_dataset_name',
    1: 'schedule_gtfs_dataset_key', 
    2:'trip_id',
    3:'trip_instance_key',
    4: 'location_timestamp', 
    5: 'location_timestamp_local',
    6:'x',
    7:'y', 
    8:'vp_idx', 
    9:'gtfs_dataset_key', 
    10:'vp_dir_xnorm', 
    11:'vp_dir_ynorm',
    12:'vp_primary_direction'})

In [44]:
# Can't find the same vp_idx?? 
og_vp.loc[og_vp.vp_idx == "11412288"]

Unnamed: 0,gtfs_dataset_name,schedule_gtfs_dataset_key,trip_id,trip_instance_key,location_timestamp,location_timestamp_local,x,y,vp_idx,geometry


In [45]:
og_vp.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1607052 entries, 0 to 1607051
Data columns (total 10 columns):
 #   Column                     Non-Null Count    Dtype              
---  ------                     --------------    -----              
 0   gtfs_dataset_name          1607052 non-null  object             
 1   schedule_gtfs_dataset_key  1607052 non-null  object             
 2   trip_id                    1607052 non-null  object             
 3   trip_instance_key          1607052 non-null  object             
 4   location_timestamp         1607052 non-null  datetime64[ns, UTC]
 5   location_timestamp_local   1607052 non-null  datetime64[ns]     
 6   x                          1607052 non-null  object             
 7   y                          1607052 non-null  object             
 8   vp_idx                     1607052 non-null  object             
 9   geometry                   1607052 non-null  geometry           
dtypes: datetime64[ns, UTC](1), datetim

In [41]:
vp_gdf.head(2)

Unnamed: 0,gtfs_dataset_name,schedule_gtfs_dataset_key,trip_id,trip_instance_key,location_timestamp,location_timestamp_local,x,y,vp_idx,gtfs_dataset_key,vp_dir_xnorm,vp_dir_ynorm,vp_primary_direction,geometry
0,Bay Area 511 Muni VehiclePositions,7cc0cb1871dfd558f11a2885c145d144,11389544_M11,5e067d518df2c7d76b7465f1c7d0901e,2023-10-11 11:21:53+00:00,2023-10-11 04:21:53,-122.45,37.78,11412288,c0e3039da063db95ebabd3fe4ee611a4,,,Unknown,POINT (-122.447 37.784)
1,Bay Area 511 Muni VehiclePositions,7cc0cb1871dfd558f11a2885c145d144,11389544_M11,5e067d518df2c7d76b7465f1c7d0901e,2023-10-11 11:22:08+00:00,2023-10-11 04:22:08,-122.45,37.78,11412289,c0e3039da063db95ebabd3fe4ee611a4,,,Unknown,POINT (-122.447 37.784)


In [38]:
vp_gdf[['trip_instance_key','geometry','x','y']].sample(10).explore('trip_instance_key',marker_kwds= {'radius':25})

KeyError: "['trip_instance_key', 'x', 'y'] not in index"

In [None]:
vp2 = pd.merge(
        vp_gdf,
        buffered_shapes,
        on = "trip_instance_key",
        how = "inner"
    ).reset_index(drop=True)

In [None]:
vp2.crs

In [None]:
type(vp_gdf)

In [None]:
total_vp = vp_spatial_accuracy.total_vp_counts_by_trip(vp2)

In [None]:
total_vp[['total_vp']].sample(5)

* This brings up only false!!

In [None]:

vp2 = vp2.assign(
       is_within = vp2.geometry_x.within(vp2.geometry_y)
  )

In [None]:
vp2[['is_within']].value_counts()

In [None]:
stop

In [None]:
muni_spatial_accuracy = merge_vp_with_shape_and_count(vp, buffered_shapes)

In [None]:
muni_spatial_accuracy.shape

In [None]:
muni_spatial_accuracy.vp_in_shape.value_counts()

In [None]:
muni_spatial_accuracy.head()