# Migrate to VP Usable
* https://github.com/cal-itp/data-analyses/issues/936
* cd rt_segment_speeds && pip install -r requirements.txt && cd ..
    * https://github.com/cal-itp/data-analyses/blob/main/Makefile#L49C2-L49C66
    

In [1]:
import datetime
import dask.dataframe as dd
import dask_geopandas as dg
import dask
import geopandas as gpd
import pandas as pd
from scripts import vp_spatial_accuracy
from segment_speed_utils import helpers
from calitp_data_analysis.geography_utils import WGS84
from segment_speed_utils.project_vars import (
    PROJECT_CRS,
    SEGMENT_GCS,
    analysis_date,
    GCS_FILE_PATH,
    COMPILED_CACHED_VIEWS,
    RT_SCHED_GCS,
    CONFIG_PATH
)

# For speeds c2
from typing import Literal
import numpy as np

# for speeds c3
# also import numpy 
from shared_utils.rt_utils import MPH_PER_MPS
from calitp_data_analysis import utils
from shared_utils import portfolio_utils, schedule_rt_utils
from segment_speed_utils import helpers, sched_rt_utils, wrangle_shapes, segment_calcs

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
# calitp-analytics-data/data-analyses/rt_segment_speeds/vp_usable_2023-10-11
operator = 'Bay Area 511 Muni VehiclePositions'
gtfs_key = '7cc0cb1871dfd558f11a2885c145d144'

## Spatial Accuracy
* Based on https://github.com/cal-itp/data-analyses/blob/main/rt_scheduled_v_ran/scripts/vp_spatial_accuracy.py
### Grab_shape_keys_in_vp

In [4]:
def grab_shape_keys_in_vp(analysis_date: str) -> pd.DataFrame:
    """
    Subset raw vp and find unique trip_instance_keys.
    Create crosswalk to link trip_instance_key to shape_array_key.
    """
    vp_trip_df  = pd.read_parquet(f"{SEGMENT_GCS}vp_usable_{analysis_date}",
        filters = [[('gtfs_dataset_name', "==", operator),
                   ('schedule_gtfs_dataset_key', '==', gtfs_key)]],
        columns = ['trip_instance_key'])
    
    vp_trip_df  = vp_trip_df.drop_duplicates(subset="trip_instance_key").reset_index(drop = True)
    
    # Make sure we have a shape geometry too
    # otherwise map_partitions will throw error
    shapes = pd.read_parquet(
        f"{COMPILED_CACHED_VIEWS}routelines_{analysis_date}.parquet",
        columns = ["shape_array_key"],
    ).dropna().drop_duplicates()
    
    trips_with_shape = helpers.import_scheduled_trips(
        analysis_date,
        columns = ["trip_instance_key", "shape_array_key"],
        get_pandas = True
    ).merge(
        shapes,
        on = "shape_array_key",
        how = "inner"
    ).merge(
        vp_trip_df,
        on = "trip_instance_key",
        how = "inner"
    ).drop_duplicates().dropna().reset_index(drop=True)

    return trips_with_shape

In [5]:
trips_with_shape = grab_shape_keys_in_vp(analysis_date)

In [6]:
trips_with_shape.shape

(9240, 2)

In [7]:
trips_with_shape.head()

Unnamed: 0,trip_instance_key,shape_array_key
0,12de3d260e9fe09fa878cb4cdb2d6898,749b225ca6691f77914e88577dc13e68
1,934aea5748bce830ffc2fa88dc01402a,749b225ca6691f77914e88577dc13e68
2,3d06fa8e68e4f38d3ccc7adfabb5c3d9,f1a7410fae06937b7183f6a553707915
3,34109ad8cfeca83cd459d42c7d51d602,f1a7410fae06937b7183f6a553707915
4,34acd907ae9d5eb5456d683d2458bbc6,f1a7410fae06937b7183f6a553707915


### Buffer shapes

In [8]:
# This is trips_with_shape_geom
trips_with_shape_geom =  vp_spatial_accuracy.buffer_shapes(analysis_date, 
                                trips_with_shape,
                                35)

In [9]:
trips_with_shape_geom.shape

(9240, 3)

In [10]:
vp = dd.read_parquet(f"{SEGMENT_GCS}vp_usable_{analysis_date}",
        filters = [[('gtfs_dataset_name', "==", operator),
                   ('schedule_gtfs_dataset_key', '==', gtfs_key)]])

In [11]:
vp[['trip_instance_key']].compute().nunique()

trip_instance_key    9240
dtype: int64

In [12]:
def merge_vp_with_shape_and_count(
    vp: dd.DataFrame,
    trips_with_shape_geom: gpd.GeoDataFrame
) -> gpd.GeoDataFrame:
    """
    Merge vp with crosswalk and buffered shapes.
    Get vp count totals and vp within shape.
    """
    vp_gdf = gpd.GeoDataFrame(
        vp,
        geometry = gpd.points_from_xy(vp.x, vp.y),
        crs = WGS84
    ).to_crs(PROJECT_CRS)
    
    vp_gdf = vp_gdf.rename(columns = {
    0:'gtfs_dataset_name',
    1: 'schedule_gtfs_dataset_key', 
    2:'trip_id',
    3:'trip_instance_key',
    4: 'location_timestamp', 
    5: 'location_timestamp_local',
    6:'x',
    7:'y', 
    8:'vp_idx', 
    9:'gtfs_dataset_key', 
    10:'vp_dir_xnorm', 
    11:'vp_dir_ynorm',
    12:'vp_primary_direction'})
    
    vp2 = pd.merge(
        vp_gdf,
        trips_with_shape_geom,
        on = "trip_instance_key",
        how = "inner"
    ).reset_index(drop=True)
    
    total_vp = vp_spatial_accuracy.total_vp_counts_by_trip(vp2)
    
    vp2 = vp2.assign(
        is_within = vp2.geometry_x.within(vp2.geometry_y)
    ).query('is_within==True')
    
    vps_in_shape = (vp2.groupby("trip_instance_key", 
                                observed = True, group_keys = False)
                    .agg({"location_timestamp_local": "count"})
                    .reset_index()
                    .rename(columns = {"location_timestamp_local": "vp_in_shape"})
                   )
        
    count_df = pd.merge(
        total_vp,
        vps_in_shape,
        on = "trip_instance_key",
        how = "left"
    )
    
    count_df = count_df.assign(
        vp_in_shape = count_df.vp_in_shape.fillna(0).astype("int32"),
        total_vp = count_df.total_vp.fillna(0).astype("int32")
    )
    
    return count_df

In [13]:
muni = merge_vp_with_shape_and_count(vp, trips_with_shape_geom)

In [14]:
muni.vp_in_shape.describe()

count   9240.00
mean     161.87
std       64.06
min        0.00
25%      118.00
50%      167.00
75%      207.00
max      481.00
Name: vp_in_shape, dtype: float64

In [15]:
muni.trip_instance_key.nunique()

9240

In [16]:
muni.loc[muni.vp_in_shape == 0]

Unnamed: 0,trip_instance_key,total_vp,vp_in_shape
167,04ae6ea9655473bdcaf9b80d443558ae,111,0
351,0a1553b7a738967a422b7b8960560ded,75,0
408,0bc53a1063efae8a32eca4a9034d2a21,61,0
1128,1fe28ce8669a7f946f272faf80d80ce7,70,0
1981,37aed2e5ad125f8aede20922b38bf6b9,78,0
3022,54e54e2839a6945b1005322398a89ae9,132,0
3127,580684a7a6b43ce5130f60a441681713,46,0
3213,5a82f8c02f2036db77b1979f4ae228f5,41,0
3861,6cbdc2d321ca63df78bce01eeb06fe07,47,0
3937,6ee4a9165f47b59cb3e10b0125dd72e8,34,0


In [17]:
vp[['trip_instance_key']].compute().nunique()

trip_instance_key    9240
dtype: int64

## Update Completeness
* https://github.com/cal-itp/data-analyses/blob/main/rt_predictions/01_update_completeness.ipynb

In [18]:
from segment_speed_utils.project_vars import PREDICTIONS_GCS

In [19]:
PREDICTIONS_GCS

'gs://calitp-analytics-data/data-analyses/rt_predictions/'

### Translate to `vp_usable`

In [20]:
# Only use the trips with shapes.
relevant_trips = list(muni.trip_instance_key.unique())

In [21]:
completeness_cols = ['vp_idx',
      'location_timestamp_local', 'trip_instance_key',
      'gtfs_dataset_key']

In [22]:
vp_filtered = dd.read_parquet(f"{SEGMENT_GCS}vp_usable_{analysis_date}",
        columns = completeness_cols, 
        filters = [[('gtfs_dataset_name', "==", operator),
                   ('schedule_gtfs_dataset_key', '==', gtfs_key),
                   ('trip_instance_key', 'in', relevant_trips)]])

In [23]:
vp_filtered.columns

Index(['vp_idx', 'location_timestamp_local', 'trip_instance_key',
       'gtfs_dataset_key'],
      dtype='object')

In [24]:
fresh_updates =vp_filtered.sort_values(['vp_idx'])

In [25]:
two_cols = ['trip_instance_key','gtfs_dataset_key']

In [26]:
fresh_updates_df = fresh_updates.compute()

In [27]:
fresh_updates_df = fresh_updates_df.assign(fresh = 1)

In [28]:
"""fresh_updates2 = (fresh_updates_df.groupby([
        *two_cols, 
         pd.Grouper(key = 'location_timestamp_local', freq = '1Min')
    ])
    .count()
    .reset_index()
    .rename(columns = {'vp_idx':'number_of_pings_per_minute'})
                    )"""

"fresh_updates2 = (fresh_updates_df.groupby([\n        *two_cols, \n         pd.Grouper(key = 'location_timestamp_local', freq = '1Min')\n    ])\n    .count()\n    .reset_index()\n    .rename(columns = {'vp_idx':'number_of_pings_per_minute'})\n                    )"

In [29]:
test_group1 = (fresh_updates_df.groupby([
        *two_cols, 
         pd.Grouper(key = 'location_timestamp_local', freq = '1Min')
    ])
    .count()).reset_index()

In [30]:
test_group2 = (fresh_updates_df.groupby([
        *['trip_instance_key'], 
         pd.Grouper(key = 'location_timestamp_local', freq = '1Min')
    ])
    .count()
    .reset_index()
    .rename(columns = {'vp_idx':'number_of_pings_per_minute'})
        )

In [31]:
test_group3 = (fresh_updates_df.groupby([
        *['trip_instance_key', 'gtfs_dataset_key'], 
         pd.Grouper(key = 'location_timestamp_local', freq = '1Min')
    ])
    .count()
    .reset_index()
    .rename(columns = {'vp_idx':'number_of_pings_per_minute'})
        )

### Checks

In [32]:
test_group1.sample()

Unnamed: 0,trip_instance_key,gtfs_dataset_key,location_timestamp_local,vp_idx,fresh
9945560,ab15d6fda7115825603d29b574f3a8b7,c0e3039da063db95ebabd3fe4ee611a4,2023-10-11 10:44:00,3,3


In [33]:
test_group2.sample()

Unnamed: 0,trip_instance_key,location_timestamp_local,number_of_pings_per_minute,gtfs_dataset_key,fresh
54592,1979f2da580305edbe42c58457499ed1,2023-10-11 05:41:00,3,3,3


In [34]:
test_group2.sample()

Unnamed: 0,trip_instance_key,location_timestamp_local,number_of_pings_per_minute,gtfs_dataset_key,fresh
72124,224955f470277311c7eb73e019d0d030,2023-10-11 20:21:00,3,3,3


In [35]:
test_group2.fresh.describe()

count   541681.00
mean         2.96
std          0.47
min          1.00
25%          3.00
50%          3.00
75%          3.00
max          4.00
Name: fresh, dtype: float64

In [36]:
test_group3.fresh.describe()

count   14978040.00
mean           0.11
std            0.56
min            0.00
25%            0.00
50%            0.00
75%            0.00
max            4.00
Name: fresh, dtype: float64

In [37]:
test_group2 = test_group2.assign(
        atleast2_trip_updates = test_group2.apply(
            lambda x: 1 if x["fresh"] >= 2
            else 0, axis=1)
    )    
    

In [38]:
test_group2.atleast2_trip_updates.describe()

count   541681.00
mean         0.99
std          0.11
min          0.00
25%          1.00
50%          1.00
75%          1.00
max          1.00
Name: atleast2_trip_updates, dtype: float64

In [39]:
test_group2.atleast2_trip_updates.value_counts()

1    534886
0      6795
Name: atleast2_trip_updates, dtype: int64

### Trip minutes is wrong
* Check w/ 38247cbee93b6f85d58bf1812ae553b9
* Began at 16:01:00, end at 17:00:000
* Skips having data: jumps from 4:38 to 4:54

In [40]:
test_group2_grouped = (test_group2.groupby(two_cols)
           .agg({
               "location_timestamp_local": "size",
               "atleast2_trip_updates": "sum"})
           .reset_index()
          ).rename(columns = {
            "location_timestamp_local": "trip_min_elapsed"
    })
    

In [41]:
test_group2_grouped.loc[test_group2_grouped.trip_instance_key == "38247cbee93b6f85d58bf1812ae553b9"]

Unnamed: 0,trip_instance_key,gtfs_dataset_key,trip_min_elapsed,atleast2_trip_updates
7113,38247cbee93b6f85d58bf1812ae553b9,2,7,7
7114,38247cbee93b6f85d58bf1812ae553b9,3,33,33
7115,38247cbee93b6f85d58bf1812ae553b9,4,5,5


In [42]:
test_group2_grouped2 = (test_group2.groupby(two_cols)
           .agg({
               "location_timestamp_local": "count",
               "atleast2_trip_updates": "sum"})
           .reset_index()
          ).rename(columns = {
            "location_timestamp_local": "trip_min_elapsed"
    })

In [43]:
test_group2_grouped2.loc[test_group2_grouped2.trip_instance_key == "38247cbee93b6f85d58bf1812ae553b9"]

Unnamed: 0,trip_instance_key,gtfs_dataset_key,trip_min_elapsed,atleast2_trip_updates
7113,38247cbee93b6f85d58bf1812ae553b9,2,7,7
7114,38247cbee93b6f85d58bf1812ae553b9,3,33,33
7115,38247cbee93b6f85d58bf1812ae553b9,4,5,5


In [44]:
test_group2_grouped3 = (test_group2.groupby(['trip_instance_key'])
           .agg({
               "location_timestamp_local": "count",
               "atleast2_trip_updates": "sum"})
           .reset_index()
          ).rename(columns = {
            "location_timestamp_local": "trip_min_elapsed"
    })

In [45]:
test_group2_grouped3.loc[test_group2_grouped3.trip_instance_key == "38247cbee93b6f85d58bf1812ae553b9"]

Unnamed: 0,trip_instance_key,trip_min_elapsed,atleast2_trip_updates
2002,38247cbee93b6f85d58bf1812ae553b9,45,45


#### 4 works the best

In [60]:
test_group2['max_time'] = test_group2.location_timestamp_local

In [70]:
test_group2_grouped4 = (test_group2
                        .groupby(['trip_instance_key'])
                        .agg({'location_timestamp_local':'min','max_time':'max', 
                             'atleast2_trip_updates':'sum'})
                        .reset_index()
                        .rename(columns = {'location_timestamp_local':'min_time'})
                       )

In [71]:
test_group2_grouped4['trip_min_elapsed'] = (test_group2_grouped4.max_time - test_group2_grouped4.min_time) / pd.Timedelta(minutes=1)

In [72]:
test_group2_grouped4.loc[test_group2_grouped4.trip_instance_key == "38247cbee93b6f85d58bf1812ae553b9"]

Unnamed: 0,trip_instance_key,min_time,max_time,atleast2_trip_updates,trip_min_elapsed
2002,38247cbee93b6f85d58bf1812ae553b9,2023-10-11 16:01:00,2023-10-11 17:00:00,45,59.0


In [74]:
test_group2_grouped4 = test_group2_grouped4.assign(
        pct_update_complete = test_group2_grouped4.atleast2_trip_updates.divide(
            test_group2_grouped4.trip_min_elapsed)
    ) 

* 568caf4acf76125fb5db063f8737e5a8 = began at 8:32 am, end at 9:45. 
* b063e289ac71ce8cac8f3e34895a7a1e = began at 5:03, end at 6:26

In [75]:
test_group2_grouped4.shape

(9240, 6)

In [76]:
test_group2_grouped4.trip_min_elapsed.describe()

count   9240.00
mean      58.18
std       22.27
min       10.00
25%       43.00
50%       60.00
75%       73.00
max      196.00
Name: trip_min_elapsed, dtype: float64

In [77]:
test_group2_grouped4.pct_update_complete.describe()

count   9240.00
mean       1.00
std        0.06
min        0.02
25%        1.00
50%        1.00
75%        1.02
max        1.10
Name: pct_update_complete, dtype: float64

In [89]:
test_group2_grouped4.trip_min_elapsed.describe()

count   9240.00
mean      58.18
std       22.27
min       10.00
25%       43.00
50%       60.00
75%       73.00
max      196.00
Name: trip_min_elapsed, dtype: float64

### Tests
* Why are some trips experiencing pct-update-complete greater than 100%??

In [83]:
len(test_group2_grouped4.loc[test_group2_grouped4.pct_update_complete == 1.1])

10

In [82]:
test_group2_grouped4.loc[test_group2_grouped4.pct_update_complete == 1.1].sample()

Unnamed: 0,trip_instance_key,min_time,max_time,atleast2_trip_updates,trip_min_elapsed,pct_update_complete
1972,37622040815e89d063272bb6e37acc65,2023-10-11 05:31:00,2023-10-11 05:41:00,11,10.0,1.1


In [100]:
# How many cols have larger atleast2_trip_updates compared to trip-min_elapsed
test_group2_grouped4['larger'] =test_group2_grouped4.trip_min_elapsed -  test_group2_grouped4.atleast2_trip_updates 

In [101]:
test_group2_grouped4.larger.describe()

count   9240.00
mean       0.29
std        3.82
min       -1.00
25%       -1.00
50%        0.00
75%        0.00
max      105.00
Name: larger, dtype: float64

In [107]:
def mask_overly_large_min(row):
    if row.atleast2_trip_updates > row.trip_min_elapsed:
        return row.trip_min_elapsed
    else:
        return row.atleast2_trip_updates

In [108]:
# Apply the function
test_group2_grouped4["test_mask"] = test_group2_grouped4.apply(mask_overly_large_min, axis=1)

In [113]:
test_group2_grouped4.loc[test_group2_grouped4.larger == -1].sample(2)

Unnamed: 0,trip_instance_key,min_time,max_time,atleast2_trip_updates,trip_min_elapsed,pct_update_complete,larger,test_mask
8211,e4b7661d54ea8d861cff3e35e4caec28,2023-10-11 02:19:00,2023-10-11 02:42:00,24,23.0,1.04,-1.0,23.0
2374,4258fcb5df6feb5288d0b64e58d9f85a,2023-10-11 04:40:00,2023-10-11 05:15:00,36,35.0,1.03,-1.0,35.0


In [114]:
test_group2_grouped4.loc[test_group2_grouped4.larger == 3].sample(2)

Unnamed: 0,trip_instance_key,min_time,max_time,atleast2_trip_updates,trip_min_elapsed,pct_update_complete,larger,test_mask
5039,8cc5578efefd252ced8bae2d3e75cfff,2023-10-11 13:51:00,2023-10-11 14:29:00,35,38.0,0.92,3.0,35.0
429,0c22f6a2911a877afe1b8311fa877228,2023-10-11 09:48:00,2023-10-11 10:40:00,49,52.0,0.94,3.0,49.0


In [112]:
test_group2_grouped4.larger.value_counts().head()/len(test_group2_grouped4)*100

0.00    42.19
-1.00   40.05
1.00    10.76
2.00     1.53
3.00     1.07
Name: larger, dtype: float64

#### 37622040815e89d063272bb6e37acc65
* The trip is ten minutes but there are eleven rows.

In [86]:
test_group2_grouped4.loc[test_group2_grouped4.trip_instance_key == "37622040815e89d063272bb6e37acc65"]

Unnamed: 0,trip_instance_key,min_time,max_time,atleast2_trip_updates,trip_min_elapsed,pct_update_complete
1972,37622040815e89d063272bb6e37acc65,2023-10-11 05:31:00,2023-10-11 05:41:00,11,10.0,1.1


In [88]:
len(fresh_updates_df.loc[fresh_updates_df.trip_instance_key == "37622040815e89d063272bb6e37acc65"])

32

In [94]:
# fresh_updates_df.loc[fresh_updates_df.trip_instance_key == "37622040815e89d063272bb6e37acc65"]

In [92]:
len(test_group2.loc[test_group2.trip_instance_key == "37622040815e89d063272bb6e37acc65"])

11

In [95]:
# test_group2.loc[test_group2.trip_instance_key == "37622040815e89d063272bb6e37acc65"]

In [85]:
len(fresh_updates_df.loc[fresh_updates_df.trip_instance_key == "37622040815e89d063272bb6e37acc65"])

32

#### 568caf4acf76125fb5db063f8737e5a8

In [119]:
test_group2_grouped4.loc[test_group2_grouped4.trip_instance_key == "568caf4acf76125fb5db063f8737e5a8"]

Unnamed: 0,trip_instance_key,min_time,max_time,atleast2_trip_updates,trip_min_elapsed,pct_update_complete,larger,test_mask
3077,568caf4acf76125fb5db063f8737e5a8,2023-10-11 08:32:00,2023-10-11 09:45:00,74,73.0,1.01,-1.0,73.0


#### 38247cbee93b6f85d58bf1812ae553b9

In [118]:
test_group2_grouped4.loc[test_group2_grouped4.trip_instance_key == "38247cbee93b6f85d58bf1812ae553b9"]

Unnamed: 0,trip_instance_key,min_time,max_time,atleast2_trip_updates,trip_min_elapsed,pct_update_complete,larger,test_mask
2002,38247cbee93b6f85d58bf1812ae553b9,2023-10-11 16:01:00,2023-10-11 17:00:00,45,59.0,0.76,14.0,45.0


## How many minutes a trip took and the average speeds?
* Temporary place until I move the spatial stuff out

In [None]:
type(muni)

In [None]:
len(relevant_trips)

In [None]:
# Test if I can filter for relevant trips
# Yes, can filter for only relevant trips up here? 
# https://github.com/cal-itp/data-analyses/blob/main/rt_scheduled_v_ran/scripts/vp_spatial_accuracy.py#L190-L193
#trips_with_shape_trips = set(trips_with_shape.trip_instance_key.unique().tolist())
#muni_trips = set(muni.trip_instance_key.unique().tolist())
#trips_with_shape_trips - muni_trips

In [None]:
# muni_trips - trips_with_shape_trips

In [None]:
# len(vp_filtered)

In [None]:
# vp_filtered[['trip_instance_key']].compute().nunique()

In [None]:
vp_filtered.columns

### C2 
* https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/C2_triangulate_vp.py
* Break it apart and check it out later to undersatnd what's happening

In [None]:
def triangulate_vp(
    ddf: dd.DataFrame, 
    group_cols: list = ["trip_instance_key"]
) -> np.ndarray:
    """
    Grab a sample of vehicle positions for each trip to triangulate distance.
    These vp already sjoined onto the shape.
    Roughly pick vp at equally spaced intervals.
    
    Dask aggregation can't group and use lambda to create list of possible 
    vp_idx.
    """        
    grouped_ddf = ddf.groupby(group_cols, observed=True, group_keys=False)

    min_df = (grouped_ddf
              .agg({"vp_idx": "min"})
              .rename(columns = {"vp_idx": "min_vp_idx"})
             )

    max_df = (grouped_ddf
              .agg({"vp_idx": "max"})
              .rename(columns = {"vp_idx": "max_vp_idx"})
             )
    
    vp_range = dd.merge(
        min_df,
        max_df,
        left_index = True,
        right_index = True,
        how = "inner"
    )

    vp_range = vp_range.persist()
    
    vp_range["range_diff"] = vp_range.max_vp_idx - vp_range.min_vp_idx
    
    vp_range = vp_range.assign(
        p25_vp_idx = (vp_range.range_diff * 0.25 + vp_range.min_vp_idx
                     ).round(0).astype("int64"),
        p50_vp_idx = (vp_range.range_diff * 0.5 + vp_range.min_vp_idx
                     ).round(0).astype("int64"),
        p75_vp_idx = (vp_range.range_diff * 0.75 + vp_range.min_vp_idx
                     ).round(0).astype("int64"),
    )
    
    vp_idx_cols = [
        "min_vp_idx", 
        "p25_vp_idx",
        "p50_vp_idx", 
        "p75_vp_idx",
        "max_vp_idx"
    ]

    results = vp_range[vp_idx_cols].compute().to_numpy().flatten()    
    
    results = list(results)
    return results

In [None]:
triangulate_muni = triangulate_vp(vp_filtered, 'trip_instance_key')

In [None]:
vp_results = dd.read_parquet(f"{SEGMENT_GCS}vp_usable_{analysis_date}",
        columns = [
            "gtfs_dataset_key", "trip_instance_key",
            "location_timestamp_local",
            "x", "y", "vp_idx"],
        filters = [[('gtfs_dataset_name', "==", operator),
                   ('schedule_gtfs_dataset_key', '==', gtfs_key),
                    ('vp_idx', 'in', triangulate_muni)]]).compute()

In [None]:
vp_results.sample(2)

In [None]:
len(vp_results.loc[vp_results.trip_instance_key == "5e067d518df2c7d76b7465f1c7d0901e"])

In [None]:
len(vp_filtered.loc[vp_filtered.trip_instance_key == "5e067d518df2c7d76b7465f1c7d0901e"])

In [None]:
len(vp_results.loc[vp_results.trip_instance_key == "37e64d454f8679b38b2d35ed0d256df0"])

In [None]:
len(vp_filtered.loc[vp_filtered.trip_instance_key == "37e64d454f8679b38b2d35ed0d256df0"])

* vp_results is a pandas df but merge_rt_scheduled requires dask dataframe...it still works!

In [None]:
type(vp_results)

In [None]:
def merge_rt_scheduled_trips(
    rt_trips: dd.DataFrame,
    analysis_date: str,
    group_cols: list = ["trip_instance_key"]) -> dd.DataFrame:
    """
    Merge RT trips (vehicle positions) to scheduled trips 
    to get the shape_array_key.
    Don't pull other scheduled trip columns now, wait until
    after aggregation is done.
    """
    trips = helpers.import_scheduled_trips(
        analysis_date,
        columns = group_cols + ["shape_array_key"],
        get_pandas = True
    )
        
    df = dd.merge(
        rt_trips,
        trips,
        on = group_cols,
        how = "left",
    )
    
    return df

In [None]:
vp_with_sched = (
        merge_rt_scheduled_trips(
            vp_results, 
            analysis_date, 
            group_cols = ["trip_instance_key"]
        ).sort_values("vp_idx")
        .reset_index(drop=True)
    )

In [None]:
vp_with_sched.head()

### C3
https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/C3_trip_route_speed.py

In [None]:
vp_with_sched.sample(2)

In [None]:
c3 = gpd.GeoDataFrame(
        vp_with_sched,
        geometry = gpd.points_from_xy(vp_with_sched.x, vp_with_sched.y, crs=WGS84)
    ).to_crs(PROJECT_CRS).drop(columns = ["x", "y"])

In [None]:
# c3.sample(10).drop(columns = ['location_timestamp_local']).explore('shape_array_key', marker_kwds= {'radius':25})

In [None]:
shapes_list = c3.shape_array_key.unique().tolist()

In [None]:
shapes = helpers.import_scheduled_shapes(
        analysis_date,
        columns = ["shape_array_key","geometry"],
        filters = [[("shape_array_key", "in", shapes_list)]],
        get_pandas = True,
        crs = PROJECT_CRS
    )

In [None]:
c3_m = pd.merge(
        c3,
        shapes,
        on = "shape_array_key",
        how = "inner"
    ).rename(columns = {"geometry_x": "vp_geometry", 
                        "geometry_y": "shape_geometry"}
            ).set_geometry("vp_geometry")

In [None]:
shape_meters_geoseries = wrangle_shapes.project_point_geom_onto_linestring(
        c3_m,
        "shape_geometry",
        "vp_geometry",
    )


In [None]:
shape_meters_geoseries.head()

In [None]:
c3_m["shape_meters"] = shape_meters_geoseries

In [None]:
def distance_and_seconds_elapsed(
    df: pd.DataFrame, 
    group_cols: list
) -> pd.DataFrame:
    """
    If every trip has 3 vp, we want the change in time and distance
    between 1st and 2nd, 2nd and 3rd.
    Then, sum up the change in time and change by trip.
    """
    dist_col = "shape_meters"
    time_col = "location_timestamp_local"
    sort_cols = group_cols + ["vp_idx"]
    
    # Get the change in distance, time for each row
    df = df.assign(
        prior_dist = (df.sort_values(sort_cols)
                      .groupby(group_cols, 
                               observed=True, group_keys=False)
                      [dist_col]
                      .apply(lambda x: x.shift(1))
                     ),
        prior_time = (df.sort_values(sort_cols)
                      .groupby(group_cols, 
                               observed=True, group_keys=False)
                      [time_col]
                      .apply(lambda x: x.shift(1))
                     )   
    )
    
    # distance should be positive, but sometimes it's not, 
    # so use absolute value
    df = df.assign(
        change_meters = abs(df[dist_col] - df.prior_dist),
        change_sec = (df[time_col] - df.prior_time).divide(
                       np.timedelta64(1, 's'))
    )
    
    # For a trip, sum up the total change in distance and time 
    # Easier to calculate the speed this way, than
    # taking a weighted average later
    df2 = (df.groupby(group_cols, 
                     observed=True, group_keys=False)
           .agg({"change_meters": "sum", 
                 "change_sec": "sum"})
           .reset_index()
          )
    
    df2 = df2.assign(
        speed_mph = (df2.change_meters.divide(df2.change_sec) * 
                     MPH_PER_MPS)
    )
    
    return df2


In [None]:
speed = distance_and_seconds_elapsed(
        c3_m,
        group_cols = ["gtfs_dataset_key", "trip_instance_key"]
    )
    