# Migrate to VP Usable
* https://github.com/cal-itp/data-analyses/issues/936
* cd rt_segment_speeds && pip install -r requirements.txt && cd ..
    * https://github.com/cal-itp/data-analyses/blob/main/Makefile#L49C2-L49C66
    

In [1]:
import datetime
import dask.dataframe as dd
import dask_geopandas as dg
import dask
import geopandas as gpd
import pandas as pd
from scripts import vp_spatial_accuracy
from segment_speed_utils import helpers
from calitp_data_analysis.geography_utils import WGS84
from segment_speed_utils.project_vars import (
    PROJECT_CRS,
    SEGMENT_GCS,
    analysis_date,
    GCS_FILE_PATH,
    COMPILED_CACHED_VIEWS,
    RT_SCHED_GCS,
    CONFIG_PATH
)

# For speeds c2
from typing import Literal
import numpy as np

# for speeds c3
# also import numpy 
from shared_utils.rt_utils import MPH_PER_MPS
from calitp_data_analysis import utils
from shared_utils import portfolio_utils, schedule_rt_utils
from segment_speed_utils import helpers, sched_rt_utils, wrangle_shapes, segment_calcs

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
# calitp-analytics-data/data-analyses/rt_segment_speeds/vp_usable_2023-10-11
operator = 'Bay Area 511 Muni VehiclePositions'
gtfs_key = '7cc0cb1871dfd558f11a2885c145d144'

## Spatial Accuracy
* Based on https://github.com/cal-itp/data-analyses/blob/main/rt_scheduled_v_ran/scripts/vp_spatial_accuracy.py
### Grab_shape_keys_in_vp

In [4]:
def grab_shape_keys_in_vp(analysis_date: str) -> pd.DataFrame:
    """
    Subset raw vp and find unique trip_instance_keys.
    Create crosswalk to link trip_instance_key to shape_array_key.
    """
    vp_trip_df  = pd.read_parquet(f"{SEGMENT_GCS}vp_usable_{analysis_date}",
        filters = [[('gtfs_dataset_name', "==", operator),
                   ('schedule_gtfs_dataset_key', '==', gtfs_key)]],
        columns = ['trip_instance_key'])
    
    vp_trip_df  = vp_trip_df.drop_duplicates(subset="trip_instance_key").reset_index(drop = True)
    
    # Make sure we have a shape geometry too
    # otherwise map_partitions will throw error
    shapes = pd.read_parquet(
        f"{COMPILED_CACHED_VIEWS}routelines_{analysis_date}.parquet",
        columns = ["shape_array_key"],
    ).dropna().drop_duplicates()
    
    trips_with_shape = helpers.import_scheduled_trips(
        analysis_date,
        columns = ["trip_instance_key", "shape_array_key"],
        get_pandas = True
    ).merge(
        shapes,
        on = "shape_array_key",
        how = "inner"
    ).merge(
        vp_trip_df,
        on = "trip_instance_key",
        how = "inner"
    ).drop_duplicates().dropna().reset_index(drop=True)

    return trips_with_shape

In [5]:
trips_with_shape = grab_shape_keys_in_vp(analysis_date)

In [6]:
trips_with_shape.shape

(9240, 2)

In [7]:
trips_with_shape.head()

Unnamed: 0,trip_instance_key,shape_array_key
0,12de3d260e9fe09fa878cb4cdb2d6898,749b225ca6691f77914e88577dc13e68
1,934aea5748bce830ffc2fa88dc01402a,749b225ca6691f77914e88577dc13e68
2,3d06fa8e68e4f38d3ccc7adfabb5c3d9,f1a7410fae06937b7183f6a553707915
3,34109ad8cfeca83cd459d42c7d51d602,f1a7410fae06937b7183f6a553707915
4,34acd907ae9d5eb5456d683d2458bbc6,f1a7410fae06937b7183f6a553707915


### Buffer shapes

In [8]:
# This is trips_with_shape_geom
trips_with_shape_geom =  vp_spatial_accuracy.buffer_shapes(analysis_date, 
                                trips_with_shape,
                                35)

In [9]:
trips_with_shape_geom.shape

(9240, 3)

In [10]:
vp = dd.read_parquet(f"{SEGMENT_GCS}vp_usable_{analysis_date}",
        filters = [[('gtfs_dataset_name', "==", operator),
                   ('schedule_gtfs_dataset_key', '==', gtfs_key)]])

In [11]:
def merge_vp_with_shape_and_count(
    vp: dd.DataFrame,
    trips_with_shape_geom: gpd.GeoDataFrame
) -> gpd.GeoDataFrame:
    """
    Merge vp with crosswalk and buffered shapes.
    Get vp count totals and vp within shape.
    """
    vp_gdf = gpd.GeoDataFrame(
        vp,
        geometry = gpd.points_from_xy(vp.x, vp.y),
        crs = WGS84
    ).to_crs(PROJECT_CRS)
    
    vp_gdf = vp_gdf.rename(columns = {
    0:'gtfs_dataset_name',
    1: 'schedule_gtfs_dataset_key', 
    2:'trip_id',
    3:'trip_instance_key',
    4: 'location_timestamp', 
    5: 'location_timestamp_local',
    6:'x',
    7:'y', 
    8:'vp_idx', 
    9:'gtfs_dataset_key', 
    10:'vp_dir_xnorm', 
    11:'vp_dir_ynorm',
    12:'vp_primary_direction'})
    
    vp2 = pd.merge(
        vp_gdf,
        trips_with_shape_geom,
        on = "trip_instance_key",
        how = "inner"
    ).reset_index(drop=True)
    
    total_vp = vp_spatial_accuracy.total_vp_counts_by_trip(vp2)
    
    vp2 = vp2.assign(
        is_within = vp2.geometry_x.within(vp2.geometry_y)
    ).query('is_within==True')
    
    vps_in_shape = (vp2.groupby("trip_instance_key", 
                                observed = True, group_keys = False)
                    .agg({"location_timestamp_local": "count"})
                    .reset_index()
                    .rename(columns = {"location_timestamp_local": "vp_in_shape"})
                   )
        
    count_df = pd.merge(
        total_vp,
        vps_in_shape,
        on = "trip_instance_key",
        how = "left"
    )
    
    count_df = count_df.assign(
        vp_in_shape = count_df.vp_in_shape.fillna(0).astype("int32"),
        total_vp = count_df.total_vp.fillna(0).astype("int32")
    )
    
    return count_df

In [12]:
muni = merge_vp_with_shape_and_count(vp, trips_with_shape_geom)

In [13]:
muni.vp_in_shape.describe()

count   9240.00
mean     161.87
std       64.06
min        0.00
25%      118.00
50%      167.00
75%      207.00
max      481.00
Name: vp_in_shape, dtype: float64

In [14]:
muni.loc[muni.vp_in_shape == 0]

Unnamed: 0,trip_instance_key,total_vp,vp_in_shape
167,04ae6ea9655473bdcaf9b80d443558ae,111,0
351,0a1553b7a738967a422b7b8960560ded,75,0
408,0bc53a1063efae8a32eca4a9034d2a21,61,0
1128,1fe28ce8669a7f946f272faf80d80ce7,70,0
1981,37aed2e5ad125f8aede20922b38bf6b9,78,0
3022,54e54e2839a6945b1005322398a89ae9,132,0
3127,580684a7a6b43ce5130f60a441681713,46,0
3213,5a82f8c02f2036db77b1979f4ae228f5,41,0
3861,6cbdc2d321ca63df78bce01eeb06fe07,47,0
3937,6ee4a9165f47b59cb3e10b0125dd72e8,34,0


In [15]:
vp[['trip_instance_key']].compute().nunique()

trip_instance_key    9240
dtype: int64

## Update Completeness
* https://github.com/cal-itp/data-analyses/blob/main/rt_predictions/01_update_completeness.ipynb

In [16]:
from segment_speed_utils.project_vars import PREDICTIONS_GCS

In [17]:
PREDICTIONS_GCS

'gs://calitp-analytics-data/data-analyses/rt_predictions/'

### Translate to `vp_usable`

In [18]:
# Only use the trips with shapes.
relevant_trips = list(muni.trip_instance_key.unique())

In [19]:
#vp_filtered = dd.read_parquet(f"{SEGMENT_GCS}vp_usable_{analysis_date}",
#        filters = [[('gtfs_dataset_name', "==", operator),
#                   ('schedule_gtfs_dataset_key', '==', gtfs_key),
#                    ('trip_instance_key', 'in', relevant_trips)]])

In [20]:
operators2 = ['Anaheim Resort VehiclePositions', 'Bay Area 511 Dumbarton Express VehiclePositions',
             'Bay Area 511 Fairfield and Suisun Transit Vehicle Positions',
             'Bear Vehicle Positions', 'Santa Cruz Vehicle Positions']

In [21]:
operators = ['Anaheim Resort VehiclePositions', 
             'Bear Vehicle Positions', 'Santa Cruz Vehicle Positions']

In [22]:
# Trying with file back in March to compare
vp_filtered = dd.read_parquet(f"{SEGMENT_GCS}vp_usable_{analysis_date}",
        filters = [[('gtfs_dataset_name', "in", operators)]])

In [23]:
vp_filtered[['location_timestamp','location_timestamp_local']].head()

Unnamed: 0,location_timestamp,location_timestamp_local
374035,2023-10-12 01:56:10+00:00,2023-10-11 18:56:10
374036,2023-10-12 01:57:50+00:00,2023-10-11 18:57:50
374037,2023-10-12 01:58:49+00:00,2023-10-11 18:58:49
374038,2023-10-12 01:59:49+00:00,2023-10-11 18:59:49
374039,2023-10-12 02:00:22+00:00,2023-10-11 19:00:22


In [24]:
metric_timestamp_col = "location_timestamp_local_sec"

In [25]:
timestamp_col = "location_timestamp_local"

In [26]:
vp_filtered = segment_calcs.convert_timestamp_to_seconds(vp_filtered, ['location_timestamp_local'])

In [27]:
# vp_filtered.columns

* For now turn this into a normal dataframe because dask doesn't sort by multiple columns

In [28]:
vp_filtered_df = vp_filtered.compute()

In [29]:
vp_filtered_df.columns

Index(['gtfs_dataset_name', 'schedule_gtfs_dataset_key', 'trip_id',
       'trip_instance_key', 'location_timestamp', 'location_timestamp_local',
       'x', 'y', 'vp_idx', 'gtfs_dataset_key', 'vp_dir_xnorm', 'vp_dir_ynorm',
       'vp_primary_direction', 'location_timestamp_local_sec'],
      dtype='object')

In [30]:
vp_filtered_df.schedule_gtfs_dataset_key.unique()

array(['a8d5f90bfd689badb7e1deb041408e96',
       '43d8d305ee692724a532f30ea63a1cbe',
       'b7a6cd6a1a06406c35fa9abd16ad9754'], dtype=object)

In [31]:
vp_filtered_df.gtfs_dataset_key.nunique()

3

* Experiment more with stop_cols and understand the original cols chosen for this list

In [32]:
stop_cols = ['trip_instance_key', 'gtfs_dataset_key', 
            'gtfs_dataset_name',]

In [33]:
vp_filtered_df.trip_id.nunique(), vp_filtered_df.trip_instance_key.nunique()

(1658, 1658)

In [34]:
vp_filtered_df.shape, vp_filtered_df.vp_idx.nunique()

((201184, 14), 201184)

In [35]:
fresh_updates =(
        (vp_filtered_df.sort_values(
            stop_cols + [timestamp_col, metric_timestamp_col])
         .drop_duplicates(subset = stop_cols + [metric_timestamp_col])
        )[stop_cols + [timestamp_col, metric_timestamp_col]]
        .assign(fresh = 1)
    )

In [36]:
df_with_fresh_flag = pd.merge(
        vp_filtered_df,
        fresh_updates,
        on = stop_cols + [
            timestamp_col, metric_timestamp_col],
        how = "left",
    )

In [37]:
df_with_fresh_flag.head(2)

Unnamed: 0,gtfs_dataset_name,schedule_gtfs_dataset_key,trip_id,trip_instance_key,location_timestamp,location_timestamp_local,x,y,vp_idx,gtfs_dataset_key,vp_dir_xnorm,vp_dir_ynorm,vp_primary_direction,location_timestamp_local_sec,fresh
0,Bear Vehicle Positions,a8d5f90bfd689badb7e1deb041408e96,1,69a1a350dd6041284928921b28c34f60,2023-10-12 01:56:10+00:00,2023-10-11 18:56:10,-122.3,37.87,374035,1a5a0ff50ac6b3f4fa4ee1b7b9b7c205,,,Unknown,68170,1
1,Bear Vehicle Positions,a8d5f90bfd689badb7e1deb041408e96,1,69a1a350dd6041284928921b28c34f60,2023-10-12 01:57:50+00:00,2023-10-11 18:57:50,-122.3,37.87,374036,1a5a0ff50ac6b3f4fa4ee1b7b9b7c205,0.02,1.0,Northbound,68270,1


In [38]:
df_with_fresh_flag.fresh = df_with_fresh_flag.fresh.fillna(0).astype(int)

In [39]:
df_with_fresh_flag[metric_timestamp_col].sample(5)

40319     39282
40594     72370
167092    83615
33655     67051
91722     52757
Name: location_timestamp_local_sec, dtype: int64

In [40]:
stop_cols

['trip_instance_key', 'gtfs_dataset_key', 'gtfs_dataset_name']

In [42]:
vp_filtered_df.shape

(201184, 14)

In [41]:
df_with_fresh_flag.shape

(201184, 15)

In [43]:
# df_with_fresh_flag2 = parse_hour_min(df_with_fresh_flag, [timestamp_col])

In [48]:
grouped_df = df_with_fresh_flag.groupby([
        *stop_cols, 
         pd.Grouper(key = timestamp_col, freq = '1Min')
    ])

In [46]:
type(grouped_df)

pandas.core.groupby.generic.DataFrameGroupBy

In [49]:
grouped_df2 = grouped_df.agg({'fresh':'sum'}).reset_index()

In [52]:
grouped_df2 = grouped_df2.assign(
        atleast2_trip_updates = grouped_df2.apply(
            lambda x: 1 if x["fresh"] >= 2
            else 0, axis=1)
    )    
    

In [67]:
grouped_df2.atleast2_trip_updates.value_counts()

0    21680423
1       75853
Name: atleast2_trip_updates, dtype: int64

In [68]:
timestamp_col

'location_timestamp_local'

In [53]:
grouped_df3 = (grouped_df2.groupby(stop_cols)
           .agg({
               timestamp_col: "size",
               "atleast2_trip_updates": "sum"})
           .reset_index()
          ).rename(columns = {
            timestamp_col: "trip_min_elapsed"
    })
    

In [54]:
grouped_df3 = grouped_df3.assign(
        pct_update_complete = grouped_df3.atleast2_trip_updates.divide(
            grouped_df3.trip_min_elapsed)
    ) 

In [55]:
grouped_df3.shape

(14922, 6)

In [64]:
grouped_df3.atleast2_trip_updates.describe()

count   14922.00
mean        5.08
std        22.84
min         0.00
25%         0.00
50%         0.00
75%         0.00
max       964.00
Name: atleast2_trip_updates, dtype: float64

In [57]:
cols = [
    "atleast2_trip_updates", 
    "trip_min_elapsed",
    "pct_update_complete"]

In [61]:
def describe_to_df(df, operator: str, metric_cols: list) -> pd.DataFrame:
    """
    Convert df.column.describe() to a 
    horizontally concatted dataframe.
    """
    # Filter for operator
    df = df[df.gtfs_dataset_name == operator].reset_index(drop=True)
    
    operator = operator.replace('TripUpdates','').strip()
    
    final = pd.DataFrame()

    for i in metric_cols:
        df2 = pd.DataFrame({i: df[i].describe()})
        final = pd.concat([final, df2], axis=1)

    final = final.reset_index().rename(columns={"index": "Measure"})


    final.Measure = final.Measure.str.title()
    
    # https://stackoverflow.com/questions/59535426/can-you-change-the-caption-font-size-using-pandas-styling
    final = final.style.set_caption(f"Summary for {operator}").set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', 'black'),
        ('font-size', '16px')
    ]}]).format(precision=1)

    return final


In [59]:
grouped_df3.columns

Index(['trip_instance_key', 'gtfs_dataset_key', 'gtfs_dataset_name',
       'trip_min_elapsed', 'atleast2_trip_updates', 'pct_update_complete'],
      dtype='object')

In [62]:
for i in grouped_df3.gtfs_dataset_name.unique():
    display(
       describe_to_df(
            grouped_df3,
            i,
            cols,
        )
    )

Unnamed: 0,Measure,atleast2_trip_updates,trip_min_elapsed,pct_update_complete
0,Count,4974.0,4974.0,4974.0
1,Mean,8.4,1458.0,0.0
2,Std,33.4,0.0,0.0
3,Min,0.0,1458.0,0.0
4,25%,0.0,1458.0,0.0
5,50%,0.0,1458.0,0.0
6,75%,0.0,1458.0,0.0
7,Max,964.0,1458.0,0.7


Unnamed: 0,Measure,atleast2_trip_updates,trip_min_elapsed,pct_update_complete
0,Count,4974.0,4974.0,4974.0
1,Mean,0.3,1458.0,0.0
2,Std,2.0,0.0,0.0
3,Min,0.0,1458.0,0.0
4,25%,0.0,1458.0,0.0
5,50%,0.0,1458.0,0.0
6,75%,0.0,1458.0,0.0
7,Max,34.0,1458.0,0.0


Unnamed: 0,Measure,atleast2_trip_updates,trip_min_elapsed,pct_update_complete
0,Count,4974.0,4974.0,4974.0
1,Mean,6.5,1458.0,0.0
2,Std,20.2,0.0,0.0
3,Min,0.0,1458.0,0.0
4,25%,0.0,1458.0,0.0
5,50%,0.0,1458.0,0.0
6,75%,0.0,1458.0,0.0
7,Max,110.0,1458.0,0.1


In [None]:
stop

### Break apart old function

In [None]:
from segment_speed_utils.project_vars import PREDICTIONS_GCS

In [None]:
df = pd.read_parquet(
    f"{PREDICTIONS_GCS}rt_sched_stop_times_2023-03-15.parquet")
df._gtfs_dataset_name.unique()

In [None]:
#prediction_cols = set(df.columns.tolist())
#muni_cols = set(muni.columns.tolist())

In [None]:
#list(set(prediction_cols).intersection(muni_cols))

In [None]:
#scheduled.sample()

#### flag_first_trip_update_prediction_for_stop

In [None]:
timestamp_col = "_extract_ts_local"
metric_timestamp_col = "trip_update_timestamp_local"

In [None]:
stop_cols = [
        "gtfs_dataset_key", "_gtfs_dataset_name", 
        "service_date", 
        "shape_id", "route_id",
        "trip_id", 
        "stop_id", "stop_sequence",
        "scheduled_arrival", "actual_stop_arrival_pacific", 
    ]

In [None]:
df[['_extract_ts_local','trip_update_timestamp_local']].sample(5)

In [None]:
def exclude_predictions_after_actual_stop_arrival(
    df: pd.DataFrame,
    timestamp_col: str
) -> pd.DataFrame:
    """
    Drop the predictions within a trip. Previous exclusion 
    only dropped predictions after trip ended.
    Now, drop predictions for a stop after the stop has arrived.
    """
    df2 = df[df[timestamp_col] <= df.actual_stop_arrival_pacific
            ].reset_index(drop=True)
    
    return df2

In [None]:
df = exclude_predictions_after_actual_stop_arrival(df, timestamp_col)

#### this is flag_first_trip_update_prediction_for_stop

In [None]:
df.shape

In [None]:
fresh_updates =(
        (df.sort_values(
            stop_cols + [timestamp_col, metric_timestamp_col])
         .drop_duplicates(subset = stop_cols + [metric_timestamp_col])
        )[stop_cols + [timestamp_col, metric_timestamp_col,
                       "predicted_pacific"]]
        .assign(fresh = 1)
    )

In [None]:
fresh_updates.shape

In [None]:
fresh_updates.head(2)

In [None]:
fresh_updates.fresh.value_counts()

In [None]:
fresh_updates.shape, df.shape

In [None]:
df_with_fresh_flag = pd.merge(
        df,
        fresh_updates,
        on = stop_cols + [
            timestamp_col, metric_timestamp_col, 
            "predicted_pacific"],
        how = "left",
    )

In [None]:
df_with_fresh_flag.head(2)

In [None]:
df_with_fresh_flag.shape

In [None]:
df_with_fresh_flag = df_with_fresh_flag.assign(
        fresh = df_with_fresh_flag.fresh.fillna(0).astype(int)
    )
    

In [None]:
def parse_hour_min(
    df: pd.DataFrame, 
    timestamp_col: list
) -> pd.DataFrame:
    """
    Parse out the hour and minute from some 
    timestamp_column (header_timestamp, trip_update_timestamp)
    """
    for c in timestamp_col:
        df = df.assign(
            hour = df[c].dt.hour,
            minute = df[c].dt.minute
        ).rename(columns = {
            "hour": f"{c}_hour", 
            "minute": f"{c}_min"})
    
    return df


#### Why parse the time?
* new cols are _extract_ts_local_hour and _extract_ts_local_min

In [None]:
df_with_fresh_flag2 = parse_hour_min(df_with_fresh_flag, [timestamp_col])

In [None]:
df_with_fresh_flag2.head(1)

In [None]:
df_with_fresh_flag.head(1)

#### atleast2_updates_by_trip_stop
* Having trouble previewing this to see it's correct.
* Try to convert metric_timestamp_col to an integer to sum for the step below but it didn't work

In [None]:
# df_with_fresh_flag2['test'] = pd.to_datetime(df_with_fresh_flag2[metric_timestamp_col]).astype(int)/ 10**9

In [None]:
grouped_df = df_with_fresh_flag2.groupby([*stop_cols, 
         pd.Grouper(key = timestamp_col, freq = '1Min')
    ])


In [None]:
type(grouped_df)

##### "sum" isn't working for metric_timestamp_col.
* Need to change it to "fresh"

In [None]:
metric_timestamp_col

In [None]:
# Sum didn't work with metric_timestamp_col
# Count 
df2 = (grouped_df
           .agg({"fresh": "sum"})
           .reset_index()
    )

In [None]:
type(df2)

In [None]:
df2 = df2.assign(
        atleast2_trip_updates = df2.apply(
            lambda x: 1 if x["fresh"] >= 2
            else 0, axis=1)
    )    

In [None]:
df2.columns

In [None]:
df3 = (df2.groupby(stop_cols)
           .agg({
               timestamp_col: "size",
               "atleast2_trip_updates": "sum"})
           .reset_index()
          ).rename(columns = {
            timestamp_col: "trip_min_elapsed"
    })
    

In [None]:
df3.columns

In [None]:
df3 = df3.assign(
        pct_update_complete = df3.atleast2_trip_updates.divide(
            df3.trip_min_elapsed)
    ) 
    

In [None]:
df2.head(2)

In [None]:
df3.head(2)

#### Santa Cruz and Bear Trip Updates are different

In [None]:
for i in df3._gtfs_dataset_name.unique():
    display(
       describe_to_df(
            df3,
            i,
            cols,
        )
    )

In [None]:
stop

## How many minutes a trip took and the average speeds?
* Temporary place until I move the spatial stuff out

In [None]:
type(muni)

In [None]:
len(relevant_trips)

In [None]:
# Test if I can filter for relevant trips
# Yes, can filter for only relevant trips up here? 
# https://github.com/cal-itp/data-analyses/blob/main/rt_scheduled_v_ran/scripts/vp_spatial_accuracy.py#L190-L193
#trips_with_shape_trips = set(trips_with_shape.trip_instance_key.unique().tolist())
#muni_trips = set(muni.trip_instance_key.unique().tolist())
#trips_with_shape_trips - muni_trips

In [None]:
# muni_trips - trips_with_shape_trips

In [None]:
# len(vp_filtered)

In [None]:
# vp_filtered[['trip_instance_key']].compute().nunique()

In [None]:
vp_filtered.columns

### C2 
* https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/C2_triangulate_vp.py
* Break it apart and check it out later to undersatnd what's happening

In [None]:
def triangulate_vp(
    ddf: dd.DataFrame, 
    group_cols: list = ["trip_instance_key"]
) -> np.ndarray:
    """
    Grab a sample of vehicle positions for each trip to triangulate distance.
    These vp already sjoined onto the shape.
    Roughly pick vp at equally spaced intervals.
    
    Dask aggregation can't group and use lambda to create list of possible 
    vp_idx.
    """        
    grouped_ddf = ddf.groupby(group_cols, observed=True, group_keys=False)

    min_df = (grouped_ddf
              .agg({"vp_idx": "min"})
              .rename(columns = {"vp_idx": "min_vp_idx"})
             )

    max_df = (grouped_ddf
              .agg({"vp_idx": "max"})
              .rename(columns = {"vp_idx": "max_vp_idx"})
             )
    
    vp_range = dd.merge(
        min_df,
        max_df,
        left_index = True,
        right_index = True,
        how = "inner"
    )

    vp_range = vp_range.persist()
    
    vp_range["range_diff"] = vp_range.max_vp_idx - vp_range.min_vp_idx
    
    vp_range = vp_range.assign(
        p25_vp_idx = (vp_range.range_diff * 0.25 + vp_range.min_vp_idx
                     ).round(0).astype("int64"),
        p50_vp_idx = (vp_range.range_diff * 0.5 + vp_range.min_vp_idx
                     ).round(0).astype("int64"),
        p75_vp_idx = (vp_range.range_diff * 0.75 + vp_range.min_vp_idx
                     ).round(0).astype("int64"),
    )
    
    vp_idx_cols = [
        "min_vp_idx", 
        "p25_vp_idx",
        "p50_vp_idx", 
        "p75_vp_idx",
        "max_vp_idx"
    ]

    results = vp_range[vp_idx_cols].compute().to_numpy().flatten()    
    
    results = list(results)
    return results

In [None]:
triangulate_muni = triangulate_vp(vp_filtered, 'trip_instance_key')

In [None]:
vp_results = dd.read_parquet(f"{SEGMENT_GCS}vp_usable_{analysis_date}",
        columns = [
            "gtfs_dataset_key", "trip_instance_key",
            "location_timestamp_local",
            "x", "y", "vp_idx"],
        filters = [[('gtfs_dataset_name', "==", operator),
                   ('schedule_gtfs_dataset_key', '==', gtfs_key),
                    ('vp_idx', 'in', triangulate_muni)]]).compute()

In [None]:
vp_results.sample(2)

In [None]:
len(vp_results.loc[vp_results.trip_instance_key == "5e067d518df2c7d76b7465f1c7d0901e"])

In [None]:
len(vp_filtered.loc[vp_filtered.trip_instance_key == "5e067d518df2c7d76b7465f1c7d0901e"])

In [None]:
len(vp_results.loc[vp_results.trip_instance_key == "37e64d454f8679b38b2d35ed0d256df0"])

In [None]:
len(vp_filtered.loc[vp_filtered.trip_instance_key == "37e64d454f8679b38b2d35ed0d256df0"])

* vp_results is a pandas df but merge_rt_scheduled requires dask dataframe...it still works!

In [None]:
type(vp_results)

In [None]:
def merge_rt_scheduled_trips(
    rt_trips: dd.DataFrame,
    analysis_date: str,
    group_cols: list = ["trip_instance_key"]) -> dd.DataFrame:
    """
    Merge RT trips (vehicle positions) to scheduled trips 
    to get the shape_array_key.
    Don't pull other scheduled trip columns now, wait until
    after aggregation is done.
    """
    trips = helpers.import_scheduled_trips(
        analysis_date,
        columns = group_cols + ["shape_array_key"],
        get_pandas = True
    )
        
    df = dd.merge(
        rt_trips,
        trips,
        on = group_cols,
        how = "left",
    )
    
    return df

In [None]:
vp_with_sched = (
        merge_rt_scheduled_trips(
            vp_results, 
            analysis_date, 
            group_cols = ["trip_instance_key"]
        ).sort_values("vp_idx")
        .reset_index(drop=True)
    )

In [None]:
vp_with_sched.head()

### C3
https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/C3_trip_route_speed.py

In [None]:
vp_with_sched.sample(2)

In [None]:
c3 = gpd.GeoDataFrame(
        vp_with_sched,
        geometry = gpd.points_from_xy(vp_with_sched.x, vp_with_sched.y, crs=WGS84)
    ).to_crs(PROJECT_CRS).drop(columns = ["x", "y"])

In [None]:
# c3.sample(10).drop(columns = ['location_timestamp_local']).explore('shape_array_key', marker_kwds= {'radius':25})

In [None]:
shapes_list = c3.shape_array_key.unique().tolist()

In [None]:
shapes = helpers.import_scheduled_shapes(
        analysis_date,
        columns = ["shape_array_key","geometry"],
        filters = [[("shape_array_key", "in", shapes_list)]],
        get_pandas = True,
        crs = PROJECT_CRS
    )

In [None]:
c3_m = pd.merge(
        c3,
        shapes,
        on = "shape_array_key",
        how = "inner"
    ).rename(columns = {"geometry_x": "vp_geometry", 
                        "geometry_y": "shape_geometry"}
            ).set_geometry("vp_geometry")

In [None]:
shape_meters_geoseries = wrangle_shapes.project_point_geom_onto_linestring(
        c3_m,
        "shape_geometry",
        "vp_geometry",
    )


In [None]:
shape_meters_geoseries.head()

In [None]:
c3_m["shape_meters"] = shape_meters_geoseries

In [None]:
def distance_and_seconds_elapsed(
    df: pd.DataFrame, 
    group_cols: list
) -> pd.DataFrame:
    """
    If every trip has 3 vp, we want the change in time and distance
    between 1st and 2nd, 2nd and 3rd.
    Then, sum up the change in time and change by trip.
    """
    dist_col = "shape_meters"
    time_col = "location_timestamp_local"
    sort_cols = group_cols + ["vp_idx"]
    
    # Get the change in distance, time for each row
    df = df.assign(
        prior_dist = (df.sort_values(sort_cols)
                      .groupby(group_cols, 
                               observed=True, group_keys=False)
                      [dist_col]
                      .apply(lambda x: x.shift(1))
                     ),
        prior_time = (df.sort_values(sort_cols)
                      .groupby(group_cols, 
                               observed=True, group_keys=False)
                      [time_col]
                      .apply(lambda x: x.shift(1))
                     )   
    )
    
    # distance should be positive, but sometimes it's not, 
    # so use absolute value
    df = df.assign(
        change_meters = abs(df[dist_col] - df.prior_dist),
        change_sec = (df[time_col] - df.prior_time).divide(
                       np.timedelta64(1, 's'))
    )
    
    # For a trip, sum up the total change in distance and time 
    # Easier to calculate the speed this way, than
    # taking a weighted average later
    df2 = (df.groupby(group_cols, 
                     observed=True, group_keys=False)
           .agg({"change_meters": "sum", 
                 "change_sec": "sum"})
           .reset_index()
          )
    
    df2 = df2.assign(
        speed_mph = (df2.change_meters.divide(df2.change_sec) * 
                     MPH_PER_MPS)
    )
    
    return df2


In [None]:
speed = distance_and_seconds_elapsed(
        c3_m,
        group_cols = ["gtfs_dataset_key", "trip_instance_key"]
    )
    