# Migrate to VP Usable
* https://github.com/cal-itp/data-analyses/issues/936

In [1]:
import datetime
import dask.dataframe as dd
import dask_geopandas as dg
import dask
import geopandas as gpd
import pandas as pd
from scripts import vp_spatial_accuracy
from segment_speed_utils import helpers
from calitp_data_analysis.geography_utils import WGS84
from segment_speed_utils.project_vars import (
    PROJECT_CRS,
    SEGMENT_GCS,
    analysis_date,
    GCS_FILE_PATH,
    COMPILED_CACHED_VIEWS,
    RT_SCHED_GCS,
    CONFIG_PATH
)

# For speeds c2
from typing import Literal
import numpy as np

# for speeds c3
# also import numpy 
from shared_utils.rt_utils import MPH_PER_MPS
from calitp_data_analysis import utils
from shared_utils import portfolio_utils, schedule_rt_utils
from segment_speed_utils import helpers, sched_rt_utils, wrangle_shapes

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
# calitp-analytics-data/data-analyses/rt_segment_speeds/vp_usable_2023-10-11
operator = 'Bay Area 511 Muni VehiclePositions'
gtfs_key = '7cc0cb1871dfd558f11a2885c145d144'

## Spatial Accuracy
* Based on https://github.com/cal-itp/data-analyses/blob/main/rt_scheduled_v_ran/scripts/vp_spatial_accuracy.py
* cd rt_segment_speeds && pip install -r requirements.txt && cd ..
    * https://github.com/cal-itp/data-analyses/blob/main/Makefile#L49C2-L49C66
    
### Grab_shape_keys_in_vp

In [4]:
def grab_shape_keys_in_vp(analysis_date: str) -> pd.DataFrame:
    """
    Subset raw vp and find unique trip_instance_keys.
    Create crosswalk to link trip_instance_key to shape_array_key.
    """
    vp_trip_df  = pd.read_parquet(f"{SEGMENT_GCS}vp_usable_{analysis_date}",
        filters = [[('gtfs_dataset_name', "==", operator),
                   ('schedule_gtfs_dataset_key', '==', gtfs_key)]],
        columns = ['trip_instance_key'])
    
    vp_trip_df  = vp_trip_df.drop_duplicates(subset="trip_instance_key").reset_index(drop = True)
    
    # Make sure we have a shape geometry too
    # otherwise map_partitions will throw error
    shapes = pd.read_parquet(
        f"{COMPILED_CACHED_VIEWS}routelines_{analysis_date}.parquet",
        columns = ["shape_array_key"],
    ).dropna().drop_duplicates()
    
    trips_with_shape = helpers.import_scheduled_trips(
        analysis_date,
        columns = ["trip_instance_key", "shape_array_key"],
        get_pandas = True
    ).merge(
        shapes,
        on = "shape_array_key",
        how = "inner"
    ).merge(
        vp_trip_df,
        on = "trip_instance_key",
        how = "inner"
    ).drop_duplicates().dropna().reset_index(drop=True)

    return trips_with_shape

In [5]:
trips_with_shape = grab_shape_keys_in_vp(analysis_date)

In [6]:
trips_with_shape.shape

(9240, 2)

In [7]:
trips_with_shape.head()

Unnamed: 0,trip_instance_key,shape_array_key
0,12de3d260e9fe09fa878cb4cdb2d6898,749b225ca6691f77914e88577dc13e68
1,934aea5748bce830ffc2fa88dc01402a,749b225ca6691f77914e88577dc13e68
2,3d06fa8e68e4f38d3ccc7adfabb5c3d9,f1a7410fae06937b7183f6a553707915
3,34109ad8cfeca83cd459d42c7d51d602,f1a7410fae06937b7183f6a553707915
4,34acd907ae9d5eb5456d683d2458bbc6,f1a7410fae06937b7183f6a553707915


### Buffer shapes

In [8]:
# This is trips_with_shape_geom
trips_with_shape_geom =  vp_spatial_accuracy.buffer_shapes(analysis_date, 
                                trips_with_shape,
                                35)

In [9]:
trips_with_shape_geom.shape

(9240, 3)

In [10]:
vp = dd.read_parquet(f"{SEGMENT_GCS}vp_usable_{analysis_date}",
        filters = [[('gtfs_dataset_name', "==", operator),
                   ('schedule_gtfs_dataset_key', '==', gtfs_key)]])

In [11]:
def merge_vp_with_shape_and_count(
    vp: dd.DataFrame,
    trips_with_shape_geom: gpd.GeoDataFrame
) -> gpd.GeoDataFrame:
    """
    Merge vp with crosswalk and buffered shapes.
    Get vp count totals and vp within shape.
    """
    vp_gdf = gpd.GeoDataFrame(
        vp,
        geometry = gpd.points_from_xy(vp.x, vp.y),
        crs = WGS84
    ).to_crs(PROJECT_CRS)
    
    vp_gdf = vp_gdf.rename(columns = {
    0:'gtfs_dataset_name',
    1: 'schedule_gtfs_dataset_key', 
    2:'trip_id',
    3:'trip_instance_key',
    4: 'location_timestamp', 
    5: 'location_timestamp_local',
    6:'x',
    7:'y', 
    8:'vp_idx', 
    9:'gtfs_dataset_key', 
    10:'vp_dir_xnorm', 
    11:'vp_dir_ynorm',
    12:'vp_primary_direction'})
    
    vp2 = pd.merge(
        vp_gdf,
        trips_with_shape_geom,
        on = "trip_instance_key",
        how = "inner"
    ).reset_index(drop=True)
    
    total_vp = vp_spatial_accuracy.total_vp_counts_by_trip(vp2)
    
    vp2 = vp2.assign(
        is_within = vp2.geometry_x.within(vp2.geometry_y)
    ).query('is_within==True')
    
    vps_in_shape = (vp2.groupby("trip_instance_key", 
                                observed = True, group_keys = False)
                    .agg({"location_timestamp_local": "count"})
                    .reset_index()
                    .rename(columns = {"location_timestamp_local": "vp_in_shape"})
                   )
        
    count_df = pd.merge(
        total_vp,
        vps_in_shape,
        on = "trip_instance_key",
        how = "left"
    )
    
    count_df = count_df.assign(
        vp_in_shape = count_df.vp_in_shape.fillna(0).astype("int32"),
        total_vp = count_df.total_vp.fillna(0).astype("int32")
    )
    
    return count_df

In [12]:
muni = merge_vp_with_shape_and_count(vp, trips_with_shape_geom)

In [13]:
muni.vp_in_shape.describe()

count   9240.00
mean     161.87
std       64.06
min        0.00
25%      118.00
50%      167.00
75%      207.00
max      481.00
Name: vp_in_shape, dtype: float64

In [14]:
muni.loc[muni.vp_in_shape == 0]

Unnamed: 0,trip_instance_key,total_vp,vp_in_shape
167,04ae6ea9655473bdcaf9b80d443558ae,111,0
351,0a1553b7a738967a422b7b8960560ded,75,0
408,0bc53a1063efae8a32eca4a9034d2a21,61,0
1128,1fe28ce8669a7f946f272faf80d80ce7,70,0
1981,37aed2e5ad125f8aede20922b38bf6b9,78,0
3022,54e54e2839a6945b1005322398a89ae9,132,0
3127,580684a7a6b43ce5130f60a441681713,46,0
3213,5a82f8c02f2036db77b1979f4ae228f5,41,0
3861,6cbdc2d321ca63df78bce01eeb06fe07,47,0
3937,6ee4a9165f47b59cb3e10b0125dd72e8,34,0


In [15]:
vp[['trip_instance_key']].compute().nunique()

trip_instance_key    9240
dtype: int64

## Update Completeness
* https://github.com/cal-itp/data-analyses/blob/main/rt_predictions/01_update_completeness.ipynb

In [95]:
from segment_speed_utils.project_vars import PREDICTIONS_GCS

In [96]:
PREDICTIONS_GCS

'gs://calitp-analytics-data/data-analyses/rt_predictions/'

In [97]:
COMPILED_CACHED_VIEWS

'gs://calitp-analytics-data/data-analyses/rt_delay/compiled_cached_views/'

In [98]:
scheduled_url = f"{COMPILED_CACHED_VIEWS}trips_{analysis_date}.parquet"

In [99]:
scheduled = pd.read_parquet(scheduled_url)

In [100]:
scheduled.columns

Index(['feed_key', 'gtfs_dataset_key', 'name', 'regional_feed_type',
       'service_date', 'trip_start_date_pacific', 'trip_id',
       'trip_instance_key', 'route_key', 'route_id', 'route_type',
       'route_short_name', 'route_long_name', 'route_desc', 'direction_id',
       'shape_array_key', 'shape_id', 'trip_first_departure_datetime_pacific',
       'trip_last_arrival_datetime_pacific', 'service_hours',
       'trip_start_date_local_tz', 'trip_first_departure_datetime_local_tz',
       'trip_last_arrival_datetime_local_tz'],
      dtype='object')

### Break apart old function

In [101]:
df = pd.read_parquet(
    f"{PREDICTIONS_GCS}rt_sched_stop_times_2023-03-15.parquet")
df._gtfs_dataset_name.unique()

array(['Anaheim Resort TripUpdates',
       'Bay Area 511 Dumbarton Express TripUpdates',
       'Bay Area 511 Fairfield and Suisun Transit TripUpdates',
       'Santa Cruz Trip Updates', 'Bear Trip Updates'], dtype=object)

In [102]:
prediction_cols = set(df.columns.tolist())
scheduled_cols = set(scheduled.columns.tolist())
prediction_cols - scheduled_cols

{'_extract_ts_local',
 '_gtfs_dataset_name',
 '_trip_update_message_age',
 'actual_stop_arrival_pacific',
 'predicted_pacific',
 'schedule_relationship',
 'scheduled_arrival',
 'stop_id',
 'stop_sequence',
 'trip_start_time',
 'trip_update_timestamp_local'}

In [103]:
scheduled.sample()

Unnamed: 0,feed_key,gtfs_dataset_key,name,regional_feed_type,service_date,trip_start_date_pacific,trip_id,trip_instance_key,route_key,route_id,route_type,route_short_name,route_long_name,route_desc,direction_id,shape_array_key,shape_id,trip_first_departure_datetime_pacific,trip_last_arrival_datetime_pacific,service_hours,trip_start_date_local_tz,trip_first_departure_datetime_local_tz,trip_last_arrival_datetime_local_tz
1893,60a215476c5d605e0e6af15765880c03,cc53a0dbf5df90e3009b9cb5d89d80ba,LA DOT Schedule,,2023-10-11,2023-10-11,183-33jj2orr9,5b56a247dcb89cb8a5ba4f0f4cbbc9e2,b93acf2e93d2de4181f75f0c9d302bc9,4447,3,A,DASH A,Little Tokyo/City West,0.0,0f4687a3b01aa3d8be2265732a10c892,18751,2023-10-11 11:08:00,2023-10-11 12:09:00,1.02,2023-10-11,2023-10-11 11:08:00,2023-10-11 12:09:00


In [104]:
df.sample()

Unnamed: 0,gtfs_dataset_key,_gtfs_dataset_name,service_date,trip_id,trip_start_time,_trip_update_message_age,stop_id,stop_sequence,schedule_relationship,_extract_ts_local,trip_update_timestamp_local,shape_id,route_id,scheduled_arrival,actual_stop_arrival_pacific,predicted_pacific
1528518,659f9f49a52e3541583fac3749cfbe44,Bear Trip Updates,2023-03-15,230,2023-03-15 11:00:00,55,coryhall,5.0,SCHEDULED,2023-03-15 10:44:20,2023-03-15 10:43:25,peri_loop_1,peri,2023-03-15 11:06:00,2023-03-15 11:07:18,2023-03-15 11:06:18


#### flag_first_trip_update_prediction_for_stop

In [105]:
timestamp_col = "_extract_ts_local"
metric_timestamp_col = "trip_update_timestamp_local"

In [106]:
stop_cols = [
        "gtfs_dataset_key", "_gtfs_dataset_name", 
        "service_date", 
        "shape_id", "route_id",
        "trip_id", 
        "stop_id", "stop_sequence",
        "scheduled_arrival", "actual_stop_arrival_pacific", 
    ]

In [107]:
def exclude_predictions_after_actual_stop_arrival(
    df: pd.DataFrame,
    timestamp_col: str
) -> pd.DataFrame:
    """
    Drop the predictions within a trip. Previous exclusion 
    only dropped predictions after trip ended.
    Now, drop predictions for a stop after the stop has arrived.
    """
    df2 = df[df[timestamp_col] <= df.actual_stop_arrival_pacific
            ].reset_index(drop=True)
    
    return df2

In [108]:
df = exclude_predictions_after_actual_stop_arrival(df, timestamp_col)

In [109]:
fresh_updates =(
        (df.sort_values(
            stop_cols + [timestamp_col, metric_timestamp_col])
         .drop_duplicates(subset = stop_cols + [metric_timestamp_col])
        )[stop_cols + [timestamp_col, metric_timestamp_col,
                       "predicted_pacific"]]
        .assign(fresh = 1)
    )

In [110]:
fresh_updates.head(2)

Unnamed: 0,gtfs_dataset_key,_gtfs_dataset_name,service_date,shape_id,route_id,trip_id,stop_id,stop_sequence,scheduled_arrival,actual_stop_arrival_pacific,_extract_ts_local,trip_update_timestamp_local,predicted_pacific,fresh
93974,262d7b27183fa8d174ab8fc83ad5848f,Anaheim Resort TripUpdates,2023-03-15,063c940e-ae42-4473-b9d8-c36083d3ec23,8f305689-4315-445e-abea-920dbbf0be5e,3662f00e-b8d2-4e8b-8905-c1a5624ce879:1,1100,1.0,2023-03-15 19:00:00,2023-03-15 20:35:37,2023-03-15 19:59:40,2023-03-15 19:59:35,2023-03-15 19:00:00,1
93966,262d7b27183fa8d174ab8fc83ad5848f,Anaheim Resort TripUpdates,2023-03-15,063c940e-ae42-4473-b9d8-c36083d3ec23,8f305689-4315-445e-abea-920dbbf0be5e,3662f00e-b8d2-4e8b-8905-c1a5624ce879:1,1100,1.0,2023-03-15 19:00:00,2023-03-15 20:35:37,2023-03-15 20:00:00,2023-03-15 19:59:56,2023-03-15 19:00:00,1


In [111]:
df_with_fresh_flag = pd.merge(
        df,
        fresh_updates,
        on = stop_cols + [
            timestamp_col, metric_timestamp_col, 
            "predicted_pacific"],
        how = "left",
    )

In [112]:
df_with_fresh_flag.head(2)

Unnamed: 0,gtfs_dataset_key,_gtfs_dataset_name,service_date,trip_id,trip_start_time,_trip_update_message_age,stop_id,stop_sequence,schedule_relationship,_extract_ts_local,trip_update_timestamp_local,shape_id,route_id,scheduled_arrival,actual_stop_arrival_pacific,predicted_pacific,fresh
0,262d7b27183fa8d174ab8fc83ad5848f,Anaheim Resort TripUpdates,2023-03-15,6b6e3f4d-3c89-41d7-8d88-bc5668fc8e27:11,2023-03-15 17:30:00,6,c24c75f6-8397-4d8c-a4e5-72444083a9c8,2.0,SCHEDULED,2023-03-15 21:16:20,2023-03-15 21:16:14,cae6cda4-863b-48b5-b64f-554972b9d110,8f305689-4315-445e-abea-920dbbf0be5e,2023-03-15 17:37:00,2023-03-15 21:17:38,2023-03-15 21:16:09,1.0
1,262d7b27183fa8d174ab8fc83ad5848f,Anaheim Resort TripUpdates,2023-03-15,6b6e3f4d-3c89-41d7-8d88-bc5668fc8e27:11,2023-03-15 17:30:00,5,c24c75f6-8397-4d8c-a4e5-72444083a9c8,2.0,SCHEDULED,2023-03-15 21:16:00,2023-03-15 21:15:55,cae6cda4-863b-48b5-b64f-554972b9d110,8f305689-4315-445e-abea-920dbbf0be5e,2023-03-15 17:37:00,2023-03-15 21:17:38,2023-03-15 21:16:09,1.0


In [113]:
df_with_fresh_flag = df_with_fresh_flag.assign(
        fresh = df_with_fresh_flag.fresh.fillna(0).astype(int)
    )
    

In [114]:
def parse_hour_min(
    df: pd.DataFrame, 
    timestamp_col: list
) -> pd.DataFrame:
    """
    Parse out the hour and minute from some 
    timestamp_column (header_timestamp, trip_update_timestamp)
    """
    for c in timestamp_col:
        df = df.assign(
            hour = df[c].dt.hour,
            minute = df[c].dt.minute
        ).rename(columns = {
            "hour": f"{c}_hour", 
            "minute": f"{c}_min"})
    
    return df


In [115]:
df_with_fresh_flag2 = parse_hour_min(df_with_fresh_flag, [timestamp_col])

In [116]:
df_with_fresh_flag2.head(1)

Unnamed: 0,gtfs_dataset_key,_gtfs_dataset_name,service_date,trip_id,trip_start_time,_trip_update_message_age,stop_id,stop_sequence,schedule_relationship,_extract_ts_local,trip_update_timestamp_local,shape_id,route_id,scheduled_arrival,actual_stop_arrival_pacific,predicted_pacific,fresh,_extract_ts_local_hour,_extract_ts_local_min
0,262d7b27183fa8d174ab8fc83ad5848f,Anaheim Resort TripUpdates,2023-03-15,6b6e3f4d-3c89-41d7-8d88-bc5668fc8e27:11,2023-03-15 17:30:00,6,c24c75f6-8397-4d8c-a4e5-72444083a9c8,2.0,SCHEDULED,2023-03-15 21:16:20,2023-03-15 21:16:14,cae6cda4-863b-48b5-b64f-554972b9d110,8f305689-4315-445e-abea-920dbbf0be5e,2023-03-15 17:37:00,2023-03-15 21:17:38,2023-03-15 21:16:09,1,21,16


In [117]:
df_with_fresh_flag.head(1)

Unnamed: 0,gtfs_dataset_key,_gtfs_dataset_name,service_date,trip_id,trip_start_time,_trip_update_message_age,stop_id,stop_sequence,schedule_relationship,_extract_ts_local,trip_update_timestamp_local,shape_id,route_id,scheduled_arrival,actual_stop_arrival_pacific,predicted_pacific,fresh
0,262d7b27183fa8d174ab8fc83ad5848f,Anaheim Resort TripUpdates,2023-03-15,6b6e3f4d-3c89-41d7-8d88-bc5668fc8e27:11,2023-03-15 17:30:00,6,c24c75f6-8397-4d8c-a4e5-72444083a9c8,2.0,SCHEDULED,2023-03-15 21:16:20,2023-03-15 21:16:14,cae6cda4-863b-48b5-b64f-554972b9d110,8f305689-4315-445e-abea-920dbbf0be5e,2023-03-15 17:37:00,2023-03-15 21:17:38,2023-03-15 21:16:09,1


#### atleast2_updates_by_trip_stop

In [118]:
grouped_df = df_with_fresh_flag2.groupby([*stop_cols, 
         pd.Grouper(key = timestamp_col, freq = '1Min')
    ])


In [119]:
type(grouped_df)

pandas.core.groupby.generic.DataFrameGroupBy

#### "sum" isn't working.

In [120]:
# Sum didn't work.
df2 = (grouped_df
           .agg({metric_timestamp_col: "count"})
           .reset_index()
    )

In [121]:
type(df2)

pandas.core.frame.DataFrame

In [122]:
df2 = df2.assign(
        atleast2_trip_updates = df2.apply(
            lambda x: 1 if x[metric_col] >= 2
            else 0, axis=1)
    )    

In [123]:
stop_cols

['gtfs_dataset_key',
 '_gtfs_dataset_name',
 'service_date',
 'shape_id',
 'route_id',
 'trip_id',
 'stop_id',
 'stop_sequence',
 'scheduled_arrival',
 'actual_stop_arrival_pacific']

In [124]:
df3 = (df2.groupby(stop_cols)
           .agg({
               timestamp_col: "size",
               "atleast2_trip_updates": "sum"})
           .reset_index()
          ).rename(columns = {
            timestamp_col: "trip_min_elapsed"
    })
    

In [125]:
df3 = df3.assign(
        pct_update_complete = df3.atleast2_trip_updates.divide(
            df3.trip_min_elapsed)
    ) 
    

In [126]:
df2.head(2)

Unnamed: 0,gtfs_dataset_key,_gtfs_dataset_name,service_date,shape_id,route_id,trip_id,stop_id,stop_sequence,scheduled_arrival,actual_stop_arrival_pacific,_extract_ts_local,trip_update_timestamp_local,atleast2_trip_updates
0,262d7b27183fa8d174ab8fc83ad5848f,Anaheim Resort TripUpdates,2023-03-15,063c940e-ae42-4473-b9d8-c36083d3ec23,8f305689-4315-445e-abea-920dbbf0be5e,3662f00e-b8d2-4e8b-8905-c1a5624ce879:1,1100,1.0,2023-03-15 19:00:00,2023-03-15 20:35:37,2023-03-15 19:59:00,1,0
1,262d7b27183fa8d174ab8fc83ad5848f,Anaheim Resort TripUpdates,2023-03-15,063c940e-ae42-4473-b9d8-c36083d3ec23,8f305689-4315-445e-abea-920dbbf0be5e,3662f00e-b8d2-4e8b-8905-c1a5624ce879:1,1100,1.0,2023-03-15 19:00:00,2023-03-15 20:35:37,2023-03-15 20:00:00,3,1


In [127]:
df3.head(2)

Unnamed: 0,gtfs_dataset_key,_gtfs_dataset_name,service_date,shape_id,route_id,trip_id,stop_id,stop_sequence,scheduled_arrival,actual_stop_arrival_pacific,trip_min_elapsed,atleast2_trip_updates,pct_update_complete
0,262d7b27183fa8d174ab8fc83ad5848f,Anaheim Resort TripUpdates,2023-03-15,063c940e-ae42-4473-b9d8-c36083d3ec23,8f305689-4315-445e-abea-920dbbf0be5e,3662f00e-b8d2-4e8b-8905-c1a5624ce879:1,1100,1.0,2023-03-15 19:00:00,2023-03-15 20:35:37,37,36,0.97
1,262d7b27183fa8d174ab8fc83ad5848f,Anaheim Resort TripUpdates,2023-03-15,063c940e-ae42-4473-b9d8-c36083d3ec23,8f305689-4315-445e-abea-920dbbf0be5e,3662f00e-b8d2-4e8b-8905-c1a5624ce879:1,1100,3.0,2023-03-15 19:12:00,2023-03-15 20:35:37,11,11,1.0


In [128]:
cols = [
    "atleast2_trip_updates", 
    "trip_min_elapsed",
    "pct_update_complete"]

In [129]:
def describe_to_df(df, operator: str, metric_cols: list) -> pd.DataFrame:
    """
    Convert df.column.describe() to a 
    horizontally concatted dataframe.
    """
    # Filter for operator
    df = df[df._gtfs_dataset_name == operator].reset_index(drop=True)
    
    operator = operator.replace('TripUpdates','').strip()
    
    final = pd.DataFrame()

    for i in metric_cols:
        df2 = pd.DataFrame({i: df[i].describe()})
        final = pd.concat([final, df2], axis=1)

    final = final.reset_index().rename(columns={"index": "Measure"})


    final.Measure = final.Measure.str.title()
    
    # https://stackoverflow.com/questions/59535426/can-you-change-the-caption-font-size-using-pandas-styling
    final = final.style.set_caption(f"Summary for {operator}").set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', 'black'),
        ('font-size', '16px')
    ]}]).format(precision=1)

    return final


#### Santa Cruz and Bear Trip Updates are different

In [130]:
for i in df3._gtfs_dataset_name.unique():
    display(
       describe_to_df(
            df3,
            i,
            cols,
        )
    )

Unnamed: 0,Measure,atleast2_trip_updates,trip_min_elapsed,pct_update_complete
0,Count,1800.0,1800.0,1800.0
1,Mean,21.0,21.7,0.9
2,Std,21.9,21.9,0.2
3,Min,0.0,1.0,0.0
4,25%,8.0,9.0,0.9
5,50%,13.0,13.0,1.0
6,75%,29.0,30.0,1.0
7,Max,172.0,173.0,1.0


Unnamed: 0,Measure,atleast2_trip_updates,trip_min_elapsed,pct_update_complete
0,Count,1490.0,1490.0,1490.0
1,Mean,79.1,80.2,1.0
2,Std,32.1,32.1,0.0
3,Min,1.0,3.0,0.2
4,25%,56.0,58.0,1.0
5,50%,82.0,83.0,1.0
6,75%,101.0,103.0,1.0
7,Max,161.0,161.0,1.0


Unnamed: 0,Measure,atleast2_trip_updates,trip_min_elapsed,pct_update_complete
0,Count,514.0,514.0,514.0
1,Mean,64.1,65.0,1.0
2,Std,15.2,15.1,0.0
3,Min,0.0,1.0,0.0
4,25%,61.0,61.0,1.0
5,50%,67.0,68.0,1.0
6,75%,72.0,73.0,1.0
7,Max,97.0,97.0,1.0


Unnamed: 0,Measure,atleast2_trip_updates,trip_min_elapsed,pct_update_complete
0,Count,1289.0,1289.0,1289.0
1,Mean,49.5,50.2,1.0
2,Std,10.4,10.4,0.0
3,Min,2.0,3.0,0.7
4,25%,45.0,46.0,1.0
5,50%,47.0,47.0,1.0
6,75%,50.0,51.0,1.0
7,Max,112.0,113.0,1.0


Unnamed: 0,Measure,atleast2_trip_updates,trip_min_elapsed,pct_update_complete
0,Count,1201.0,1201.0,1201.0
1,Mean,26.9,27.5,1.0
2,Std,9.6,9.5,0.1
3,Min,0.0,1.0,0.0
4,25%,22.0,23.0,1.0
5,50%,29.0,30.0,1.0
6,75%,33.0,34.0,1.0
7,Max,53.0,54.0,1.0


In [131]:
stop

NameError: name 'stop' is not defined

## How many minutes a trip took and the average speeds?
* Temporary place until I move the spatial stuff out

In [None]:
# Only use the trips with shapes.
relevant_trips = list(muni.trip_instance_key.unique())

In [None]:
type(muni)

In [None]:
len(relevant_trips)

In [None]:
# Test if I can filter for relevant trips
# Yes, can filter for only relevant trips up here? 
# https://github.com/cal-itp/data-analyses/blob/main/rt_scheduled_v_ran/scripts/vp_spatial_accuracy.py#L190-L193
#trips_with_shape_trips = set(trips_with_shape.trip_instance_key.unique().tolist())
#muni_trips = set(muni.trip_instance_key.unique().tolist())
#trips_with_shape_trips - muni_trips

In [None]:
# muni_trips - trips_with_shape_trips

In [None]:
vp_filtered = dd.read_parquet(f"{SEGMENT_GCS}vp_usable_{analysis_date}",
        filters = [[('gtfs_dataset_name', "==", operator),
                   ('schedule_gtfs_dataset_key', '==', gtfs_key),
                    ('trip_instance_key', 'in', relevant_trips)]])

In [None]:
# len(vp_filtered)

In [None]:
# vp_filtered[['trip_instance_key']].compute().nunique()

In [None]:
vp_filtered.columns

### C2 
* https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/C2_triangulate_vp.py
* Break it apart and check it out later to undersatnd what's happening

In [None]:
def triangulate_vp(
    ddf: dd.DataFrame, 
    group_cols: list = ["trip_instance_key"]
) -> np.ndarray:
    """
    Grab a sample of vehicle positions for each trip to triangulate distance.
    These vp already sjoined onto the shape.
    Roughly pick vp at equally spaced intervals.
    
    Dask aggregation can't group and use lambda to create list of possible 
    vp_idx.
    """        
    grouped_ddf = ddf.groupby(group_cols, observed=True, group_keys=False)

    min_df = (grouped_ddf
              .agg({"vp_idx": "min"})
              .rename(columns = {"vp_idx": "min_vp_idx"})
             )

    max_df = (grouped_ddf
              .agg({"vp_idx": "max"})
              .rename(columns = {"vp_idx": "max_vp_idx"})
             )
    
    vp_range = dd.merge(
        min_df,
        max_df,
        left_index = True,
        right_index = True,
        how = "inner"
    )

    vp_range = vp_range.persist()
    
    vp_range["range_diff"] = vp_range.max_vp_idx - vp_range.min_vp_idx
    
    vp_range = vp_range.assign(
        p25_vp_idx = (vp_range.range_diff * 0.25 + vp_range.min_vp_idx
                     ).round(0).astype("int64"),
        p50_vp_idx = (vp_range.range_diff * 0.5 + vp_range.min_vp_idx
                     ).round(0).astype("int64"),
        p75_vp_idx = (vp_range.range_diff * 0.75 + vp_range.min_vp_idx
                     ).round(0).astype("int64"),
    )
    
    vp_idx_cols = [
        "min_vp_idx", 
        "p25_vp_idx",
        "p50_vp_idx", 
        "p75_vp_idx",
        "max_vp_idx"
    ]

    results = vp_range[vp_idx_cols].compute().to_numpy().flatten()    
    
    results = list(results)
    return results

In [None]:
triangulate_muni = triangulate_vp(vp_filtered, 'trip_instance_key')

In [None]:
vp_results = dd.read_parquet(f"{SEGMENT_GCS}vp_usable_{analysis_date}",
        columns = [
            "gtfs_dataset_key", "trip_instance_key",
            "location_timestamp_local",
            "x", "y", "vp_idx"],
        filters = [[('gtfs_dataset_name', "==", operator),
                   ('schedule_gtfs_dataset_key', '==', gtfs_key),
                    ('vp_idx', 'in', triangulate_muni)]]).compute()

In [None]:
vp_results.sample(2)

In [None]:
len(vp_results.loc[vp_results.trip_instance_key == "5e067d518df2c7d76b7465f1c7d0901e"])

In [None]:
len(vp_filtered.loc[vp_filtered.trip_instance_key == "5e067d518df2c7d76b7465f1c7d0901e"])

In [None]:
len(vp_results.loc[vp_results.trip_instance_key == "37e64d454f8679b38b2d35ed0d256df0"])

In [None]:
len(vp_filtered.loc[vp_filtered.trip_instance_key == "37e64d454f8679b38b2d35ed0d256df0"])

* vp_results is a pandas df but merge_rt_scheduled requires dask dataframe...it still works!

In [None]:
type(vp_results)

In [None]:
def merge_rt_scheduled_trips(
    rt_trips: dd.DataFrame,
    analysis_date: str,
    group_cols: list = ["trip_instance_key"]) -> dd.DataFrame:
    """
    Merge RT trips (vehicle positions) to scheduled trips 
    to get the shape_array_key.
    Don't pull other scheduled trip columns now, wait until
    after aggregation is done.
    """
    trips = helpers.import_scheduled_trips(
        analysis_date,
        columns = group_cols + ["shape_array_key"],
        get_pandas = True
    )
        
    df = dd.merge(
        rt_trips,
        trips,
        on = group_cols,
        how = "left",
    )
    
    return df

In [None]:
vp_with_sched = (
        merge_rt_scheduled_trips(
            vp_results, 
            analysis_date, 
            group_cols = ["trip_instance_key"]
        ).sort_values("vp_idx")
        .reset_index(drop=True)
    )

In [None]:
vp_with_sched.head()

### C3
https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/C3_trip_route_speed.py

In [None]:
vp_with_sched.sample(2)

In [None]:
c3 = gpd.GeoDataFrame(
        vp_with_sched,
        geometry = gpd.points_from_xy(vp_with_sched.x, vp_with_sched.y, crs=WGS84)
    ).to_crs(PROJECT_CRS).drop(columns = ["x", "y"])

In [None]:
# c3.sample(10).drop(columns = ['location_timestamp_local']).explore('shape_array_key', marker_kwds= {'radius':25})

In [None]:
shapes_list = c3.shape_array_key.unique().tolist()

In [None]:
shapes = helpers.import_scheduled_shapes(
        analysis_date,
        columns = ["shape_array_key","geometry"],
        filters = [[("shape_array_key", "in", shapes_list)]],
        get_pandas = True,
        crs = PROJECT_CRS
    )

In [None]:
c3_m = pd.merge(
        c3,
        shapes,
        on = "shape_array_key",
        how = "inner"
    ).rename(columns = {"geometry_x": "vp_geometry", 
                        "geometry_y": "shape_geometry"}
            ).set_geometry("vp_geometry")

In [None]:
shape_meters_geoseries = wrangle_shapes.project_point_geom_onto_linestring(
        c3_m,
        "shape_geometry",
        "vp_geometry",
    )


In [None]:
shape_meters_geoseries.head()

In [None]:
c3_m["shape_meters"] = shape_meters_geoseries

In [None]:
def distance_and_seconds_elapsed(
    df: pd.DataFrame, 
    group_cols: list
) -> pd.DataFrame:
    """
    If every trip has 3 vp, we want the change in time and distance
    between 1st and 2nd, 2nd and 3rd.
    Then, sum up the change in time and change by trip.
    """
    dist_col = "shape_meters"
    time_col = "location_timestamp_local"
    sort_cols = group_cols + ["vp_idx"]
    
    # Get the change in distance, time for each row
    df = df.assign(
        prior_dist = (df.sort_values(sort_cols)
                      .groupby(group_cols, 
                               observed=True, group_keys=False)
                      [dist_col]
                      .apply(lambda x: x.shift(1))
                     ),
        prior_time = (df.sort_values(sort_cols)
                      .groupby(group_cols, 
                               observed=True, group_keys=False)
                      [time_col]
                      .apply(lambda x: x.shift(1))
                     )   
    )
    
    # distance should be positive, but sometimes it's not, 
    # so use absolute value
    df = df.assign(
        change_meters = abs(df[dist_col] - df.prior_dist),
        change_sec = (df[time_col] - df.prior_time).divide(
                       np.timedelta64(1, 's'))
    )
    
    # For a trip, sum up the total change in distance and time 
    # Easier to calculate the speed this way, than
    # taking a weighted average later
    df2 = (df.groupby(group_cols, 
                     observed=True, group_keys=False)
           .agg({"change_meters": "sum", 
                 "change_sec": "sum"})
           .reset_index()
          )
    
    df2 = df2.assign(
        speed_mph = (df2.change_meters.divide(df2.change_sec) * 
                     MPH_PER_MPS)
    )
    
    return df2


In [None]:
speed = distance_and_seconds_elapsed(
        c3_m,
        group_cols = ["gtfs_dataset_key", "trip_instance_key"]
    )
    