# Average speeds across entire trip

In [1]:
#import os
#os.environ['USE_PYGEOS'] = '0'
# turning this off makes to_crs really slow

import dask.dataframe as dd
import dask_geopandas as dg
import folium
import geopandas as gpd
import numpy as np
import pandas as pd
import shapely

from segment_speed_utils import helpers, sched_rt_utils, wrangle_shapes
from segment_speed_utils.project_vars import (SEGMENT_GCS,
                                              CONFIG_PATH, PROJECT_CRS
                                             )
analysis_date = "2023-05-17"


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas


In [2]:
df = pd.read_parquet(f"{SEGMENT_GCS}trip_summary/vp_subset_{analysis_date}.parquet")

In [3]:
# in case there are fewer shapes to grab
shapes_list = df.shape_array_key.unique().tolist()

shapes = helpers.import_scheduled_shapes(
    analysis_date,
    columns = ["shape_array_key","geometry"],
    filters = [[("shape_array_key", "in", shapes_list)]],
    get_pandas = True,
    crs = PROJECT_CRS
)

In [4]:
linear_ref = wrangle_shapes.linear_reference_vp_against_segment(
    df,
    shapes,
    segment_identifier_cols = ["shape_array_key"]
).compute()

linear_ref.to_parquet("test.parquet")



In [None]:
linear_ref = pd.read_parquet("test.parquet")

In [7]:
def distance_and_seconds_elapsed(
    df: pd.DataFrame, 
    group_cols: list
) -> pd.DataFrame:
    """
    If every trip has 3 vp, we want the change in time and distance
    between 1st and 2nd, 2nd and 3rd.
    Then, sum up the change in time and change by trip.
    """
    dist_col = "shape_meters"
    time_col = "location_timestamp_local"
    sort_cols = group_cols + ["vp_idx"]
    

    df = df.assign(
        prior_dist = (df.sort_values(sort_cols)
                      .groupby(group_cols, 
                               observed=True, group_keys=False)
                      [dist_col]
                      .apply(lambda x: x.shift(1))
                     ),
        prior_time = (df.sort_values(sort_cols)
                      .groupby(group_cols, 
                               observed=True, group_keys=False)
                      [time_col]
                      .apply(lambda x: x.shift(1))
                     )   
    )
    
    df = df.assign(
        change_meters = df[dist_col] - df.prior_dist,
        change_sec = (df[time_col] - df.prior_time).divide(
                       np.timedelta64(1, 's'))
    )
    
    df2 = (df.groupby(group_cols, 
                     observed=True, group_keys=False)
           .agg({"change_meters": "sum", 
                 "change_sec": "sum"})
           .reset_index()
          )
    
    df2 = df2.assign(
        speed_mph = (df2.change_meters.divide(df2.change_sec) * 
                     rt_utils.MPH_PER_MPS)
    )
    
    return df2

In [8]:
from shared_utils import rt_utils

speed = distance_and_seconds_elapsed(
    linear_ref, 
    group_cols = ["gtfs_dataset_key", "trip_id"]
)

In [9]:
speed.shape

(68556, 5)

In [10]:
speed[speed.speed_mph>=70].shape

(20, 5)

In [13]:
speed[speed.speed_mph<=2].shape

(4378, 5)

In [None]:
def aggregate_by_operator_route_time_of_day():

In [None]:
#test_key = "00accf770009aafd5dc103ff2eeddb37"
#test_trip = "t_1995375_b_33395_tn_0"
test_shape = "70f010e0dba18191937ed4b5bea42e8a"

This trip has a lot of vp that end up not being joined to any segment.
Including those vp far away from the shape mean that the interpolation results show the same thing, because essentially, all those points fall closest to the one end of the shape, and when taking the difference in `shape_meters`, the difference is zero.

This is a compelling reason to add the % of segments touched in the sjoin results. Before, we used time cutoff, because it's easier to implement. '

At least for calculating trip average speeds, we do need to touch at least 50% of the segments, or even 70% of segments as recommended in notebook, to only calculate entire trip averages on trips that have enough vp.

The con of using % of segments is that it becomes even more crucial that segments are cut correctly. If we miss a segment (which we might, currently), there are vp that are not being joined, and we may throw out too many trips because it fails the % segments threshold.

For now, let's take the sjoin results and use a couple points to triangulate the distance. Make an array, and pick points either every 10 min or at least 3 points to calculate distance.

In [None]:
ddf = A2.merge_usable_vp_with_sjoin_vpidx(
    [test_shape],
    USABLE_FILE,
    SJOIN_FILE,
    SEGMENT_IDENTIFIER_COLS,
    GROUPING_COL
)

In [None]:
ddf = ddf.compute()

In [None]:
from shared_utils import geography_utils

ddf = geography_utils.create_point_geometry(ddf, "x", "y")

In [None]:
crosswalk = sched_rt_utils.crosswalk_scheduled_trip_grouping_with_rt_key(
        analysis_date, 
        ["feed_key", "trip_id", GROUPING_COL, "shape_id"] 
    )

In [None]:
shapes = helpers.import_scheduled_shapes(
    analysis_date,
    columns = ["shape_array_key", "geometry"],
    filters = [[("shape_array_key", "in", [test_shape])]],
    get_pandas = True,
    crs = PROJECT_CRS
)

In [None]:
shapes2 = pd.merge(
    shapes,
    crosswalk,
    on = "shape_array_key",
    how = "inner"
)

In [None]:
ddf2 = ddf.to_crs(PROJECT_CRS).drop(
    columns = ["location_timestamp", "location_timestamp_local", 
               "activity_date"])

In [None]:
m = ddf2.explore("trip_id", tiles = "CartoDB Positron")
m = shapes2.explore(m=m, color="yellow", name="shape")
folium.LayerControl().add_to(m)
m

## Triangulate vp based on sjoin results

In [None]:
def list_of_vp_by_trip(
    df: pd.DataFrame, 
    group_cols: list = ["gtfs_dataset_key", "trip_id"]
) -> pd.DataFrame:

    df2 = (df.groupby(trip_cols, observed=True)
           .agg({"vp_idx": list})
           .reset_index()
          )
    
    return df2

In [None]:
by_trip_ddfs = [list_of_vp_by_trip(df, trip_cols) for df in subset_vp_ddfs]

In [None]:
one = by_trip_ddfs[0]

In [None]:
trip_df = compute(one)[0]

In [None]:
def count_vp_and_get_every_10_min(my_list: list):
    vp_idx_arr = np.asarray(my_list)
    subset_arr = vp_idx_arr[::30]
    
    if len(subset_arr) < 3:
        subset_arr = vp_idx_arr[:15]
    
    return list(subset_arr)

In [None]:
trip_df = trip_df.assign(
    vp_idx2 = trip_df.apply(
        lambda x: 
        count_vp_and_get_every_10_min(x.vp_idx), 
        axis=1, meta=('vp_idx2', 'object'))
)

In [None]:
keep_subset_vp = trip_df.vp_idx2.explode()

In [None]:
ddf_subset = ddf[ddf.vp_idx.isin(keep_subset_vp)][
    ["gtfs_dataset_key", "trip_id",
     "location_timestamp_local",
     "x", "y", "vp_idx"]]

In [None]:
crosswalk = sched_rt_utils.crosswalk_scheduled_trip_grouping_with_rt_key(
        analysis_date, 
        ["feed_key", "trip_id", GROUPING_COL]
    )

In [None]:
subset_vp_shape = delayed(dd.merge)(
    ddf_subset,
    crosswalk,
    on = ["gtfs_dataset_key", "trip_id"],
    how = "inner"
).drop_duplicates()

In [None]:
subset_shapes = subset_vp_shape.shape_array_key.unique().persist()

In [None]:
subset_shapes

In [None]:
shapes = helpers.import_scheduled_shapes(
    analysis_date,
    columns = ["shape_array_key", "geometry"],
    filters = [[("shape_array_key", "in", subset_shapes)]],
    get_pandas = True,
    crs = PROJECT_CRS
)

In [None]:
RT_OPERATORS = subset_vp_shape.gtfs_dataset_key.unique().compute()

In [None]:
test_operator = RT_OPERATORS[0]

In [None]:
subset_vp_operator = subset_vp_shape[
    subset_vp_shape.gtfs_dataset_key==test_operator]

In [None]:
linear_ref_operator = delayed(
    wrangle_shapes.linear_reference_vp_against_segment)(
    subset_vp_operator,
    shapes,
    segment_identifier_cols = [GROUPING_COL]
)

In [None]:
linear_ref = delayed(wrangle_shapes.linear_reference_vp_against_segment)(
    subset_vp_shape,
    shapes,
    segment_identifier_cols = [GROUPING_COL]
)

In [None]:
linear_ref

In [None]:
operators = dd.read_parquet(
    f"{SEGMENT_GCS}{INPUT_FILE}", 
    columns = ["gtfs_dataset_key"]
).gtfs_dataset_key.unique().compute().tolist()

In [None]:
subset_operators = operators[:2]
subset_operators

In [None]:
ddf = dd.read_parquet(
    f"{SEGMENT_GCS}{INPUT_FILE}", 
    filters = [[("gtfs_dataset_key", "in", subset_operators)]],
    columns = ["vp_idx"]
)

In [None]:
trip_cols = ["gtfs_dataset_key", "trip_id"]
hour_min_cols = ["hour", "minute"]

## Pings per minute for service hours

In [None]:
ddf = ddf.repartition(npartitions=5)

ddf = ddf.assign(
    minute = ddf.location_timestamp_local.dt.minute
)

In [None]:
ddf.dtypes

In [None]:
num_vp_pings = (ddf.groupby(trip_cols + hour_min_cols, observed=True)
                ["location_timestamp_local"]
                .count()
                .dropna()
                .reset_index()
                .rename(columns = {"location_timestamp_local": "num_pings"})
               )

In [None]:
num_vp_pings = num_vp_pings.assign(
        atleast2 = num_vp_pings.apply(
            lambda x: 1 if x.num_pings >= 2
            else 0, axis=1, meta=('atleast2', 'int8'))
    )    

In [None]:
vp_pings = (num_vp_pings.groupby(trip_cols)
            .agg({
               "hour": "size",
               "atleast2": "sum"})
            .dropna()
            .reset_index()
           ).rename(columns = {
            "hour": "trip_min_elapsed"})

In [None]:
vp_pings = vp_pings.persist()

In [None]:
vp_pings.compute()