# Average speeds across entire trip

In [1]:
import os
os.environ['USE_PYGEOS'] = '0'

import dask.dataframe as dd
import pandas as pd

from segment_speed_utils import helpers, sched_rt_utils
from segment_speed_utils.project_vars import (analysis_date, SEGMENT_GCS, 
                                              CONFIG_PATH, PROJECT_CRS)

In [2]:
STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments")

INPUT_FILE = f'{STOP_SEG_DICT["stage2"]}_{analysis_date}'
INPUT_FILE

'vp_stop_segment_2023-05-17'

In [3]:
#test_key = "00accf770009aafd5dc103ff2eeddb37"
#test_trip = "t_1995375_b_33395_tn_0"

In [10]:
ddf = dd.read_parquet(
    f"{SEGMENT_GCS}vp_sjoin/{INPUT_FILE}", 
)

In [12]:
len(ddf.index)

27188771

In [None]:
operators = dd.read_parquet(
    f"{SEGMENT_GCS}{INPUT_FILE}", 
    columns = ["gtfs_dataset_key"]
).gtfs_dataset_key.unique().compute().tolist()

In [None]:
subset_operators = operators[:2]
subset_operators

In [None]:
ddf = dd.read_parquet(
    f"{SEGMENT_GCS}{INPUT_FILE}", 
    filters = [[("gtfs_dataset_key", "in", subset_operators)]],
    columns = ["vp_idx"]
)

In [None]:
trip_cols = ["gtfs_dataset_key", "trip_id"]
hour_min_cols = ["hour", "minute"]

## Pings per minute for service hours

In [None]:
ddf = ddf.repartition(npartitions=5)

ddf = ddf.assign(
    minute = ddf.location_timestamp_local.dt.minute
)

In [None]:
ddf.dtypes

In [None]:
num_vp_pings = (ddf.groupby(trip_cols + hour_min_cols, observed=True)
                ["location_timestamp_local"]
                .count()
                .dropna()
                .reset_index()
                .rename(columns = {"location_timestamp_local": "num_pings"})
               )

In [None]:
num_vp_pings = num_vp_pings.assign(
        atleast2 = num_vp_pings.apply(
            lambda x: 1 if x.num_pings >= 2
            else 0, axis=1, meta=('atleast2', 'int8'))
    )    

In [None]:
vp_pings = (num_vp_pings.groupby(trip_cols)
            .agg({
               "hour": "size",
               "atleast2": "sum"})
            .dropna()
            .reset_index()
           ).rename(columns = {
            "hour": "trip_min_elapsed"})

In [None]:
vp_pings = vp_pings.persist()

In [None]:
vp_pings.compute()

## Triangulate vp for lengths

In [None]:
from dask import delayed, compute

df_orig = delayed(pd.read_parquet)(
    f"{SEGMENT_GCS}{INPUT_FILE}", 
    filters = [[("gtfs_dataset_key", "in", subset_operators)]],
    columns = ["gtfs_dataset_key", "trip_id", "vp_idx", 
               "location_timestamp_local"]
)

In [None]:
df_orig2 = df_orig.groupby(trip_cols, observed=True).agg(
    {"vp_idx": list}).reset_index()

In [None]:
df_orig3 = compute(df_orig2)[0]

In [None]:
import numpy as np

def count_vp_and_get_every_10_min(my_list: list):
    vp_idx_arr = np.asarray(my_list)
    subset_arr = vp_idx_arr[::30]
    
    if len(subset_arr) < 2:
        subset_arr = vp_idx_arr[:15]
    
    return list(subset_arr)

In [None]:
df_orig3 = df_orig3.assign(
    vp_idx2 = df_orig3.apply(lambda x: 
                             count_vp_and_get_every_10_min(x.vp_idx), 
                             axis=1)
)

In [None]:
keep_subset_vp = list(df_orig3.vp_idx2.explode())

In [None]:
ddf = dd.read_parquet(
    f"{SEGMENT_GCS}{INPUT_FILE}", 
    filters = [[("gtfs_dataset_key", "in", subset_operators), 
               ("vp_idx", "in", keep_subset_vp)]],
    columns = ["gtfs_dataset_key", "trip_id", 
               "location_timestamp_local", 
               "x", "y",
               "vp_idx"]
).compute()

In [None]:
trip_grouping_cols = ["shape_array_key"]
crosswalk = sched_rt_utils.crosswalk_scheduled_trip_grouping_with_rt_key(
        analysis_date, 
        ["feed_key", "trip_id"] + trip_grouping_cols
    )

In [None]:
vp_with_shape = dd.merge(
    ddf,
    crosswalk,
    on = ["gtfs_dataset_key", "trip_id"],
    how = "inner"
).drop_duplicates()

In [None]:
subset_shapes = vp_with_shape.shape_array_key.unique().tolist()

In [None]:
shapes = helpers.import_scheduled_shapes(
    analysis_date,
    columns = ["shape_array_key", "geometry"],
    filters = [[("shape_array_key", "in", subset_shapes)]],
    get_pandas = True,
    crs = PROJECT_CRS
)

In [None]:
from segment_speed_utils import wrangle_shapes
import dask_geopandas as dg
import geopandas as gpd

In [None]:
triangulated_vp_with_shape = dd.from_pandas(vp_with_shape, npartitions=1)


In [None]:
triangulated_vp_with_shape["geometry"] = dg.points_from_xy(
    triangulated_vp_with_shape, "x", "y")

# Refer to the geometry column by name
vp_gddf = dg.from_dask_dataframe(
    triangulated_vp_with_shape, 
    geometry="geometry"
).set_crs("EPSG:4326").to_crs(PROJECT_CRS)

In [None]:
vp_gddf2 = dd.merge(
    vp_gddf,
    shapes,
    on = "shape_array_key",
    how = "inner"
)

In [None]:
vp_geoseries = gpd.GeoSeries(vp_gddf2.geometry_x.compute())
shape_geoseries = gpd.GeoSeries(vp_gddf2.geometry_y.compute())

In [None]:
vp_geoseries_buff = vp_geoseries.buffer(25)

In [None]:
intersecting_line = vp_geoseries_buff.intersection(shape_geoseries)

In [None]:
gdf = vp_gddf2.compute()
gdf["area_of_intersection"] = intersecting_line

In [None]:
gdf.area_of_intersection.is_empty

In [None]:
gdf = gdf.set_geometry("area_of_intersection").set_crs(PROJECT_CRS)
gdf2 = gdf[~gdf.set_geometry("area_of_intersection").is_empty]

In [None]:
test_key = "00accf770009aafd5dc103ff2eeddb37"
test_trip = "t_1995375_b_33395_tn_0"
gdf2[gdf2.trip_id == "t_1995375_b_33395_tn_0"]

In [None]:
#points_plot = vp_gddf[
#    vp_gddf.trip_id == "t_1995375_b_33395_tn_0"].compute()[
#    ["vp_idx", "shape_array_key", "geometry"]]


points_plot = pd.read_parquet(
    f"{SEGMENT_GCS}{INPUT_FILE}", 
    filters = [[("gtfs_dataset_key", "in", subset_operators), 
               ("trip_id", "==", "t_1995375_b_33395_tn_0")]],
    columns = ["gtfs_dataset_key", "trip_id", 
               #"location_timestamp_local", 
               "x", "y",
               "vp_idx"]
)#.compute()

In [None]:
crosswalk[crosswalk.trip_id=="t_1995375_b_33395_tn_0"].shape_array_key.iloc[0]

In [None]:
points_plot["geometry"] = gpd.points_from_xy(points_plot.x, points_plot.y, 
                                             crs="EPSG:4326")

In [None]:
points_plot = points_plot.set_geometry("geometry").to_crs(PROJECT_CRS)

In [None]:
shape_layer = shapes[shapes.shape_array_key==points_plot.shape_array_key.iloc[0]]

In [None]:
import folium

In [None]:
m = points_plot.explore("vp_idx", tiles = "CartoDB Positron", )
m = shape_layer.explore(m=m, color="red", name="shape")
folium.LayerControl().add_to(m)
m

In [None]:
sample2 = pd.merge(
    sample,
    shapes,
    on = "shape_array_key",
    how = "inner"
)

In [None]:
one_shape_geom = sample2.geometry_y.iloc[0]

In [None]:
one_shape_geom.unary_union

In [None]:
import shapely

In [None]:
sample2 = sample2.assign(
    shape_meters = sample2.apply(
        lambda x: x.geometry_y.unary_union.project(x.geometry_x), 
        axis=1)
)

In [None]:
shapes[shapes.shape_array_key=="70f010e0dba18191937ed4b5bea42e8a"].explore(
    tiles = "CartoDB Positron")

In [None]:
one_shape = shapes[
    shapes.shape_array_key=="70f010e0dba18191937ed4b5bea42e8a"].geometry.iloc[0]

In [None]:
test = wrangle_shapes.project_list_of_coords(one_shape, [], use_shapely_coords=True)

In [None]:
test

In [None]:
df2 = wrangle_shapes.linear_reference_vp_against_segment(
    triangulated_vp_with_shape,
    shapes,
    segment_identifier_cols = ["shape_array_key"]
)

In [None]:
df2.compute()