# Average speeds across entire trip

In [1]:
import os
os.environ['USE_PYGEOS'] = '0'

import dask.dataframe as dd
import pandas as pd

from segment_speed_utils import helpers, sched_rt_utils
from segment_speed_utils.project_vars import (analysis_date, SEGMENT_GCS, 
                                              CONFIG_PATH, PROJECT_CRS)

In [2]:
STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments")

INPUT_FILE = f'{STOP_SEG_DICT["stage1"]}_{analysis_date}'
INPUT_FILE

'vp_usable_2023-05-17'

In [3]:
operators = dd.read_parquet(
    f"{SEGMENT_GCS}{INPUT_FILE}", 
    columns = ["gtfs_dataset_key"]
).gtfs_dataset_key.unique().compute().tolist()

In [4]:
subset_operators = operators[:2]

In [5]:
ddf = dd.read_parquet(
    f"{SEGMENT_GCS}{INPUT_FILE}", 
    filters = [[("gtfs_dataset_key", "in", subset_operators)]],
)

In [6]:
trip_cols = ["gtfs_dataset_key", "trip_id"]
hour_min_cols = ["hour", "minute"]

## Pings per minute for service hours

In [None]:
ddf = ddf.repartition(npartitions=5)

ddf = ddf.assign(
    minute = ddf.location_timestamp_local.dt.minute
)

In [None]:
ddf.dtypes

In [None]:
num_vp_pings = (ddf.groupby(trip_cols + hour_min_cols, observed=True)
                ["location_timestamp_local"]
                .count()
                .dropna()
                .reset_index()
                .rename(columns = {"location_timestamp_local": "num_pings"})
               )

In [None]:
num_vp_pings = num_vp_pings.assign(
        atleast2 = num_vp_pings.apply(
            lambda x: 1 if x.num_pings >= 2
            else 0, axis=1, meta=('atleast2', 'int8'))
    )    

In [None]:
vp_pings = (num_vp_pings.groupby(trip_cols)
            .agg({
               "hour": "size",
               "atleast2": "sum"})
            .dropna()
            .reset_index()
           ).rename(columns = {
            "hour": "trip_min_elapsed"})

In [None]:
vp_pings = vp_pings.persist()

In [None]:
vp_pings.compute()

## Triangulate vp for lengths

In [7]:
def get_aggregation(ddf: dd.DataFrame, trip_cols: list):
    
    def remove_extra(df: dd.DataFrame):
        return df.dropna().reset_index()
    
    first_vp = (ddf.groupby(trip_cols, observed=True)
                .vp_idx.min()
                .pipe(remove_extra)
               )
    last_vp = (ddf.groupby(trip_cols, observed=True)
               .vp_idx.max()
               .pipe(remove_extra)
              )
    middle_vp = (ddf.groupby(trip_cols, observed=True)
                 .vp_idx.mean().round(0)
                 .pipe(remove_extra)
                )
    triangulate_vp = dd.multi.concat(
        [first_vp, middle_vp, last_vp], axis=0
    )
    
    return triangulate_vp

In [8]:
df = get_aggregation(ddf, trip_cols).compute().astype({"vp_idx": "int"})

In [9]:
triangulated_vp2 = dd.merge(
    ddf,
    df[["vp_idx"]],
    on = "vp_idx",
    how = "inner"
)[["gtfs_dataset_key", "_gtfs_dataset_name", 
   "trip_id", "location_timestamp_local", 
   "x", "y", "vp_idx"]]

In [10]:
trip_grouping_cols = ["shape_array_key"]
crosswalk = sched_rt_utils.crosswalk_scheduled_trip_grouping_with_rt_key(
        analysis_date, 
        ["feed_key", "trip_id"] + trip_grouping_cols
    )

In [33]:
triangulated_vp_with_shape = dd.merge(
    triangulated_vp2,
    crosswalk,
    on = ["gtfs_dataset_key", "trip_id"],
    how = "inner"
).drop_duplicates()

In [12]:
subset_shapes = triangulated_vp_with_shape.shape_array_key.unique().compute().tolist()

In [14]:
shapes = helpers.import_scheduled_shapes(
    analysis_date,
    columns = ["shape_array_key", "geometry"],
    filters = [[("shape_array_key", "in", subset_shapes)]],
    get_pandas = True,
    crs = PROJECT_CRS
)

In [40]:
from segment_speed_utils import wrangle_shapes
import dask_geopandas as dg
import geopandas as gpd

In [35]:
triangulated_vp_with_shape = triangulated_vp_with_shape.repartition(npartitions=1)

In [41]:
triangulated_vp_with_shape["geometry"] = dg.points_from_xy(triangulated_vp_with_shape, "x", "y")

# Refer to the geometry column by name
vp_gddf = dg.from_dask_dataframe(
    triangulated_vp_with_shape, 
    geometry="geometry"
).set_crs("EPSG:4326").to_crs(PROJECT_CRS)

In [55]:
vp_gddf2 = dd.merge(
    vp_gddf,
    shapes,
    on = "shape_array_key",
    how = "inner"
)

In [59]:
vp_geoseries = gpd.GeoSeries(vp_gddf2.geometry_x.compute())
shape_geoseries = gpd.GeoSeries(vp_gddf2.geometry_y.compute())

In [60]:
shape_meters_geoseries = wrangle_shapes.project_point_geom_onto_linestring(
        shape_geoseries,
        vp_geoseries,
        get_dask_array=True
    )


In [61]:
vp_gddf2['shape_meters'] = shape_meters_geoseries


In [65]:
vp_gddf2[vp_gddf2.trip_id == "t_1995375_b_33395_tn_0"].compute()

Unnamed: 0,gtfs_dataset_key,_gtfs_dataset_name,trip_id,location_timestamp_local,x,y,vp_idx,feed_key,shape_array_key,geometry_x,geometry_y,shape_meters
0,00accf770009aafd5dc103ff2eeddb37,Bay Area 511 Sonoma County Transit Vehicle Pos...,t_1995375_b_33395_tn_0,2023-05-17 10:58:06,-122.73247,38.459064,0,9c93eedde54ab5f9a25ba51b977358b8,70f010e0dba18191937ed4b5bea42e8a,POINT (-238128.395 52644.088),"LINESTRING (-234388.650 40690.714, -234390.234...",3989.879409
1,00accf770009aafd5dc103ff2eeddb37,Bay Area 511 Sonoma County Transit Vehicle Pos...,t_1995375_b_33395_tn_0,2023-05-17 11:33:14,-122.72462,38.39505,105,9c93eedde54ab5f9a25ba51b977358b8,70f010e0dba18191937ed4b5bea42e8a,POINT (-237648.944 45511.435),"LINESTRING (-234388.650 40690.714, -234390.234...",3989.879409
2,00accf770009aafd5dc103ff2eeddb37,Bay Area 511 Sonoma County Transit Vehicle Pos...,t_1995375_b_33395_tn_0,2023-05-17 15:19:09,-122.708565,38.3452,210,9c93eedde54ab5f9a25ba51b977358b8,70f010e0dba18191937ed4b5bea42e8a,POINT (-236407.255 39932.111),"LINESTRING (-234388.650 40690.714, -234390.234...",4874.678103


In [48]:
vp_gddf[vp_gddf.vp_idx.isin([0, 105, 210])].compute().drop(
    columns = "location_timestamp_local").explore("vp_idx", tiles="CartoDB Positron")

In [50]:
shapes[shapes.shape_array_key=="70f010e0dba18191937ed4b5bea42e8a"].explore(
    tiles = "CartoDB Positron")

In [52]:
one_shape = shapes[
    shapes.shape_array_key=="70f010e0dba18191937ed4b5bea42e8a"].geometry.iloc[0]

In [53]:
[i for i in one_shape.coords]

[(-234388.6499281881, 40690.71433703462),
 (-234390.23394866896, 40172.53155302256),
 (-234397.25296085997, 39844.66767196357),
 (-234404.5964268675, 39471.44006559346),
 (-234404.07114788066, 39258.79586155666),
 (-234450.95749054762, 39259.23472119542),
 (-234708.86659612018, 39261.987792056985),
 (-234786.95902140392, 39264.53718635766),
 (-234832.70883840334, 39271.173603437375),
 (-234869.07290717046, 39282.32576754643),
 (-234910.25843166377, 39302.06679882575),
 (-235020.8206284654, 39379.048921857495),
 (-235102.61311525982, 39435.75355679914),
 (-235141.66197301535, 39456.88076046249),
 (-235186.98641954607, 39472.29252156941),
 (-235188.9579202701, 39467.455410534516),
 (-235193.45766453695, 39468.917826281395),
 (-235225.34911167898, 39477.60907329479),
 (-235216.09595789693, 39526.61117921909),
 (-235210.70656007362, 39553.37031211937),
 (-235209.94204285313, 39580.26093362132),
 (-235214.07678118767, 39606.84601219278),
 (-235226.08271141475, 39636.10146657331),
 (-235235.

In [27]:
df2 = wrangle_shapes.linear_reference_vp_against_segment(
    triangulated_vp_with_shape,
    shapes,
    segment_identifier_cols = ["shape_array_key"]
)

In [29]:
df2.compute()

Unnamed: 0,gtfs_dataset_key,_gtfs_dataset_name,trip_id,location_timestamp_local,vp_idx,feed_key,shape_array_key,shape_meters
0,00accf770009aafd5dc103ff2eeddb37,Bay Area 511 Sonoma County Transit Vehicle Pos...,t_1995375_b_33395_tn_0,2023-05-17 10:58:06,0,9c93eedde54ab5f9a25ba51b977358b8,70f010e0dba18191937ed4b5bea42e8a,3989.879409
1,00accf770009aafd5dc103ff2eeddb37,Bay Area 511 Sonoma County Transit Vehicle Pos...,t_1995375_b_33395_tn_0,2023-05-17 11:33:14,105,9c93eedde54ab5f9a25ba51b977358b8,70f010e0dba18191937ed4b5bea42e8a,3989.879409
2,00accf770009aafd5dc103ff2eeddb37,Bay Area 511 Sonoma County Transit Vehicle Pos...,t_1995375_b_33395_tn_0,2023-05-17 15:19:09,210,9c93eedde54ab5f9a25ba51b977358b8,70f010e0dba18191937ed4b5bea42e8a,4874.678103
3,00accf770009aafd5dc103ff2eeddb37,Bay Area 511 Sonoma County Transit Vehicle Pos...,t_1995378_b_33395_tn_0,2023-05-17 15:35:13,211,9c93eedde54ab5f9a25ba51b977358b8,b09dd070bdd68ca007a683a522af4c32,5369.104340
4,00accf770009aafd5dc103ff2eeddb37,Bay Area 511 Sonoma County Transit Vehicle Pos...,t_1995378_b_33395_tn_0,2023-05-17 15:49:31,254,9c93eedde54ab5f9a25ba51b977358b8,b09dd070bdd68ca007a683a522af4c32,12736.932143
...,...,...,...,...,...,...,...,...
433,00accf770009aafd5dc103ff2eeddb37,Bay Area 511 Sonoma County Transit Vehicle Pos...,t_5561800_b_80156_tn_0,2023-05-17 08:06:40,31742,9c93eedde54ab5f9a25ba51b977358b8,1c31b8e8f6cfa9b1edeea8f6c5fcddb2,22384.842522
434,00accf770009aafd5dc103ff2eeddb37,Bay Area 511 Sonoma County Transit Vehicle Pos...,t_5561800_b_80156_tn_0,2023-05-17 08:59:37,31893,9c93eedde54ab5f9a25ba51b977358b8,1c31b8e8f6cfa9b1edeea8f6c5fcddb2,44355.356656
435,00accf770009aafd5dc103ff2eeddb37,Bay Area 511 Sonoma County Transit Vehicle Pos...,t_5562081_b_80156_tn_0,2023-05-17 07:31:16,32887,9c93eedde54ab5f9a25ba51b977358b8,0e69f3b447f85898af234663d28cf1e4,0.000000
436,00accf770009aafd5dc103ff2eeddb37,Bay Area 511 Sonoma County Transit Vehicle Pos...,t_5562081_b_80156_tn_0,2023-05-17 07:48:35,32939,9c93eedde54ab5f9a25ba51b977358b8,0e69f3b447f85898af234663d28cf1e4,5667.411467
