In [1]:
import dask.dataframe as dd
import dask_geopandas as dg
import datetime
import geopandas as gpd
import pandas as pd

from shared_utils import dask_utils, utils
from segment_speed_utils import helpers, sched_rt_utils
from segment_speed_utils.project_vars import (SEGMENT_GCS, 
                                              CONFIG_PATH, analysis_date)


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas


In [2]:
STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments")

In [3]:
dict_inputs = STOP_SEG_DICT

INPUT_FILE_PREFIX = dict_inputs["stage2"]
SEGMENT_IDENTIFIER_COLS = dict_inputs["segment_identifier_cols"]
GROUPING_COL = dict_inputs["grouping_col"]
TIMESTAMP_COL = dict_inputs["timestamp_col"]
EXPORT_FILE = dict_inputs["stage3"]

In [4]:
shape_cases = pd.read_parquet(
    f"{SEGMENT_GCS}stops_projected_{analysis_date}/",
    filters = [[("loop_or_inlining", "==", 1)]],
    columns = ["shape_array_key"]
).shape_array_key.unique().tolist()
    

In [5]:
test_shapes = shape_cases[:2]

In [6]:
# https://docs.dask.org/en/stable/delayed-collections.html
vp_joined_to_segments = helpers.import_vehicle_positions(
    f"{SEGMENT_GCS}vp_sjoin/",
    f"{INPUT_FILE_PREFIX}_{analysis_date}",
    file_type = "df",
    filters = [[("shape_array_key", "in", test_shapes)]],
    partitioned=True
)

In [8]:
df = vp_joined_to_segments.compute()

In [17]:
mean_time = (df.groupby(["shape_array_key", "stop_sequence", 
                         "trip_id"])
             .agg({"location_timestamp_local": "mean"})
             .reset_index()
             .rename(columns = {"location_timestamp_local": "mean_time"})
            )

In [18]:
df2 = pd.merge(
    df,
    mean_time,
    on = ["shape_array_key", "stop_sequence", "trip_id"],
)

In [20]:
df2 = df2.assign(
    group = df2.apply(
        lambda x: 0 if x.location_timestamp_local <= x.mean_time 
        else 1, axis=1)
)

In [23]:
one_trip = df2.trip_id.unique().tolist()[10]

In [27]:
df2[df2.trip_id==one_trip][["stop_sequence", 
                            "location_timestamp_local", 
                            "mean_time", "group"]
                          ].sort_values("location_timestamp_local")

Unnamed: 0,stop_sequence,location_timestamp_local,mean_time,group
130,1723,2023-03-15 07:30:21,2023-03-15 07:53:42.142857216,0
131,1723,2023-03-15 08:19:24,2023-03-15 07:53:42.142857216,1
132,1723,2023-03-15 07:31:06,2023-03-15 07:53:42.142857216,0
133,1723,2023-03-15 08:18:49,2023-03-15 07:53:42.142857216,1
134,1723,2023-03-15 07:46:14,2023-03-15 07:53:42.142857216,0
...,...,...,...,...
7602,588,2023-03-15 07:59:12,2023-03-15 07:59:28.333333248,0
7734,816,2023-03-15 08:03:00,2023-03-15 08:03:00.000000000,0
7824,379,2023-03-15 07:56:47,2023-03-15 07:56:46.333333504,1
7825,379,2023-03-15 07:56:32,2023-03-15 07:56:46.333333504,0


In [None]:
stop_segments = gpd.read_parquet(
    f"{SEGMENT_GCS}stop_segments_{analysis_date}.parquet",
    filters = [[("shape_array_key", "==", one_shape)]]
)

In [None]:
one_shape = "085811097c87489feebe8206770b3cd7"
stop_seq = 600
one_trip = "t14A-b1B59-sl4"

In [None]:
stop_segments[stop_segments.stop_sequence < 3600].explore("stop_sequence", 
                    tiles="CartoDB Positron", categorical=True)

In [None]:
df = vp_joined_to_segments_special[
    (vp_joined_to_segments_special.shape_array_key==one_shape) & 
    (vp_joined_to_segments_special.stop_sequence==stop_seq) & 
    (vp_joined_to_segments_special.trip_id==one_trip)
]

In [None]:
df[["trip_id", "location_timestamp_local"]].sort_values("location_timestamp_local")