# Speedmap Segments

* segmentizing is distinct from the nearest neighbor pipeline.
  * segments are cut then left there
  * brought back in only for mapping
  * while we cut segments, we can conveniently create the new proxy stops (which we do need for nearest neighbor), because now there are a couple extra "stops" for longer segments.
  * **todo:** add `stop_sequence1` and `stop_sequence2` that track the float values, and this will enable merging segment geoms for mapping. also need to forward fill of `stop_primary_direction`. save out speedmap stops as an intermediate output.
* nearest neighbor pipeline starting input df resembles `stop_times` combined with `stops` --> `stop_times_with_direction`
   * trip key, shape key
   * stop (starting point is stop_id / stop_sequence)
      * 1,000m segments could use `stop_id` / `stop_sequence1` 
   * stop geometry --> for an actual stop, this is the stop's point geom
      * `stop_sequence1 = 1` (existing stop geom)
      * `stop_sequence1 = 1.5` (origin of segment). `gtfs_segments` cuts from stop to next stop, and `stop_id1` is what is matched against `stop_times`. here, `stop_sequence1` is what we want to grab.

In [1]:
import datetime
import geopandas as gpd
import pandas as pd
import shapely

from calitp_data_analysis import geography_utils, utils
from segment_speed_utils import helpers, neighbor
from shared_utils import rt_dates

from segment_speed_utils.project_vars import GTFS_DATA_DICT, SEGMENT_GCS

#import sys 
#sys.path.append('scripts/')

analysis_date = rt_dates.DATES["mar2024"]

In [2]:
stop_segments = gpd.read_parquet(
    f"{SEGMENT_GCS}segment_options/"
    f"stop_segments_{analysis_date}.parquet",
)

stop_segments = stop_segments.assign(
    segment_length = stop_segments.geometry.length
)

In [3]:
def cut_longer_segments(
    stop_segments: gpd.GeoDataFrame, 
    segment_length: int
) -> gpd.GeoDataFrame:
    """
    """
    gdf = stop_segments.loc[
        stop_segments.segment_length > segment_length
    ]
                
    gdf["segment_geometry"] = gdf.apply(
        lambda x:
        geography_utils.create_segments(x.geometry, int(segment_length)),
        axis=1
    )
    
    gdf2 = geography_utils.explode_segments(
        gdf,
        group_cols = ['trip_instance_key'],
        segment_col = 'segment_geometry'
    )
    
    trip_stop_cols = ["trip_instance_key", "stop_sequence"]

    gdf2 = gdf2.assign(
        segment_sequence2 = gdf2.groupby(trip_stop_cols).cumcount()
    )
    
    # Amend segment_id which has suffix "-1"
    # after we explode, the suffix needs to increase, -1, -2, -3
    gdf2 = gdf2.assign(
        # split off the last hyphen and add new suffix (segment_sequence)
        segment_id = (gdf2.segment_id
               .str.rsplit('-1', n=1, expand=True)[0] +
               "-" + gdf2.segment_sequence2.astype(str)                
              )
    )
    
    # TODO: this might be unnecessarily complicated
    # leave for now, but maybe we can get away with just segment_sequence2
    # although for aggregation, we want to ensure segments have same endpoints
    # if we want to stack it, and maybe segment_sequence2 isn't sufficient?
    # we don't want to stack segment1 with segment1 
    
    # To get a new stop_sequence that is numeric, 
    # would have to calculate cumulative distance in the segment now
    gdf2["seg_length"] = gdf2.geometry.length
    gdf2["prev_seg_length"] = (gdf2.groupby(trip_stop_cols)
                               .seg_length
                               .shift(1)
                              )
    
    gdf2["seg_cumulative"] = (gdf2.groupby(trip_stop_cols)
                              .prev_seg_length
                              .cumsum()
                             )
    
    gdf2["seg_pct"] = gdf2.seg_cumulative.divide(
        gdf2.segment_length).round(2)
    
    keep_cols = stop_segments.columns.tolist()
    
    gdf3 = gdf2.assign(
        stop_sequence1 = (gdf2.stop_sequence + gdf2.seg_pct).fillna(
            gdf2.stop_sequence)
    )[keep_cols + ["seg_length", "stop_sequence1"]] 
    #todo: remove seg_length, leave in temporarily so we can compare lengths

    return gdf3

In [4]:
operator_trip_keys = [
    'fe9adabd35ad48c0bc2cb3f2a8f68376'
]

test_segments = stop_segments.loc[
    stop_segments.trip_instance_key.isin(operator_trip_keys)]

SEGMENT_LENGTH = 1_000
print(test_segments.shape)
test_longer_segments = cut_longer_segments(
    test_segments, SEGMENT_LENGTH)

(45, 12)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [5]:
test_longer_segments[
    (test_longer_segments.trip_instance_key.isin(operator_trip_keys)) & 
    (test_longer_segments.stop_sequence==40)]

Unnamed: 0,trip_instance_key,shape_array_key,stop_id1,stop_sequence,geometry,stop_id2,segment_id,stop_pair,schedule_gtfs_dataset_key,route_id,direction_id,segment_length,seg_length,stop_sequence1
6,fe9adabd35ad48c0bc2cb3f2a8f68376,4aea6bd11ca2d523a12c0f0ed298a27b,1540,40,"LINESTRING (-158742.944 -120649.717, -158738.2...",1627,1540-1627-0,1540__1627,43d8d305ee692724a532f30ea63a1cbe,74S,1.0,1031.062174,1000.0,40.0
7,fe9adabd35ad48c0bc2cb3f2a8f68376,4aea6bd11ca2d523a12c0f0ed298a27b,1540,40,"LINESTRING (-158220.354 -120984.111, -158220.0...",1627,1540-1627-1,1540__1627,43d8d305ee692724a532f30ea63a1cbe,74S,1.0,1031.062174,31.062174,40.97


In [6]:
test_interp_seg = gpd.read_parquet(
    f"{SEGMENT_GCS}segment_options/"
    f"test_interpolated_{analysis_date}.parquet",
    filters = [[("trip_instance_key", "in", operator_trip_keys)]]
)


test_interp_seg[
    (test_interp_seg.next_stop_sequence == 41.00)
    # TODO: have to use float, integer filter will not work
    # will want to double check all the merging downstream will work...
    # round to 2 decimal places?
   ]

Unnamed: 0,trip_instance_key,shape_array_key,stop_id1,stop_sequence,geometry,stop_id2,segment_id,stop_pair,schedule_gtfs_dataset_key,route_id,direction_id,next_stop_sequence,length
45,fe9adabd35ad48c0bc2cb3f2a8f68376,4aea6bd11ca2d523a12c0f0ed298a27b,1540,40.0,"LINESTRING (-158742.944 -120649.717, -158738.2...",1627,1540-1627-1_0,1540__1627,43d8d305ee692724a532f30ea63a1cbe,74S,1.0,41.0,1000.0
46,fe9adabd35ad48c0bc2cb3f2a8f68376,4aea6bd11ca2d523a12c0f0ed298a27b,1540,40.969874,"LINESTRING (-158220.354 -120984.111, -158220.0...",1627,1540-1627-1_1,1540__1627,43d8d305ee692724a532f30ea63a1cbe,74S,1.0,41.0,31.062174


## cut for all stop segments

In [7]:
start = datetime.datetime.now()
SEGMENT_LENGTH = 1_000
print(stop_segments.shape)
longer_segments = cut_longer_segments(
    stop_segments, SEGMENT_LENGTH)

end = datetime.datetime.now()
print(f"execution: {end - start}")

(2981612, 12)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


execution: 0:02:44.179563


In [8]:
longer_segments.head()

Unnamed: 0,trip_instance_key,shape_array_key,stop_id1,stop_sequence,geometry,stop_id2,segment_id,stop_pair,schedule_gtfs_dataset_key,route_id,direction_id,segment_length,seg_length,stop_sequence1
0,0000dbb743b258f707687f2dc14ad90f,37bb198ccd3af93e3b7f10bb7602faff,819,1672,"LINESTRING (195533.932 -435807.899, 195532.071...",2304,819-2304-0,819__2304,f74424acf8c41e4c1e9fd42838c4875c,488,1.0,1734.469963,1000.0,1672.0
1,0000dbb743b258f707687f2dc14ad90f,37bb198ccd3af93e3b7f10bb7602faff,819,1672,"LINESTRING (194729.719 -435915.114, 194731.202...",2304,819-2304-1,819__2304,f74424acf8c41e4c1e9fd42838c4875c,488,1.0,1734.469963,734.469963,1672.58
2,000282cd39e32e79bcf3f9de4cada38e,8f166f5ce4cea24c85cd98da8d016081,1255,432,"LINESTRING (190599.280 -429256.139, 190581.236...",1786,1255-1786-0,1255__1786,f74424acf8c41e4c1e9fd42838c4875c,187,1.0,1485.996087,1000.0,432.0
3,000282cd39e32e79bcf3f9de4cada38e,8f166f5ce4cea24c85cd98da8d016081,1255,432,"LINESTRING (189699.094 -428924.361, 189682.302...",1786,1255-1786-1,1255__1786,f74424acf8c41e4c1e9fd42838c4875c,187,1.0,1485.996087,485.996087,432.67
4,0006903347ed5690b444df134045fa8e,14fa6ac9015fb947dbce30d107f89e76,428,2,"LINESTRING (30913.670 -188209.514, 30916.167 -...",115,428-115-0,428__115,4b59b468244e0d5139d91fc698acc9d6,20,0.0,5144.774017,1000.0,2.0


In [10]:
longer_segments[
    longer_segments.trip_instance_key.isin(operator_trip_keys)
].tail()

Unnamed: 0,trip_instance_key,shape_array_key,stop_id1,stop_sequence,geometry,stop_id2,segment_id,stop_pair,schedule_gtfs_dataset_key,route_id,direction_id,segment_length,seg_length,stop_sequence1
572595,fe9adabd35ad48c0bc2cb3f2a8f68376,4aea6bd11ca2d523a12c0f0ed298a27b,1540,40,"LINESTRING (-158220.354 -120984.111, -158220.0...",1627,1540-1627-1,1540__1627,43d8d305ee692724a532f30ea63a1cbe,74S,1.0,1031.062174,31.062174,40.97
572596,fe9adabd35ad48c0bc2cb3f2a8f68376,4aea6bd11ca2d523a12c0f0ed298a27b,1630,44,"LINESTRING (-157875.135 -121906.895, -157867.4...",1636,1630-1636-0,1630__1636,43d8d305ee692724a532f30ea63a1cbe,74S,1.0,1030.23676,1000.0,44.0
572597,fe9adabd35ad48c0bc2cb3f2a8f68376,4aea6bd11ca2d523a12c0f0ed298a27b,1630,44,"LINESTRING (-157361.276 -122133.001, -157349.0...",1636,1630-1636-1,1630__1636,43d8d305ee692724a532f30ea63a1cbe,74S,1.0,1030.23676,30.23676,44.97
572598,fe9adabd35ad48c0bc2cb3f2a8f68376,4aea6bd11ca2d523a12c0f0ed298a27b,1636,45,"LINESTRING (-157339.582 -122132.350, -157349.0...",1661,1636-1661-0,1636__1661,43d8d305ee692724a532f30ea63a1cbe,74S,1.0,1140.223387,1000.0,45.0
572599,fe9adabd35ad48c0bc2cb3f2a8f68376,4aea6bd11ca2d523a12c0f0ed298a27b,1636,45,"LINESTRING (-156702.961 -121585.092, -156701.8...",1661,1636-1661-1,1636__1661,43d8d305ee692724a532f30ea63a1cbe,74S,1.0,1140.223387,140.223387,45.88


At this point, we should create the proxy stops to be added to the existing `stop_times` table. 

The origin of segment would be the proxy stop.

2 files to save out:
- proxy stops, segment origin --> to append to `stop_times` for nearest neighbor
- the longer segments, now cut to 1,000 m, append to existing `stop_segments` and save as `speedmap_segments_{analysis_date}` (can be referenced in `gtfs_analytics_data.yml`)

In [11]:
# todo: update references to shapely.Point(x.geometry.coords[0])
# we can use shapely.get_point()
keep_cols =  ["trip_instance_key", "shape_array_key",
               "stop_sequence", "stop_id", "stop_pair", 
               #"stop_primary_direction",
               "geometry"]

proxy_stops = longer_segments.assign(
    geometry = longer_segments.apply(
        lambda x: shapely.get_point(x.geometry, 0), axis=1)
).rename(
    columns = {"stop_id1": "stop_id"}
)[keep_cols + ["stop_sequence1"]].to_crs("EPSG:4326")

# stop_primary_direction can be populated when it's appended
# with the stop_times, and we can sort by trip-stop_sequence1
# and pd.ffill (forward fill)

In [12]:
def concatenate_new_stops_with_existing(
    new_stops: gpd.GeoDataFrame,
    analysis_date: str
) -> gpd.GeoDataFrame: 
    stop_times = helpers.import_scheduled_stop_times(
        analysis_date,
        columns = ["trip_instance_key", "shape_array_key",
                   "stop_sequence", "stop_id", "stop_pair", 
                   "stop_primary_direction",
                   "geometry"],
        with_direction = True,
        get_pandas = True,
        crs = "EPSG:4326",
    )
    
    # need to check whether CRS is 3310 when it goes into nearest neighbor
    # right now, set it at 4326
    
    trip_stop_cols = ["trip_instance_key", "stop_sequence"]
    gdf = pd.concat(
        [stop_times, new_stops], 
        axis=0, ignore_index=True
    ).sort_values(
        trip_stop_cols
    ).reset_index(drop=True)
    
    gdf = gdf.assign(
        stop_primary_direction = (gdf.groupby(trip_stop_cols)
                                  .stop_primary_direction
                                  .ffill()
                                 ),
        stop_sequence1 = gdf.stop_sequence1.fillna(gdf.stop_sequence),
        #TODO: create stop_sequence2 
        # is this needed or will stop_sequence1 be sufficient
        # segments go from current stop to next stop, 
    )
    
    return gdf

In [13]:
new_stop_times = concatenate_new_stops_with_existing(
    proxy_stops,
    analysis_date
)

# save this out somewhere
# similar to road_segments, which would have its proxy stops saved out
# TODO: think about folder structure and how this should be keyed in
# within gtfs_analytics_catalog.yml

In [14]:
utils.geoparquet_gcs_export(
    new_stop_times,
    SEGMENT_GCS,
    f"stop_time_expansion/speedmap_stop_times_{analysis_date}"
)

In [15]:
speedmap_segments = pd.concat([
    stop_segments.loc[stop_segments.segment_length < SEGMENT_LENGTH],
    longer_segments], 
    axis=0
).sort_values(
    ["schedule_gtfs_dataset_key", 
    "trip_instance_key", "stop_sequence"]
).reset_index(drop=True).drop(
    columns = ["segment_length", 
    "seg_length" 
    # drop this column in earlier function
])

In [16]:
utils.geoparquet_gcs_export(
    speedmap_segments,
    SEGMENT_GCS,
    f"segment_options/speedmap_segments_{analysis_date}"
)