In [None]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(800_000_000_000)

import pandas as pd
import geopandas as gpd

import dask.dataframe as dd

from segment_speed_utils import helpers
from segment_speed_utils.project_vars import SEGMENT_GCS, analysis_date
from shared_utils.rt_utils import arrowize_segment
from shared_utils import gtfs_utils_v2
from calitp_data_analysis import geography_utils
from calitp_data_analysis.tables import tbls


import numpy as np
import shapely
from siuba import *

# Add interpolated segments to pipeline

* Additional detail between widely spaced stops is more useful for speed analysis

## Check existing segments

In [None]:
path = 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/segment_options/stop_segments_2024-03-13.parquet'

In [None]:
# read via geopandas so that geometry stays intact
segs = gpd.read_parquet(path)

In [None]:
# segs >> head(3)

In [None]:
#  Less than 6% of segments need to be interpolated...
segs.length.quantile(.94)

In [None]:
# segs >> count(_.segment_id) >> arrange(-_.n)

# Additional Columns Required

Should probably happen upstream in `cut_stop_segments.py`, related scripts...

* `length`: float, `geometry.length`
* `next_stop_sequence`: lead of `stop_sequence`, should include final stop seq (final stop seq unavailable here since shifting from existing df...)
    * alternatively, rename `stop_sequence` -> `stop_sequence1` and add `stop_sequence2` (consistent with existing `stop_id1` and `stop_id2`)

## Adding these after the fact for testing...

In [None]:
# segs_ddf = dd.from_pandas(segs, npartitions=50)

# segs_ddf = segs_ddf.assign(length=lambda x: x.geometry.length)

# next_sequence_ddf = segs_ddf[['trip_instance_key', 'stop_sequence']].groupby('trip_instance_key').shift(-1)
# next_sequence_ddf = next_sequence_ddf.rename(columns={'stop_sequence': 'next_stop_sequence'})

# #  note this relies on the index
# segs_ddf = segs_ddf.join(next_sequence_ddf)

# !mkdir test_segs

# segs_ddf[['next_stop_sequence', 'length']].to_parquet('test_segs/')

# new_cols = segs_ddf.compute()

# segs_ddf = dd.read_parquet('test_segs/')

In [None]:
# segs_ddf

In [None]:
#  restart kernel/seperate script

In [None]:
# segs = segs.join(segs_ddf.compute())

In [None]:
# segs.to_parquet('test_interpolated_segs.parquet')

In [None]:
segs = gpd.read_parquet('test_interpolated_segs.parquet')

## Find BBBR10

In [None]:
bbb = gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name('2024-03-13') >> filter(_.name.str.contains('Big Blue'))

In [None]:
bbb

In [None]:
str10 = tbls.mart_gtfs.dim_stop_times() >> filter(_.feed_key == '4f9888472a8dad0f66bdbbd002312789', _.trip_id == '919600') >> collect()

In [None]:
# segs >> filter(_.trip_instance_key == 'd98e5cc1fb62e6e5ed0030934ef8a396') >> arrange(_.stop_sequence)

In [None]:
str10 >> select(_.trip_id, _.stop_id, _.stop_sequence, _.arrival_time, _.departure_time) >> arrange(_.stop_sequence) >> head (3)

In [None]:
bbbr10 = (tbls.mart_gtfs.fct_scheduled_trips()
 >> filter(_.gtfs_dataset_key.isin(bbb.gtfs_dataset_key),
          _.route_short_name.str.contains('R10'),
          _.service_date == '2024-03-13')
) >> collect()

In [None]:
# bbbr10

In [None]:
# bbbr10.trip_id

## Work out function

In [None]:
test = segs >> filter(_.trip_instance_key == '3505bf6a20e8d29e83e545784a421bc7')

In [None]:
long = test >> filter(_.stop_sequence == 18)

In [None]:
geom = long.geometry.iloc[0]

In [None]:
geom.length

In [None]:
km_segs = geom.length // 1000
km_segs

In [None]:
def split_distance(geom, dist=1000):
    '''
    geom: shapely.LineString
    
    returns a shapely.MultiLineString split every dist (meters)
    '''
    
    split_segs = geom.length // dist
    substrings = []
    for i in range(0, int(split_segs)):
        substrings += [shapely.ops.substring(geom, i * dist, (i+1) * dist)]
    substrings += [shapely.ops.substring(geom, split_segs * dist, geom.length)]
    new_geom = shapely.MultiLineString(substrings)
    
    return new_geom

In [None]:
new_geom = split_distance(geom)

In [None]:
geom

In [None]:
new_geom

In [None]:
# geom.wkt

In [None]:
# new_geom.wkt

In [None]:
long.geometry = [new_geom]

In [None]:
exploded = long.explode()

In [None]:
#  note order remains from list order passed to MultiLineString constructor
exploded.head(2)

In [None]:
def process_exploded(gdf):
    #  TODO assert gdf is grouped (to all same trip x original seg), handle sorting here!
    '''
    update required cols in exploded gdf
    
    stop_sequence: increment proportional to segment distance within arbitrary stop sequence increment
    segment_id: postfix _(int) per segment to maintain uniqueness
    '''
    gdf['length'] = gdf.geometry.apply(lambda x: x.length) #  maybe move this to keep geopandas out of the rest?
    
    min_stop = int(gdf.stop_sequence.min())
    next_stop = int(gdf.next_stop_sequence.max())
    stop_seq_chg = gdf.next_stop_sequence.max() - min_stop
    
    #  increment stop sequence proportional to distance traveled 
    seq_per_km = stop_seq_chg / gdf.length.sum() 
    seq_changes = gdf.length * seq_per_km
    stop_sequences_scaled = np.flip(next_stop - np.flip(seq_changes).cumsum())
    
    gdf['stop_sequence'] = stop_sequences_scaled
    
    #  postfix to segment_id so that it remains unique
    postfixes = np.arange(0, gdf.shape[0]).astype(str)
    underscores = np.full(gdf.shape[0], '_')
    postfixes = np.char.add(underscores, postfixes)
    gdf['segment_id'] = gdf.segment_id + postfixes
    
    return(gdf)

In [None]:
processed = process_exploded(exploded)

In [None]:
arrowize_segment?

In [None]:
processed.geometry = processed.geometry.apply(lambda x: arrowize_segment(x))

In [None]:
processed.head(3)

In [None]:
# processed.explore()

## Abstracting -- drop long geoms from overall, process as array of gdfs, merge?

In [None]:
shorts = (segs[segs['length'] < 1000]).copy()

In [None]:
longs = (segs[segs['length'] > 1000]).copy()

In [None]:
longs = longs >> filter(-_.next_stop_sequence.isna()) #  fix upstream (include final next seq...)

In [None]:
longs.shape

In [None]:
test_longs = longs.iloc[:1000,:]

In [None]:
test_longs.head(3)

## Rowwise apply and accumulate?

* row-wise makes sense (each row has the information we need to split into n rows of interpolated segments)

In [None]:
def interpolate_segments(row):
    '''
    wrapper function -- rowwise apply to a gdf of "long" (>1000m) segments
    '''
    global interpolated_longs
    new_geom = split_distance(row.geometry)

    row.geometry = new_geom
    #  back to gdf to use .explode()
    row = (gpd.GeoDataFrame(row)
           .transpose()
           .set_geometry('geometry')
           .set_crs(geography_utils.CA_NAD83Albers)
          )
    exploded = gpd.GeoDataFrame.explode(row, column='geometry', index_parts=False)
    # return exploded
    processed = process_exploded(exploded)
    
    interpolated_longs += [processed]
    return

In [None]:
# test1 = test_longs.head(1)

# x = test1.apply(interpolate_segments, axis=1)

# gdf = pd.concat(interpolated_longs)

# #  can't split at endpoints (no next stop seq, must calculate upstream instead of shift!)
# #  drop for now to test
# test_longs = test_longs >> filter(-_.next_stop_sequence.isna())

# %%timeit

# interpolated_longs = []

# _ = test_longs.apply(interpolate_segments, axis=1)

# (14 * 177) / 60 #  41min to interpolate all -- not entirely ideal

# (interpolated >> distinct(_.segment_id, _keep_all=True)).shape

In [None]:
# interpolated = pd.concat(interpolated_longs)

##  much of this compute is going to the same segments... better to do once and join to trips?

* compute/join on `shape_array_key, segment_id`
* accumulate geoms only in dict; lookup/replace geoms in full df, then use `gdf.explode()`?
    * elegantly deal with repeated columns...
    * refactor process_exploded to go here, could groupby/dask that

In [None]:
longs.head(1)

In [None]:
#  segments missing from interpolation because we're missing the last stop sequence in testing
#  should be fixed in prod!

# (longs >> distinct(_.shape_array_key, _.segment_id, _keep_all=True)
#                      >> filter(_.next_stop_sequence.isna())).explore()

In [None]:
def store_new_geoms(row):
    '''
    wrapper function -- rowwise apply to a gdf of "long" (>1000m) segments
    
    accumulate results in a dict: segment_geoms (init empty dict outside function)
    '''
    global segment_geoms
    new_geom = split_distance(row.geometry)
    # row.geometry = new_geom
    
    geom_key = (row.shape_array_key, row.segment_id)
    segment_geoms[geom_key] = new_geom
    #  TODO store key:geom in dict/something fast...
    
    return

In [None]:
def lookup_geom(row):
    '''
    after running store_new_geoms on unique segments, apply this to 
    a gdf of all segments to lookup new geom by shape_array_key, segment_id
    '''
    row.geometry = segment_geoms[(row.shape_array_key, row.segment_id)]
    return row

In [None]:
to_interpolate = (longs >> distinct(_.shape_array_key, _.segment_id, _keep_all=True)
                 )

In [None]:
to_interpolate >> head(1)

In [None]:
%%time

segment_geoms = {}

_ = to_interpolate.apply(store_new_geoms, axis=1)

In [None]:
%%time
#  now, add geometries to long list...
interpolated = longs.apply(lookup_geom, axis = 1)

In [None]:
interpolated = interpolated.explode(index_parts=False).reset_index(drop=True)

In [None]:
interpolated

In [None]:
interpolated.shape

In [None]:
recombined = pd.concat([shorts, interpolated]).reset_index(drop=True)

In [None]:
recombined.shape

# Taking a look: Big Blue Bus R10

* split and merge with untouched segments looks good!

In [None]:
# processed.shape_array_key

In [None]:
from siuba import * #  re-import to fix bug? TODO report...

In [None]:
test = recombined[recombined.trip_instance_key == '3505bf6a20e8d29e83e545784a421bc7']

In [None]:
test.geometry = test.geometry.apply(lambda x: arrowize_segment(x))

In [None]:
test.explore()

In [None]:
##  TODO fix process explode first...

In [None]:
bigtest = (recombined >> filter(_.trip_instance_key.isin(longs.trip_instance_key[:500])))

In [None]:
bigtest.shape

In [None]:
bigtest.geometry = bigtest.geometry.apply(lambda x: arrowize_segment(x))

In [None]:
bigtest.explore()