In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(1_000_000_000_000) ## 1TB?

import pandas as pd
import geopandas as gpd
from siuba import *
import json

In [444]:
import shared_utils
import warnings

# Conveyal Transit Paths

* GH Issue: https://github.com/cal-itp/data-analyses/issues/1098

## Conveyal SOP

* prepare a csv with lat, lon, and od column with 0 for origin and 1 for destination
* run Conveyal Analysis: 8-10am, standard transit parameters, add JSON feed_id param
* run Regional Analysis: 120min max time, 5, 50, 95 %ile, get paths and travel times

## Metrics

* modal trip by n_iterations (most available trip) (how fast is it, how many xfers, fares?)
* fastest trip (how available?, fares?, xfer count?)
* fewest xfer trip (how fast, how available, fares?)

## Visuals??

# Basic Paths Parse

In [2]:
array_cols = ['routes', 'boardStops', 'alightStops',
       'rideTimes', 'waitTimes', 'feedIds']

def unpack_conveyal_path_df(df, array_cols = array_cols):
    
    for col in array_cols:
        df.loc[:,col] = df[col].map(lambda x: x.split('|'))
    return df

In [3]:
df = pd.read_csv('./la/6643b1ec46ade8368e2cb698_PATHS.csv')
df.index.rename('trip_group_id', inplace=True)
df.reset_index(inplace=True)
df = (df >> filter(_.origin == 0, _.destination == 1)
         >> select(-_.group)
     )
df = unpack_conveyal_path_df(df)
# df = df.dropna() #  remove same o/d
df >> head(3)

Unnamed: 0,trip_group_id,origin,destination,routes,boardStops,alightStops,feedIds,rideTimes,accessTime,egressTime,transferTime,waitTimes,totalTime,nIterations
11,11,0,1,"[209-13168, 804, 805, 90-13168, 96-13168]","[141078, 80128, 80211, 12102, 3611]","[2489, 80122, 80213, 3611, 1574]","[654404a5dad1a760f30e340d, 654404b9dad1a760f30...","[14.0, 18.0, 2.0, 3.0, 46.0]",8.6,7.2,2.3,"[1.4, 4.3, 1.5, 1.9, 2.0]",112.2,2
12,12,0,1,"[108-13168, 910-13168, 802, 94-13168]","[7962, 2322, 80211, 30001]","[1539, 65300042, 80201, 11812]","[654404a5dad1a760f30e340d, 654404a5dad1a760f30...","[15.0, 12.0, 26.0, 20.0]",1.4,3.8,9.8,"[1.6, 5.3, 2.4, 5.4]",102.8,56
13,13,0,1,"[108-13168, 910-13168, 802, 3163, 164-13168]","[7963, 2322, 80211, 3068903, 16428]","[1539, 65300042, 80201, 3068909, 11812]","[654404a5dad1a760f30e340d, 654404a5dad1a760f30...","[16.0, 12.0, 26.0, 12.0, 8.0]",1.3,3.8,10.5,"[1.7, 5.3, 4.4, 4.6, 2.1]",107.8,18


# Conveyal bundle-feed matching

In [10]:
socal_conveyal_joined = pd.read_parquet('../../conveyal_update/socal_conveyal_joined.parquet')
#  temporary version pending global solution

In [25]:
socal_conveyal_joined.columns

Index(['feedId', 'name', 'bundleScopedFeedId', 'serviceStart', 'serviceEnd',
       'checksum', 'span', 'feed_key', 'region', 'gtfs_dataset_name',
       'base64_url', 'date', 'span_x', 'span_y'],
      dtype='object')

In [17]:
unique_feeds = df.feedIds.explode().unique()

In [24]:
# #  all example feeds present!
# socal_conveyal_joined >> filter(_.feedId.isin(unique_feeds))

In [26]:
socal_conveyal_joined = socal_conveyal_joined >> select(_.feedId, _.feed_key, _.gtfs_dataset_name, _.base64_url, _.date)

In [32]:
as_dict = socal_conveyal_joined.set_index('feedId').to_dict()
# feed_key_dict = as_dict['feed_key']

In [36]:
df['feed_keys'] = df.feedIds.apply(lambda x: [as_dict['feed_key'][item] for item in x])
df['gtfs_dataset_names'] = df.feedIds.apply(lambda x: [as_dict['gtfs_dataset_name'][item] for item in x])
df['date'] = socal_conveyal_joined.date.iloc[0]

In [446]:
with pd.option_context('display.max_colwidth', 100):
    display(df)

Unnamed: 0,trip_group_id,origin,destination,routes,boardStops,alightStops,feedIds,rideTimes,accessTime,egressTime,transferTime,waitTimes,totalTime,nIterations,feed_keys,gtfs_dataset_names,date
11,11,0,1,"[209-13168, 804, 805, 90-13168, 96-13168]","[141078, 80128, 80211, 12102, 3611]","[2489, 80122, 80213, 3611, 1574]","[654404a5dad1a760f30e340d, 654404b9dad1a760f30e340e, 654404b9dad1a760f30e340e, 654404a5dad1a760f...","[14.0, 18.0, 2.0, 3.0, 46.0]",8.6,7.2,2.3,"[1.4, 4.3, 1.5, 1.9, 2.0]",112.2,2,"[edb0fd72b98a7f739407ceca3fae9034, beaaee89e671fea3d48124dc23335bed, beaaee89e671fea3d48124dc233...","[LA Metro Bus Schedule, LA Metro Rail Schedule, LA Metro Rail Schedule, LA Metro Bus Schedule, L...",2023-10-18
12,12,0,1,"[108-13168, 910-13168, 802, 94-13168]","[7962, 2322, 80211, 30001]","[1539, 65300042, 80201, 11812]","[654404a5dad1a760f30e340d, 654404a5dad1a760f30e340d, 654404b9dad1a760f30e340e, 654404a5dad1a760f...","[15.0, 12.0, 26.0, 20.0]",1.4,3.8,9.8,"[1.6, 5.3, 2.4, 5.4]",102.8,56,"[edb0fd72b98a7f739407ceca3fae9034, edb0fd72b98a7f739407ceca3fae9034, beaaee89e671fea3d48124dc233...","[LA Metro Bus Schedule, LA Metro Bus Schedule, LA Metro Rail Schedule, LA Metro Bus Schedule]",2023-10-18
13,13,0,1,"[108-13168, 910-13168, 802, 3163, 164-13168]","[7963, 2322, 80211, 3068903, 16428]","[1539, 65300042, 80201, 3068909, 11812]","[654404a5dad1a760f30e340d, 654404a5dad1a760f30e340d, 654404b9dad1a760f30e340e, 6544048cdad1a760f...","[16.0, 12.0, 26.0, 12.0, 8.0]",1.3,3.8,10.5,"[1.7, 5.3, 4.4, 4.6, 2.1]",107.8,18,"[edb0fd72b98a7f739407ceca3fae9034, edb0fd72b98a7f739407ceca3fae9034, beaaee89e671fea3d48124dc233...","[LA Metro Bus Schedule, LA Metro Bus Schedule, LA Metro Rail Schedule, Burbank Schedule, LA Metr...",2023-10-18
14,14,0,1,"[807, 804, 802, 94-13168]","[80706, 80128, 80211, 30001]","[80709, 80122, 80201, 11812]","[654404b9dad1a760f30e340e, 654404b9dad1a760f30e340e, 654404b9dad1a760f30e340e, 654404a5dad1a760f...","[9.0, 18.0, 26.0, 20.0]",14.5,3.8,9.2,"[1.5, 1.9, 8.4, 1.5]",113.8,2,"[beaaee89e671fea3d48124dc23335bed, beaaee89e671fea3d48124dc23335bed, beaaee89e671fea3d48124dc233...","[LA Metro Rail Schedule, LA Metro Rail Schedule, LA Metro Rail Schedule, LA Metro Bus Schedule]",2023-10-18
15,15,0,1,"[108-13168, 910-13168, 802, Antelope Valley Line, 154-13168]","[7962, 2322, 80211, 107, 30003]","[1539, 65300042, 80214, 102, 1326]","[654404a5dad1a760f30e340d, 654404a5dad1a760f30e340d, 654404b9dad1a760f30e340e, 654404c3dad1a760f...","[15.0, 12.0, 8.0, 19.0, 4.0]",1.4,3.5,10.7,"[1.6, 5.3, 3.4, 4.7, 1.9]",90.5,20,"[edb0fd72b98a7f739407ceca3fae9034, edb0fd72b98a7f739407ceca3fae9034, beaaee89e671fea3d48124dc233...","[LA Metro Bus Schedule, LA Metro Bus Schedule, LA Metro Rail Schedule, Metrolink Schedule, LA Me...",2023-10-18
16,16,0,1,"[108-13168, 4X, 802, 94-13168]","[7962, 373, 80211, 30001]","[1539, 380, 80201, 11812]","[654404a5dad1a760f30e340d, 654404e2dad1a760f30e3431, 654404b9dad1a760f30e340e, 654404a5dad1a760f...","[15.0, 12.4, 26.0, 20.0]",1.4,3.8,17.5,"[1.6, 4.1, 3.5, 2.5]",107.8,20,"[edb0fd72b98a7f739407ceca3fae9034, 24487472b6cd8d19984eb6496780bcad, beaaee89e671fea3d48124dc233...","[LA Metro Bus Schedule, Torrance Schedule, LA Metro Rail Schedule, LA Metro Bus Schedule]",2023-10-18
17,17,0,1,"[108-13168, 910-13168, 802, 3163, 164-13168]","[7962, 2322, 80211, 3068903, 16428]","[1539, 65300042, 80201, 3068909, 11812]","[654404a5dad1a760f30e340d, 654404a5dad1a760f30e340d, 654404b9dad1a760f30e340e, 6544048cdad1a760f...","[15.0, 12.0, 26.0, 12.0, 8.0]",1.4,3.8,10.5,"[1.6, 5.3, 4.4, 4.6, 1.1]",105.8,20,"[edb0fd72b98a7f739407ceca3fae9034, edb0fd72b98a7f739407ceca3fae9034, beaaee89e671fea3d48124dc233...","[LA Metro Bus Schedule, LA Metro Bus Schedule, LA Metro Rail Schedule, Burbank Schedule, LA Metr...",2023-10-18
18,18,0,1,"[40-13168, Antelope Valley Line, 154-13168]","[141013, 107, 30003]","[652, 102, 1326]","[654404a5dad1a760f30e340d, 654404c3dad1a760f30e3418, 654404a5dad1a760f30e340d]","[53.0, 19.0, 4.0]",13.3,3.5,6.4,"[1.7, 1.7, 1.9]",104.5,12,"[edb0fd72b98a7f739407ceca3fae9034, 1ac8ee536d38b537e2cf55383222d379, edb0fd72b98a7f739407ceca3fa...","[LA Metro Bus Schedule, Metrolink Schedule, LA Metro Bus Schedule]",2023-10-18
19,19,0,1,"[108-13168, 4X, 802, 3163, 164-13168]","[7962, 373, 80211, 3068903, 16428]","[1539, 380, 80201, 3068909, 11812]","[654404a5dad1a760f30e340d, 654404e2dad1a760f30e3431, 654404b9dad1a760f30e340e, 6544048cdad1a760f...","[15.0, 12.4, 26.0, 12.0, 8.0]",1.4,3.8,9.4,"[1.6, 2.1, 8.4, 4.6, 1.1]",105.8,2,"[edb0fd72b98a7f739407ceca3fae9034, 24487472b6cd8d19984eb6496780bcad, beaaee89e671fea3d48124dc233...","[LA Metro Bus Schedule, Torrance Schedule, LA Metro Rail Schedule, Burbank Schedule, LA Metro Bu...",2023-10-18
20,20,0,1,"[108-13168, 460-13168, 802, 94-13168]","[7962, 2322, 80211, 30001]","[1539, 5019, 80201, 11812]","[654404a5dad1a760f30e340d, 654404a5dad1a760f30e340d, 654404b9dad1a760f30e340e, 654404a5dad1a760f...","[15.0, 13.0, 26.0, 20.0]",1.4,3.8,9.6,"[1.6, 1.4, 3.5, 2.5]",97.8,20,"[edb0fd72b98a7f739407ceca3fae9034, edb0fd72b98a7f739407ceca3fae9034, beaaee89e671fea3d48124dc233...","[LA Metro Bus Schedule, LA Metro Bus Schedule, LA Metro Rail Schedule, LA Metro Bus Schedule]",2023-10-18


In [28]:
# works, but is it useful?
df_exploded = df.explode(array_cols).reset_index(drop=True)

## Metric Development

In [50]:
row = df.iloc[0,:]
row

trip_group_id                                                        11
origin                                                                0
destination                                                           1
routes                        [209-13168, 804, 805, 90-13168, 96-13168]
boardStops                          [141078, 80128, 80211, 12102, 3611]
alightStops                            [2489, 80122, 80213, 3611, 1574]
feedIds               [654404a5dad1a760f30e340d, 654404b9dad1a760f30...
rideTimes                                  [14.0, 18.0, 2.0, 3.0, 46.0]
accessTime                                                          8.6
egressTime                                                          7.2
transferTime                                                        2.3
waitTimes                                     [1.4, 4.3, 1.5, 1.9, 2.0]
totalTime                                                         112.2
nIterations                                                     

In [54]:
row.feed_keys

['edb0fd72b98a7f739407ceca3fae9034',
 'beaaee89e671fea3d48124dc23335bed',
 'beaaee89e671fea3d48124dc23335bed',
 'edb0fd72b98a7f739407ceca3fae9034',
 'edb0fd72b98a7f739407ceca3fae9034']

### Get warehouse data

In [420]:
from shapely.ops import split, substring, LineString
from calitp_data_analysis import geography_utils

In [252]:
all_feed_keys = list(df.feed_keys.explode().unique())
all_route_ids = list(df.routes.explode().unique())
all_stops = list(df.boardStops.explode().unique()) + list(df.alightStops.explode().unique())

In [256]:
warehouse_data = {}

In [257]:
analysis_date = df.date.iloc[0].date()

In [259]:
warehouse_data['shapes'] = shared_utils.gtfs_utils_v2.get_shapes(selected_date=analysis_date, operator_feeds=all_feed_keys,
                                                  shape_cols = ['feed_key', 'shape_id'])
warehouse_data['shapes'] = warehouse_data['shapes'].to_crs(geography_utils.CA_NAD83Albers)

  sqlalchemy.util.warn(


In [439]:
warehouse_data['trips'] = shared_utils.gtfs_utils_v2.get_trips(selected_date=analysis_date, operator_feeds=all_feed_keys, trip_cols = ['feed_key', 'trip_id', 'route_id',
                                                                                                                                       'shape_id', 'trip_first_departure_ts'])

In [440]:
warehouse_data['trips'] = warehouse_data['trips'] >> filter(_.route_id.isin(all_route_ids))

In [302]:
warehouse_data['st'] = shared_utils.gtfs_utils_v2.get_stop_times(selected_date=analysis_date, operator_feeds=all_feed_keys, trip_df=warehouse_data['trips'])

warehouse_data['st'] = warehouse_data['st'] >> filter(_.stop_id.isin(all_stops)) >> collect()

warehouse_data['stops'] = shared_utils.gtfs_utils_v2.get_stops(selected_date=analysis_date, operator_feeds=all_feed_keys, custom_filtering={'stop_id': all_stops})
warehouse_data['stops'] = warehouse_data['stops'].to_crs(geography_utils.CA_NAD83Albers)

  sqlalchemy.util.warn(
  sqlalchemy.util.warn(
  sqlalchemy.util.warn(
  sqlalchemy.util.warn(
  sqlalchemy.util.warn(


### Map?

In [452]:
def shape_segments_from_row(row, warehouse_data):

    stop_pairs = list(zip(row.boardStops, row.alightStops))
    
    row_shape_segments = []
    for stop_pair in stop_pairs:
        # print(stop_pair)
        first_filter = warehouse_data['st'] >> filter(_.stop_id.isin(stop_pair))
        # display(first_filter)
        good_trips = first_filter >> count(_.trip_id) >> filter(_.n > 1)
        assert good_trips.shape[0] > 0
        trip_with_pair = first_filter >> filter(_.trip_id == good_trips.trip_id.iloc[0]) >> arrange(_.stop_sequence)
        trip_with_pair = trip_with_pair >> select(_.feed_key, _.trip_id, _.stop_id, _.stop_sequence)
        trip_with_pair = trip_with_pair >> inner_join(_, warehouse_data['stops'] >> select(_.feed_key, _.stop_id, _.geometry),
                                                      on = ['feed_key', 'stop_id'])
        trip_with_pair = trip_with_pair >> inner_join(_, warehouse_data['trips'] >> select(_.feed_key, _.trip_id, _.shape_id),
                                                      on = ['feed_key', 'trip_id'])
        paired_shape = warehouse_data['shapes'] >> filter(_.feed_key == trip_with_pair.feed_key.iloc[0], _.shape_id == trip_with_pair.shape_id.iloc[0])
            
        if not trip_with_pair.stop_id.is_unique:
            print('warning, trip has duplicate stops at a single stop')
            trip_with_pair = trip_with_pair >> distinct(_.stop_id, _keep_all=True)
        stop0 =  (trip_with_pair >> filter(_.stop_sequence == _.stop_sequence.min())).geometry.iloc[0]
        stop1 =  (trip_with_pair >> filter(_.stop_sequence == _.stop_sequence.max())).geometry.iloc[0]
        if paired_shape.empty:
            print('warning, trip has no shape')
            trip_with_pair = trip_with_pair >> distinct(_.stop_id, _keep_all=True)
            paired_segment = LineString([stop0, stop1])
        # stop0_proj = shape_geom.project(stop0)
        # stop1_proj = shape_geom.project(stop1)
        else:
            shape_geom = paired_shape.geometry.iloc[0]
            stops_proj = [shape_geom.project(stop0), shape_geom.project(stop1)] #  be resillient to looping
            paired_segment = substring(shape_geom, min(stops_proj), max(stops_proj))
        
        trip_with_pair['segment_geom'] = paired_segment
        trip_with_pair.set_geometry('segment_geom')
        trip_with_pair = trip_with_pair >> rename(stop_geom = _.geometry)
        # display(stop_pair)
        trip_with_pair = trip_with_pair >> distinct(_.shape_id, _keep_all=True)
        trip_with_pair['stop_pair'] = [stop_pair]
        trip_with_pair['trip_group_id'] = row.trip_group_id
        trip_with_pair['nIterations'] = row.nIterations
        trip_with_seg = gpd.GeoDataFrame(trip_with_pair, geometry='segment_geom', crs=geography_utils.CA_NAD83Albers)
        row_shape_segments += [trip_with_seg]
    return pd.concat(row_shape_segments)

In [453]:
def compile_all_spatial_routes(df):
    spatial_routes = []
    for _ix, row in df.iterrows():
        try:
            spatial_routes += [shape_segments_from_row(row, warehouse_data)]
        except:
            print(f'failed for row {row}')
    spatial_routes = pd.concat(spatial_routes)
    return spatial_routes

In [454]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    spatial_routes = compile_all_spatial_routes(df)



In [456]:
spatial_routes >> head(3)

Unnamed: 0,feed_key,trip_id,stop_id,stop_sequence,stop_geom,shape_id,segment_geom,stop_pair,trip_group_id,nIterations
0,edb0fd72b98a7f739407ceca3fae9034,10209000510529-JUNE23,141078,32,POINT (155462.061 -445965.161),2090051_JUNE23,"LINESTRING (155451.791 -445961.441, 155451.900...","(141078, 2489)",11,2
0,beaaee89e671fea3d48124dc23335bed,59052614,80122,11,POINT (160750.294 -439298.745),804WB_RC_221121,"LINESTRING (160750.580 -439298.986, 160748.045...","(80128, 80122)",11,2
0,beaaee89e671fea3d48124dc23335bed,58926335,80211,11,POINT (160763.169 -439295.846),802EB_190513,"LINESTRING (160759.382 -439300.931, 160839.511...","(80211, 80213)",11,2


In [457]:
spatial_routes.segment_geom = spatial_routes.apply(lambda x: x.segment_geom.buffer(x.nIterations * 1.3), axis=1)

In [None]:
spatial_routes = spatial_routes >> arrange(-_.nIterations)
# spatial_routes.explore(column = 'shape_id', cmap='Accent', tiles="CartoDB positron")
spatial_routes.explore(column = 'trip_group_id', cmap='tab20', tiles="CartoDB positron")

## Storytelling - infrequent regional rail

* only two usable trips for 8-10am departures from origin, at 10:41 (Metrolink) and 11:01 (Amtrak)

In [449]:
(warehouse_data['trips'] >> filter(_.route_id == 'Antelope Valley Line') >> arrange(_.trip_first_departure_ts))[:3]

Unnamed: 0,feed_key,trip_id,route_id,shape_id,trip_first_departure_ts
16332,1ac8ee536d38b537e2cf55383222d379,294300161,Antelope Valley Line,AVin,2023-10-18 10:41:00+00:00
16336,1ac8ee536d38b537e2cf55383222d379,294300162,Antelope Valley Line,AVin,2023-10-18 11:41:00+00:00
16230,1ac8ee536d38b537e2cf55383222d379,294300163,Antelope Valley Line,AVin,2023-10-18 12:11:00+00:00


In [450]:
(warehouse_data['trips'] >> filter(_.route_id == '78') >> arrange(_.trip_first_departure_ts))[:3]

Unnamed: 0,feed_key,trip_id,route_id,shape_id,trip_first_departure_ts
321,b4970d6cc7e206d9e667796130394790,26944,78,102,2023-10-18 04:01:00+00:00
8410,b4970d6cc7e206d9e667796130394790,17148,78,148,2023-10-18 11:01:00+00:00
334,b4970d6cc7e206d9e667796130394790,19891,78,149,2023-10-18 13:01:00+00:00
