In [None]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(1_000_000_000_000) ## 1TB?

import pandas as pd
import numpy as np
import geopandas as gpd
from siuba import *
import json

import shared_utils
import warnings
from path_example_vars import GCS_PATH

CONVEYAL_GCS_PATH = 'gs://calitp-analytics-data/data-analyses/conveyal_update/'

# Conveyal Transit Paths

* GH Issue: https://github.com/cal-itp/data-analyses/issues/1098

## Conveyal SOP

* prepare a csv with lat, lon, and od column with 0 for origin and 1 for destination
    * allow freeform, use od col as id in upload
* run Conveyal Analysis: 8-10am, standard transit parameters, add JSON feed_id param
* run Regional Analysis: 120min max time, 5, 50, 95 %ile, get paths and travel times

## Metrics

* modal trip by n_iterations (most available trip) (how fast is it, how many xfers, fares?)
* fastest trip (how available?, fares?, xfer count?)
* fewest xfer trip (how fast, how available, fares?)

## Visuals??

# Basic Paths Parse

In [None]:
def read_conveyal_path_df(path):

    array_cols = ['routes', 'boardStops', 'alightStops',
           'rideTimes', 'waitTimes', 'feedIds']

    def unpack_conveyal_path_df(df, array_cols = array_cols):

        for col in array_cols:
            df.loc[:,col] = df[col].map(lambda x: x.split('|'))
        return df

    df = pd.read_csv(path)
    df.index.rename('trip_group_id', inplace=True)
    df.reset_index(inplace=True)
    df['total_iterations'] = (df >> filter(_.origin == 0, _.destination == 0)).nIterations.iloc[0]
    df = (df >> filter(_.origin == 0, _.destination == 1)
             >> select(-_.group)
         )
    df = unpack_conveyal_path_df(df)
    return df

In [None]:
df = read_conveyal_path_df('./la/6643b1ec46ade8368e2cb698_PATHS.csv')
df >> head(3)

# Conveyal bundle-feed matching

In [None]:
def add_warehouse_identifiers(conveyal_df):
    
    warehouse_conveyal_joined = pd.read_parquet(f'{CONVEYAL_GCS_PATH}warehouse_conveyal_simple_2023-10-18')
    analysis_date = warehouse_conveyal_joined.date.iloc[0].date()

    # all example feeds present?
    unique_feeds = conveyal_df.feedIds.explode().unique()
    assert np.isin(unique_feeds, warehouse_conveyal_joined.feedId).all()

    warehouse_conveyal_joined = warehouse_conveyal_joined >> select(_.feedId, _.feed_key, _.gtfs_dataset_name, _.base64_url, _.date)

    as_dict = warehouse_conveyal_joined.set_index('feedId').to_dict()

    conveyal_df['feed_keys'] = conveyal_df.feedIds.apply(lambda x: [as_dict['feed_key'][item] for item in x])
    conveyal_df['gtfs_dataset_names'] = conveyal_df.feedIds.apply(lambda x: [as_dict['gtfs_dataset_name'][item] for item in x])
    conveyal_df['date'] = analysis_date
    
    return conveyal_df

In [None]:
df = add_warehouse_identifiers(df)

In [None]:
# with pd.option_context('display.max_colwidth', 100):
#     display(df)

In [None]:
# works, but is it useful?
# df_exploded = df.explode(array_cols).reset_index(drop=True)

## Metric Development

### Get warehouse data

In [None]:
from shapely.ops import split, substring, LineString
from calitp_data_analysis import geography_utils

In [None]:
def get_warehouse_data(path_df):
    '''
    get relevant data from warehouse for all trips in Conveyal path output
    '''
    analysis_date = path_df.date.iloc[0]
    all_feed_keys = list(df.feed_keys.explode().unique())
    all_route_ids = list(df.routes.explode().unique())
    all_stops = list(df.boardStops.explode().unique()) + list(df.alightStops.explode().unique())

    warehouse_data = {}
    warehouse_data['shapes'] = shared_utils.gtfs_utils_v2.get_shapes(selected_date=analysis_date, operator_feeds=all_feed_keys,
                                                      shape_cols = ['feed_key', 'shape_id'])
    warehouse_data['shapes'] = warehouse_data['shapes'].to_crs(geography_utils.CA_NAD83Albers)
    warehouse_data['trips'] = shared_utils.gtfs_utils_v2.get_trips(selected_date=analysis_date,
                                                                   operator_feeds=all_feed_keys,
                                                                   trip_cols = ['feed_key', 'trip_id', 'route_id',
                                                                                'shape_id', 'trip_first_departure_ts']
                                                                  )
    warehouse_data['trips'] = warehouse_data['trips'] >> filter(_.route_id.isin(all_route_ids))
    warehouse_data['st'] = shared_utils.gtfs_utils_v2.get_stop_times(selected_date=analysis_date, operator_feeds=all_feed_keys, trip_df=warehouse_data['trips'])
    warehouse_data['st'] = warehouse_data['st'] >> filter(_.stop_id.isin(all_stops)) >> collect()
    warehouse_data['stops'] = shared_utils.gtfs_utils_v2.get_stops(selected_date=analysis_date, operator_feeds=all_feed_keys, custom_filtering={'stop_id': all_stops})
    warehouse_data['stops'] = warehouse_data['stops'].to_crs(geography_utils.CA_NAD83Albers)
    
    return warehouse_data

In [None]:
warehouse_data = get_warehouse_data(df)

### Map?

In [None]:
def shape_segments_from_row(row, warehouse_data):

    stop_pairs = list(zip(row.boardStops, row.alightStops))
    
    row_shape_segments = []
    for stop_pair in stop_pairs:
        # print(stop_pair)
        first_filter = warehouse_data['st'] >> filter(_.stop_id.isin(stop_pair))
        # display(first_filter)
        good_trips = first_filter >> count(_.trip_id) >> filter(_.n > 1)
        assert good_trips.shape[0] > 0
        trip_with_pair = first_filter >> filter(_.trip_id == good_trips.trip_id.iloc[0]) >> arrange(_.stop_sequence)
        trip_with_pair = trip_with_pair >> select(_.feed_key, _.trip_id, _.stop_id, _.stop_sequence)
        trip_with_pair = trip_with_pair >> inner_join(_, warehouse_data['stops'] >> select(_.feed_key, _.stop_id, _.geometry),
                                                      on = ['feed_key', 'stop_id'])
        trip_with_pair = trip_with_pair >> inner_join(_, warehouse_data['trips'] >> select(_.feed_key, _.trip_id, _.shape_id),
                                                      on = ['feed_key', 'trip_id'])
        paired_shape = warehouse_data['shapes'] >> filter(_.feed_key == trip_with_pair.feed_key.iloc[0], _.shape_id == trip_with_pair.shape_id.iloc[0])
            
        if not trip_with_pair.stop_id.is_unique:
            print('warning, trip has duplicate stops at a single stop')
            trip_with_pair = trip_with_pair >> distinct(_.stop_id, _keep_all=True)
        stop0 =  (trip_with_pair >> filter(_.stop_sequence == _.stop_sequence.min())).geometry.iloc[0]
        stop1 =  (trip_with_pair >> filter(_.stop_sequence == _.stop_sequence.max())).geometry.iloc[0]
        if paired_shape.empty:
            print('warning, trip has no shape')
            trip_with_pair = trip_with_pair >> distinct(_.stop_id, _keep_all=True)
            paired_segment = LineString([stop0, stop1])
        # stop0_proj = shape_geom.project(stop0)
        # stop1_proj = shape_geom.project(stop1)
        else:
            shape_geom = paired_shape.geometry.iloc[0]
            stops_proj = [shape_geom.project(stop0), shape_geom.project(stop1)] #  be resillient to looping
            paired_segment = substring(shape_geom, min(stops_proj), max(stops_proj))
        
        trip_with_pair['segment_geom'] = paired_segment
        trip_with_pair.set_geometry('segment_geom')
        trip_with_pair = trip_with_pair >> rename(stop_geom = _.geometry)
        # display(stop_pair)
        trip_with_pair = trip_with_pair >> distinct(_.shape_id, _keep_all=True)
        trip_with_pair['stop_pair'] = [stop_pair]
        trip_with_pair['trip_group_id'] = row.trip_group_id
        trip_with_pair['nIterations'] = row.nIterations
        trip_with_pair['totalTime'] = row.totalTime
        trip_with_pair['xfer_count'] = len(stop_pairs) - 1
        trip_with_seg = gpd.GeoDataFrame(trip_with_pair, geometry='segment_geom', crs=geography_utils.CA_NAD83Albers)
        row_shape_segments += [trip_with_seg]
    return pd.concat(row_shape_segments)

In [None]:
def compile_all_spatial_routes(df):
    spatial_routes = []
    for _ix, row in df.iterrows():
        try:
            spatial_routes += [shape_segments_from_row(row, warehouse_data)]
        except:
            print(f'failed for row {row}')
    spatial_routes = pd.concat(spatial_routes)
    return spatial_routes

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    spatial_routes = compile_all_spatial_routes(df)

In [None]:
spatial_routes >> head(3)

In [None]:
map_routes = spatial_routes.copy()

In [None]:
map_routes.segment_geom = map_routes.apply(lambda x: x.segment_geom.buffer(x.nIterations * 2), axis=1)
map_routes = map_routes >> arrange(-_.nIterations)

In [None]:
map_routes.explore(column = 'trip_group_id', cmap='tab20', tiles="CartoDB positron")

In [None]:
# map_routes.explore(column = 'shape_id', cmap='Accent', tiles="CartoDB positron")

In [None]:
map_routes.explore(column = 'xfer_count', cmap='coolwarm', tiles="CartoDB positron")

In [None]:
map_routes['car_p50_ratio'] = map_routes['totalTime'] / 46

In [None]:
# map_routes.explore(column = 'totalTime', cmap='coolwarm', tiles="CartoDB positron")
#  TODO fixed scale @1.5, 2, 2.5, 3
map_routes.explore(column = 'car_p50_ratio', cmap='coolwarm', tiles="CartoDB positron")

## Storytelling - infrequent regional rail

* only two usable trips for 8-10am departures from origin, at 10:41 (Metrolink) and 11:01 (Amtrak)

In [None]:
(warehouse_data['trips'] >> filter(_.route_id == 'Antelope Valley Line') >> arrange(_.trip_first_departure_ts))[:3]

In [None]:
(warehouse_data['trips'] >> filter(_.route_id == '78') >> arrange(_.trip_first_departure_ts))[:3]

## full shape map

In [None]:
shape_grouped = spatial_routes >> group_by(_.shape_id, _.segment_geom) >> summarize(total_iterations = _.nIterations.sum())

In [None]:
shape_grouped.segment_geom = shape_grouped.apply(lambda x: x.segment_geom.buffer(x.total_iterations * 2), axis=1)
shape_grouped = shape_grouped >> arrange(-_.total_iterations)

In [None]:
shape_grouped = gpd.GeoDataFrame(shape_grouped, geometry='segment_geom', crs=geography_utils.CA_NAD83Albers)

In [None]:
shape_grouped.explore(column = 'shape_id', cmap='Accent', tiles="CartoDB positron")