# Path Finder

## Setup Spark environment

In [64]:
%%local
import os
username = os.environ['JUPYTERHUB_USER']
get_ipython().run_cell_magic('configure', line="-f", cell='{ "name":"%s-final-path-finder", "executorMemory":"4G", "executorCores":4, "numExecutors":10, "driverMemory": "4G" }' % username)

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
5869,application_1618324153128_5225,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
5837,application_1618324153128_5181,pyspark,busy,Link,Link,,
5843,application_1618324153128_5187,pyspark,idle,Link,Link,,
5846,application_1618324153128_5190,pyspark,idle,Link,Link,,
5850,application_1618324153128_5195,pyspark,idle,Link,Link,,
5852,application_1618324153128_5197,pyspark,idle,Link,Link,,
5853,application_1618324153128_5199,pyspark,idle,Link,Link,,
5855,application_1618324153128_5202,pyspark,idle,Link,Link,,
5858,application_1618324153128_5205,pyspark,busy,Link,Link,,
5860,application_1618324153128_5207,pyspark,idle,Link,Link,,
5861,application_1618324153128_5215,pyspark,busy,Link,Link,,


In [65]:
%%send_to_spark -i username -t str -n username

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Successfully passed 'username' as 'username' to Spark kernel

In [66]:
import pyspark.sql.functions as F
from pyspark.sql.types import *

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Import data from HDFS

In [98]:
routes = spark.read.csv('/data/sbb/csv/timetable/routes/2019/05/07/routes.csv', header=True )
stops = spark.read.orc('/data/sbb/orc/geostops')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [104]:
walking_edges = spark.read.parquet('/user/%s/final/parquet/walking_edges' % username)
transport_edges = spark.read.parquet('/user/%s/final/parquet/transport_edges' % username)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [100]:
transport_edges.columns

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

['start_vertex', 'end_vertex', 'start_time', 'duration', 'route_id', 'weekday']

## Shortest Path

In [105]:
walking_edges = walking_edges.withColumn('value', F.struct(F.col('end_vertex'), F.col('duration')))\
                             .groupBy('start_vertex').agg({'value': 'collect_set'})\
                             .toPandas()\
                             .set_index('start_vertex')\
                             .to_dict()['collect_set(value)']

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [71]:
edges = transport_edges.filter('weekday == "monday"')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [107]:
# This function assumes a (spark) dataframe with columns 'start_vertex', 'end_vertex', 'start_time', 'duration', 'route_id' 
# edges is a dataframe that represents the edges of the graph, 
# edges should be sorted by order of descending starting time
# inward_walking_edges is a map route_id -> list(route_id), representing the nodes from which we can walk to a given node
# walking_edge_duration is a map (route_id, route_id) -> float, representing the duration of the walk
# end_time should be in the format 'HH:MM'
def latest_departure_paths(edges, end_stop, end_time, walking_edges):

    def edge_valid(u, v, time, dur, route_id): 
        next_edges = all_roads.get(v, [])
        
        # Can we reach the destination ?
        if v == end_stop and time + dur <= end_time: 
             return (time, v, route_id, dur)
        
        # We don't add edges starting at the destination
        if u != end_stop:
            for edge in next_edges[::-1]: # Traverse the list in reverse order because it's sorted by descending starting time
                time_v, next_v, route_id_v, dur_v = edge
                if (route_id == route_id_v and time + dur <= time_v) or (time + dur + transfer_time <= time_v): 
                    return (time, v, route_id, dur)            
            
            # Can we walk from v instead?
            for end_vertex, walking_duration in walking_edges.get(v, []):
                if end_vertex != u: # do not loop back to u
                    for next_next_edge in all_roads.get(end_vertex, []):
                        time_next = next_next_edge[0]
                        if time + dur + transfer_time <= time_next - walking_duration : # u -> v counts as a transfer
                            return (time, v, route_id, dur)
                
        return None
                
    def time_to_minutes(timestamp):
        return int(timestamp[:2]) * 60 + int(timestamp[3:5])
    
    
    transfer_time = 2 # at least 2 minutes to transfer
    max_walking_time = 10
    
    all_roads = {}
    end_time = time_to_minutes(end_time)
    start_time = end_time - 120 # look only at edges departing at most 2 hours before end_time
        
    edges = edges.filter(F.col('start_time').between(start_time, end_time)).toPandas().sort_values(by=['start_time'], ascending=False).to_numpy()

    for row in edges:
        start_vertex, end_vertex, start, duration, route_id = row[:5]
        
        update = edge_valid(start_vertex, end_vertex, start, duration, route_id)
        if update:
            if start_vertex in all_roads:    
                all_roads[start_vertex].append(update)
            else:
                all_roads[start_vertex] = [update]
                        
    return all_roads

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [108]:
import time
start = time.time()
all_roads = latest_departure_paths(edges, '8591221', '20:00', walking_edges)
end = time.time() 
print(end - start)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

1.30478000641

In [86]:
stop_id_to_stop = stops.select('stop_id', 'stop_name')
stop_id_to_stop_dict = stop_id_to_stop.toPandas().set_index('stop_id').to_dict()['stop_name']

route_id_to_route_name = routes.withColumn('route_name', F.concat(F.col('route_desc'), F.lit(' '), F.col('route_short_name')))\
                               .select('route_id', 'route_desc', 'route_short_name', 'route_name')
route_id_to_route_name_dict = route_id_to_route_name.toPandas().set_index('route_id').to_dict()['route_name']
route_id_to_route_name_dict['walking'] = 'Walk'

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [59]:
# Journey reconstruction

from itertools import groupby
import numpy as np

def minutes_to_timestamp(minutes):
    return str(minutes // 60) + ':' + str(minutes % 60).zfill(2)

def select_next(prev_time, prev_duration, prev_route_id, node, all_roads, transfer_time=2, max_walking_time=10):
    next_edges = all_roads.get(node, [])
    possible_edges = []
    for edge in next_edges:
        t, v, route_id, dur = edge
        if prev_route_id != route_id:
            if prev_time + prev_duration + transfer_time <= t:
                possible_edges.append(edge)
        elif prev_route_id == 'Walking' and prev_time + prev_duration <= t and prev_duration + dur <= max_walking_time:
            possible_edges.append(edge)
        elif prev_time + prev_duration <= t:
            possible_edges.append(edge)
    
    return possible_edges
    
def iterate_all_edges(start_edges, end_stop, all_roads, cur_list):
    for edge in start_edges:
        t, v, route_id, dur = edge
        if v == end_stop:
            yield cur_list + [edge]
        else:
            for journey in iterate_all_edges(select_next(t, dur, route_id, v,  all_roads), end_stop, all_roads, cur_list + [edge]):
                yield journey

def get_journey_attributes(journey):
    walking_distance = sum([t[-1] for t in journey if t[-1] == 'Walking'])
    transfers = len(set([t[2] for t in journey if t[2] != 'Walking']))
    return [transfers, walking_distance]
    
def get_walking_edges(all_roads, node, walking_edges, walking_edge_duration):
    # Compute viable walks starting at node u:
    walking_edges = []
    
def generate_best_paths(all_roads, start_stop, end_stop, walking_edges, walking_edge_duration):
    
    
    start_edges 
    start_edges = sorted(all_roads.get(start_stop, []), key=lambda x:x[0], reverse=True)
    start_edges = groupby(start_edges, key=lambda x:x[0])
    for start_time , edges in start_edges:
        all_journeys = list(iterate_all_edges(list(edges), end_stop, all_roads, []))
        journey_attribs = np.array([get_journey_attributes(journey) for journey in all_journeys])
        sorted_indices = np.lexsort(journey_attribs.T)
        for i in sorted_indices:
            yield all_journeys[i]

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [60]:
all_roads.keys()[334]

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

u'8573723'

In [61]:
iterator = generate_best_paths(all_roads, '8573723',  '8591221')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [62]:
journey = next(iterator)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

An error was encountered:
maximum recursion depth exceeded in cmp
Traceback (most recent call last):
  File "<stdin>", line 41, in generate_best_paths
  File "<stdin>", line 29, in iterate_all_edges
  File "<stdin>", line 29, in iterate_all_edges
  File "<stdin>", line 29, in iterate_all_edges
  File "<stdin>", line 29, in iterate_all_edges
  File "<stdin>", line 29, in iterate_all_edges
  File "<stdin>", line 29, in iterate_all_edges
  File "<stdin>", line 29, in iterate_all_edges
  File "<stdin>", line 29, in iterate_all_edges
  File "<stdin>", line 29, in iterate_all_edges
  File "<stdin>", line 29, in iterate_all_edges
  File "<stdin>", line 29, in iterate_all_edges
  File "<stdin>", line 29, in iterate_all_edges
  File "<stdin>", line 29, in iterate_all_edges
  File "<stdin>", line 29, in iterate_all_edges
  File "<stdin>", line 29, in iterate_all_edges
  File "<stdin>", line 29, in iterate_all_edges
  File "<stdin>", line 29, in iterate_all_edges
  File "<stdin>", line 29, in ite

In [25]:
def journey_to_segments (journey, start_stop):
    segments = [(route_id, list(edges)) for (route_id, edges) in  groupby(journey, lambda x:x[2])]

    segments = [(i + 1, edges[0][1], edges[-1][1], route_id, edges[0][0], edges[-1][0] + edges[-1][3] - edges[0][0], [t[1] for t in edges]) \
                for i, (route_id, edges) in enumerate(segments)]
    
    segments[0] = (segments[0][0], start_stop, segments[0][2], segments[0][3], segments[0][4], segments[0][5], [start_stop] + segments[0][6])
    
    return segments

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [26]:
def journey_to_df(journey_list):
    arr = []
    journey_number = 0
    for journey in journey_list:
        journey_number += 1
        arr = arr + [(journey_number,) + seg for seg in journey]
    
    schema = StructType([
        StructField("journey_number", IntegerType(), True),
        StructField("segment_number", IntegerType(), True),
        StructField("start_vertex", StringType(), True),
        StructField("end_vertex", StringType(), True),
        StructField("route_id", StringType(), True),
        StructField("departure_time", StringType(), True),
        StructField("duration", StringType(), True),
        StructField("stop_seq", ArrayType(StringType()), True)
    ])
    
    journey_df = spark.createDataFrame(arr, schema)
    
    return journey_df

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [27]:
journey_df = journey_to_df([journey_to_segments(journey, '8580437')])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [28]:
def pretty_print_journey(journey):
    journey_str = '\n'.join(["%s : %s\n|\n| %s (%s min)\n| %s\n|\n%s : %s\n" % (minutes_to_timestamp(time), 
                                                                             stop_id_to_stop_dict[start],
                                                                             route_id_to_route_name_dict[route_id],
                                                                             minutes_to_timestamp(duration),
                                                                             ' -> '.join([stop_id_to_stop_dict[s] for s in stops]) if route_id != 'walking' else '',
                                                                             minutes_to_timestamp(time + duration),
                                                                             stop_id_to_stop_dict[end]) for _, start, end, route_id, time, duration, stops in journey])
    return journey_str

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [32]:
journey = next(iterator)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [37]:
journey = next(iterator)
print(pretty_print_journey(journey_to_segments(journey, '8595075')))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

18:47 : Dietlikon, In Lampitzäckern
|
| Bus 749 (0:02 min)
| Dietlikon, In Lampitzäckern -> Dietlikon, Dornenstrasse -> Dietlikon, Bahnhof
|
18:49 : Dietlikon, Bahnhof

18:55 : Dietlikon, Zentrum
|
| Bus 748 (0:10 min)
| Dietlikon, Zentrum -> Dietlikon, Hofwiesen -> Dietlikon, Zentrum -> Dietlikon, Bahnhof -> Dietlikon, Dornenstrasse -> Dietlikon, Dübendorferstrasse
|
19:05 : Dietlikon, Dübendorferstrasse

19:10 : Dietlikon, Hörnligraben
|
| Bus 787 (0:07 min)
| Dietlikon, Hörnligraben -> Wallisellen, Ifang -> Wallisellen, Glatt (Bus) -> Zürich, Altried -> Zürich, Luchswiesen
|
19:17 : Zürich, Luchswiesen

19:19 : Zürich, Altried
|
| Tram 9 (0:33 min)
| Zürich, Altried -> Zürich, Luchswiesen -> Zürich, Luegisland -> Zürich, Heerenwiesen -> Zürich, Schwamendingerplatz -> Zürich, Schörlistrasse -> Zürich, Waldgarten -> Zürich, Tierspital -> Zürich, Milchbuck -> Zürich, Universität Irchel -> Zürich, Langmauerstrasse -> Zürich, Letzistrasse -> Zürich, Kinkelstrasse -> Zürich, Seilbahn Rigi

In [42]:
print(pretty_print_journey(journey_to_segments(journey, '8580301')))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

19:06 : Zürich Flughafen, OPC
|
| Bus 520 (0:01 min)
| Zürich Flughafen, OPC -> Zürich Flughafen, Fracht
|
19:07 : Zürich Flughafen, Fracht

19:10 : Zürich Flughafen, Bahnhof
|
| Tram 10 (0:12 min)
| Zürich Flughafen, Bahnhof -> Kloten Balsberg, Bahnhof -> Glattbrugg, Unterriet -> Rümlang, Bäuler -> Glattbrugg, Bahnhof -> Glattbrugg, Lindberghplatz -> Glattpark, Glattpark -> Zürich, Oerlikerhus -> Zürich, Leutschenbach
|
19:22 : Zürich, Leutschenbach

19:24 : Zürich, Messe/Hallenstadion
|
| Tram 11 (0:19 min)
| Zürich, Messe/Hallenstadion -> Zürich, Sternen Oerlikon -> Zürich Oerlikon, Bahnhof -> Zürich, Regensbergbrücke -> Zürich, Bad Allenmoos -> Zürich, Radiostudio -> Zürich, Bucheggplatz -> Zürich, Laubiweg -> Zürich, Schaffhauserplatz -> Zürich, Röslistrasse -> Zürich, Ottikerstrasse -> Zürich, Sonneggstrasse -> Zürich, Haldenegg -> Zürich, Central
|
19:43 : Zürich, Central

19:45 : Zürich, Neumarkt
|
| Bus 31 (0:11 min)
| Zürich, Neumarkt -> Zürich, Kunsthaus -> Zürich, Sprechers

## Send to local

Send transfer dataframe to local

In [None]:
%%spark -o next_transfer_df -n -1

In [None]:
%%local
next_transfer_df.head()

Reconstruct a dict for transfers to keep old code

In [None]:
%%local
next_transfer = {e[0]: tuple(e[1:]) for e in next_transfer_df.to_numpy()}

Send route id to route name mapping to local

In [None]:
%%spark -o route_id_to_route_name -n -1
route_id_to_route_name = routes.withColumn('route_name', F.concat(F.col('route_desc'), F.lit(' '), F.col('route_short_name')))\
                               .select('route_id', 'route_desc', 'route_short_name', 'route_name')

In [None]:
%%local
route_id_to_route_name_dict = route_id_to_route_name.set_index('route_id').to_dict()['route_name']
route_id_to_route_name_dict['walking'] = 'walk'

Send stop id to stop name mapping to local

In [None]:
%%spark -o stop_id_to_stop -n -1
stop_id_to_stop = stops.select('stop_id', 'stop_name')

In [None]:
%%local
stop_id_to_stop_dict = stop_id_to_stop.set_index('stop_id').to_dict()['stop_name']

## Reconstruct path and pretty print

In [None]:
%%local
def minutes_to_timestamp(minutes):
    return str(minutes // 60) + ':' + str(minutes % 60).zfill(2)
    
    
def reconstruct_journey(next_transfer, start_stop):
    time, next_stop, route_id, duration, stop_seq = next_transfer[start_stop]
    journey = []
    while next_stop != start_stop:
        journey.append(
            (minutes_to_timestamp(time),
             stop_id_to_stop_dict[start_stop], 
             route_id_to_route_name_dict[route_id],
             duration,
             'Ride %d stops' % (len(stop_seq)-1),
             minutes_to_timestamp(time + duration),
             stop_id_to_stop_dict[next_stop])
        )
        
        start_stop = next_stop
        time, next_stop, route_id, duration, stop_seq = next_transfer[next_stop]
    return journey


def pretty_print_journey(journey):
    journey_str = '\n'.join(["%s : %s\n|\n| %s (%s min) %s\n|\n%s : %s\n" % segment for segment in journey])
    return journey_str

In [None]:
%%local
print(pretty_print_journey(reconstruct_journey(next_transfer, '8590883'))) # almost same as sbb

In [None]:
%%local
print(pretty_print_journey(reconstruct_journey(next_transfer, '8591353'))) # almost same as sbb

In [None]:
%%local
print(pretty_print_journey(reconstruct_journey(next_transfer, '8590788'))) # not in sbb

In [None]:
%%local
print(pretty_print_journey(reconstruct_journey(next_transfer, '8591368'))) # Correct