# Path Finder

## Setup Spark environment

In [95]:
%%local
import os
username = os.environ['JUPYTERHUB_USER']
get_ipython().run_cell_magic('configure', line="-f", cell='{ "name":"%s-final-path-finder", "executorMemory":"4G", "executorCores":4, "numExecutors":10, "driverMemory": "4G" }' % username)

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
5499,application_1618324153128_4780,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
5397,application_1618324153128_4671,pyspark,idle,Link,Link,,
5445,application_1618324153128_4723,pyspark,idle,Link,Link,,
5461,application_1618324153128_4741,pyspark,idle,Link,Link,,
5465,application_1618324153128_4745,pyspark,idle,Link,Link,,
5471,application_1618324153128_4751,pyspark,idle,Link,Link,,
5480,application_1618324153128_4761,pyspark,idle,Link,Link,,
5482,application_1618324153128_4763,pyspark,idle,Link,Link,,
5483,application_1618324153128_4764,pyspark,idle,Link,Link,,
5484,application_1618324153128_4765,pyspark,busy,Link,Link,,
5485,application_1618324153128_4766,pyspark,busy,Link,Link,,


In [96]:
%%send_to_spark -i username -t str -n username

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Successfully passed 'username' as 'username' to Spark kernel

In [97]:
import pyspark.sql.functions as F
from pyspark.sql.types import *

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Import data from HDFS

In [98]:
routes = spark.read.csv('/data/sbb/csv/timetable/routes/2019/05/07/routes.csv', header=True )
stops = spark.read.orc('/data/sbb/orc/geostops')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [99]:
walking_edges = spark.read.parquet('/user/%s/final/parquet/walking_edges' % username)
transport_edges = spark.read.parquet('/user/%s/final/parquet/transport_edges' % username)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [100]:
transport_edges.columns

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

['start_vertex', 'end_vertex', 'start_time', 'duration', 'route_id', 'weekday']

## Shortest Path

In [104]:
inward_walking_edges = walking_edges.groupby('end_vertex').agg({'start_vertex': 'collect_set'})\
                                                              .withColumnRenamed('collect_set(start_vertex)', 'start_vertices')\
                                                              .toPandas().set_index('end_vertex').to_dict()['start_vertices']
walking_edge_duration = walking_edges.toPandas()
walking_edge_duration['key'] = walking_edge_duration.apply(lambda x: (x['start_vertex'], x['end_vertex']), axis=1)
walking_edge_duration = walking_edge_duration.set_index('key').to_dict()['duration']

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [105]:
edges = transport_edges.filter('weekday == "monday"')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [125]:
# This function assumes a (spark) dataframe with columns 'start_vertex', 'end_vertex', 'start_time', 'duration', 'route_id' 
# edges is a dataframe that represents the edges of the graph, 
# edges should be sorted by order of descending starting time
# inward_walking_edges is a map route_id -> list(route_id), representing the nodes from which we can walk to a given node
# walking_edge_duration is a map (route_id, route_id) -> float, representing the duration of the walk
# end_time should be in the format 'HH:MM'
def latest_departure_paths(edges, end_stop, end_time, inward_walking_edges, walking_edge_duration):
    
    # This function computes whether edge (u, v, t, dur, route_id) can be taken
    # returns update to node u if possible, and None otherwise
    def edge_valid(u, v, time, dur, route_id):
        
        next_edges = all_roads.get(v, [])
        
        for edge in next_edges[::-1]: # find first (earliest) edge that makes route possible
            
            time_v, next_v, route_id_v, dur_v, seq_v = edge

            if route_id == 'walking':
                if  route_id_v == 'walking' and dur_v + dur <= max_walking_time:
                    time = time_v - dur - transfer_time
                    return (time, next_v, route_id, dur + dur_v, [u] + seq_v)
                elif route_id_v != 'walking':
                    time = time_v - dur - transfer_time
                    return (time, v, route_id, dur, [u, v])

            if (route_id == route_id_v or v == end_stop) and time + duration <= time_v: # same route or last_stop
                return (time, next_v, route_id, dur + dur_v, [u] + seq_v)

            elif time + duration + transfer_time <= time_v: # transfer at node_v
                return (time, v, route_id, dur, [u, v]) 

        # In any other case no update can be made
        return None
                
    def time_to_minutes(timestamp):
        return int(timestamp[:2]) * 60 + int(timestamp[3:5])
    
    transfer_time = 2 # at least 2 minutes to transfer
    max_walking_time = 10

    all_roads = {}
    end_time = time_to_minutes(end_time)
    start_time = end_time - 120 # look only at edges departing at most 2 hours before end_time
    
    all_roads[end_stop] = [(end_time, end_stop, None, 0, [end_stop])]

    edges = edges.filter(F.col('start_time').between(start_time, end_time)).toPandas().sort_values(by=['start_time'], ascending=False).to_numpy()

    for row in edges:
        
        start_vertex, end_vertex, start, duration, route_id = row[:5]
        
        update = edge_valid(start_vertex, end_vertex, start, duration, route_id)
        if update:
            if start_vertex in all_roads:    
                all_roads[start_vertex].append(update)
            else:
                all_roads[start_vertex] = [update]
                
            
        w_start_vertex = start_vertex    
        for w_end_vertex in inward_walking_edges.get(w_start_vertex, []):
            duration = walking_edge_duration[(w_start_vertex, w_end_vertex)]
            w_start = start 
            update = edge_valid(w_start_vertex, w_end_vertex, w_start, duration, 'walking')
            if update:
                if w_start_vertex in all_roads and (update not in all_roads[w_start_vertex]):
                    all_roads[w_start_vertex].append(update)
#                     print(all_roads[w_start_vertex][0])
                    all_roads[w_start_vertex].sort(lambda x,y : x[0], reverse=True)
                elif w_start_vertex not in all_roads:
                    all_roads[w_start_vertex] = [update]
                
                    
            
    
    schema = StructType([
        StructField("start_vertex", StringType(), True),
        StructField("departure_time", StringType(), True),
        StructField("end_vertex", StringType(), True),
        StructField("route_id", StringType(), True),
        StructField("duration", IntegerType(), True),
        StructField("stop_seq", ArrayType(StringType()), True)
    ])
    

    return all_roads

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [126]:
import time
start = time.time()
all_roads = latest_departure_paths(edges, '8591221', '20:00', inward_walking_edges, walking_edge_duration)
end = time.time() 
print(end - start)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

21.6278610229

In [124]:
all_roads.values()[0][0]

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

(1149, u'8503016:0:4', 'walking', 7, [u'8580301', u'8503016:0:4'])

In [56]:
all([len(all_roads.values()[i][0]) == 5 for i in range(len(all_roads))])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

True

In [127]:
stop_id_to_stop = stops.select('stop_id', 'stop_name')
stop_id_to_stop_dict = stop_id_to_stop.toPandas().set_index('stop_id').to_dict()['stop_name']

route_id_to_route_name = routes.withColumn('route_name', F.concat(F.col('route_desc'), F.lit(' '), F.col('route_short_name')))\
                               .select('route_id', 'route_desc', 'route_short_name', 'route_name')
route_id_to_route_name_dict = route_id_to_route_name.toPandas().set_index('route_id').to_dict()['route_name']
route_id_to_route_name_dict['walking'] = 'Walk'

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [128]:
# This cell is for generating best k paths
def minutes_to_timestamp(minutes):
    return str(minutes // 60) + ':' + str(minutes % 60).zfill(2)

def select_next(time, paths_from_next_stop):
    rev = paths_from_next_stop[::-1]
    
    for i in rev:
        if (i[0] >= time):
            return i
    
    return (None, None, None, None , [])

def generate_best_k_paths(k_paths, start_stop, k):
    l = k_paths[start_stop]
    nbre_paths = k
    if (l == None):
        print("Sorry bro, no paths available")
        return
    if(len(l) < k):
        nbre_paths = len(l)
    
    possible = l[::-1][0:nbre_paths]
    roads = []
    for i in possible:
        start_stop_ = start_stop
        time, next_stop, route_id, duration, stops = i
        journey = []
        while next_stop != start_stop_:
            journey.append(
                (minutes_to_timestamp(time),
                 stop_id_to_stop_dict[start_stop_], 
                 route_id_to_route_name_dict[route_id],
                 duration,
                 '' if route_id == 'walking' else 'Ride %d stops' % len(stops),
                 minutes_to_timestamp(time + duration),
                 stop_id_to_stop_dict[next_stop])
            )
        
            start_stop_ = next_stop
            paths_from_next_stop = k_paths[next_stop]
            
            m = select_next(time + duration, paths_from_next_stop)
            
            if (m[0] != None):
                time, next_stop, route_id, duration, stops = m
        
        roads.append(journey)
        
    return roads

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [129]:
g = generate_best_k_paths(all_roads, '8591250', 4 )

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [62]:
len(g)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

4

In [130]:
def pretty_print_journey(journey):
    if journey == None:
        return "No possible second best path"
    journey_str = '\n'.join(["%s : %s\n|\n| %s (%s min) %s\n|\n%s : %s\n" % segment for segment in journey])
    return journey_str

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [131]:
for i in g:
    print(pretty_print_journey(i))
    print("**************************")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

17:55 : Zürich, Lerchenrain
|
| Walk (6 min) 
|
18:01 : Zürich, Schauenberg

18:03 : Zürich, Schauenberg
|
| Walk (37 min) 
|
18:40 : Zürich, Einfangstrasse

**************************
17:54 : Zürich, Lerchenrain
|
| Walk (7 min) 
|
18:01 : Zürich, Einfangstrasse

18:01 : Zürich, Einfangstrasse
|
| Bus 62 (62 min) Ride 53 stops
|
19:03 : Zürich, Schwamendingerplatz

19:04 : Zürich, Schwamendingerplatz
|
| Tram 7 (19 min) Ride 18 stops
|
19:23 : Stettbach, Bahnhof

19:28 : Stettbach, Bahnhof
|
| Walk (2 min) 
|
19:30 : Stettbach

19:34 : Stettbach
|
| S-Bahn 12 (4 min) Ride 2 stops
|
19:38 : Zürich Stadelhofen

19:41 : Zürich Stadelhofen
|
| Walk (1 min) 
|
19:42 : Zürich Stadelhofen FB

19:43 : Zürich Stadelhofen FB
|
| Tram 8 (8 min) Ride 8 stops
|
19:51 : Zürich, Klusplatz

19:52 : Zürich, Klusplatz
|
| Bus 31 (5 min) Ride 6 stops
|
19:57 : Zürich, Kapfstrasse

**************************
17:54 : Zürich, Lerchenrain
|
| Walk (6 min) 
|
18:00 : Zürich, Lerchenhalde

18:01 : Zürich, Ler

In [30]:
m = all_roads['8591250']
time, next_stop, route_id, duration, stops = select_next(4, m)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [31]:
stops

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[u'8591250', u'8591338', u'8576262', u'8591054']

## Send to local

Send transfer dataframe to local

In [None]:
%%spark -o next_transfer_df -n -1

In [None]:
%%local
next_transfer_df.head()

Reconstruct a dict for transfers to keep old code

In [None]:
%%local
next_transfer = {e[0]: tuple(e[1:]) for e in next_transfer_df.to_numpy()}

Send route id to route name mapping to local

In [None]:
%%spark -o route_id_to_route_name -n -1
route_id_to_route_name = routes.withColumn('route_name', F.concat(F.col('route_desc'), F.lit(' '), F.col('route_short_name')))\
                               .select('route_id', 'route_desc', 'route_short_name', 'route_name')

In [None]:
%%local
route_id_to_route_name_dict = route_id_to_route_name.set_index('route_id').to_dict()['route_name']
route_id_to_route_name_dict['walking'] = 'walk'

Send stop id to stop name mapping to local

In [None]:
%%spark -o stop_id_to_stop -n -1
stop_id_to_stop = stops.select('stop_id', 'stop_name')

In [None]:
%%local
stop_id_to_stop_dict = stop_id_to_stop.set_index('stop_id').to_dict()['stop_name']

## Reconstruct path and pretty print

In [None]:
%%local
def minutes_to_timestamp(minutes):
    return str(minutes // 60) + ':' + str(minutes % 60).zfill(2)
    
    
def reconstruct_journey(next_transfer, start_stop):
    time, next_stop, route_id, duration, stop_seq = next_transfer[start_stop]
    journey = []
    while next_stop != start_stop:
        journey.append(
            (minutes_to_timestamp(time),
             stop_id_to_stop_dict[start_stop], 
             route_id_to_route_name_dict[route_id],
             duration,
             'Ride %d stops' % (len(stop_seq)-1),
             minutes_to_timestamp(time + duration),
             stop_id_to_stop_dict[next_stop])
        )
        
        start_stop = next_stop
        time, next_stop, route_id, duration, stop_seq = next_transfer[next_stop]
    return journey


def pretty_print_journey(journey):
    journey_str = '\n'.join(["%s : %s\n|\n| %s (%s min) %s\n|\n%s : %s\n" % segment for segment in journey])
    return journey_str

In [None]:
%%local
print(pretty_print_journey(reconstruct_journey(next_transfer, '8590883'))) # almost same as sbb

In [None]:
%%local
print(pretty_print_journey(reconstruct_journey(next_transfer, '8591353'))) # almost same as sbb

In [None]:
%%local
print(pretty_print_journey(reconstruct_journey(next_transfer, '8590788'))) # not in sbb

In [None]:
%%local
print(pretty_print_journey(reconstruct_journey(next_transfer, '8591368'))) # Correct