# Path Finder

## Setup Spark environment

In [236]:
%%local
import os
username = os.environ['JUPYTERHUB_USER']
get_ipython().run_cell_magic('configure', line="-f", cell='{ "name":"%s-final-path-finder", "executorMemory":"4G", "executorCores":4, "numExecutors":10, "driverMemory": "4G" }' % username)

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
5633,application_1618324153128_4926,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
5611,application_1618324153128_4902,pyspark,idle,Link,Link,,
5618,application_1618324153128_4911,pyspark,idle,Link,Link,,
5619,application_1618324153128_4912,pyspark,idle,Link,Link,,
5621,application_1618324153128_4914,pyspark,idle,Link,Link,,
5627,application_1618324153128_4920,pyspark,idle,Link,Link,,
5630,application_1618324153128_4923,pyspark,idle,Link,Link,,
5631,application_1618324153128_4924,pyspark,idle,Link,Link,,
5633,application_1618324153128_4926,pyspark,idle,Link,Link,,✔


In [237]:
%%send_to_spark -i username -t str -n username

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Successfully passed 'username' as 'username' to Spark kernel

In [238]:
import pyspark.sql.functions as F
from pyspark.sql.types import *

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Import data from HDFS

In [239]:
routes = spark.read.csv('/data/sbb/csv/timetable/routes/2019/05/07/routes.csv', header=True )
stops = spark.read.orc('/data/sbb/orc/geostops')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [240]:
walking_edges = spark.read.parquet('/user/%s/final/parquet/walking_edges' % username)
transport_edges = spark.read.parquet('/user/%s/final/parquet/transport_edges' % username)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [241]:
transport_edges.columns

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

['start_vertex', 'end_vertex', 'start_time', 'duration', 'route_id', 'weekday']

## Shortest paths

In [242]:
inward_walking_edges = walking_edges.groupby('end_vertex').agg({'start_vertex': 'collect_set'})\
                                                              .withColumnRenamed('collect_set(start_vertex)', 'start_vertices')\
                                                              .toPandas().set_index('end_vertex').to_dict()['start_vertices']

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [None]:
walking_edge_duration = walking_edges.withColumn('key', F.struct(F.col('start_vertex'), F.col('end_vertex')))\
                                     .toPandas()\
                                     .set_index('key')\
                                     .to_dict()['duration']

In [243]:
edges = transport_edges.filter('weekday == "monday"')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [244]:
# This function assumes a (spark) dataframe with columns 'start_vertex', 'end_vertex', 'start_time', 'duration', 'route_id' 
# edges is a dataframe that represents the edges of the graph, 
# edges should be sorted by order of descending starting time
# inward_walking_edges is a map route_id -> list(route_id), representing the nodes from which we can walk to a given node
# walking_edge_duration is a map (route_id, route_id) -> float, representing the duration of the walk
# end_time should be in the format 'HH:MM'
def latest_departure_paths(edges, end_stop, end_time, walking_edges, walking_edge_duration):
    
    def add_walking_edges_for_node(start_vertex, min_time, exclude_nodes):
        
        if start_vertex in nodes_w_processed:
            return
        
        next_edges = all_roads.get(start_vertex, [])
        
        visit_nodes = [node for node in walking_edges.get(start_vertex, []) if node not in exclude_nodes]
        
        exclude_nodes = exclude_nodes | {start_vertex}
        
        
        for end_vertex in visit_nodes:
            
            dur = walking_edge_duration[(start_vertex, end_vertex)]
            
            add_walking_edges_for_node(end_vertex, min_time, exclude_nodes)
            
            for edge in all_roads.get(end_vertex, []):
                time_v, next_v, route_id_v, dur_v = edge
                if route_id_v == 'Walking' and dur_v + dur <= max_walking_time and time_v - dur >= min_time:
                    next_edges.append((time_v - dur, end_vertex, 'Walking', dur))
                elif route_id != 'Walking' and time_v - dur - transfer_time >= min_time:
                    next_edges.append((time_v - dur - transfer_time, end_vertex, 'Walking', dur))
                    
        
        nodes_w_processed.add(start_vertex)
        all_roads[start_vertex] = list(set(next_edges))
        return
    
    # This function computes whether edge (u, v, t, dur, route_id) can be taken
    # returns update to node u if possible, and None otherwise
    def edge_valid(u, v, time, dur, route_id):
        
        add_walking_edges_for_node(v, time, {u})
        next_edges = all_roads.get(v, [])
        
        for edge in next_edges: 
            time_v, next_v, route_id_v, dur_v = edge
            if ((route_id == route_id_v or v == end_stop) and time + duration <= time_v) or (time + duration + transfer_time <= time_v): 
                return (time, v, route_id, dur)

        return None
                
    def time_to_minutes(timestamp):
        return int(timestamp[:2]) * 60 + int(timestamp[3:5])
    
    
    transfer_time = 2 # at least 2 minutes to transfer
    max_walking_time = 10

    nodes_w_processed = set(end_stop)
    
    all_roads = {}
    end_time = time_to_minutes(end_time)
    start_time = end_time - 120 # look only at edges departing at most 2 hours before end_time
    
    all_roads[end_stop] = [(end_time, end_stop, None, 0)]

    edges = edges.filter(F.col('start_time').between(start_time, end_time)).toPandas().sort_values(by=['start_time'], ascending=False).to_numpy()

    for row in edges:
        start_vertex, end_vertex, start, duration, route_id = row[:5]
        
        update = edge_valid(start_vertex, end_vertex, start, duration, route_id)
        if update:
#             nodes_w_processed.difference_update(walking_edges.get(start_vertex, []))

            if start_vertex in all_roads:    
                all_roads[start_vertex].append(update)
            else:
                all_roads[start_vertex] = [update]
                
            next_transfer[start_vertex] = update
                    
            w_start_vertex = start_vertex
            for w_end_vertex in inward_walking_edges.get(w_start_vertex, []):
                duration = walking_edge_duration[(w_start_vertex, w_end_vertex)]
                w_start = start 
                update = edge_valid(w_start_vertex, w_end_vertex, w_start, duration, 'walking')
                if update:
                    if(all_roads.get(w_start_vertex) != None):
                        all_roads[w_start_vertex].append(update)
                    elif (all_roads.get(start_vertex) == None):
                        all_roads[w_start_vertex] = [update]
                    
                    next_transfer[w_start_vertex] = update

    return all_roads

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [245]:
import time
start = time.time()
all_roads = latest_departure_paths(edges, '8591221', '20:00', inward_walking_edges, walking_edge_duration)
end = time.time() 
print(end - start)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

1.56233310699

In [249]:
all_roads

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

{u'8580301': [(1146, u'8588553', u'26-520-j19-1', 1), (1143, u'8588553', u'26-731-j19-1', 1), (1143, u'8588553', u'26-733-j19-1', 1), (1141, u'8588553', u'26-765-j19-1', 1), (1141, u'8588553', u'79-736-j19-1', 1), (1139, u'8588553', u'26-734-j19-1', 1), (1138, u'8573205:0:F', u'26-765-j19-1', 1), (1137, u'8573205:0:L', u'79-736-j19-1', 1), (1136, u'8573205:0:D', u'26-733-j19-1', 1), (1136, u'8573205:0:E', u'26-731-j19-1', 1), (1133, u'8588553', u'26-530-j19-1', 1), (1130, u'8588553', u'26-524-j19-1', 1), (1128, u'8588553', u'26-733-j19-1', 1), (1128, u'8573205:0:C', u'26-531-j19-1', 2), (1128, u'8588553', u'26-521-j19-1', 1), (1126, u'8588553', u'79-736-j19-1', 1), (1125, u'8573205:0:F', u'26-765-j19-1', 1), (1123, u'8588553', u'26-765-j19-1', 2), (1122, u'8573205:0:L', u'79-736-j19-1', 1), (1121, u'8573205:0:D', u'26-733-j19-1', 1), (1120, u'8573205:0:G', u'26-521-j19-1', 2), (1120, u'8588553', u'26-531-j19-1', 1), (1118, u'8588553', u'26-530-j19-1', 1), (1116, u'8588553', u'26-520-j1

In [246]:
# This cell is for generating best k paths
def select_next(time, paths_from_next_stop):
    rev = paths_from_next_stop[::-1]
    
    for i in rev:
        if (i[0] >= time):
            return i
    
    return (None, None, None, None , [])

def generate_best_k_paths(k_paths, start_stop, k):
    l = k_paths[start_stop]
    nbre_paths = k
    
    if (l == None):
        return None
    
    if(len(l) < k):
        nbre_paths = len(l)
    
    possible = l[::-1][0:nbre_paths]
    
    roads = []
    for p in possible:
        start_stop_ = start_stop
        time, next_stop, route_id, duration, stops = p
        journey = []
        i = 1
        while next_stop != start_stop_:
            journey.append(
                (i, start_stop_, next_stop, route_id, time, duration, stops)
            )
            
            start_stop_ = next_stop
            i += 1
            
            paths_from_next_stop = k_paths[next_stop]
            m = select_next(time + duration, paths_from_next_stop)
            
            if (m[0] != None):
                time, next_stop, route_id, duration, stops = m
        
        roads.append(journey)
        
    return roads

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

An error was encountered:
list index out of range
Traceback (most recent call last):
IndexError: list index out of range



In [174]:
best_k_paths = generate_best_k_paths(all_roads, '8591250', 4 )

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [175]:
def journey_to_df(journey_list):
    arr = []
    journey_number = 0
    for journey in journey_list:
        journey_number += 1
        arr = arr + [(journey_number,) + seg for seg in journey]
    
    schema = StructType([
        StructField("journey_number", IntegerType(), True),
        StructField("segment_number", IntegerType(), True),
        StructField("start_vertex", StringType(), True),
        StructField("end_vertex", StringType(), True),
        StructField("route_id", StringType(), True),
        StructField("departure_time", StringType(), True),
        StructField("duration", StringType(), True),
        StructField("stop_seq", ArrayType(StringType()), True)
    ])
    
    journey_df = spark.createDataFrame(arr, schema)
    
    return journey_df

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

4

In [176]:
journey_df = journey_to_df(best_k_paths)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Send to local

In [30]:
%%spark -o journey_df -n -1

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [31]:
%%local
journey_df['start_vertex'] = journey_df.start_vertex.astype(str)
journey_df['end_vertex'] = journey_df.end_vertex.astype(str)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[u'8591250', u'8591338', u'8576262', u'8591054']

In [None]:
%%spark -o stop_id_to_stop -n -1
stop_id_to_stop = stops.select('stop_id', 'stop_name')

In [None]:
%%spark -o stop_id_to_lon -n -1
stop_id_to_lon = stops.select('stop_id', 'stop_lon')

In [None]:
%%spark -o stop_id_to_lat -n -1
stop_id_to_lat = stops.select('stop_id', 'stop_lat')

In [None]:
%%spark -o route_id_to_route_name -n -1
route_id_to_route_name = routes.withColumn('route_name', F.concat(F.col('route_desc'), F.lit(' '), F.col('route_short_name')))\
                               .select('route_id', 'route_desc', 'route_short_name', 'route_name')

## Display results

In [None]:
%%local
stop_id_to_stop_dict = stop_id_to_stop.set_index('stop_id').to_dict()['stop_name']

stop_id_to_lon_dict = stop_id_to_lon.set_index('stop_id').to_dict()['stop_lon']
stop_id_to_lat_dict = stop_id_to_lat.set_index('stop_id').to_dict()['stop_lat']

route_id_to_route_name_dict = route_id_to_route_name.set_index('route_id').to_dict()['route_name']
route_id_to_route_name_dict['walking'] = 'Walk'

In [None]:
%%local
def minutes_to_timestamp(minutes):
    if minutes < 60:
        return str(minutes)
    return str(minutes // 60) + ':' + str(minutes % 60).zfill(2)

### Print paths

In [None]:
%%local
def pretty_print_journey(journey):
    journey_str = '\n'.join(["%s : %s\n|\n| %s (%s min)\n| %s\n|\n%s : %s\n" % (minutes_to_timestamp(time), 
                                                                             stop_id_to_stop_dict[start],
                                                                             route_id_to_route_name_dict[route_id],
                                                                             minutes_to_timestamp(duration),
                                                                             ' -> '.join([stop_id_to_stop_dict[s] for s in stops]) if route_id != 'walking' else '',
                                                                             minutes_to_timestamp(time + duration),
                                                                             stop_id_to_stop_dict[end]) for _, start, end, route_id, time, duration, stops in journey])
    return journey_str

In [None]:
%%local
def journeys_to_str(df):
    if len(df) == 0:
        return "No possible path"
    
    n = df['journey_number'].unique()

    res = []
    for i in sorted(n):
        segment_list = sorted(df[df.journey_number == i].values[:,1:], key = lambda e: e[0])
        res.append(pretty_print_journey(segment_list))
    return '\n******************************\n'.join(res)

In [None]:
%%local
print(journeys_to_str(journey_df))

### Map visualization

In [None]:
%%local
import matplotlib.pyplot as plt

def get_cmap(n, name='hsv'):
    '''Returns a function that maps each index in 0, 1, ..., n-1 to a distinct 
    RGB color; the keyword argument name must be a standard mpl colormap name.'''
    return plt.cm.get_cmap(name, n)

In [None]:
%%local
import plotly.graph_objects as go
from matplotlib.colors import rgb2hex

def map_plot(df):
    if len(df) == 0:
        return
    
    n = df['journey_number'].unique()

    cmap = get_cmap(max(n)+1)

    fig = go.Figure()
    
    for i in sorted(n):
        color = rgb2hex(cmap(i-1))
        journey = sorted(df[df.journey_number == i].values[:,1:], key = lambda e: e[0])
        
        stops_per_segment = [seg[-1] for seg in journey]
        
        stops_per_segment = [stops_per_segment[0]] + [ss[1:] for ss in stops_per_segment[1:]]
        
        stops = [item for sublist in res for item in sublist]
        
        fig.add_trace(go.Scattermapbox(
            lat=[stop_id_to_lat_dict[s] for s in stops],
            lon=[stop_id_to_lon_dict[s] for s in stops],
            mode='lines+markers',
            marker=go.scattermapbox.Marker(
                size=6, color=color
            ),
            opacity=0.7,
            text=[stop_id_to_stop_dict[s] for s in stops]
        ))

    fig.update_layout(
        autosize=True,
        hovermode='closest',
        mapbox=dict(
            style='open-street-map',
            bearing=0,
            center=dict(
                lat=47.378177,
                lon=8.540192
            ),
            pitch=0,
            zoom=12
        ),
        height=900,
        showlegend=True
    )

    fig.show()

In [None]:
%%local
map_plot(journey_df)