In [None]:
!pip install pyarrow

In [None]:
import pandas as pd
from hdfs3 import HDFileSystem
import os
username = os.environ['JUPYTERHUB_USER']

## Read data from HDFS

In [None]:
hdfs = HDFileSystem(user='ebouille') # impersonate ebouille to read the file
def read_hdfs(path):
    files = hdfs.glob(path)
    df = pd.DataFrame()
    for file in files:
        with hdfs.open(file) as f:
            df = df.append(pd.read_parquet(f))
    return df

In [None]:
walking_edges = read_hdfs('/user/%s/final/parquet/walking_edges/*.parquet' % username)

In [None]:
walking_edges.head()

In [None]:
transport_edges = read_hdfs('/user/%s/final/parquet/transport_edges/*.parquet' % username)

In [None]:
transport_edges.head()

In [None]:
reachable_stops = read_hdfs('/user/%s/final/parquet/reachable_stops/*.parquet' % username)

In [None]:
reachable_stops.head()

In [None]:
routes = read_hdfs('/user/%s/final/parquet/routes/*.parquet' % username)

In [None]:
routes.head()

In [None]:
walking_edges['value'] = list(zip(walking_edges['end_vertex'], walking_edges['duration']))
walking_edges = walking_edges.groupby('start_vertex').agg({'value': set})\
                             .to_dict()['value']

## Shortest Path

In [None]:
def time_to_minutes(timestamp):
        return int(timestamp[:2]) * 60 + int(timestamp[3:5])

In [None]:
# This function assumes a (spark) dataframe with columns 'start_vertex', 'end_vertex', 'start_time', 'duration', 'route_id' 
# edges is a dataframe that represents the edges of the graph, 
# end_time should be in the format 'HH:MM'
def latest_departure_paths(edges, end_stop, end_time, walking_edges):

    def edge_valid(u, v, time, dur, route_id): 
        next_edges = all_roads.get(v, [])
        
        # Did we reach the destination ?
        if v == end_stop and time + dur <= end_time: 
             return (time, v, route_id, dur)
        
        if u != end_stop and v != end_stop:
            for edge in next_edges[::-1]: # Traverse the list in reverse order because it's sorted by descending starting time
                time_v, next_v, route_id_v, dur_v = edge
                if (route_id == route_id_v and time + dur <= time_v) or (time + dur + transfer_time <= time_v): 
                    return (time, v, route_id, dur)            
            
            # Can we walk from v instead?
            for end_vertex, walking_duration in walking_edges.get(v, []):
                if end_vertex != u: # do not loop back to u
                    for next_next_edge in all_roads.get(end_vertex, []):
                        time_next = next_next_edge[0]
                        if time + dur <= time_next - walking_duration - transfer_time : # can we make in time to v in order to walk
                            return (time, v, route_id, dur)
                
        return None
    def time_to_minutes(timestamp):
        return int(timestamp[:2]) * 60 + int(timestamp[3:5])
    
    
    transfer_time = 2 # at least 2 minutes to transfer
    
    all_roads = {}
    end_time = time_to_minutes(end_time)
    start_time = end_time - 120 # look only at edges departing at most 2 hours before end_time
        
    edges = edges[(edges['start_time'] >= start_time) & (edges['start_time'] <= end_time)].sort_values(by=['start_time'], ascending=False).to_numpy()

    for row in edges:
        start_vertex, end_vertex, start, duration, route_id = row[:5]
        update = edge_valid(start_vertex, end_vertex, start, duration, route_id)
        if update:
            if start_vertex in all_roads:    
                all_roads[start_vertex].append(update)
            else:
                all_roads[start_vertex] = [update]
                        
    return all_roads

In [None]:
from itertools import groupby
import numpy as np

# Journey reconstruction


# Returns a List of all possible edges that can be taken after edge 'edge'
def select_next(all_roads, walking_edges, edge, transfer_time=2):
    
    prev_time, node, prev_route_id, prev_duration = edge[:4]
    next_edges = all_roads.get(node, [])
    
    if prev_route_id == 'Walking': 
        # A walking edge encodes only one next trip at the node, stored as an index of the trip in all_roads[node] 
        return [next_edges[edge[-1]]]
    else:
        possible_edges = []
        # We may take a transport edge or a walking edge in this case
        next_edges = next_edges +  get_walking_edges(all_roads, walking_edges, node)
        for edge_n in next_edges:
            t, v, route_id, dur = edge_n[:4]
            if (prev_route_id == route_id or route_id == 'Walking') and prev_time + prev_duration <= t:
                possible_edges.append(edge_n)
            elif prev_route_id != route_id and prev_time + prev_duration + transfer_time <= t:
                possible_edges.append(edge_n)

        return possible_edges 

# Recursively iterate on all possible paths to the destination, and yield journeys as we find them 
def iterate_all_edges(all_roads, walking_edges, start_edges, end_stop, end_time, cur_journey):
    for edge in start_edges:
        t, v, route_id, dur = edge[:4]
        if v == end_stop and t + dur <= end_time: # have we reached the destination?
            yield cur_journey + [edge]
        elif v != end_stop:
            next_edges = select_next(all_roads, walking_edges, edge)
            for journey in iterate_all_edges(all_roads, walking_edges, next_edges, end_stop, end_time, cur_journey + [edge]):
                yield journey
                
# Compute the number of transfers as well as the total walking duration for a Journey
def get_journey_attributes(journey):
    walking_distance = sum([t[-2] for t in journey if t[2] == 'Walking'])
    transfers = len(set([t[2] for t in journey if t[2] != 'Walking']))
    return [transfers, walking_distance]

# Compute viable walks starting at node start_stop
# A walking edge u -> v holds information about the single trip to take from v
# This avoids unnecessary path computations
def get_walking_edges(all_roads, walking_edges, start_stop, transfer_time=2):
    possible_walking_edges = []
    for end_vertex, duration in walking_edges.get(start_stop, []):
        for index, edge in enumerate(all_roads.get(end_vertex, [])):
            time_next, node_next = edge[:2]
            if node_next != start_stop:
                possible_walking_edges.append((time_next - duration - transfer_time, end_vertex, 'Walking', duration, index))
    return possible_walking_edges
    
def generate_best_paths(all_roads, walking_edges, start_stop, end_stop, end_time):
    # add possible paths that start by walking
    start_edges = all_roads.get(start_stop, []) + get_walking_edges(all_roads, walking_edges, start_stop)
    if not start_edges: # not paths were found
        yield []
    else:
        # sort edges and group by start_time
        start_edges = sorted(start_edges, key=lambda x:x[0], reverse=True)
        start_edges = groupby(start_edges, key=lambda x:x[0])
        for start_time, edges in start_edges:
            # find all possible journeys that start at time start_time
            all_journeys = list(iterate_all_edges(all_roads, walking_edges, list(edges), end_stop, end_time, []))
            journey_attribs = np.array([get_journey_attributes(journey) for journey in all_journeys])
            sorted_indices = np.lexsort(journey_attribs.T)
            for i in sorted_indices:
                yield all_journeys[i]

In [None]:
# Processes a list of edges (Journey) into a list of segments
def journey_to_segments (journey, start_stop):
    
    grouped_journey = [(route_id, list(edges)) for (route_id, edges) in  groupby(journey, lambda x: x[2])]
    segments = []
    for index, (route_id, edges) in enumerate(grouped_journey):
        
        start_time_s = edges[0][0]
        
        if route_id == 'Walking' and index > 0:
            # Start walking as soon as you get to the previous stop
            start_time_s = segments[-1][4] + segments[-1][5]
             
        end_stop_s = edges[-1][1]
        duration_s = edges[-1][0] + edges[-1][3] - edges[0][0]
        
        start_stop_s = start_stop if index == 0 else segments[index - 1][2]
            
        stops_s = [start_stop_s] + [t[1] for t in edges]
                
        segments.append((index + 1, start_stop_s, end_stop_s, route_id, start_time_s, duration_s, stops_s))
            
    return segments

## Generate N paths 

In [None]:
def generate_n_journeys(dep, dest, day, arrival_time, n):
    day_edges = transport_edges[transport_edges.weekday == day]
    all_roads = latest_departure_paths(day_edges, dest , arrival_time, walking_edges)
    generator = generate_best_paths(all_roads, walking_edges, dep, dest, time_to_minutes(arrival_time))
    
    journey_list = {}
    for i in range(n):
        journey = next(generator)
        journey_list[i+1] = [seg for seg in (journey_to_segments(journey, dep))]
        
    return journey_list

In [None]:
journey_list = generate_n_journeys('8580301', '8591221', 'monday', '20:00', 5)

## Lookup dicts for human readable information

In [None]:
stop_id_to_stop_dict = reachable_stops[['stop_id', 'stop_name']].set_index('stop_id').to_dict()['stop_name']
stop_id_to_lon_dict = reachable_stops[['stop_id', 'stop_lon']].set_index('stop_id').to_dict()['stop_lon']
stop_id_to_lat_dict = reachable_stops[['stop_id', 'stop_lat']].set_index('stop_id').to_dict()['stop_lat']

In [None]:
routes['route_name'] = routes['route_desc'] + ' ' + routes['route_short_name']
route_id_to_route_name_dict = routes[['route_id', 'route_name']].set_index('route_id').to_dict()['route_name']
route_id_to_route_name_dict['Walking'] = 'Walk'

## Pretty print

In [None]:
def minutes_to_timestamp(minutes):
    if minutes < 60:
        return str(minutes)
    return str(minutes // 60) + ':' + str(minutes % 60).zfill(2)

In [None]:
def minutes_to_duration(minutes):
    if minutes < 60:
        return str(minutes) + ' min'
    return str(minutes // 60) + ' hr ' + str(minutes % 60) + ' min'

In [None]:
def pretty_print_journey(journey):
    journey_str = '\n'.join(["%s : %s\n|\n| %s (%s min)\n| %s\n|\n%s : %s\n" % (minutes_to_timestamp(time), 
                                                                             stop_id_to_stop_dict[start],
                                                                             route_id_to_route_name_dict[route_id],
                                                                             minutes_to_timestamp(duration),
                                                                             ' -> '.join([stop_id_to_stop_dict[s] for s in stops]) if route_id != 'Walking' else '',
                                                                             minutes_to_timestamp(time + duration),
                                                                             stop_id_to_stop_dict[end]) for _, start, end, route_id, time, duration, stops in journey])
    return journey_str

In [None]:
def journeys_to_str(journey_list):
    if len(journey_list) == 0:
        return "No possible path"

    res = []
    for i in journey_list.keys():
        segment_list = sorted(journey_list[i], key = lambda e: e[0])
        res.append(pretty_print_journey(segment_list))
    return '\n******************************\n'.join(res)

## Visualization

In [None]:
import plotly.graph_objects as go
from ipywidgets import interact

def map_plot():
    fig = go.Figure()

    fig.add_trace(go.Scattermapbox(
    ))

    fig.update_layout(
        autosize=True,
        hovermode='closest',
        mapbox=dict(
            style='carto-positron',
            # style='open-street-map',
            bearing=0,
            pitch=0,
            zoom=11
        ),
        height=900,
        width=1200
    )

    return fig

In [None]:
fig = map_plot()
stream_fig = go.FigureWidget(fig)
stream_data = stream_fig.data

def show_path(journey_segments):
    if len(journey_segments) == 0:
        return

    journey = sorted(journey_segments, key = lambda e: e[0])        
    
    departure_time = journey[0][4]
    arrival_time = journey[-1][4] + journey[-1][5]
    
    stops_per_segment = [seg[-1] for seg in journey]
    
    stops_per_segment_no_duplicates = [stops_per_segment[0]] + [ss[1:] for ss in stops_per_segment[1:]]

    stops = [item for sublist in stops_per_segment_no_duplicates for item in sublist]
    
    lats = [stop_id_to_lat_dict[s] for s in stops]
    lons = [stop_id_to_lon_dict[s] for s in stops]
    
    departure_stop = stops[0]
    arrival_stop = stops[-1]
    
    # Reset figure and center figure
    with stream_fig.batch_update():
        stream_fig.data = stream_fig.data[:1]
        stream_fig.update_layout(mapbox_center=dict(lat = (max(lats)+min(lats))/2, lon = (max(lons)+min(lons))/2))
        stream_fig.update_layout(legend_traceorder="reversed", legend_valign='top')
        stream_fig.update_layout(title_text='%s (%s) -> %s (%s)' % (stop_id_to_stop_dict[departure_stop],
                                                                    minutes_to_timestamp(departure_time),
                                                                    stop_id_to_stop_dict[arrival_stop],
                                                                    minutes_to_timestamp(arrival_time)))
    
    
    stops_per_route = [(seg[3], seg[4], seg[5], seg[-1]) for seg in journey]
    
    for route_id, time, duration, stops in reversed(stops_per_route):
        stop_names = [stop_id_to_stop_dict[s] for s in stops]
        
        trace_name = '<b>' + route_id_to_route_name_dict[route_id] + '</b><br>%s %s<br>%s' % (stop_names[0],
                                                                                              minutes_to_timestamp(time),
                                                                                              minutes_to_duration(duration))
        
        if route_id != 'Walking':
            trace_name += ' (%d stops)' % (len(stops)-1)
        trace_name += '<br>%s %s' % (stop_names[-1], minutes_to_timestamp(time + duration))
        
        with stream_fig.batch_update():
            stream_fig.add_trace(go.Scattermapbox(
                lat=[stop_id_to_lat_dict[s] for s in stops],
                lon=[stop_id_to_lon_dict[s] for s in stops],
                mode='lines+markers',
                marker=go.scattermapbox.Marker(
                    size=6,
                ),
                opacity=1,
                hovertext=stop_names,
                name=trace_name,
                hoverinfo="text+name",
                hoverlabel_namelength=-1
            ))
    return stream_fig

In [None]:
from ipywidgets import interact

@interact(i=journey_list.keys())
def f(i):
    return show_path(journey_list[i])