## load stored data

In [1]:
!pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-4.0.1-cp37-cp37m-manylinux2014_x86_64.whl (21.8 MB)
[K     |████████████████████████████████| 21.8 MB 6.5 MB/s eta 0:00:01
Installing collected packages: pyarrow
Successfully installed pyarrow-4.0.1


In [1]:
from hdfs3 import HDFileSystem
hdfs = HDFileSystem(host='hdfs://iccluster040.iccluster.epfl.ch', port=8020, user='ebouille') # impersonate ebouille to read the file
import pandas as pd


## Read final_df
files = hdfs.glob('/user/{0}/final_df.parquet/*.parquet'.format('benhaj'))
final_df = pd.DataFrame()
for file in files:
    with hdfs.open(file) as f:
        final_df = final_df.append(pd.read_parquet(f))
        
## Read relevant_nodes
files = hdfs.glob('/user/{0}/relevant_nodes.parquet/*.parquet'.format('benhaj'))
relevant_nodes = pd.DataFrame()
for file in files:
    with hdfs.open(file) as f:
        relevant_nodes = relevant_nodes.append(pd.read_parquet(f))

## Read route_type
files = hdfs.glob('/user/{0}/route_type.parquet/*.parquet'.format('benhaj'))
route_type = pd.DataFrame()
for file in files:
    with hdfs.open(file) as f:
        route_type = route_type.append(pd.read_parquet(f))
        
## Read walk_edges
files = hdfs.glob('/user/{0}/walk_edges.parquet/*.parquet'.format('benhaj'))
walk_edges = pd.DataFrame()
for file in files:
    with hdfs.open(file) as f:
        walk_edges = walk_edges.append(pd.read_parquet(f))
        
## Read walk_edges
files = hdfs.glob('/user/{0}/df_delays.parquet/*.parquet'.format('benhaj'))
df_delays = pd.DataFrame()
for file in files:
    with hdfs.open(file) as f:
        df_delays = df_delays.append(pd.read_parquet(f))

In [2]:
import pandas as pd
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from heapq import heappush, heappop
from itertools import count

from fastest_paths_algorithm import get_shortest_paths_DFs

## Visualize paths

In [3]:
import plotly.graph_objects as go


def extract_nodes_infos(G, path_df):
    first_node = G.nodes[path_df.iloc[0]['source']]
    
    lon_array, lat_array = [first_node['longitude']], [first_node['latitude']]
    stop_names = [first_node['stop_name']]
    
    for idx, edge_row in path_df.iterrows():
        node = G.nodes[edge_row['target']]
        
        lon_array.append(node['longitude'])
        lat_array.append(node['latitude'])
        stop_names.append(node['stop_name'])
        
        
    return lon_array, lat_array, stop_names


def visualize_paths(G, paths_df, paths_edges_df, s, t):
    fig = go.Figure()

    COLORS = [
        "#8260c9",
        "#77b341",
        "#cb57b3",
        "#51a671",
        "#d44360",
        "#4ac0cd",
        "#cd5a2b",
        "#7085ca",
        "#cca242",
        "#ba658c",
        "#7a7a32",
        "#c6795c"
    ]


    COLOR_MAP = {
        idx: COLORS[idx % len(COLORS)] 
        for idx, _ in enumerate(pd.Series.drop_duplicates(paths_df['ta_path_id']).values)
    }

    edge_x = []
    edge_y = []
    for edge in G.edges():
        x0, y0 = G.nodes[edge[0]]['longitude'], G.nodes[edge[0]]['latitude']
        x1, y1 = G.nodes[edge[1]]['longitude'], G.nodes[edge[1]]['latitude']
        edge_x.append(x0)
        edge_x.append(x1)
        edge_x.append(None)
        edge_y.append(y0)
        edge_y.append(y1)
        edge_y.append(None)

    edge_trace = go.Scattermapbox(
        mode = "markers+lines",
        lon=edge_x, 
        lat=edge_y,
        line=dict(width=0.5, color='#888'),
        marker={'size': 5},
        name='Others',
        showlegend = False,
        hoverinfo='none'
        )

    fig.add_trace(edge_trace)


    for idx, path_row in paths_df.iterrows():

        path_df = paths_edges_df[paths_edges_df['path_id'] == path_row['path_id']]

        lon_array, lat_array, stop_names = extract_nodes_infos(G, path_df)    

        fig.add_trace(go.Scattermapbox(
            mode = "markers+lines",
            lon = lon_array,
            lat = lat_array,
            name = f"Path {idx + 1}",
            customdata = stop_names,
            marker = {
                'size': 10,
                'color': COLOR_MAP[path_row['ta_path_id']],
            },
            line = {
                'color': COLOR_MAP[path_row['ta_path_id']]
            },
            hovertemplate =
                '<b>Station name</b>: %{customdata}'+
                '<br><b>Longitude</b>: %{lon}'+
                '<br><b>Latitude</b>: %{lat}',
            legendgroup = path_row['ta_path_id']
        ))


    fig.update_layout(
        margin ={'l':0,'t':0,'b':0,'r':0},
        mapbox = {
            'center': {'lon': 8.540192, 'lat': 47.378177},
            'style': "carto-positron",
            'center': {'lon': 8.540192, 'lat': 47.378177},
            'zoom': 10},
        legend = {
            'itemclick': 'toggleothers'
        },
        height= 600
        )

    fig.show()

In [4]:
#filtering by arrival time
def filter_data(arrival_time):
    i,j = arrival_time.split(':')
    hours = str(int(i)-2)
    minutes = j
    if int(hours)<10:
        h='0'+hours
    else: h=hours
    if(int(minutes)<10):
        m='0'+minutes
    else : m=minutes
    time_minus_2hours = hours+':'+minutes
    hours,minute = time_minus_2hours.split(':')
    if int(hours)<10:
        hours = '0'+hours
    else: hours = hours
    if int(minutes)<10:
        minutes= '0'+minutes
    else: minutes = minutes
    finall = hours+':'+minutes
    filtered_data = final_df[((final_df.arrival >= finall) & (final_df.arrival <= arrival_time)) | (final_df["type"] == "walking")].copy()
    return filtered_data

In [5]:
def to_seconds(string):
    string = string[:5]
    i,j = string.split(':')
    seconds = int(i)*60*60 + int(j)*60
    return seconds

In [6]:
def change_time_to_unixTimestamp(df1):
    df=df1.copy()
    df['arrival'] = df.apply(lambda row: to_seconds(row.arrival) if row.type!='walking' else None , axis=1)
    df['departure'] = df.apply(lambda row: to_seconds(row.departure) if row.type!='walking' else None,axis=1)
    return df

In [7]:
def create_graph(df):
    Graph = nx.convert_matrix.from_pandas_edgelist(df,source='source',target='target',edge_attr=True, create_using=nx.MultiDiGraph())
    nodes_attributes = relevant_nodes.set_index('stop_id').to_dict('index')
    nx.set_node_attributes(Graph, nodes_attributes)
    return Graph

In [66]:
def categorize(time):
    if time=='': return None
    if time>= '07:00' and time<'11:00' : return 1
    elif time>='11:00' and time<'15:00': return 2
    else: return 3

def from_timestamp_to_string(timestamp):
    print(timestamp)
    if timestamp!=np.nan:
        hours = int(timestamp/(60*60))
        minutes = int((timestamp - hours*60*60)/60)
        if hours<10:
            hours = '0'+str(hours)
        else: hours = str(hours)
        if minutes<10:
            minutes= '0'+str(minutes)
        else: minutes = str(minutes)
        return hours+':'+minutes
    else: return None

def process_summary(df1,category):
    df=df1.copy()
    df['arrival'] = df.apply(lambda row: from_timestamp_to_string(row.arrival) if row.type!='walking' else None,axis=1)
    df.departure = df.apply(lambda row: from_timestamp_to_string(row.departure) if row.type!='walking' else None,axis=1)
    df['category'] = category
    return df
    
def add_seconds(string,seconds):
    sec = int(seconds)
    i,j = string.split(':')
    min_from_sec = int(sec/60)
    remaining_seconds = sec - (min_from_sec*60)
    minute= str(int(i)+min_from_sec)
    seconds=str(int(j)+remaining_seconds)
    return minute+':'+seconds

def difference_in_seconds(prev_arrival,departure):
    t1 = to_seconds(prev_arrival)
    t2 = to_seconds(departure)
    return t2-t1
    

def get_avg_max(type_, category, stop):
    values = df_delays.where((df_delays.type==type_)&(df_delays.category==category)&(df_delays.stop_id==str(stop))).dropna()[['avg','max']].values[0]
    avg,max_=values[0],values[1]
    return avg,max_

def compute_proba_exp(lambda_ , x):
    return 1 - np.exp(-lambda_*x)

def print_results(df):
    paths_proba = []
    path_i = 0
    print('These are all the possible paths :')
    print(100*'=')
    print(100*'=')
    print(f'path {path_i+1}')
    print(100*'=')
    departure_time = '12:00'
    walking_departure = departure_time ##input_time
    probabilities = []
    first_travel = True
    for row in df.iterrows():
        path_id = row[1].path_id
        if path_id !=path_i:
            proba_path = np.prod(probabilities)
            paths_proba.append([path_id,proba_path])
            print(f'This path has a probability {proba_path} of success')
            print(100*'=')
            print(100*'=')
            print(f"path {path_id+1}")
            print(100*'=')
            path_i = path_id
            walking_departure = departure_time ##input_time
            probabilities = []
            first_travel = True


        if ((row[1].type=='train') or (row[1].type=='bus')):
            s = row[1].source
            t = row[1].target
            type_ = row[1].type
            dep = row[1].departure
            arr = row[1].arrival
            cat = row[1].category

            ## dealing with delay
            if first_travel==True : 
                proba = 1
                previous_arrival = arr
                first_travel= False
            else:
                print(type_)
                avg_delay,max_delay = get_avg_max(type_,cat,t)
                time_to_change = difference_in_seconds(previous_arrival,dep)
                if (time_to_change>max_delay):
                    proba=1
                else:
                    proba = compute_proba_exp(lambda_ = 1/avg_delay , x=time_to_change)
                previous_arrival = arr


            next_type = row[1].type_next
            if(next_type=='walking'): walking_departure = arr
            print(f'Take the {type_} at {dep} from {s} to {t}')
            print(f"You'll get there at {arr} with probability {proba}")
            probabilities.append(proba)

        else:
            s = row[1].source
            t = row[1].target
            dep = walking_departure
            duration = row[1].duration
            arr = add_seconds(dep,duration)

            ## dealing with delay
            if first_travel==True : 
                previous_arrival = arr
                first_travel=False


            proba=1
            next_type = row[1].type_next
            if(next_type=='walking'): walking_departure = arr
            print(f'Now, walk from {s} to {t}')
            print(f"You'll get there at {arr} with probability {proba}")
            probabilities.append(proba)


        print(100*'=')

    print(f'This path has a probability {np.prod(probabilities)} of success')
    print(100*'=')
    print(100*'=')
    if path_i==0: paths_proba.append(np.prod(probabilities))
    return paths_proba

def get_stop_id(source):
    return relevant_nodes.where(relevant_nodes.stop_name==source).dropna()['stop_id'].values[0]

In [71]:
def run_all(source,target,arrival_time,q_value):
    filtered_data = filter_data(arrival_time)
    filtered = change_time_to_unixTimestamp(filtered_data)
    
    filtered_data = filtered.astype({'source': int,'target':int})

    Graph = create_graph(filtered_data)

    s= int(get_stop_id(source))
    
    t= int(get_stop_id(target))

    path_gen, paths_df, paths_edges_df, summary_df = get_shortest_paths_DFs(Graph, s, t)
    category = categorize(arrival_time)
    summary_df = process_summary(summary_df,category)
    paths_proba = print_results(summary_df)
    print('The paths that satisfy your confidence value are :')
    if len(paths_proba)>1:
        chosen_paths = [int(p[0])+1 for p in paths_proba if p[1]>=q_value]
        print(chosen_paths)
    else: print('1')
    #visualize_paths(Graph, paths_df, paths_edges_df, source, target)

In [72]:
import ipywidgets as widgets

form_item_layout = widgets.Layout(
    display='flex',
    flex_flow='row',
    justify_content='space-between',
    
)

arrival_hour = widgets.BoundedIntText(value=12, min = 7, max = 19, step = 1, disabled = False,\
                                    layout = form_item_layout)
arrival_minute = widgets.BoundedIntText(value = 0,min = 0,max = 59, step = 1, disabled = False, layout =form_item_layout)

start_stop = widgets.Dropdown(options=relevant_nodes['stop_name'], layout=form_item_layout)

end_stop =  widgets.Dropdown(options=relevant_nodes["stop_name"], layout=form_item_layout)

conf = widgets.FloatSlider(value=0.1,min=0,max=1,step=0.1,description='',disabled=False,\
                             continuous_update=False,orientation='horizontal',readout=True,readout_format='.1f',)

valid = widgets.Button(description='Search', disabled=False, button_style='primary',tooltip='Go ahead and click',icon='search')


out = widgets.Output()

def search(b):
    out.clear_output()
    with out:
        
        if arrival_hour.value<10:
            hours = '0'+str(arrival_hour.value)
        else: hours = str(arrival_hour.value)
        if int(arrival_minute.value)<10:
            minutes= '0'+str(arrival_minute.value)
        else: minutes = str(arrival_minute.value)
        arrival = hours+':'+minutes
        
        run_all(start_stop.value,end_stop.value,arrival,conf.value)



search.start_hour, search.start_minute, search.start_stop ,search.end_stop = 0,0,0,0
valid.on_click(search)

hbox1 = widgets.HBox([widgets.Label(value="From:", layout=form_item_layout),start_stop,widgets.Label(value="To:", layout=form_item_layout), end_stop])
hbox2 = widgets.HBox([widgets.Label(value="Arrives at:",), arrival_hour, arrival_minute])
hbox3 = widgets.HBox([widgets.Label(value='Confidence interval :'),conf, valid])


ui = widgets.VBox([ hbox1, hbox2,hbox3])
display(ui, out)

VBox(children=(HBox(children=(Label(value='From:', layout=Layout(display='flex', flex_flow='row', justify_cont…

Output()