In [1]:
# load the library
import requests
import bs4
from bs4 import BeautifulSoup
import numpy as np
import math
from geopy.distance import vincenty
import urllib2, requests
import pandas as pd
from tqdm import tqdm
import cPickle as pickle
import sys
sys.setrecursionlimit(10000)
from tsp_solver.greedy import solve_tsp

import googlemaps

In [2]:
fn = '../../../../Browser/Documents/mapapikey.txt'
api = pd.read_csv(fn)
api = api.columns[0]
gmaps = googlemaps.Client(key=api)

In [3]:
import mechanize
import cookielib

# Browser
br = mechanize.Browser()

# Cookie Jar
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)

# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)

# Follows refresh 0 but not hangs on refresh > 0
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

# Want debugging messages?
#br.set_debug_http(True)
#br.set_debug_redirects(True)
#br.set_debug_responses(True)

# User-Agent setup
# br.addheaders = [('User-agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1',
          'Referer': 'http://www.google.com'}
br.addheaders = [header]



In [4]:
# pickle save data
def save_data(db, filename):
    with open('assets/datasets/{}.p'.format(filename), 'wb') as fp:
        pickle.dump(db, fp)

In [5]:
# # pickle load data
def load_data(filename):
    with open('assets/datasets/{}.p'.format(filename), 'rb') as fp:
        read_content = pickle.load(fp)
        return read_content

In [6]:
edges = load_data('edges') # dict indexed by lat long
locations = load_data('attraction_locations')
location_details = load_data('location_details')

In [7]:
attractions = pd.read_excel('assets/datasets/ShortAttractions.xlsx')
# attractions.columns = ['index', 'Locations']
attractions_list = list(attractions['Locations'])

In [8]:
# get lat long from google maps api
def get_coordinates(attractions, add_hk=True):
    attractions = attractions.encode('utf-8')
    if add_hk:
        query = attractions + ', Hong Kong'
    else:
        query = attractions
    geocode_result = gmaps.geocode(query)
    if geocode_result == []:
        return None,None,None,None
    else:
        latitude = geocode_result[0]['geometry']['location']['lat']
        longitude = geocode_result[0]['geometry']['location']['lng']
        return attractions, latitude, longitude, geocode_result

In [9]:
# attractions mapping to map the attractions to the geo location details
def location_mapping(attractions):
    locations = {}
    location_details = {}
    for i in tqdm(attractions):
        if i not in locations.keys():
            loc, lat, lon, loc_details = get_coordinates(i.decode(encoding='UTF-8'))
            if loc != None:
                locations[loc] = (lat, lon)
                location_details[loc] = loc_details
    return locations, location_details

In [10]:
# convert location dictionary to location dataframe
def loc_dict2df(locations):
    df = []
    for i, j in locations.items():
        df.append([i, j[0],j[1]])
    df = pd.DataFrame(df, columns=['Location', 'Latitude', 'Longitude'])
    return df

In [11]:
locations, location_details = location_mapping(attractions_list)

100%|██████████| 12/12 [00:02<00:00,  5.10it/s]


In [12]:
clear_places = []
for i in location_details:
    clear_places.append([i, location_details[i][0]['formatted_address']])
clear_places = pd.DataFrame(clear_places, columns=['Location','Address'])

In [13]:
attractions_list = list(clear_places[clear_places.Address != "Hong Kong"]['Location'])

In [26]:
# locations, location_details = location_mapping(attractions_list)

100%|██████████| 12/12 [00:01<00:00,  5.98it/s]


In [16]:
# this code will export the locations and lat long coordinates for plotting on tableau

location_coord = loc_dict2df(locations)
location_coord.to_csv('assets/datasets/location_lat_lon.csv')

In [17]:
# save_data(locations, 'attraction_locations')
# save_data(location_details, 'location_details')

In [18]:
# pair up 'nodes' (locations) to create 'edges' (routes) of the graphs
def node_pairing(locations):
    location_pair = []
    for i in range(len(locations)):
        for j in range(i+1,len(locations)):
            location_pair.append( [ {locations.keys()[i] : locations[locations.keys()[i]]}, {locations.keys()[j] : locations[locations.keys()[j]]} ] )
            location_pair.append( [ {locations.keys()[j] : locations[locations.keys()[j]]}, {locations.keys()[i] : locations[locations.keys()[i]]} ] )
    return location_pair
location_pair = node_pairing(locations)

In [19]:
# get the suggested route from the government webpage given lat lon...
def get_route(start_lat, start_lon, end_lat, end_lon):
    latlong = 'http://m.hketransport.gov.hk/getRouteSearchResult.php?ctl26=&ctl30=&ctl36=&ctl40=&ctl41=&ctl42=&slat={}&slon={}&elat={}&elon={}&DDL_BUF_O=400&DDL_BUF_D=400&RB_MODE=RB_MODE1&Btn_RS=Route%20Search&lang=0'.format(start_lat, start_lon, end_lat, end_lon)
    r = br.open(latlong)
    html = r.read()
    return BeautifulSoup(html, "lxml")

In [20]:
def route_finder_hk(location_pair_index):
    i = location_pair_index
    start_lat = i[0][i[0].keys()[0]][0]
    start_lon = i[0][i[0].keys()[0]][1]
    end_lat = i[1][i[1].keys()[0]][0]
    end_lon = i[1][i[1].keys()[0]][1]
    return start_lat, start_lon, end_lat, end_lon

In [22]:
# convert direction result into list from page 
def parse_table(table):
    """ Get data from table """
    tmp = [
        [cell.get_text().strip() for cell in row.find_all(['th', 'td'])]
           for row in table.find_all('tr')
    ]
    result = []
    for i in tmp:
        if i != [] and i != ['Route Search'] and i != ['Route Info'] and i != ['Other Info'] and i != ['']:
            if len(i) <= 2:
                result.append(i)
    return result

In [23]:
# convert direction list into dict from list
def build_dict(tbl_list):
    loc_dict = {}
    cross = []
    for i in range(len(tbl_list)):
        if tbl_list[i] != [] :
            if 'Choice ' in tbl_list[i][0][:7]:
                cross.append(i)
    cross.append(len(tbl_list))
    
    for i in range(len(cross)-1):
        loc_dict[tbl_list[cross[i]][0]] = tbl_list[cross[i]:cross[i+1]]
    return loc_dict

In [24]:
# convert direction dict into pd
def build_table(loc_dict):
    loc_table = []
    for i, j in loc_dict.items():
        choice = int(i[7:])
        cost = j[-1:][0][0]
        time = j[-1:][0][1]
        loc_table.append( [choice, cost, time] )
    loc_table = pd.DataFrame(loc_table, columns=['choice', 'cost', 'time'])
    loc_table.sort_values('choice',inplace=True)
    loc_table.index = range(len(loc_table))
    return loc_table

In [29]:
# route details
def route_builder(location_pair, saver = 50, renew=False):
    if renew:
        edges = {}
    else:
        edges = load_data('edges')
    counter = 0
    for i in tqdm(range(len(location_pair))):
        tmp = {}
        a = location_pair[i][0][location_pair[i][0].keys()[0]]
        b = location_pair[i][1][location_pair[i][1].keys()[0]]
        keys = [a,b]
        if str(keys) not in edges.keys():
            start_lat, start_lon, end_lat, end_lon = route_finder_hk(location_pair[i])
            table = get_route(start_lat, start_lon, end_lat, end_lon)
            tbl_list = parse_table(table) # get list of table items from html
            loc_dict = build_dict(tbl_list) # convert the list of choices into logical dictionary
            df = build_table(loc_dict) # build the choices summary into a dataframe
            tmp['detail'] = loc_dict # build detail to the dict
            tmp['result'] = df # build summary to the dict
            tmp['from'] = location_pair[i][0].keys()[0]
            tmp['to'] = location_pair[i][1].keys()[0]
            edges[str(keys)] = tmp
            
            counter = counter + 1
            if counter == saver :
                save_data(edges, 'edges')
                counter = 0
    return edges

In [30]:
def find_distance_approx(start_lat, start_lon, end_lat, end_lon):
    dist = vincenty((start_lat,start_lon), (end_lat, end_lon)).km
    return dist # in kilometers

In [31]:
# the location pair actually has a lot of duplications, as places with different share the same lat-lon coordinates
keys_location_pair = []
edge_keys = {}
for i in location_pair:
    s_coord = i[0].items()[0][1]
    e_coord = i[1].items()[0][1]
    key = str([s_coord, e_coord])
    keys_location_pair.append(key)
    edge_keys[key] = i

# make unique location pair
unique_location_pair = []
for i,j in edge_keys.items():
    unique_location_pair.append(j)

In [32]:
edges = route_builder(unique_location_pair, renew=True)

100%|██████████| 132/132 [19:04<00:00,  6.71s/it]


In [33]:
save_data(edges, 'edges')