# Extract Landmarks Data in Melbourne from Wikipedia Interactively

<a id=toc>

Extract landmarks data:
- category, 
- name, 
- (latitude, longitude)

from Wikipedia page [landmarks in Melbourne](https://en.wikipedia.org/wiki/Template:Melbourne_landmarks) in an interactive way.

In [1]:
%matplotlib inline

import requests, re, os
from bs4 import BeautifulSoup
from bs4.element import Tag
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lxml
from fastkml import kml, styles
from shapely.geometry import Point

URL for the landmarks in the Melbourne city centre.

In [2]:
#url = 'https://en.wikipedia.org/wiki/Template:Melbourne_landmarks'
url = 'https://en.wikipedia.org/wiki/Template:Melbourne_landmarks?action=render' # cleaner HTML
data_dir = '../data'
fpoi = os.path.join(data_dir, 'poi-Melb-0.csv')

In [3]:
response = requests.get(url, timeout=10)

In [4]:
html = response.text
soup = BeautifulSoup(html, 'html.parser')

In [5]:
#print(soup.prettify())

### Extract POI coordinates from its Wikipedia page

**NOTE** that there could be more than one coordinate pairs exists in a page, e.g. [Yarra River](https://en.wikipedia.org/wiki/Yarra_River).

In [6]:
def extract_coord(url):
    """
    Assume a URL of a location with a Wikipedia page
    """
    url1 = url + '?action=render' # cleaner HTML
    response = requests.get(url1, timeout=10)
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')
    coords = list(soup.find_all('span', {'class':'geo-dec'}))
    if coords is None or len(coords) == 0:
        print('No Geo-coordinates found')
        return
    
    idx = 0
    if len(coords) > 1:
        if len(coords) == 2 and coords[0].string == coords[1].string:
            idx = 0
        else:
            print('WARN: more than one geo-coordinates detected!')
            print('please check the actual page', url)
            for i, c in enumerate(coords): 
                  print('%d: %s' % (i, c.string))
            ii = input('Input the index of the correct coordinates... ')
            idx = int(ii)
            assert(0 <= idx < len(coords))
    
    coord = coords[idx]
    children = list(coord.children)
    assert(len(children) > 0)
    coordstr = children[0]
    #print(coordstr)
    
    ss = re.sub(r'\s+', ',', coordstr).split(',') # replace blank spaces with ','
    assert(len(ss) == 2)
    latstr = ss[0].split('°') # e.g. 37.82167°S
    lonstr = ss[1].split('°') # e.g. 144.96778°E
    
    assert(len(latstr) == 2 and len(lonstr) == 2)
    lat = float(latstr[0]) if latstr[1] == 'N' else -1 * float(latstr[0])
    lon = float(lonstr[0]) if lonstr[1] == 'E' else -1 * float(lonstr[0])
    
    print(lat, lon)
    return (lat, lon, url)

In [7]:
extract_coord('https://en.wikipedia.org/wiki/Yarra_River')

WARN: more than one geo-coordinates detected!
please check the actual page https://en.wikipedia.org/wiki/Yarra_River
0: 37.74917°S 146.14056°E
1: 37.85194°S 144.90833°E
2: 37.85194°S 144.90833°E
Input the index of the correct coordinates... 1
-37.85194 144.90833


(-37.85194, 144.90833, 'https://en.wikipedia.org/wiki/Yarra_River')

Extract POI data, e.g. category, name, coordinates, from a HTML string retrieved from Wikipedia.

In [8]:
def extract_poi(html):
    """
    Assume POI category is a string in <th>
    POI name and hyperlink is in <li> contained in an unordered list <ul> 
    """
    soup = BeautifulSoup(html, 'html.parser')
    th = soup.find('th')
    if th is None:
        print('NO POI category found')
        return
    assert(len(th.contents) > 0)
    cat = th.contents[0]
    print('CAT:', cat)
    
    ul = soup.find('ul')
    if ul is None:
        print('NO POI found')
        return
    
    poi_data = [] # (name, cat, lat, lon, url)
    
    for li in ul.children:
        #print(type(li), li)
        if isinstance(li, Tag):
            addr = ''.join(['https:', li.a['href']])
            children = list(li.a.children)
            assert(len(children) > 0)            
            name = children[0]
            print(addr, name)
            ret = extract_coord(addr)
            if ret is not None:
                poi_data.append((name, cat, ret[0], ret[1], ret[2]))
    return poi_data

Extract POI data from landmarks in Melbourne recorded in [this Wikipedia page](https://en.wikipedia.org/wiki/Template:Melbourne_landmarks).

In [9]:
#columns = ['Name', 'Category', 'Latitude', 'Longitude']
columns = ['poiName', 'poiTheme', 'poiLat', 'poiLon', 'poiURL']
poi_df = pd.DataFrame(columns=columns)

In [10]:
table = soup.find('table', {'class':'navbox-inner'}) # this class info was found by looking at the raw HTML text

In [17]:
cnt = 0
hline = '-'*90
for c in table.children:
    print(hline)
    print('NODE %d BEGIN' % cnt)
    print(c)
    print('NODE %d END' % cnt)
    print(hline)
    k = input('Press [Y] or [y] to extract POI, press any other key to ignore ')
    if k == 'Y' or k == 'y':
        print('Extracting POI...')
        poi_data = extract_poi(str(c))
        for t in poi_data: poi_df.loc[poi_df.shape[0]] = [t[i] for i in range(len(t))]
    else:
        print('IGNORED.')
    print('\n\n')
    
    cnt += 1

------------------------------------------------------------------------------------------
NODE 0 BEGIN
<tbody><tr><th class="navbox-title" colspan="2" scope="col"><div class="plainlinks hlist navbar mini"><ul><li class="nv-view"><a class="mw-selflink selflink"><abbr style=";;background:none transparent;border:none;-moz-box-shadow:none;-webkit-box-shadow:none;box-shadow:none; padding:0;" title="View this template">v</abbr></a></li><li class="nv-talk"><a href="//en.wikipedia.org/wiki/Template_talk:Melbourne_City_Centre_landmarks" title="Template talk:Melbourne City Centre landmarks"><abbr style=";;background:none transparent;border:none;-moz-box-shadow:none;-webkit-box-shadow:none;box-shadow:none; padding:0;" title="Discuss this template">t</abbr></a></li><li class="nv-edit"><a class="external text" href="https://en.wikipedia.org/w/index.php?title=Template:Melbourne_City_Centre_landmarks&amp;action=edit"><abbr style=";;background:none transparent;border:none;-moz-box-shadow:none;-webkit

Press [Y] or [y] to extract POI, press any other key to ignore  
IGNORED.





Latitude/Longitude statistics.

In [12]:
poi_df.head()

Unnamed: 0,poiName,poiTheme,poiLat,poiLon,poiURL


In [13]:
print('#POIs:', poi_df.shape[0])

#POIs: 0


In [14]:
print('Latitude Range:', poi_df['poiLat'].max() - poi_df['poiLat'].min())
poi_df['poiLat'].describe()

Latitude Range: nan


count     0
unique    0
Name: poiLat, dtype: int64

In [15]:
print('Longitude Range:', poi_df['poiLon'].max() - poi_df['poiLon'].min())
poi_df['poiLon'].describe()

Longitude Range: nan


count     0
unique    0
Name: poiLon, dtype: int64

Scatter plot.

In [None]:
plt.figure(figsize=[10, 10])
plt.scatter(poi_df['poiLat'], poi_df['poiLon'])

### Filtering out the outliers

In [None]:
lat_range = [-39, -36]
lon_range = [143, 147]

In [None]:
poi_df = poi_df[poi_df['poiLat'] > min(lat_range)]
poi_df = poi_df[poi_df['poiLat'] < max(lat_range)]
poi_df = poi_df[poi_df['poiLon'] > min(lon_range)]
poi_df = poi_df[poi_df['poiLon'] < max(lon_range)]

Latitude/Longitude statistics.

In [None]:
print('#POIs:', poi_df.shape[0])

In [None]:
print('Latitude Range:', poi_df['poiLat'].max() - poi_df['poiLat'].min())
poi_df['poiLat'].describe()

In [None]:
print('Longitude Range:', poi_df['poiLon'].max() - poi_df['poiLon'].min())
poi_df['poiLon'].describe()

Scatter plot.

In [None]:
plt.figure(figsize=[10, 10])
plt.scatter(poi_df['poiLat'], poi_df['poiLon'])

### Filtering POIs with the same wikipage and coordinates but associated with several names and categories

In [None]:
print('#POIs:', poi_df.shape[0])
print('#URLs:', poi_df['poiURL'].unique().shape[0])

In [None]:
duplicated = poi_df['poiURL'].duplicated()
duplicated[duplicated == True]

In [None]:
print(poi_df.loc[15, 'poiURL'])
poi_df[poi_df['poiURL'] == poi_df.loc[15, 'poiURL']]

This is a place located at Melbourne CBD, let's choose the second item with category 'Shopping'.

In [None]:
poi_df.drop(4, axis=0, inplace=True)

In [None]:
poi_df.head()

In [None]:
print(poi_df.loc[37, 'poiURL'])
poi_df[poi_df['poiURL'] == poi_df.loc[37, 'poiURL']]

For a Post Office, Let's choose the second item with category 'Institutions'.

In [None]:
poi_df.drop(19, axis=0, inplace=True)

In [None]:
poi_df.head(20)

### Check distance between POIs

In [None]:
def calc_dist_vec(longitudes1, latitudes1, longitudes2, latitudes2):
    """Calculate the distance (unit: km) between two places on earth, vectorised"""
    # convert degrees to radians
    lng1 = np.radians(longitudes1)
    lat1 = np.radians(latitudes1)
    lng2 = np.radians(longitudes2)
    lat2 = np.radians(latitudes2)
    radius = 6371.0088 # mean earth radius, en.wikipedia.org/wiki/Earth_radius#Mean_radius

    # The haversine formula, en.wikipedia.org/wiki/Great-circle_distance
    dlng = np.fabs(lng1 - lng2)
    dlat = np.fabs(lat1 - lat2)
    dist =  2 * radius * np.arcsin( np.sqrt( 
                (np.sin(0.5*dlat))**2 + np.cos(lat1) * np.cos(lat2) * (np.sin(0.5*dlng))**2 ))
    return dist

In [None]:
poi_dist_df = pd.DataFrame(data=np.zeros((poi_df.shape[0], poi_df.shape[0]), dtype=np.float), \
                           index=poi_df.index, columns=poi_df.index)
for ix in poi_df.index:
    dists = calc_dist_vec(poi_df.loc[ix, 'poiLon'], poi_df.loc[ix, 'poiLat'], poi_df['poiLon'], poi_df['poiLat'])
    poi_dist_df.loc[ix] = dists

POI pairs that are less than 50 metres.

In [None]:
check_ix = []
for i in range(poi_df.index.shape[0]):
    for j in range(i+1, poi_df.index.shape[0]):
        if poi_dist_df.iloc[i, j] < 0.05:  # less 50m
            check_ix = check_ix + [poi_df.index[i], poi_df.index[j]]
            print(poi_df.index[i], poi_df.index[j])

In [None]:
poi_df.loc[check_ix]

In [None]:
print(poi_df.loc[33, 'poiURL'])
print(poi_df.loc[35, 'poiURL'])
print(poi_df.loc[76, 'poiURL'])

In [None]:
poi_df.drop(33, axis=0, inplace=True)
poi_df.drop(35, axis=0, inplace=True)

In [None]:
poi_df.head(35)

### Save POI data to file

In [None]:
#poi_ = poi_df[['poiTheme', 'poiLon', 'poiLat']].copy()
poi_ = poi_df.copy()
poi_.reset_index(inplace=True)
poi_.drop('index', axis=1, inplace=True)
poi_.index.name = 'poiID'
poi_

In [None]:
poi_.to_csv(fpoi, index=True)

In [None]:
#poi_df.to_csv(fpoi, index=False)

### Visualise POIs on map

This is a [shared Google map](https://drive.google.com/open?id=1ywC8Wm1QAPa5x89bMk-CcAJKPWE&usp=sharing).

In [None]:
def generate_kml(fname, poi_df):
    k = kml.KML()
    ns = '{http://www.opengis.net/kml/2.2}'
    styid = 'style1'
    # colors in KML: aabbggrr, aa=00 is fully transparent
    sty = styles.Style(id=styid, styles=[styles.LineStyle(color='9f0000ff', width=2)]) # transparent red
    doc = kml.Document(ns, '1', 'POIs', 'POIs visualization', styles=[sty])
    k.append(doc)
    
    # Placemark for POIs
    for ix in poi_df.index:
        name = poi_df.loc[ix, 'poiName']
        cat  = poi_df.loc[ix, 'poiTheme']
        lat  = poi_df.loc[ix, 'poiLat']
        lon  = poi_df.loc[ix, 'poiLon']
        desc = ''.join(['POI Name: ', name, '<br/>Category: ', cat, '<br/>Coordinates: (%f, %f)' % (lat, lon)])
        pm = kml.Placemark(ns, str(ix), name, desc, styleUrl='#' + styid)
        pm.geometry = Point(lon, lat)
        doc.append(pm)
        
    # save to file
    kmlstr = k.to_string(prettyprint=True)
    with open(fname, 'w') as f:
        f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
        f.write(kmlstr)

In [1]:
generate_kml('./poiAulick.kml', poi_df)

NameError: name 'generate_kml' is not defined