# Hong Kong Neighborhood mining

This notebook combines data sources (wikipedia and Foursquare API) to assemble a list of neighborhoods and map them to their coordinates for futher processing.

In [1]:
import pandas as pd
import numpy as np
import requests

from bs4 import BeautifulSoup

from geocoder import enrich_neighborhoods_with_geocoder, map_neighborhoods

### Scrape wikipedia to compile Hong Kong neighborhood list

In [3]:
wiki_url = "https://en.wikipedia.org/wiki/List_of_places_in_Hong_Kong"
r  = requests.get(wiki_url)
data = r.text

soup = BeautifulSoup(data)

areas = ['Hong Kong Island', 'Kowloon', 'New Territories']
results = []
# exclude (this) and [not-this]
for h2 in soup('h2'):
    area = h2.find('a')
    if area and area.contents and area.contents[0] in areas:
        sib = h2.find_next_sibling('ul')
        for district in sib.findAll('li'):
            district_name = district.find('a')
            if district_name and district_name.contents:
                district_neighborhoods = district.find('ul')
                if district_neighborhoods and district_neighborhoods.findAll:
                    for li in district_neighborhoods.findAll('li'):
                        c = li.find('a') and li.find('a').contents[0]
                        if c: 
                            results.append(c)

print(results[0:20])

['Central District', 'Admiralty', 'Mid-Levels', 'Soho', 'The Peak', 'Sai Wan', 'Kennedy Town', 'Sai Ying Pun', 'Shek Tong Tsui', 'Sheung Wan', 'Admiralty', 'Soho', 'Kennedy Town', 'Sai Ying Pun', 'Shek Tong Tsui', 'Causeway Bay', 'Fortress Hill', 'Tin Hau', 'Chai Wan', 'Heng Fa Chuen']


In [4]:
print(len(results))

369


#### Drop duplicates

In [5]:
results_set = set(results)
neighborhoods_list = list(results_set)
print(len(neighborhoods_list))

237


In [9]:
df = pd.DataFrame(neighborhoods_list)
df.columns = ['Neighborhood']
df.head()

Unnamed: 0,Neighborhood
0,Ngau Tau Kok
1,Tsim Sha Tsui East
2,Marina Cove
3,Yau Ma Tei
4,Clear Water Bay


### Combine wikipedia data with geocoder data

Since Wikipedia dataset does not include zip code coordinates, we should hydrate dataset with longitude and latitude from Geocoder in order to access Foursquare data.

In [10]:
enrich_neighborhoods_with_geocoder(df, "Hong Kong, China")
df.head()

RateLimiter caught an error, retrying (0/2 tries). Called with (*('Wah Fu, Hong Kong, China',), **{}).
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/urllib/request.py", line 1317, in do_open
    encode_chunked=req.has_header('Transfer-encoding'))
  File "/opt/anaconda3/lib/python3.7/http/client.py", line 1244, in request
    self._send_request(method, url, body, headers, encode_chunked)
  File "/opt/anaconda3/lib/python3.7/http/client.py", line 1290, in _send_request
    self.endheaders(body, encode_chunked=encode_chunked)
  File "/opt/anaconda3/lib/python3.7/http/client.py", line 1239, in endheaders
    self._send_output(message_body, encode_chunked=encode_chunked)
  File "/opt/anaconda3/lib/python3.7/http/client.py", line 1026, in _send_output
    self.send(msg)
  File "/opt/anaconda3/lib/python3.7/http/client.py", line 966, in send
    self.connect()
  File "/opt/anaconda3/lib/python3.7/http/client.py", line 1414, in connect
    server_hostname=server_hostn

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Ngau Tau Kok,22.315512,114.219092
1,Tsim Sha Tsui East,22.29718,114.172205
2,Marina Cove,22.357656,114.257546
3,Yau Ma Tei,22.312365,114.170779
4,Clear Water Bay,22.284671,114.296271


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 237 entries, 0 to 236
Data columns (total 3 columns):
Neighborhood    237 non-null object
Latitude        233 non-null float64
Longitude       233 non-null float64
dtypes: float64(2), object(1)
memory usage: 5.7+ KB


In [13]:
# drop missing values
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 233 entries, 0 to 236
Data columns (total 3 columns):
Neighborhood    233 non-null object
Latitude        233 non-null float64
Longitude       233 non-null float64
dtypes: float64(2), object(1)
memory usage: 7.3+ KB


### Save neighborhood coordinates dataset

In [14]:
df.to_csv('data/hong_kong_neighborhood_coords.csv')

### Visualize Hong Kong neighborhoods
<a id="vis-neighborhoods"></a>

In [15]:
m = map_neighborhoods(df, "Hong Kong, China")
m