# Singapore Neighborhood mining

This notebook combines data sources (wikipedia and Foursquare API) to assemble a list of neighborhoods and map them to their coordinates for futher processing.

In [1]:
import pandas as pd
import numpy as np
import requests

from bs4 import BeautifulSoup

from foursquare import fetch_venues, rank_venues_by_frequency
from geocoder import enrich_neighborhoods_with_geocoder, map_neighborhoods

### Scrape wikipedia to compile Singapore neighborhood list

In [2]:
wiki_url = "https://en.wikipedia.org/wiki/List_of_places_in_Singapore"
r  = requests.get(wiki_url)
data = r.text

soup = BeautifulSoup(data)

areas = ['Central Region', 'Central Region', 'North Region', 'North-East Region', 'West Region']
results = []

for h2 in soup('h2'):
    area = h2.findNext('ul')
    if area and area.contents and area.contents[0] and h2.getText()[:-6] in areas:
        sib = h2.find_next_sibling('ul')
        for district in sib.findAll('li'):
            district_name = district.find('a')
            if district_name and district_name.contents:
                results.append(district_name.contents[0])
                district_neighborhoods = district.find('ul')
                if district_neighborhoods and district_neighborhoods.findAll:
                    for li in district_neighborhoods.findAll('li'):
                        c = li.find('a') and li.find('a').contents[0]
                        if c: 
                            results.append(c)

                            
print(len(results))
print(results[0:20])

275
['Bishan', 'Bishan East', 'Marymount', 'Upper Thomson', 'Bishan East', 'Marymount', 'Upper Thomson', 'Bukit Merah', 'Alexandra Hill', 'Alexandra North', 'Bukit Ho Swee', 'Bukit Merah', 'Depot Road', 'Everton Park', 'Henderson Hill', 'Kampong Tiong Bahru', 'Maritime Square', 'Singapore General Hospital', 'Tiong Bahru', 'Alexandra Hill']


#### Drop duplicates

In [3]:
results_set = set(results)
neighborhoods_list = list(results_set)
print(len(neighborhoods_list))

157


In [4]:
neighborhoods_list[0:5]

['Central Water Catchment',
 'Kampong Bugis',
 'Benoi Sector',
 'Bidadari',
 'Bukit Timah']

In [5]:
df = pd.DataFrame(neighborhoods_list)
df.columns = ['Neighborhood']
df.head()

Unnamed: 0,Neighborhood
0,Central Water Catchment
1,Kampong Bugis
2,Benoi Sector
3,Bidadari
4,Bukit Timah


### Combine wikipedia data with geocoder data

Since Wikipedia dataset does not include zip code coordinates, we should hydrate dataset with longitude and latitude from Geocoder in order to access Foursquare data.

In [6]:
address = 'Singapore'

In [7]:
enrich_neighborhoods_with_geocoder(df, address)
df.head()

RateLimiter caught an error, retrying (0/2 tries). Called with (*('Geylang, Singapore',), **{}).
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/urllib/request.py", line 1317, in do_open
    encode_chunked=req.has_header('Transfer-encoding'))
  File "/opt/anaconda3/lib/python3.7/http/client.py", line 1244, in request
    self._send_request(method, url, body, headers, encode_chunked)
  File "/opt/anaconda3/lib/python3.7/http/client.py", line 1290, in _send_request
    self.endheaders(body, encode_chunked=encode_chunked)
  File "/opt/anaconda3/lib/python3.7/http/client.py", line 1239, in endheaders
    self._send_output(message_body, encode_chunked=encode_chunked)
  File "/opt/anaconda3/lib/python3.7/http/client.py", line 1026, in _send_output
    self.send(msg)
  File "/opt/anaconda3/lib/python3.7/http/client.py", line 966, in send
    self.connect()
  File "/opt/anaconda3/lib/python3.7/http/client.py", line 1414, in connect
    server_hostname=server_hostname)
 

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Central Water Catchment,,
1,Kampong Bugis,1.30853,103.865871
2,Benoi Sector,1.319582,103.680792
3,Bidadari,1.335262,103.872167
4,Bukit Timah,1.35469,103.776372


In [8]:
# Geylang, Singapore
# 1.3201, 103.8918

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 3 columns):
Neighborhood    157 non-null object
Latitude        152 non-null float64
Longitude       152 non-null float64
dtypes: float64(2), object(1)
memory usage: 3.8+ KB


In [11]:
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 152 entries, 1 to 156
Data columns (total 3 columns):
Neighborhood    152 non-null object
Latitude        152 non-null float64
Longitude       152 non-null float64
dtypes: float64(2), object(1)
memory usage: 4.8+ KB


### Visualize Singapore neighborhoods
<a id="vis-neighborhoods"></a>

In [12]:
m = map_neighborhoods(df, address)
m

In [13]:
df.sort_values('Longitude')

Unnamed: 0,Neighborhood,Latitude,Longitude
150,Tuas,1.329372,103.648236
72,Tuas View,1.329372,103.648236
105,Gul Circle,1.313587,103.664740
24,Jurong Island,1.259617,103.670471
136,Joo Koon,1.327735,103.678222
...,...,...,...
155,Punggol,1.398033,103.907331
20,Marine Parade,1.302689,103.907395
70,Lorong Halus,1.381749,103.919980
130,Coney Island,1.409418,103.921470


In [14]:
# drop geospatial outlier whose neighborhood is already represented
df.drop(128, inplace=True)

### Save neighborhood coordinates dataset

In [16]:
df.to_csv('data/singapore_neighborhood_coords.csv')

### Fetch neighborhood venues from Foursquare

In [17]:
venues = fetch_venues(names=df['Neighborhood'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )
ranked = rank_venues_by_frequency(venues)

      Neighborhood  Accessories Store  African Restaurant  Airport Terminal  \
0        Admiralty                0.0            0.000000               0.0   
1   Alexandra Hill                0.0            0.000000               0.0   
2  Alexandra North                0.0            0.000000               0.0   
3         Aljunied                0.0            0.083333               0.0   
4       Anchorvale                0.0            0.000000               0.0   

   American Restaurant  Arcade  Art Gallery  Art Museum  Arts & Crafts Store  \
0                  0.0     0.0     0.000000         0.0                  0.0   
1                  0.0     0.0     0.083333         0.0                  0.0   
2                  0.0     0.0     0.083333         0.0                  0.0   
3                  0.0     0.0     0.000000         0.0                  0.0   
4                  0.0     0.0     0.000000         0.0                  0.0   

   Asian Restaurant  ...  Vegetarian / Vegan

In [None]:
ranked

### Save ranked venues dataset

In [18]:
ranked.to_csv('data/singapore_neighborhood_venues_ranked.csv')