### Packages

In [44]:
import pandas as pd 
import numpy as np

import urllib.request
from bs4 import BeautifulSoup

import requests
import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

import matplotlib.cm as cm
import matplotlib.colors as colors

import folium # map rendering library
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import geocoder

# import k-means from clustering stage
from sklearn.cluster import KMeans

### Scrapping data from Wikipedia

In [26]:
url = 'https://en.wikipedia.org/wiki/Planning_Areas_of_Singapore'

page = urllib.request.urlopen(url)

soup = BeautifulSoup(page, 'lxml')



In [33]:
wikitable = soup.find('table', class_ = 'wikitable sortable')

# Create a list to store neighborhood data
neighborhoodList = []

for row in wikitable.find_all('tr'):
    cells = row.find_all('td')
    if len(cells) == 9: 
        neighborhoodList.append(cells[0].find(text = True))

### Getting the districts in Singapore

In [38]:
df_SG = pd.DataFrame(neighborhoodList, columns = ['Districts'])
df_SG.head()

Unnamed: 0,Districts
0,Ang Mo Kio
1,Bedok
2,Bishan
3,Boon Lay
4,Bukit Batok


### Getting the coordinates

In [45]:
def get_latlng(neighborhood):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Singapore, Singapore'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords

In [46]:
coords = [ get_latlng(neighborhood) for neighborhood in df_SG["Districts"].tolist() ]

In [47]:
# Checking the coordinates
coords

[[1.3716100000000324, 103.84546000000006],
 [1.3242500000000632, 103.95297000000005],
 [1.3507900000000745, 103.85110000000009],
 [1.3469578206992554, 103.71275696250261],
 [1.349520000000041, 103.75277000000006],
 [1.2832199522478398, 103.81675993782109],
 [1.3787700000000314, 103.76977000000005],
 [1.3404100000000199, 103.77221000000009],
 [1.2904100000000653, 103.85211000000004],
 [1.3699600000000487, 103.99311000000006],
 [1.3699600000000487, 103.99311000000006],
 [1.3861600000000749, 103.74618000000004],
 [1.3143800000000283, 103.76537000000008],
 [1.3771599483526997, 103.95552993392594],
 [1.3114700000000425, 103.88218000000006],
 [1.371240000000057, 103.89162000000005],
 [1.3343700000000354, 103.74367000000007],
 [1.339490000000069, 103.70739000000003],
 [1.3094147802789686, 103.86673041726073],
 [1.4196700000000533, 103.70232000000004],
 [1.4125896854022786, 103.78968650996946],
 [1.2957900000000677, 103.89544000000006],
 [1.2785700000000588, 103.85762000000005],
 [1.3214799816

### New dataframe to store coordinates

In [48]:
df_coordinates = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])

In [49]:
df_SG['Latitude'] = df_coordinates['Latitude']
df_SG['Longitude'] = df_coordinates['Longitude']

df_SG.head()

Unnamed: 0,Districts,Latitude,Longitude
0,Ang Mo Kio,1.37161,103.84546
1,Bedok,1.32425,103.95297
2,Bishan,1.35079,103.8511
3,Boon Lay,1.346958,103.712757
4,Bukit Batok,1.34952,103.75277


### Saving data for future use 

In [50]:
df_SG.to_csv("df_SG.csv", index=False)

## Creating a map of Singapore with the districts on top

In [51]:
address = 'Singapore, Singapore'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Singapore, Singapore {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Singapore, Singapore 1.357107, 103.8194992.


In [53]:
# create map of Singapore using latitude and longitude values
map_SG = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, neighborhood in zip(df_SG['Latitude'], df_SG['Longitude'], df_SG['Districts']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_SG)  
    
map_SG

### Using Foursquare API 

In [54]:
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [57]:
radius = 2000
LIMIT = 100

venues = []

for lat, long, neighborhood in zip(df_SG['Latitude'], df_SG['Longitude'], df_SG['Districts']):
    
    # create the API request URL
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    # return only relevant information for each nearby venue
    for venue in results:
        venues.append((
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [58]:
# convert the venues list into a new DataFrame
venues_df = pd.DataFrame(venues)

# define the column names
venues_df.columns = ['Neighborhood', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

venues_df.head()

Unnamed: 0,Neighborhood,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,Ang Mo Kio,1.37161,103.84546,Bishan - Ang Mo Kio Park,1.362219,103.84625,Park
1,Ang Mo Kio,1.37161,103.84546,NTUC FairPrice,1.371507,103.847082,Supermarket
2,Ang Mo Kio,1.37161,103.84546,Kam Jia Zhuang Restaurant,1.368167,103.844118,Asian Restaurant
3,Ang Mo Kio,1.37161,103.84546,Face Ban Mian 非板面 (Ang Mo Kio),1.372031,103.847504,Noodle House
4,Ang Mo Kio,1.37161,103.84546,Aramsa ~ The Garden Spa,1.362292,103.847602,Spa


In [59]:
venues_df.groupby(["Neighborhood"]).count()

Unnamed: 0_level_0,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Ang Mo Kio,100,100,100,100,100,100
Bedok,100,100,100,100,100,100
Bishan,100,100,100,100,100,100
Boon Lay,100,100,100,100,100,100
Bukit Batok,98,98,98,98,98,98
Bukit Merah,100,100,100,100,100,100
Bukit Panjang,72,72,72,72,72,72
Bukit Timah,95,95,95,95,95,95
Central Water Catchment,100,100,100,100,100,100
Changi,57,57,57,57,57,57


### Checking the unique values

In [60]:
print('There are {} uniques categories.'.format(len(venues_df['VenueCategory'].unique())))

There are 296 uniques categories.


In [61]:
venues_df['VenueCategory'].unique()[:25]

array(['Park', 'Supermarket', 'Asian Restaurant', 'Noodle House', 'Spa',
       'Snack Place', 'Burger Joint', 'College Cafeteria', 'Bakery',
       'Coffee Shop', 'BBQ Joint', 'Dog Run', 'Chinese Restaurant',
       'Sandwich Place', 'Gym', 'Hobby Shop', 'Seafood Restaurant', 'Bar',
       'Miscellaneous Shop', 'Fast Food Restaurant', 'Halal Restaurant',
       'Food Court', 'Ramen Restaurant', 'Modern European Restaurant',
       'Pool'], dtype=object)

In [63]:
# Check if the results contain "Hotel"
"Hotel" in venues_df['VenueCategory'].unique()

True

### Analyse the districts

In [66]:
# one hot encoding
SG_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
SG_onehot['Neighborhoods'] = venues_df['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [SG_onehot.columns[-1]] + list(SG_onehot.columns[:-1])
SG_onehot = SG_onehot[fixed_columns]

SG_onehot.head()

Unnamed: 0,Neighborhoods,ATM,Accessories Store,Airport,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Amphitheater,Aquarium,...,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zhejiang Restaurant,Zoo,Zoo Exhibit
0,Ang Mo Kio,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Ang Mo Kio,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Ang Mo Kio,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Ang Mo Kio,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Ang Mo Kio,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Grouping the districts and find frequency

In [68]:
SG_grouped = SG_onehot.groupby(["Neighborhoods"]).mean().reset_index()

print(SG_grouped.shape)
SG_grouped.head()

(55, 297)


Unnamed: 0,Neighborhoods,ATM,Accessories Store,Airport,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Amphitheater,Aquarium,...,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zhejiang Restaurant,Zoo,Zoo Exhibit
0,Ang Mo Kio,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0
1,Bedok,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Bishan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Boon Lay,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,...,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.01
4,Bukit Batok,0.0,0.010204,0.0,0.0,0.0,0.0,0.010204,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Create new dataframe just for hotels

In [70]:
SG_Hotel = SG_grouped[["Neighborhoods","Hotel"]]
SG_Hotel.head()

Unnamed: 0,Neighborhoods,Hotel
0,Ang Mo Kio,0.0
1,Bedok,0.02
2,Bishan,0.0
3,Boon Lay,0.0
4,Bukit Batok,0.0


## Clustering the neighborhoods

In [74]:
# set number of clusters
kclusters = 3

SG_clustering = SG_Hotel.drop(["Neighborhoods"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(SG_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0], dtype=int32)

In [76]:
# Create a new dataframe that includes the cluster values
SG_merged = SG_Hotel.copy()

SG_merged["Cluster Labels"] = kmeans.labels_

SG_merged.rename(columns={"Neighborhoods": "Neighborhood"}, inplace=True)
SG_merged.head()

Unnamed: 0,Neighborhood,Hotel,Cluster Labels
0,Ang Mo Kio,0.0,0
1,Bedok,0.02,0
2,Bishan,0.0,0
3,Boon Lay,0.0,0
4,Bukit Batok,0.0,0


### Merge both the datasets together 

In [79]:

SG_merged = SG_merged.join(df_SG.set_index("Districts"), on="Neighborhood")

print(SG_merged.shape)
SG_merged.head() # check the last columns!

(55, 5)


Unnamed: 0,Neighborhood,Hotel,Cluster Labels,Latitude,Longitude
0,Ang Mo Kio,0.0,0,1.37161,103.84546
1,Bedok,0.02,0,1.32425,103.95297
2,Bishan,0.0,0,1.35079,103.8511
3,Boon Lay,0.0,0,1.346958,103.712757
4,Bukit Batok,0.0,0,1.34952,103.75277


### Visualise the map

In [80]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(SG_merged['Latitude'], SG_merged['Longitude'], SG_merged['Neighborhood'], SG_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Cluster Analysis

### Cluster 1

In [81]:
SG_merged.loc[SG_merged['Cluster Labels'] == 0]


Unnamed: 0,Neighborhood,Hotel,Cluster Labels,Latitude,Longitude
0,Ang Mo Kio,0.0,0,1.37161,103.84546
1,Bedok,0.02,0,1.32425,103.95297
2,Bishan,0.0,0,1.35079,103.8511
3,Boon Lay,0.0,0,1.346958,103.712757
4,Bukit Batok,0.0,0,1.34952,103.75277
5,Bukit Merah,0.02,0,1.28322,103.81676
6,Bukit Panjang,0.0,0,1.37877,103.76977
7,Bukit Timah,0.0,0,1.34041,103.77221
9,Changi,0.017544,0,1.36996,103.99311
10,Changi Bay,0.017544,0,1.36996,103.99311


### Cluster 2

In [82]:
SG_merged.loc[SG_merged['Cluster Labels'] == 1]

Unnamed: 0,Neighborhood,Hotel,Cluster Labels,Latitude,Longitude
8,Central Water Catchment,0.16,1,1.29041,103.85211
22,Marina South,0.12,1,1.27857,103.85762
25,Newton,0.15,1,1.31218,103.83912
27,Novena,0.1,1,1.3191,103.84372
28,Orchard,0.11,1,1.30109,103.83965
35,River Valley,0.11,1,1.296855,103.834348
36,Rochor,0.12,1,1.30413,103.85029
41,Simpang,0.16,1,1.29041,103.85211
42,Singapore River,0.13,1,1.28971,103.84964
44,Straits View,0.1,1,1.279863,103.853595


### Cluster 3

In [83]:
SG_merged.loc[SG_merged['Cluster Labels'] == 2]

Unnamed: 0,Neighborhood,Hotel,Cluster Labels,Latitude,Longitude
18,Kallang,0.07,2,1.309415,103.86673
21,Marina East,0.05,2,1.29579,103.89544
29,Outram,0.07,2,1.289241,103.835002
49,Toa Payoh,0.05,2,1.33448,103.85108
