<h1>Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto - Andrew Lau</h1>
<h2>Part 1 - Getting data from Wikipedia</h2>

In [36]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

wiki = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
website_url = requests.get(wiki).text
soup = BeautifulSoup(website_url,'lxml')

toronto = soup.find_all('table', class_ = 'wikitable sortable')

In [37]:
Col1=[]
Col2=[]
Col3=[]

for tr in soup.find_all('tr'):
    td = tr.find_all('td')
    if len(td)==3:
        Col1.append(td[0].find(text=True))
        Col2.append(td[1].find(text=True))
        Col3.append(td[2].find(text=True))

In [38]:
df = pd.DataFrame()
df['PostalCode'] = Col1
df['Borough'] = Col2
df['Neighbourhood'] = Col3
print('The shape of dataframe ', df.shape)
df.head()

The shape of dataframe  (289, 3)


Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Borough with not assigned are dropped and dataframe is reset index

In [39]:
df1 = df[df.Borough != 'Not assigned']
df1 = df1.sort_values(by=['PostalCode','Borough'])

df1.reset_index(inplace=True)
df1.drop('index',axis=1,inplace=True)

df1.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,Rouge
1,M1B,Scarborough,Malvern
2,M1C,Scarborough,Highland Creek
3,M1C,Scarborough,Rouge Hill
4,M1C,Scarborough,Port Union


Rows with same Postal Code are grouped the neighbourhood separated by comma

In [40]:
df2 = df1.groupby('PostalCode', as_index=False).agg(lambda x: ', '.join(set(x.dropna())))

Rows with Neighbourhood not assigned will be replaced with the Borough name, e.g. Queen's Park

In [41]:
df2['Neighbourhood'] = np.where(df2['Neighbourhood'] == 'Not assigned', df2['Borough'], df2['Neighbourhood'])

In [42]:
print('The shape of dataframe ', df2.shape)
df2.head()

The shape of dataframe  (103, 3)


Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek"
2,M1E,Scarborough,"Guildwood\n, West Hill, Morningside"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


<h2>Part 2 - Get geographical coordinates into dataframe

In [43]:
Geo_df = pd.read_csv('http://cocl.us/Geospatial_data')
Geo_df.columns = ['PostalCode', 'Latitude', 'Longitude']

In [44]:
df3 = pd.merge(df2, Geo_df, on=['PostalCode'], how='inner')

In [45]:
print('The shape of dataframe ', df3.shape)
df3.head()

The shape of dataframe  (103, 5)


Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood\n, West Hill, Morningside",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


<h2>Part 3 - Explore and cluster the neighborhoods in Toronto</h2>

In [46]:
#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Fetching package metadata .............
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda/envs/DSX-Python35:
#
folium                    0.5.0                      py_0    conda-forge
Libraries imported.


In [47]:
address = 'Toronto, Canada'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [48]:
df4 = df3.copy()
df4 = df4[df2.Borough.str.contains("Toronto")]
df4.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West\n, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"The Beaches West\n, India Bazaar",43.668999,-79.315572
43,M4M,East Toronto,Studio District\n,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [49]:
# set number of clusters
kclusters = 4

X = df4['Latitude']
Y = df4['Longitude']
Z = np.stack((X, Y), axis=1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Z)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 0, 0, 0, 2, 2, 2, 2, 2, 2], dtype=int32)

In [50]:
# create map
toronto_map = folium.Map(location=[latitude, longitude], zoom_start=12)

colors = ['red', 'orange', 'yellow', 'green']
df4['Cluster'] = kmeans.labels_

for lat, long, borough, cluster in zip(df4['Latitude'], df4['Longitude'], df4['Borough'], df4['Cluster']):
    label = folium.Popup(borough, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=label,
        color='black',
        fill=True,
        fill_color=colors[cluster],
        fill_opacity=0.7).add_to(toronto_map)  

toronto_map

4 clusters are formed as 4 distinct districts in Toronto. The central district (orange color) contains more postal codes and thus more condensed. It is expected that this district should have higher population than other 3 districts.