# Segmenting and Clustering Neighborhoods in Toronto

#### create a BeautifulSoup object

In [8]:
# crawl downloaded H5 file
from bs4 import BeautifulSoup

readf = open("List of postal codes of Canada_ M - Wikipedia.html", "rb")
contents = readf.read()
soup = BeautifulSoup(contents, "lxml")

#### retrieve table from encoded BeautifulSoup object

In [9]:
mytable = soup.find('table', {'class': 'wikitable sortable'})
print(mytable)

<table class="wikitable sortable">
<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Harbourfront</a>
</td></tr>
<tr>
<td>M6A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Lawrence_Heights" title="Lawrence Heights">Lawrence Heights</a>
</td></tr>
<tr>
<td>M6A</td>
<td><a href="/wiki/North

#### extract all items with td and save data

In [70]:
outfile = open('neighborhoods.csv', 'w')

items = mytable.findAll('td')
results = {}
temp = []
for item in items:
    text = item.renderContents()
    text = str(text.strip(), 'utf-8')
    if '<a' in text:
        parts = text.split('>')
        text = parts[1][:-3]
    temp.append(text)
    if len(temp) == 3:
        if temp[1] != 'Not assigned':
            name = "%s-%s" % (temp[0], temp[1])
            if name not in results.keys():
                results[name] = temp[2]
            else:
                results[name] += ',%s' % temp[2]
        temp = []

for key, val in results.items():
    post, boro = key.split('-')
    if val == 'Not assigned':
        val = boro
    outfile.write("%s;%s;%s\n" % (post, boro, val))

#### load data from csv

In [73]:
# import libraries
import numpy as np
import pandas as pd

df = pd.read_csv('neighborhoods.csv', sep=';',
                 names=['PostalCode','Borough','Neighborhood'])
print('df shape is', df.shape)

df shape is (103, 3)


In [76]:
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
5,M9A,Queen's Park,Queen's Park
6,M1B,Scarborough,"Rouge,Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens,Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson,Garden District"


#### get coordinates

In [79]:
geo_data = pd.read_csv('Geospatial_Coordinates.csv')
geo_data

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [117]:
merged_data = df
merged_data = merged_data.join(geo_data.set_index('Postal Code'), on='PostalCode')
merged_data

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.654260,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
5,M9A,Queen's Park,Queen's Park,43.667856,-79.532242
6,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens,Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937


In [97]:
import folium

In [118]:
latitude, longitude = merged_data.iloc[0][3], merged_data.iloc[0][4]

# create map for Toronto
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(merged_data['Latitude'], merged_data['Longitude'], merged_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            color='blue',
            fill=True,
            fill_color='#3186cc',
            fill_opacity=0.7,
            parse_html=False).add_to(map_toronto)  

map_toronto

#### cluster neighborhoods

In [120]:
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

toronto_grouped_clustering = merged_data[['Latitude', 'Longitude']]

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([4, 4, 2, 3, 2, 1, 0, 4, 4, 2])

In [123]:
# # add clustering labels
toronto_data = merged_data

toronto_data.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_data # check the last columns!

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,4,M3A,North York,Parkwoods,43.753259,-79.329656
1,4,M4A,North York,Victoria Village,43.725882,-79.315572
2,2,M5A,Downtown Toronto,Harbourfront,43.654260,-79.360636
3,3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,2,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
5,1,M9A,Queen's Park,Queen's Park,43.667856,-79.532242
6,0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
7,4,M3B,North York,Don Mills North,43.745906,-79.352188
8,4,M4B,East York,"Woodbine Gardens,Parkview Hill",43.706397,-79.309937
9,2,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937


In [124]:
# visualize the resulting clusters

import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_data['Latitude'], 
                                  toronto_data['Longitude'], 
                                  toronto_data['Neighborhood'], 
                                  toronto_data['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters