<H1>Segmenting and Clustering Neighborhoods in Toronto<H1\>

## Import Libraries

In [25]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import geocoder # import geocoder
import folium
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

## 1. Download and Explore Dataset

We will scrape the table from Wiki page using BeautifulSoap package and using html parser.

In [26]:
wiki_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response=requests.get(wiki_url)
soup = BeautifulSoup(response.text,'html.parser')

In [27]:
table =soup.find('table')

In [28]:
df =pd.read_html(str(table))
df = df[0]

In [29]:
df = df[df.Borough != 'Not assigned']

Converting the html table to dataframe and the below is the final data dimension after removing all non assigned neighourhoods.

In [30]:
dupl = df[df['Postal Code'].duplicated()]
df.describe()

Unnamed: 0,Postal Code,Borough,Neighborhood
count,103,103,103
unique,103,10,98
top,M4W,North York,Downsview
freq,1,24,4


In [7]:
!pip install geocoder



In [40]:
!pip install folium

Collecting folium
  Downloading https://files.pythonhosted.org/packages/a4/f0/44e69d50519880287cc41e7c8a6acc58daa9a9acf5f6afc52bcc70f69a6d/folium-0.11.0-py2.py3-none-any.whl (93kB)
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/13/fb/9eacc24ba3216510c6b59a4ea1cd53d87f25ba76237d7f4393abeaf4c94e/branca-0.4.1-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.4.1 folium-0.11.0


### Using geocoder to fetch the latitude and longitude of each pincode fetched

In [16]:
def get_latlng(neighborhood):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords
coords = [ get_latlng(neighborhood) for neighborhood in df["Postal Code"].tolist() ]
coords

[[43.75293455500008, -79.33564142299997],
 [43.72810248500008, -79.31188987099995],
 [43.65096410900003, -79.35304116399999],
 [43.723265465000054, -79.45121077799996],
 [43.66179000000005, -79.38938999999993],
 [43.66748067300006, -79.52895286499995],
 [43.80862623100006, -79.18991284599997],
 [43.74890000000005, -79.35721999999998],
 [43.70719267700008, -79.31152927299996],
 [43.65749059800004, -79.37752923699998],
 [43.70727872700007, -79.44750009299997],
 [43.65002250300006, -79.55408903099999],
 [43.78577865700004, -79.15736763799998],
 [43.72214339800007, -79.35202341799999],
 [43.68974004200004, -79.30850701899999],
 [43.65173364700007, -79.37555358799995],
 [43.69172991700003, -79.43001279899994],
 [43.637813150000056, -79.57648363299995],
 [43.76580607300008, -79.18528434099994],
 [43.67814827600006, -79.29534930999995],
 [43.645195888000046, -79.37385548899994],
 [43.68911756600005, -79.45065043699998],
 [43.77154467100007, -79.21813521299998],
 [43.70941386000004, -79.363099

In [38]:
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])
df['Latitude'] = df_coords['Latitude']
df['Longitude'] = df_coords['Longitude']
print(df.shape)

(103, 6)


### Glimpse of the final Dataset 

In [37]:
df.drop(['index'], axis=1)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.752935,-79.335641
1,M4A,North York,Victoria Village,43.728102,-79.311890
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.650964,-79.353041
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.723265,-79.451211
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.661790,-79.389390
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653340,-79.509766
99,M4Y,Downtown Toronto,Church and Wellesley,43.666659,-79.381472
100,M7Y,East Toronto,Business reply mail Processing Centre,43.648700,-79.385450
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.632798,-79.493017


In [47]:
latitude=max(df['Latitude'])
longitude=min(df['Longitude'])

In [48]:
map_manhattan = folium.Map(location=[latitude, longitude], zoom_start=11)
# add markers to map
for lat, lng, label in zip(df['Latitude'], df['Longitude'], df['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_manhattan)  
map_manhattan

In [62]:
df_toronto = df.drop(['index'], axis=1)

In [64]:
df_toronto.shape
df_toronto.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.752935,-79.335641
1,M4A,North York,Victoria Village,43.728102,-79.31189
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.650964,-79.353041
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.723265,-79.451211
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66179,-79.38939


In [65]:
toronto_grouped = df_toronto.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Agincourt,43.793930,-79.265694
1,"Alderwood, Long Branch",43.600895,-79.540387
2,"Bathurst Manor, Wilson Heights, Downsview North",43.757394,-79.442394
3,Bayview Village,43.780607,-79.376921
4,"Bedford Park, Lawrence Manor East",43.735447,-79.417944
...,...,...,...
93,"Willowdale, Newtonbrook",43.791800,-79.406428
94,Woburn,43.771545,-79.218135
95,Woodbine Heights,43.689740,-79.308507
96,York Mills West,43.750260,-79.398355


## 2. K means clustering and plotting the final segmented neighborhoods

In [67]:

# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 2, 4, 4, 4, 0, 1, 0, 0, 0])

In [73]:
# add clustering labels
#toronto_grouped.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_grouped['Cluster Labels'] = kmeans.labels_
toronto_merged = toronto_grouped

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
#toronto_merged = toronto_merged.join(toronto_grouped.set_index('Neighborhood'), on='Neighborhood')
toronto_merged.head()
#toronto_merged.head() # check the last columns!

Unnamed: 0,Cluster Labels,Neighborhood,Latitude,Longitude
0,3,Agincourt,43.79393,-79.265694
1,2,"Alderwood, Long Branch",43.600895,-79.540387
2,4,"Bathurst Manor, Wilson Heights, Downsview North",43.757394,-79.442394
3,4,Bayview Village,43.780607,-79.376921
4,4,"Bedford Park, Lawrence Manor East",43.735447,-79.417944


In [77]:

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)
# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters