In [1]:
from bs4 import BeautifulSoup
import requests

import pandas as pd 
import numpy as np


#!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim

#!pip install pgeocode
import pgeocode

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes 
import folium 

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [3]:
response = requests.get(url)

In [4]:
table = response.text

In [5]:
soup = BeautifulSoup(table, 'html')

In [6]:
tags = soup.find_all('a')

In [7]:
#for tag in tags:
   # print(tag.get('href'))

In [8]:
table_contents=[]

In [9]:
table = soup.find('table')

In [10]:
for row in table.findAll('td'):
    cell = {}
    if row.span.text == 'Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace('/',',')).replace(')','')).strip('')
        table_contents.append(cell)

#print(table_contents)
df = pd.DataFrame(table_contents)
df['Borough'] = df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A','East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business','EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto','MississaugaCanada Post Gateway Processing Centre': 'Mississauga'})

In [11]:
df.drop_duplicates(inplace=True)

In [12]:
print(df.shape)
df.head()

(103, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [13]:
postal_code = df['PostalCode'].tolist()

nomi = pgeocode.Nominatim('ca')
location = nomi.query_postal_code(postal_code)
latitude = location.latitude
longitude = location.longitude

In [14]:
coordinates = pd.DataFrame([latitude,longitude]).transpose().astype(float)
print(coordinates.shape)
coordinates.head()

(103, 2)


Unnamed: 0,latitude,longitude
0,43.7545,-79.33
1,43.7276,-79.3148
2,43.6555,-79.3626
3,43.7223,-79.4504
4,43.6641,-79.3889


In [15]:
new_df = pd.concat([df, coordinates], axis=1)
print(new_df.shape)
new_df.head()

(103, 5)


Unnamed: 0,PostalCode,Borough,Neighborhood,latitude,longitude
0,M3A,North York,Parkwoods,43.7545,-79.33
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.6555,-79.3626
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.7223,-79.4504
4,M7A,Queen's Park,Ontario Provincial Government,43.6641,-79.3889


In [16]:
new_df = new_df.dropna().reset_index(drop=True)
print(new_df.shape)
new_df

(102, 5)


Unnamed: 0,PostalCode,Borough,Neighborhood,latitude,longitude
0,M3A,North York,Parkwoods,43.7545,-79.33
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.6555,-79.3626
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.7223,-79.4504
4,M7A,Queen's Park,Ontario Provincial Government,43.6641,-79.3889
5,M9A,Etobicoke,Islington Avenue,43.6662,-79.5282
6,M1B,Scarborough,"Malvern , Rouge",43.8113,-79.193
7,M3B,North York,Don MillsNorth,43.745,-79.359
8,M4B,East York,"Parkview Hill , Woodbine Gardens",43.7063,-79.3094
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.6572,-79.3783


In [17]:
print('It looks like the Borough column has {} unique values'.format(new_df.Borough.nunique()))

It looks like the Borough column has 14 unique values


In [18]:
neighborhood_count = new_df.groupby(['Borough']).count()
neighborhood_count =  neighborhood_count.drop(['Neighborhood', 'latitude', 'longitude'], axis=1).rename(columns={'PostalCode':'Neighborhoods per Borough'})
neighborhood_count

Unnamed: 0_level_0,Neighborhoods per Borough
Borough,Unnamed: 1_level_1
Central Toronto,9
Downtown Toronto,17
Downtown Toronto Stn A,1
East Toronto,4
East Toronto Business,1
East York,4
East York/East Toronto,1
Etobicoke,11
Etobicoke Northwest,1
North York,24


In [19]:
toronto_onehot = pd.get_dummies(new_df[['Borough']], prefix="", prefix_sep="")
print(toronto_onehot.shape)
toronto_onehot

(102, 14)


Unnamed: 0,Central Toronto,Downtown Toronto,Downtown Toronto Stn A,East Toronto,East Toronto Business,East York,East York/East Toronto,Etobicoke,Etobicoke Northwest,North York,Queen's Park,Scarborough,West Toronto,York
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,1,0,0,0
5,0,0,0,0,0,0,0,1,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,1,0,0
7,0,0,0,0,0,0,0,0,0,1,0,0,0,0
8,0,0,0,0,0,1,0,0,0,0,0,0,0,0
9,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [20]:
kclusters = 14

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_onehot)

kmeans.labels_

array([ 1,  1,  2,  1, 13,  3,  0,  1,  8,  2,  1,  3,  0,  1,  8,  2,  6,
        3,  0,  7,  2,  6,  0,  8,  2,  2,  0,  1,  1,  8,  2,  5,  0,  1,
        1,  9,  2,  5,  0,  1,  1,  7,  2,  5,  0,  1,  1,  7,  2,  1,  1,
        0,  1,  1,  7,  1,  6,  1,  0,  1,  1,  4,  4,  6,  6,  0,  1,  4,
        4,  5,  3,  0,  1,  4,  4,  5,  3,  0,  4,  2,  5,  0,  4,  2,  0,
        4,  2,  3,  3,  0,  2, 10,  3, 12,  0,  2,  2,  3,  2, 11,  3,  3])

In [21]:
toronto_clustered = new_df.copy()

toronto_clustered.insert(0, 'KmeanLabels', kmeans.labels_)

In [22]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude_tor = location.latitude
longitude_tor = location.longitude
print('The geograpical coordinates for the city of Toronto are {}, {}.'.format(latitude_tor, longitude_tor))

The geograpical coordinates for the city of Toronto are 43.6534817, -79.3839347.


In [23]:
map_clusters = folium.Map(location=[latitude_tor, longitude_tor], zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, neighborhood, borough, cluster in zip(toronto_clustered['latitude'], toronto_clustered['longitude'], toronto_clustered['Neighborhood'], toronto_clustered['Borough'], toronto_clustered['KmeanLabels']):
    label = folium.Popup('Neighborhood: ' + str(neighborhood) + ' - Borough: ' + str(borough) + ' - Cluster: ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [24]:
mod_borough_df = new_df.copy()

In [25]:
mod_borough_df.Borough.values

array(['North York', 'North York', 'Downtown Toronto', 'North York',
       "Queen's Park", 'Etobicoke', 'Scarborough', 'North York',
       'East York', 'Downtown Toronto', 'North York', 'Etobicoke',
       'Scarborough', 'North York', 'East York', 'Downtown Toronto',
       'York', 'Etobicoke', 'Scarborough', 'East Toronto',
       'Downtown Toronto', 'York', 'Scarborough', 'East York',
       'Downtown Toronto', 'Downtown Toronto', 'Scarborough',
       'North York', 'North York', 'East York', 'Downtown Toronto',
       'West Toronto', 'Scarborough', 'North York', 'North York',
       'East York/East Toronto', 'Downtown Toronto', 'West Toronto',
       'Scarborough', 'North York', 'North York', 'East Toronto',
       'Downtown Toronto', 'West Toronto', 'Scarborough', 'North York',
       'North York', 'East Toronto', 'Downtown Toronto', 'North York',
       'North York', 'Scarborough', 'North York', 'North York',
       'East Toronto', 'North York', 'York', 'North York', 'Scarboroug

In [26]:
mod_borough_df['Borough']= mod_borough_df['Borough'].replace({"Queen's Park":'Downtown Toronto','Downtown Toronto Stn A':'Downtown Toronto','Etobicoke Northwest':'Etobicoke','East Toronto Business':'Scarborough','East York/East Toronto':'East York'})

In [27]:
mod_borough_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,latitude,longitude
0,M3A,North York,Parkwoods,43.7545,-79.33
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.6555,-79.3626
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.7223,-79.4504
4,M7A,Downtown Toronto,Ontario Provincial Government,43.6641,-79.3889


In [28]:
print('It looks like the Borough column has {} unique values'.format(mod_borough_df.Borough.nunique()))

It looks like the Borough column has 9 unique values


In [29]:
new_neighborhood_count = mod_borough_df.groupby(['Borough']).count()
new_neighborhood_count =  new_neighborhood_count.drop(['Neighborhood', 'latitude', 'longitude'], axis=1).rename(columns={'PostalCode':'Neighborhoods per Borough'})
new_neighborhood_count

Unnamed: 0_level_0,Neighborhoods per Borough
Borough,Unnamed: 1_level_1
Central Toronto,9
Downtown Toronto,19
East Toronto,4
East York,5
Etobicoke,12
North York,24
Scarborough,18
West Toronto,6
York,5


In [30]:
toronto_mod_onehot = pd.get_dummies(mod_borough_df[['Borough']], prefix="", prefix_sep="")
print(toronto_mod_onehot.shape)
toronto_mod_onehot.head()

(102, 9)


Unnamed: 0,Central Toronto,Downtown Toronto,East Toronto,East York,Etobicoke,North York,Scarborough,West Toronto,York
0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,1,0,0,0
2,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0
4,0,1,0,0,0,0,0,0,0


In [31]:
mod_kclusters = 9

mod_kmeans = KMeans(n_clusters=mod_kclusters, random_state=0).fit(toronto_mod_onehot)

mod_kmeans.labels_

array([1, 1, 2, 1, 2, 4, 0, 1, 6, 2, 1, 4, 0, 1, 6, 2, 7, 4, 0, 8, 2, 7,
       0, 6, 2, 2, 0, 1, 1, 6, 2, 5, 0, 1, 1, 6, 2, 5, 0, 1, 1, 8, 2, 5,
       0, 1, 1, 8, 2, 1, 1, 0, 1, 1, 8, 1, 7, 1, 0, 1, 1, 3, 3, 7, 7, 0,
       1, 3, 3, 5, 4, 0, 1, 3, 3, 5, 4, 0, 3, 2, 5, 0, 3, 2, 0, 3, 2, 4,
       4, 0, 2, 2, 4, 4, 0, 2, 2, 4, 2, 0, 4, 4])

In [32]:
mod_toronto_clustered = mod_borough_df.copy()

mod_toronto_clustered.insert(0, 'KmeanLabels', mod_kmeans.labels_)

In [33]:
mod_toronto_clustered.head()

Unnamed: 0,KmeanLabels,PostalCode,Borough,Neighborhood,latitude,longitude
0,1,M3A,North York,Parkwoods,43.7545,-79.33
1,1,M4A,North York,Victoria Village,43.7276,-79.3148
2,2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.6555,-79.3626
3,1,M6A,North York,"Lawrence Manor , Lawrence Heights",43.7223,-79.4504
4,2,M7A,Downtown Toronto,Ontario Provincial Government,43.6641,-79.3889


In [34]:
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, neighborhood, borough, cluster in zip(mod_toronto_clustered['latitude'], mod_toronto_clustered['longitude'], mod_toronto_clustered['Neighborhood'], mod_toronto_clustered['Borough'], mod_toronto_clustered['KmeanLabels']):
    label = folium.Popup('Neighborhood: ' + str(neighborhood) + ' - Borough: ' + str(borough) + ' - Cluster: ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters