### Part 1 - Create Dataframe

In [86]:
#Import libraries
import pandas as pd
import numpy as np
import requests

In [87]:
#Save Wikipedia page as a variable "dfs"
dfs = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
#First table on page is saved as dataframe "df"
df = dfs[0]
#Get number of rows
df.shape

(289, 3)

In [88]:
#Find the number of "Not assigned" Neighbourhoods
(df['Neighbourhood']=='Not assigned').value_counts()

False    211
True      78
Name: Neighbourhood, dtype: int64

In [89]:
#Find the number of "Not assigned" Boroughs
(df['Borough']=='Not assigned').value_counts()

False    212
True      77
Name: Borough, dtype: int64

In [90]:
#Remove all "Boroughs" that are "Not assigned"
df.drop(df[df.Borough=='Not assigned'].index,inplace=True)
(df['Borough']=='Not assigned').value_counts()

False    212
Name: Borough, dtype: int64

In [91]:
#However, we stil have one Neighbourhoods which is "Not assigned"
(df['Neighbourhood']=='Not assigned').value_counts()

False    211
True       1
Name: Neighbourhood, dtype: int64

In [92]:
#Let's copy the values from Boroughs for all Neighborhoods that are "Not assigned"
df['Neighbourhood'] = np.where(df['Neighbourhood'] == 'Not assigned', df['Borough'], df['Neighbourhood'])
(df['Neighbourhood']=='Not assigned').value_counts()

False    212
Name: Neighbourhood, dtype: int64

In [93]:
#Group by "Postcode" and "Borough" and aggregate
df = df.groupby(['Postcode','Borough'], sort = False).agg(lambda x: ', '.join(x))
df.shape

(103, 1)

### Part 2 - Import location

In [94]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Neighbourhood
Postcode,Borough,Unnamed: 2_level_1
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Harbourfront, Regent Park"
M6A,North York,"Lawrence Heights, Lawrence Manor"
M7A,Queen's Park,Queen's Park


In [95]:
#Let's reset the index
df = df.reset_index()
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


In [96]:
#Let's load the csv file with the geographical coordinates of each postal code:
dfg = pd.read_csv("https://cocl.us/Geospatial_data")
dfg.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [97]:
#Let's merge both dataframes using the "Postal Code" column
dfm = pd.merge(df, dfg, how='outer', left_on="Postcode", right_on="Postal Code", validate='m:1')
dfm.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,M3A,North York,Parkwoods,M3A,43.753259,-79.329656
1,M4A,North York,Victoria Village,M4A,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",M5A,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",M6A,43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,M7A,43.662301,-79.389494


In [98]:
#We have to columns for the Postal Code, let's drop the "Postcode" column
dfm.drop(['Postcode'], axis=1, inplace=True)
dfm.head(15)

Unnamed: 0,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,North York,Parkwoods,M3A,43.753259,-79.329656
1,North York,Victoria Village,M4A,43.725882,-79.315572
2,Downtown Toronto,"Harbourfront, Regent Park",M5A,43.65426,-79.360636
3,North York,"Lawrence Heights, Lawrence Manor",M6A,43.718518,-79.464763
4,Queen's Park,Queen's Park,M7A,43.662301,-79.389494
5,Etobicoke,Islington Avenue,M9A,43.667856,-79.532242
6,Scarborough,"Rouge, Malvern",M1B,43.806686,-79.194353
7,North York,Don Mills North,M3B,43.745906,-79.352188
8,East York,"Woodbine Gardens, Parkview Hill",M4B,43.706397,-79.309937
9,Downtown Toronto,"Ryerson, Garden District",M5B,43.657162,-79.378937


### Part 3 - Clustering

In [99]:
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

In [100]:
dfc = dfm

In [101]:
dfc.drop(['Borough', 'Neighbourhood', 'Postal Code'], axis=1, inplace=True)
dfc.head()

Unnamed: 0,Latitude,Longitude
0,43.753259,-79.329656
1,43.725882,-79.315572
2,43.65426,-79.360636
3,43.718518,-79.464763
4,43.662301,-79.389494


In [102]:
import matplotlib.pyplot as plt # plotting library
# backend for rendering plots within the browser
%matplotlib inline 
from sklearn.cluster import KMeans 
from sklearn.datasets.samples_generator import make_blobs

In [103]:
dfc.shape

(103, 2)

In [104]:
X = dfc.values[:,1:]
X = np.nan_to_num(X)
cluster_dataset = preprocessing.StandardScaler().fit(X).transform(X.astype(float))

In [105]:
num_clusters = 3
k_means = KMeans(init="k-means++", n_clusters=num_clusters, n_init=12)
k_means.fit(cluster_dataset)
labels = k_means.labels_
print(labels)

[0 2 0 1 0 1 2 0 2 0 0 1 2 0 2 0 0 1 2 2 0 0 2 0 0 0 2 0 0 0 0 0 2 0 1 0 0
 0 2 0 1 0 0 0 2 0 1 2 0 1 1 2 0 1 0 0 1 1 2 0 1 0 0 1 1 2 0 0 0 1 1 2 0 0
 0 1 1 1 2 0 0 1 2 0 0 2 0 0 1 1 2 0 0 1 1 2 0 0 1 0 2 1 1]


In [106]:
dfc["Labels"] = labels
dfc.head(5)

Unnamed: 0,Latitude,Longitude,Labels
0,43.753259,-79.329656,0
1,43.725882,-79.315572,2
2,43.65426,-79.360636,0
3,43.718518,-79.464763,1
4,43.662301,-79.389494,0


In [107]:
#Let's load the folium library
!conda install -c conda-forge folium=0.5.0 --yes
import folium
print('Folium installed and imported!')

Collecting package metadata: done
Solving environment: done

# All requested packages already installed.

Folium installed and imported!


In [108]:
#Let's define the center point for our Toronto Map:
latitude = 43.731136
longitude = -79.371932
tdot_map = folium.Map(location=[latitude, longitude], zoom_start=11)

In [109]:
# instantiate a feature group for the labels in "dfc"
labels = folium.map.FeatureGroup()

#define a color for each label:
colordict = {0: 'lightblue', 1: 'lightgreen', 2: 'orange'}

# add all points to map
for lat, lng, label in zip(dfc.Latitude, dfc.Longitude, dfc.Labels):
    labels.add_child(
        folium.features.CircleMarker(
            [lat, lng, label],
            radius=5, # define how big you want the circle markers to be
            color='grey',
            fill=True,
            fill_color=colordict[label],
            fill_opacity=1
        )
    )
tdot_map.add_child(labels)