In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
import folium
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
res = requests.get(url)

Using BeautifulSoup and requests to scrape the table into a pandas dataframe

In [3]:
soup = BeautifulSoup(res.content, 'html.parser')

In [4]:
table = soup.find_all('table')[0]
start = pd.read_html(str(table))

In [5]:
df = pd.DataFrame(start[0])
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


There is only one neighbourhood that is not assigned but has an assigned borough.
Instead of just changing it, this code is designed to change all values if more values are ever added.

In [6]:
msk = (df['Neighbourhood']=='Not assigned') & (df['Borough']!='Not assigned')
df.loc[msk, 'Neighbourhood'] = df.loc[msk, 'Borough']



Now we remove all values that are not assigned

In [7]:
df = df[df['Neighbourhood']!='Not assigned']
df.shape

(210, 3)

Group all postcodes w

In [8]:
df = df.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()

In [9]:
df.shape

(103, 3)

In [10]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [11]:
df_latlng = pd.read_csv('http://cocl.us/Geospatial_data')
df_latlng.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [12]:
df_latlng.rename(columns={'Postal Code':'Postcode'}, inplace=True)
df_latlng.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
df = df.merge(df_latlng, on='Postcode')

In [14]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


Now to use folium to get a map of Toronto, we can get the lat and long using geolocator

In [15]:
address = 'Toronto, Canada'
geolocator = Nominatim(user_agent="Toronto_explorer")
location = geolocator.geocode(address)
tor_lat = location.latitude
tor_long = location.longitude


In [16]:
toronto_map = folium.Map(location=[tor_lat, tor_long], zoom_start = 10)
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  

In [17]:
toronto_map

One hot encoding of the data

In [18]:
ohe_df = pd.get_dummies(df[['Borough']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
ohe_df['Neighbourhood'] = df['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [ohe_df.columns[-1]] + list(ohe_df.columns[:-1])
ohe_df = ohe_df[fixed_columns]



In [19]:
ohe_df.shape

(103, 12)

In [20]:
df_grouped = ohe_df
df_grouped

Unnamed: 0,Neighbourhood,Central Toronto,Downtown Toronto,East Toronto,East York,Etobicoke,Mississauga,North York,Queen's Park,Scarborough,West Toronto,York
0,"Rouge, Malvern",0,0,0,0,0,0,0,0,1,0,0
1,"Highland Creek, Rouge Hill, Port Union",0,0,0,0,0,0,0,0,1,0,0
2,"Guildwood, Morningside, West Hill",0,0,0,0,0,0,0,0,1,0,0
3,Woburn,0,0,0,0,0,0,0,0,1,0,0
4,Cedarbrae,0,0,0,0,0,0,0,0,1,0,0
5,Scarborough Village,0,0,0,0,0,0,0,0,1,0,0
6,"East Birchmount Park, Ionview, Kennedy Park",0,0,0,0,0,0,0,0,1,0,0
7,"Clairlea, Golden Mile, Oakridge",0,0,0,0,0,0,0,0,1,0,0
8,"Cliffcrest, Cliffside, Scarborough Village West",0,0,0,0,0,0,0,0,1,0,0
9,"Birch Cliff, Cliffside West",0,0,0,0,0,0,0,0,1,0,0


In [21]:
ohe_cluster = df_grouped.drop('Neighbourhood',1)
ohe_cluster.shape

(103, 11)

In [22]:
kclusters = 4

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(ohe_cluster)

In [23]:
kmeans.labels_[0:10]

array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3], dtype=int32)

In [24]:
new_df = df



In [25]:
df_grouped.insert(0, 'Cluster Labels', kmeans.labels_)

In [26]:
new_df = new_df.join(df_grouped.set_index('Neighbourhood'), on='Neighbourhood')

new_df.head()


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,Central Toronto,Downtown Toronto,East Toronto,East York,Etobicoke,Mississauga,North York,Queen's Park,Scarborough,West Toronto,York
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,3,0,0,0,0,0,0,0,0,1,0,0
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,3,0,0,0,0,0,0,0,0,1,0,0
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,3,0,0,0,0,0,0,0,0,1,0,0
3,M1G,Scarborough,Woburn,43.770992,-79.216917,3,0,0,0,0,0,0,0,0,1,0,0
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,3,0,0,0,0,0,0,0,0,1,0,0


Lets map our clusters.

In [27]:
map_clusters = folium.Map(location=[tor_lat, tor_long], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(new_df['Latitude'], new_df['Longitude'], new_df['Neighbourhood'], new_df['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

It looks like our neighbourhoods are properly sepearted. We have north, east, south and west neighbourhoods.