In [8]:
import requests
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np

import folium
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

In [11]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
website_url=requests.get(url).text

In [None]:
html_soup = BeautifulSoup(website_url, 'lxml')

In [None]:
postal_code_table= str(html_soup.table)

In [None]:
dfs = pd.read_html(postal_code_table)
df=dfs[0]
df.head()

In [None]:
df.rename(columns={'Postal code':'Postalcode'}, inplace=True)
df.head()

In [None]:
df.drop(df[df.Borough == 'Not assigned'].index, axis=0, inplace=True)
df.head()

In [None]:
#Now, we will group the neighbourhoods with the same postcode
df=df.groupby(['Postalcode','Borough'],sort=False).agg(', '.join)
#We will also replace the / characters to commas
df['Neighborhood'] = df['Neighborhood'].str.replace(' / ',', ')
df.reset_index(inplace=True)
df.head()

In [None]:
df['Neighborhood'] = np.where(df['Neighborhood'] == 'Not assigned',df['Borough'], df['Neighborhood'])
df

In [None]:
df.shape

In [None]:
Associating a latitude and longitude to each of the Boroughs in Toronto

In [9]:
geodata=pd.read_csv('https://cocl.us/Geospatial_data')
geodata.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [10]:
geodata.rename(columns={'Postal Code':'Postalcode'},inplace=True)
df_geo=pd.merge(df,geodata,on='Postalcode')
df_geo.head()

NameError: name 'df' is not defined

In [None]:
df_to = df_geo[df_geo['Borough'].str.contains('Toronto',regex=False)]
df_to.head()

In [None]:
To_map = folium.Map(location=[43.651070,-79.347015],zoom_start=11)

for lat,lng,borough,neighborhood in zip(df_to['Latitude'],df_to['Longitude'],df_to['Borough'],df_to['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='#003153',
    fill=True,
    fill_color='#003153',
    fill_opacity=0.7,
    parse_html=False).add_to(To_map)

To_map

In [None]:
k=5
to_cluster = df_to[['Latitude','Longitude']]
kmeans = KMeans(n_clusters = k,random_state=0).fit(to_cluster)
kmeans.labels_
df_to.insert(0, 'Cluster Labels', kmeans.labels_)

In [None]:
# create map
map_clusters = folium.Map(location=[43.651070,-79.347015],zoom_start=11)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighborhood, cluster in zip(df_to['Latitude'], df_to['Longitude'], df_to['Neighborhood'], df_to['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters


Here we can see how the model classified the data: We have some points in the center of the city, two different clusters to the west, one to the east and one to the north. The differentiation between the two groups to the west may be that the second group of points (blue in the map) are further away from the city center than the green ones.