### Scrape Wikipedia Page
Srape the list of postal codes of Canada with their latitude and longitude coordinates.

In [27]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

source = requests.get( 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M' ).text
soup = BeautifulSoup( source, 'lxml' )

# Find the first table on the Wikipedia page and iterate through tags for required information
table_info = soup.find( 'table' )
rows = table_info.find_all( 'td' )

postcode = []
borough = []
neighborhood = []

for i in range( 0, len( rows ), 3 ):
    postcode.append( rows[i].text.strip() )
    borough.append( rows[i + 1].text.strip() )
    neighborhood.append( rows[i + 2].text.strip() )

# Build the dataframe from the list of values and set column names as PostalCode, Borough, and Neighborhood
df = pd.DataFrame( data = [postcode, borough, neighborhood] ) \
     .rename( index = { 0: 'Postcode', 1: 'Borough', 2: 'Neighborhood'} ) \
     .transpose()

df.drop( df[df['Borough'] == 'Not assigned'].index, inplace = True )
df.loc[df.Neighborhood == 'Not assigned', 'Neighborhood'] = df.Borough

df_group = df.groupby( ['Postcode', 'Borough'] )['Neighborhood'].apply( ', '.join ).reset_index()

In [28]:
df_coor = pd.read_csv( 'http://cocl.us/Geospatial_data' )
df_coor.columns = ['Postcode', 'Latitude', 'Longitude']
df_join = pd.merge( df_group, df_coor, on = ['Postcode'], how = 'inner' )
df_join.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


### Explore and cluster neighborhoods in Toronto

In [29]:
df2 = df_join[['Borough', 'Neighborhood', 'Latitude', 'Longitude']].copy()
print( 'There are {} boroughs and {} neighborhoods.'
       .format( len( df2['Borough'].unique() ), df2.shape[0] ) )
df2.head( 5 )

There are 11 boroughs and 103 neighborhoods.


Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,Scarborough,Woburn,43.770992,-79.216917
4,Scarborough,Cedarbrae,43.773136,-79.239476


Get rid of the boroughs that do not contain "Toronto"

In [30]:
df2 = df2[df2.Borough.str.contains( "Toronto" )]
df2.shape

(38, 4)

Run KMeans to cluster the neighborhoods into 4 clusters. Chances are they will cluster based on Borough(Downtown, East, West, Central)

In [31]:
import folium
import numpy as np
from geopy.geocoders import Nominatim
address = 'Toronto, Canada'

geolocator = Nominatim()
location = geolocator.geocode( address )
latitude = location.latitude
longitude = location.longitude
print( 'The geograpical coordinate of Toronto are {}, {}.'.format( latitude, longitude ) )

toronto_map = folium.Map( location = [latitude, longitude], zoom_start = 11 )

X = df2['Latitude']
Y = df2['Longitude']

x = np.asarray( X )
y = np.asarray( Y )
Z = np.stack( ( X, Y ), axis = 1 )

from sklearn.cluster import KMeans 
k_means = KMeans( init = "k-means++", n_clusters = 4, n_init = 5 )
k_means.fit( Z )
clusterindex = k_means.labels_

df2['cluster'] = clusterindex

colors = ['red','green','blue','yellow','pink','gray','magenta','cyan']
for lat, lng, label,cl_ind in zip( df2['Latitude'], df2['Longitude'], df2['Borough'], df2['cluster']):
    label = folium.Popup( label, parse_html = True )
    folium.CircleMarker(
        [lat, lng],
        radius = 5,
        popup = label,
        color = 'black',
        fill = True,
        fill_color = colors[cl_ind], #set color according to clusterindex
        fill_opacity = 0.7 ).add_to( toronto_map )

toronto_map



The geograpical coordinate of Toronto are 43.653963, -79.387207.
