In [5]:
import pandas as pd
import urllib.request

In [6]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [7]:
page = urllib.request.urlopen(url)
# import the BeautifulSoup library so we can parse HTML and XML documents
from bs4 import BeautifulSoup
# parse the HTML from our URL into the BeautifulSoup parse tree format
soup = BeautifulSoup(page, "lxml")

In [8]:
# find the correct table, loop through it and create a dataframe
right_table=soup.find('table', class_='wikitable sortable')
X=[]
Y=[]
Z=[]


for row in right_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        X.append(cells[0].find(text=True))
        Y.append(cells[1].find(text=True))
        Z.append(cells[2].find(text=True))
        
df=pd.DataFrame(X,columns=['Postal Code'])
df['Borough']=Y
df['Neighborhood']=Z

#replace all '\n' values 
df.replace(r'\s+|\\n', ' ', regex=True, inplace=True) 
df['Postal Code'] = df['Postal Code'].str.rstrip()

#drop all rows that don't have a borough assigned
df.drop(df[df['Borough'] == 'Not assigned '].index, inplace = True)

#reset the index after dropping the rows above
df.reset_index(drop=True)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [9]:
df.shape

(103, 3)

In [10]:
# Loading geo data from csv on github.  
csv_url = "https://raw.githubusercontent.com/bgarrido46/Coursera_Capstone/master/Geospatial_Coordinates.csv"
df_geo = pd.read_csv(csv_url, index_col=0)
print(df_geo.head(100))

      Postal Code   Latitude  Longitude
Index                                  
1             M1B  43.806686 -79.194353
2             M1C  43.784535 -79.160497
3             M1E  43.763573 -79.188711
4             M1G  43.770992 -79.216917
5             M1H  43.773136 -79.239476
...           ...        ...        ...
96            M9C  43.643515 -79.577201
97            M9L  43.756303 -79.565963
98            M9M  43.724766 -79.532242
99            M9N  43.706876 -79.518188
100           M9P  43.696319 -79.532242

[100 rows x 3 columns]


In [11]:
df_final = pd.merge(df, df_geo, on='Postal Code', how='inner')
df_final

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


In [3]:
import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans
import folium # map rendering library

print('Libraries imported.')

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... failed with initial frozen solve. Retrying with flexible solve.
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\ProgramData\Anaconda3

  added / updated specs:
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    altair-4.1.0               |             py_1         614 KB  conda-forge
    branca-0.4.1               |             py_0          26 KB  conda-forge
    certifi-2019.11.28         |           py37_0         148 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    ------------------------------------------------------------
         

In [29]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = df_final.drop('Neighborhood', 1)
toronto_grouped_clustering2 = toronto_grouped_clustering.drop('Borough', 1)
toronto_grouped_clustering3 = toronto_grouped_clustering2.drop('Postal Code', 1)
toronto_grouped_clustering3
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering3)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

locations = df_final[['Latitude', 'Longitude']]
locationlist = locations.values.tolist()

In [34]:
# create map
map_clusters = folium.Map(location=[43.654260, -79.360636], zoom_start=11)
for point in range(0, len(locationlist)):
    folium.Marker(locationlist[point], popup=df_final['Borough'][point]).add_to(map_clusters)
map_clusters
# set color scheme for the clusters


In [35]:
df_final.insert(0, 'Cluster Labels', kmeans.labels_)
df_final

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,4,M3A,North York,Parkwoods,43.753259,-79.329656
1,4,M4A,North York,Victoria Village,43.725882,-79.315572
2,2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,0,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,2,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...,...
98,1,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,2,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,4,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,1,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


In [37]:
import numpy as np
# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_final['Latitude'], df_final['Longitude'], df_final['Neighborhood'], df_final['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters