## Importing and Installing required packages

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [2]:
!pip install wikipedia
import wikipedia as wp

Collecting wikipedia
  Downloading https://files.pythonhosted.org/packages/67/35/25e68fbc99e672127cc6fbb14b8ec1ba3dfef035bf1e4c90f78f24a80b7d/wikipedia-1.4.0.tar.gz
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/dsxuser/.cache/pip/wheels/87/2a/18/4e471fd96d12114d16fe4a446d00c3b38fb9efcb744bd31f4a
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


## Getting data read in from Wikipedia

In [3]:
html = wp.page("List_of_postal_codes_of_Canada:_M").html().encode("UTF-8")
try: 
    df = pd.read_html(html)[0]  # Try 2nd table first as most pages contain contents table first
except IndexError:
    df = pd.read_html(html)[0]
print(df.to_string())

    Postcode           Borough                                      Neighbourhood
0        M1A      Not assigned                                       Not assigned
1        M2A      Not assigned                                       Not assigned
2        M3A        North York                                          Parkwoods
3        M4A        North York                                   Victoria Village
4        M5A  Downtown Toronto                                       Harbourfront
5        M6A        North York                                   Lawrence Heights
6        M6A        North York                                     Lawrence Manor
7        M7A      Queen's Park                                       Not assigned
8        M8A      Not assigned                                       Not assigned
9        M9A  Downtown Toronto                                       Queen's Park
10       M1B       Scarborough                                              Rouge
11       M1B    

## Cleaning data

In [4]:
drops = df[ df['Borough'] == "Not assigned" ].index

# Delete these row indexes from dataFrame
df.drop(drops , inplace=True)

In [5]:
      
df_copy = df.copy()       
df_copy.loc[df['Neighbourhood']=='Not assigned', 'Neighbourhood'] = df['Borough']

In [6]:
df=df_copy

In [7]:
df.shape

(210, 3)

In [8]:
df

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Queen's Park,Queen's Park
9,M9A,Downtown Toronto,Queen's Park
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


## Parsing Post Codes for GPS 

In [11]:
!pip install geocoder



In [13]:
url = "http://cocl.us/Geospatial_data"
df2 = pd.read_csv(url)
df2.rename(columns={'Postal Code': 'Postcode'}, inplace=True)
df2 = pd.merge(df, df2, on='Postcode')

In [14]:
df2.head ()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,Lawrence Heights,43.718518,-79.464763
4,M6A,North York,Lawrence Manor,43.718518,-79.464763


## Clustering and Mapping the Boroughs

In [17]:
!conda install -c conda-forge geopy --yes
!conda install -c conda-forge folium=0.5.0 --yes

from geopy.geocoders import Nominatim 
import requests 
from pandas.io.json import json_normalize 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium 

print('Libraries imported.')

Solving environment: done

# All requested packages already installed.

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    altair-4.0.0               |             py_0         606 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         704 KB

The following NEW packages will be INSTALLED:

    altair:  4.0.0-py_0 conda-forge
    branca:  0.3.1-py_0 conda-forge
    folium:  0.5.0-py_0 conda-forge
    vincent: 0.4.4-py_1 conda-forge


Down

In [30]:
address = 'Toronto, Cananda'
import geocoder # import geocoder

latitude = 43.653963
longitude = -79.387207
#due to location service not available / non functioning zeroing in on Toronto manually

In [86]:
df=df2
df

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.654260,-79.360636
3,M6A,North York,Lawrence Heights,43.718518,-79.464763
4,M6A,North York,Lawrence Manor,43.718518,-79.464763
5,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
6,M9A,Downtown Toronto,Queen's Park,43.667856,-79.532242
7,M1B,Scarborough,Rouge,43.806686,-79.194353
8,M1B,Scarborough,Malvern,43.806686,-79.194353
9,M3B,North York,Don Mills North,43.745906,-79.352188


In [36]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(df, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [46]:

df_clust = df.drop('Neighbourhood', 1)
df_clust = df_clust.drop('Borough', 1)
df_clust = df_clust.drop('Postcode', 1)

In [47]:
df_clust

Unnamed: 0,Latitude,Longitude
0,43.753259,-79.329656
1,43.725882,-79.315572
2,43.654260,-79.360636
3,43.718518,-79.464763
4,43.718518,-79.464763
5,43.662301,-79.389494
6,43.667856,-79.532242
7,43.806686,-79.194353
8,43.806686,-79.194353
9,43.745906,-79.352188


In [48]:
# set number of clusters
kclusters = 5

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_clust)

In [57]:
# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:120] 

array([3, 3, 1, 2, 2, 1, 4, 3, 3, 2, 3, 3, 1, 1, 2, 0, 0, 0, 0, 0, 3, 3,
       3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 3, 3, 3, 3, 1, 4, 3, 1, 1, 1, 3, 2,
       2, 2, 2, 1, 1, 1, 1, 4, 4, 3, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 3,
       3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 2, 2, 0, 1, 1, 1, 1,
       4, 4, 4, 0, 3, 3, 3, 2, 2, 0, 1, 2, 2, 4, 4, 4, 4, 0, 0, 3, 3, 2,
       0, 2, 2, 4, 4, 0, 3, 3, 3, 2], dtype=int32)

In [63]:
cluster=kmeans.labels_[0:210]

In [68]:
df_clust['Cluster']=cluster

In [88]:
df_clust

Unnamed: 0,Latitude,Longitude,CLuster,Cluster
0,43.753259,-79.329656,3,3
1,43.725882,-79.315572,3,3
2,43.654260,-79.360636,1,1
3,43.718518,-79.464763,2,2
4,43.718518,-79.464763,2,2
5,43.662301,-79.389494,1,1
6,43.667856,-79.532242,4,4
7,43.806686,-79.194353,3,3
8,43.806686,-79.194353,3,3
9,43.745906,-79.352188,2,2


In [89]:
df_merged=df_clust.merge(df, how='inner')

In [90]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_merged['Latitude'], df_merged['Longitude'], df_merged['Neighbourhood'], df_merged['Cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters