In [1]:
import pandas as pd
import numpy as np
import json
import requests
from pandas.io.json import json_normalize

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim

import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes
import folium

print('Libraries Imported')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    certifi-2020.4.5.1         |   py36h9f0ad1d_0         151 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    ca-certificates-2020.4.5.1 |       hecc5488_0         146 KB  conda-forge
    geopy-1.22.0               |     pyh9f0ad1d_0          63 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0           conda-forge
    geopy:          

In [40]:
wiki = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

df = pd.DataFrame(wiki[0])
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [41]:
df.dropna(subset=['Neighborhood'], axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)
print(df.shape)
df.head()

(103, 3)


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### The portion of the notebook shown above is extracting postal code data for Toronto from Wikipedia and transforming the information into a dataframe for further analysis.

In [42]:
geo_df = pd.read_csv('https://cocl.us/Geospatial_data')
print(geo_df.shape)
geo_df.head()

(103, 3)


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [43]:
toronto_df = pd.merge(df, geo_df, on='Postal Code', how='inner')
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


### The section of the notebook above obtains longitude and latitude information for the postal codes and merges it with the dataframe containing borough and neighborhood information

In [44]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent='on_explorer')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('Coordinates of Toronto are {}, {}'.format(latitude, longitude))

Coordinates of Toronto are 43.6534817, -79.3839347


In [45]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11.25)

for lat, lng, borough, neighborhood in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Borough'], toronto_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        color='blue',
        popup=label,
        fill=True,
        fill_colors='blue',
        fill_opacity=0.6,
        parse_html=False
        ).add_to(map_toronto)

In [46]:
map_toronto

In [47]:
toronto_onehot = pd.get_dummies(toronto_df['Borough'], prefix="", prefix_sep="")

toronto_onehot['Neighborhood'] = toronto_df['Neighborhood']

fixed_col = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_col]

toronto_onehot.head()

Unnamed: 0,Neighborhood,Central Toronto,Downtown Toronto,East Toronto,East York,Etobicoke,Mississauga,North York,Scarborough,West Toronto,York
0,Parkwoods,0,0,0,0,0,0,1,0,0,0
1,Victoria Village,0,0,0,0,0,0,1,0,0,0
2,"Regent Park, Harbourfront",0,1,0,0,0,0,0,0,0,0
3,"Lawrence Manor, Lawrence Heights",0,0,0,0,0,0,1,0,0,0
4,"Queen's Park, Ontario Provincial Government",0,1,0,0,0,0,0,0,0,0


In [48]:
k = 10

toronto_clustering = toronto_onehot.drop('Neighborhood', 1)

kmeans = KMeans(n_clusters=k, random_state=2).fit(toronto_clustering)

kmeans.labels_

array([0, 0, 1, 0, 1, 3, 2, 0, 6, 1, 0, 3, 2, 0, 6, 1, 8, 3, 2, 5, 1, 8,
       2, 6, 1, 1, 2, 0, 0, 6, 1, 7, 2, 0, 0, 6, 1, 7, 2, 0, 0, 5, 1, 7,
       2, 0, 0, 5, 1, 0, 0, 2, 0, 0, 5, 0, 8, 0, 2, 0, 0, 4, 4, 8, 8, 2,
       0, 4, 4, 7, 3, 2, 0, 4, 4, 7, 9, 3, 2, 4, 1, 7, 2, 4, 1, 2, 4, 1,
       3, 3, 2, 1, 1, 3, 3, 2, 1, 1, 3, 1, 5, 3, 3], dtype=int32)

In [49]:
toronto_df.insert(0, 'Cluster Labels', kmeans.labels_)

In [50]:
toronto_df.head()

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,0,M3A,North York,Parkwoods,43.753259,-79.329656
1,0,M4A,North York,Victoria Village,43.725882,-79.315572
2,1,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,0,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [53]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11.25)

x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0,1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lng, poi, cluster in zip(toronto_df.Latitude, toronto_df.Longitude, toronto_df.Neighborhood, toronto_df['Cluster Labels']):
    labels = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        raidus = 5,
        popup=labels,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7
        ).add_to(map_clusters)

map_clusters

### The notebook clusters neighborhoods within their respective boroughs and is shown by their respective colors in the map above