In [2]:
import pandas as pd 
import numpy as np


#!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim


import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes 
import folium 

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
new_toronto_df = pd.read_csv('new_toronto.csv')
new_toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,latitude,longitude
0,M3A,North York,Parkwoods,43.7545,-79.33
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
4,M7A,Queen's Park,Ontario Provincial Government,43.6641,-79.3889


In [4]:
print('It looks like the Borough column has {} unique values'.format(new_toronto_df.Borough.nunique()))

It looks like the Borough column has 14 unique values


In [5]:
neighborhood_count = new_toronto_df.groupby(['Borough']).count()
neighborhood_count =  neighborhood_count.drop(['Neighborhood', 'latitude', 'longitude'], axis=1).rename(columns={'PostalCode':'Neighborhoods per Borough'})
neighborhood_count

Unnamed: 0_level_0,Neighborhoods per Borough
Borough,Unnamed: 1_level_1
Central Toronto,9
Downtown Toronto,17
Downtown Toronto Stn A,1
East Toronto,4
East Toronto Business,1
East York,4
East York/East Toronto,1
Etobicoke,11
Etobicoke Northwest,1
North York,24


In [6]:
toronto_onehot = pd.get_dummies(new_toronto_df[['Borough']], prefix="", prefix_sep="")
print(toronto_onehot.shape)
toronto_onehot.head()

(102, 14)


Unnamed: 0,Central Toronto,Downtown Toronto,Downtown Toronto Stn A,East Toronto,East Toronto Business,East York,East York/East Toronto,Etobicoke,Etobicoke Northwest,North York,Queen's Park,Scarborough,West Toronto,York
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [7]:
kclusters = 14

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_onehot)

kmeans.labels_

array([ 3,  3,  2,  3, 10,  4,  1,  3,  9,  2,  3,  4,  1,  3,  9,  2,  8,
        4,  1,  7,  2,  8,  1,  9,  2,  2,  1,  3,  3,  9,  2,  6,  1,  3,
        3, 12,  2,  6,  1,  3,  3,  7,  2,  6,  1,  3,  3,  7,  2,  3,  3,
        1,  3,  3,  7,  3,  8,  3,  1,  3,  3,  5,  5,  8,  8,  1,  3,  5,
        5,  6,  4,  1,  3,  5,  5,  6,  4,  1,  5,  2,  6,  1,  5,  2,  1,
        5,  2,  4,  4,  1,  2, 11,  4, 13,  1,  2,  2,  4,  2,  0,  4,  4])

In [8]:
toronto_clustered = new_toronto_df.copy()

toronto_clustered.insert(0, 'KmeanLabels', kmeans.labels_)

In [9]:
toronto_clustered.head()

Unnamed: 0,KmeanLabels,PostalCode,Borough,Neighborhood,latitude,longitude
0,3,M3A,North York,Parkwoods,43.7545,-79.33
1,3,M4A,North York,Victoria Village,43.7276,-79.3148
2,2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
3,3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
4,10,M7A,Queen's Park,Ontario Provincial Government,43.6641,-79.3889


In [10]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude_tor = location.latitude
longitude_tor = location.longitude
print('The geograpical coordinates for the city of Toronto are {}, {}.'.format(latitude_tor, longitude_tor))

The geograpical coordinates for the city of Toronto are 43.6534817, -79.3839347.


In [11]:
map_clusters = folium.Map(location=[latitude_tor, longitude_tor], zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, neighborhood, borough, cluster in zip(toronto_clustered['latitude'], toronto_clustered['longitude'], toronto_clustered['Neighborhood'], toronto_clustered['Borough'], toronto_clustered['KmeanLabels']):
    label = folium.Popup('Neighborhood: ' + str(neighborhood) + ' - Borough: ' + str(borough) + ' - Cluster: ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [12]:
mod_borough_df = new_toronto_df.copy()

In [13]:
mod_borough_df.Borough.values

array(['North York', 'North York', 'Downtown Toronto', 'North York',
       "Queen's Park", 'Etobicoke', 'Scarborough', 'North York',
       'East York', 'Downtown Toronto', 'North York', 'Etobicoke',
       'Scarborough', 'North York', 'East York', 'Downtown Toronto',
       'York', 'Etobicoke', 'Scarborough', 'East Toronto',
       'Downtown Toronto', 'York', 'Scarborough', 'East York',
       'Downtown Toronto', 'Downtown Toronto', 'Scarborough',
       'North York', 'North York', 'East York', 'Downtown Toronto',
       'West Toronto', 'Scarborough', 'North York', 'North York',
       'East York/East Toronto', 'Downtown Toronto', 'West Toronto',
       'Scarborough', 'North York', 'North York', 'East Toronto',
       'Downtown Toronto', 'West Toronto', 'Scarborough', 'North York',
       'North York', 'East Toronto', 'Downtown Toronto', 'North York',
       'North York', 'Scarborough', 'North York', 'North York',
       'East Toronto', 'North York', 'York', 'North York', 'Scarboroug

In [14]:
mod_borough_df['Borough']= mod_borough_df['Borough'].replace({"Queen's Park":'Downtown Toronto','Downtown Toronto Stn A':'Downtown Toronto','Etobicoke Northwest':'Etobicoke','East Toronto Business':'Scarborough','East York/East Toronto':'East York'})

In [15]:
mod_borough_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,latitude,longitude
0,M3A,North York,Parkwoods,43.7545,-79.33
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
4,M7A,Downtown Toronto,Ontario Provincial Government,43.6641,-79.3889


In [16]:
print('It looks like the Borough column has {} unique values'.format(mod_borough_df.Borough.nunique()))

It looks like the Borough column has 9 unique values


In [17]:
new_neighborhood_count = mod_borough_df.groupby(['Borough']).count()
new_neighborhood_count =  new_neighborhood_count.drop(['Neighborhood', 'latitude', 'longitude'], axis=1).rename(columns={'PostalCode':'Neighborhoods per Borough'})
new_neighborhood_count

Unnamed: 0_level_0,Neighborhoods per Borough
Borough,Unnamed: 1_level_1
Central Toronto,9
Downtown Toronto,19
East Toronto,4
East York,5
Etobicoke,12
North York,24
Scarborough,18
West Toronto,6
York,5


In [18]:
toronto_mod_onehot = pd.get_dummies(mod_borough_df[['Borough']], prefix="", prefix_sep="")
print(toronto_mod_onehot.shape)
toronto_mod_onehot.head()

(102, 9)


Unnamed: 0,Central Toronto,Downtown Toronto,East Toronto,East York,Etobicoke,North York,Scarborough,West Toronto,York
0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,1,0,0,0
2,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0
4,0,1,0,0,0,0,0,0,0


In [19]:
mod_kclusters = 9

mod_kmeans = KMeans(n_clusters=mod_kclusters, random_state=0).fit(toronto_mod_onehot)

mod_kmeans.labels_

array([1, 1, 2, 1, 2, 3, 0, 1, 7, 2, 1, 3, 0, 1, 7, 2, 6, 3, 0, 8, 2, 6,
       0, 7, 2, 2, 0, 1, 1, 7, 2, 5, 0, 1, 1, 7, 2, 5, 0, 1, 1, 8, 2, 5,
       0, 1, 1, 8, 2, 1, 1, 0, 1, 1, 8, 1, 6, 1, 0, 1, 1, 4, 4, 6, 6, 0,
       1, 4, 4, 5, 3, 0, 1, 4, 4, 5, 3, 0, 4, 2, 5, 0, 4, 2, 0, 4, 2, 3,
       3, 0, 2, 2, 3, 3, 0, 2, 2, 3, 2, 0, 3, 3])

In [20]:
mod_toronto_clustered = mod_borough_df.copy()

mod_toronto_clustered.insert(0, 'KmeanLabels', mod_kmeans.labels_)

In [21]:
mod_toronto_clustered.head()

Unnamed: 0,KmeanLabels,PostalCode,Borough,Neighborhood,latitude,longitude
0,1,M3A,North York,Parkwoods,43.7545,-79.33
1,1,M4A,North York,Victoria Village,43.7276,-79.3148
2,2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
3,1,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
4,2,M7A,Downtown Toronto,Ontario Provincial Government,43.6641,-79.3889


In [22]:
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, neighborhood, borough, cluster in zip(mod_toronto_clustered['latitude'], mod_toronto_clustered['longitude'], mod_toronto_clustered['Neighborhood'], mod_toronto_clustered['Borough'], mod_toronto_clustered['KmeanLabels']):
    label = folium.Popup('Neighborhood: ' + str(neighborhood) + ' - Borough: ' + str(borough) + ' - Cluster: ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters