## Segmenting and Clustering Neighborhoods in Toronto Part3
1. Leverage sklearn library to build Clustering Model
2. Leverage folium library to draw the result of Clustering on world map
3. Examine Clusters

In [78]:
import pandas as pd
import numpy as np

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

In [52]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
Origin_df = pd.read_html(url)[0]
Origin_df

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [53]:
mask1 = Origin_df["Borough"] != "Not assigned"
mask2 = Origin_df["Neighborhood"] != "Not assigned"
Mask_df = Origin_df[(mask1 & mask2)]
df = Mask_df.sort_values(by=['Postal Code']).reset_index(drop=True)
df

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


In [54]:
df.shape

(103, 3)

In [55]:
GEO = pd.read_csv('Geospatial_Coordinates.csv')
GEO

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [68]:
C_df = pd.concat([df, GEO.iloc[:,1:3]], axis=1)   # "Latitude", "Longitude"
C_df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437


In [59]:
# set number of clusters
kclusters = 5

df_for_clustering = C_df.drop(['Postal Code', 'Borough', 'Neighborhood'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_for_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[:]

array([2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, 4, 1, 4, 4, 4,
       4, 4, 4, 1, 1, 1, 4, 4, 4, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3,
       3, 3, 3, 3, 3, 4, 4, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 3, 0, 3, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [69]:
# add clustering labels
C_df.insert(0, 'Cluster Labels', kmeans.labels_)
C_df

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,2,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,2,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,2,M1G,Scarborough,Woburn,43.770992,-79.216917
4,2,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...,...
98,0,M9N,York,Weston,43.706876,-79.518188
99,0,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,0,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,0,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437


In [71]:
latitude, longitude = 43.6703, -79.3867
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(C_df['Latitude'], C_df['Longitude'], C_df['Neighborhood'], C_df['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [73]:
C_df.loc[C_df['Cluster Labels'] == 0, C_df.columns[[1] + list(range(5, C_df.shape[1]))]]

Unnamed: 0,Postal Code,Longitude
31,M3L,-79.506944
32,M3M,-79.495697
33,M3N,-79.520999
79,M6L,-79.490074
80,M6M,-79.476013
81,M6N,-79.487262
84,M6S,-79.48445
86,M7R,-79.615819
88,M8V,-79.501321
89,M8W,-79.543484


In [74]:
C_df.loc[C_df['Cluster Labels'] == 1, C_df.columns[[1] + list(range(5, C_df.shape[1]))]]

Unnamed: 0,Postal Code,Longitude
7,M1L,-79.284577
9,M1N,-79.264848
11,M1R,-79.295849
18,M2J,-79.346556
25,M3A,-79.329656
26,M3B,-79.352188
27,M3C,-79.340923
34,M4A,-79.315572
35,M4B,-79.309937
36,M4C,-79.318389


In [75]:
C_df.loc[C_df['Cluster Labels'] == 2, C_df.columns[[1] + list(range(5, C_df.shape[1]))]]

Unnamed: 0,Postal Code,Longitude
0,M1B,-79.194353
1,M1C,-79.160497
2,M1E,-79.188711
3,M1G,-79.216917
4,M1H,-79.239476
5,M1J,-79.239476
6,M1K,-79.262029
8,M1M,-79.239476
10,M1P,-79.273304
12,M1S,-79.262029


In [76]:
C_df.loc[C_df['Cluster Labels'] == 3, C_df.columns[[1] + list(range(5, C_df.shape[1]))]]

Unnamed: 0,Postal Code,Longitude
47,M4S,-79.38879
48,M4T,-79.38316
49,M4V,-79.400049
50,M4W,-79.377529
51,M4X,-79.367675
52,M4Y,-79.38316
53,M5A,-79.360636
54,M5B,-79.378937
55,M5C,-79.375418
56,M5E,-79.373306


In [77]:
C_df.loc[C_df['Cluster Labels'] == 4, C_df.columns[[1] + list(range(5, C_df.shape[1]))]]

Unnamed: 0,Postal Code,Longitude
17,M2H,-79.363452
19,M2K,-79.385975
20,M2L,-79.374714
21,M2M,-79.408493
22,M2N,-79.408493
23,M2P,-79.400049
24,M2R,-79.442259
28,M3H,-79.442259
29,M3J,-79.487262
30,M3K,-79.464763
