## Segmenting & Clustering Neighbourhoods in Toronto ##

### PART 1 : Scraping the Wikipedia page, wrangling the data, cleaning, and reading into a pandas  dataframe 

In [None]:
pip install wikipedia

In [29]:
#import numpy as np
import pandas as pd
import wikipedia as wp

# Reading the Wikipedia page
link = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
df = pd.read_html(link, header=0)[0]
#df.head()

# Drop the rows where Borough = 'Not assigned'
df_assigned = df[df.Borough != 'Not assigned']

# Combine neighbourhoods with same Postalcode
df_sameZIP = df_assigned.groupby(['Postal Code','Borough'], sort=False).agg(', '.join)
df_sameZIP.reset_index(inplace=True)

# Replacing the name of the neighbourhoods which are 'Not assigned' with names of Borough
df_sameZIP['Neighbourhood'] = np.where(df_sameZIP['Neighbourhood'] == 'Not assigned',df_sameZIP['Borough'], df_sameZIP['Neighbourhood'])

#Print the number of rows of the dataframe.
df_sameZIP.shape

(103, 3)

### PART 2 : Getting the latitude and the longitude and creating a dataframe for all neighborhoods with coordinates

In [84]:
# from assignment
#import geocoder # import geocoder

# initialize your variable to None
#lat_lng_coords = None

# loop until you get the coordinates
#while(lat_lng_coords is None):
#  g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
#  lat_lng_coords = g.latlng

#latitude = lat_lng_coords[0]
#longitude = lat_lng_coords[1]

# alternate code from assignment
Coordinates = pd.read_csv('https://cocl.us/Geospatial_data')

# Merge the dataframes of coordinates and the neighbourhoods
df_Coordinates = pd.merge(df_sameZIP,Coordinates,on='Postal Code')
df_Coordinates.head()


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


### PART 3 : Explore and cluster the neighborhoods in Toronto & plot in map

In [77]:
# Return rows with 'Toronto' in the Borough.

df_TORONTO = df_Coordinates[df_Coordinates['Borough'].str.contains('Toronto',regex=False)]
df_TORONTO

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


In [35]:
! pip install folium==0.5.0

Collecting folium==0.5.0
  Downloading folium-0.5.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 5.1 MB/s eta 0:00:011
[?25hCollecting branca
  Downloading branca-0.4.2-py3-none-any.whl (24 kB)
Building wheels for collected packages: folium
  Building wheel for folium (setup.py) ... [?25ldone
[?25h  Created wheel for folium: filename=folium-0.5.0-py3-none-any.whl size=76240 sha256=ac42d7903098e3050b11834faa00b710b4cb983651abb4fcb3177c518ad5f533
  Stored in directory: /tmp/wsuser/.cache/pip/wheels/b2/2f/2c/109e446b990d663ea5ce9b078b5e7c1a9c45cca91f377080f8
Successfully built folium
Installing collected packages: branca, folium
Successfully installed branca-0.4.2 folium-0.5.0


In [68]:
import folium # plotting library
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

# Plotting the Neighbourhoodsdata using Folium
# Country : Canada
# Latitude : 43.651070
# Longitude : 79.347015

TorontoMap = folium.Map(location=[43.651070,-79.347015],zoom_start=11)

for lat,lng,borough,neighbourhood in zip(df_TORONTO['Latitude'],df_TORONTO['Longitude'],df_TORONTO['Borough'],df_TORONTO['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=4,
    popup=label,
    color='orange',
    fill=True,
    fill_color='black',
    fill_opacity=1,
    parse_html=False).add_to(TorontoMap)
TorontoMap

In [79]:
# KMeans clustering of the neighbourhoods

# set number of clusters as 3
kcluster=3
toronto_clustering = df_TORONTO.drop(['Postal Code','Borough','Neighbourhood'],1)

# run k-means clustering
kmeans = KMeans(n_clusters = kcluster, random_state=0).fit(toronto_clustering)
kmeans.labels_[0:10] 

df_TORONTO.insert(0, 'CLUSTER LABEL', kmeans.labels_)
df_TORONTO


Unnamed: 0,CLUSTER LABEL,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,0,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,0,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,0,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,0,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,0,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,1,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,0,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,1,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


In [83]:
# creating the map using folium
map_clusters = folium.Map(location=[43.651070,-79.347015],zoom_start=11)

# set color scheme for the clusters
x = np.arange(kcluster)
ys = [i + x + (i*x)**2 for i in range(kcluster)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighbourhood, cluster in zip(df_TORONTO['Latitude'], df_TORONTO['Longitude'], df_TORONTO['Neighbourhood'], df_TORONTO['CLUSTER LABEL']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.9).add_to(map_clusters)
map_clusters