# Clustering Toronto neighborhoods

## Import of the dataframe

### Package Imports

In [4]:
from bs4 import BeautifulSoup as bs
import requests
import lxml

import pandas as pd
import numpy as np

### Html Import

#### Option 1: Import a file

In [5]:
#with open('Can_Postal_Codes.htm',encoding='utf') as html_file:
#    soup = bs(html_file,'lxml')



#### Option 2: Import a url

In [6]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source=requests.get(url).text
soup=bs(source,'lxml')

#### Fetch the contents

In [7]:
table=soup.table.contents[1]

In [8]:
Titles = table.contents[0].text.strip().split()
Titles

['Postcode', 'Borough', 'Neighbourhood']

In [9]:
cont=[]
for i in np.arange(2,len(table)-10,2):
   cont.append( table.contents[i].text.strip().split('\n')) 

### Create a dataframe

In [10]:
df= pd.DataFrame(cont)

In [11]:
df.columns=Titles

In [12]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Drop "Not Assigned"

In [13]:
# Both "Not assigned", whole row should be dropped
valtodrop=df.loc[(df.Borough=='Not assigned') 
                 & (df.Neighbourhood=='Not assigned')].index.values

In [14]:
df.drop(valtodrop,inplace=True)
df.reset_index(inplace=True,drop=True)

In [15]:
# Only Neighbourhood is "not assigned", 
# so it takes the borough's name

for i in df.index.values:
    if df.Neighbourhood[i]=='Not assigned':
       df.Neighbourhood[i]=df.Borough[i]

### Dataframe manipulation

In [16]:
# We create 3 lists, one for each column

neighlist=[]
POlist=[]
boroughlist=[]
for PO in df.Postcode.unique():
# Postcode
    POlist.append(PO)
# Borough for each postcode. We have to avoid repeating the value 
    boroughlist.append(df.loc[df.Postcode==PO].Borough.unique()[0])
# List of neighborhoods that belong to each postcode
    neighlist.append(df.loc[df.Postcode==PO].Neighbourhood.tolist())


In [17]:
neighlist[:7]

[['Parkwoods'],
 ['Victoria Village'],
 ['Harbourfront', 'Regent Park'],
 ['Lawrence Heights', 'Lawrence Manor'],
 ["Queen's Park"],
 ['Islington Avenue'],
 ['Rouge', 'Malvern']]

In [18]:
# join the neighborhood names, so that they only take 1 column
for i in range(0,len(neighlist)):
    neighlist[i]= ', '.join(neighlist[i])

In [19]:
neighlist[:7]

['Parkwoods',
 'Victoria Village',
 'Harbourfront, Regent Park',
 'Lawrence Heights, Lawrence Manor',
 "Queen's Park",
 'Islington Avenue',
 'Rouge, Malvern']

In [20]:
# Create the dataframe
df_new=pd.DataFrame({"Postcode":POlist,"Borough":boroughlist, "Neighbourhoods":neighlist})

In [21]:
# Sort by Postcode
df_new.sort_values(by='Postcode',inplace=True)
df_new.reset_index(inplace=True,drop=True)

In [22]:
df_new.head()

Unnamed: 0,Postcode,Borough,Neighbourhoods
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [23]:
df_new.shape

(103, 3)

# Neighborhood Locations

### Geocoder is not working. We will skip it, and work with the csv.

In [24]:
# import geocoder # import geocoder

# initialize your variable to None
# lat_lng_coords = None

# loop until you get the coordinates
# while(lat_lng_coords is None):
#    g = geocoder.google('{}, Toronto, Ontario'.format('M1C'))
#    lat_lng_coords = g.latlng

#latitude = lat_lng_coords[0]
#longitude = lat_lng_coords[1]

### Import the dataset

In [25]:
url='https://cocl.us/Geospatial_data/Geospatial_coordinates.csv'
dfll=pd.read_csv(url)

In [26]:
dfll.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge the two datasets

In [27]:
dfll.rename(columns={"Postal Code":"Postcode"},inplace=True)

In [28]:
df_Neigh = pd.merge(df_new,dfll)

In [29]:
df_Neigh.head()

Unnamed: 0,Postcode,Borough,Neighbourhoods,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Clustering

## Imports

### Packages

In [30]:
import json

import random 

from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

from pandas.io.json import json_normalize

# Matplotlib and associated plotting modules
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors


import folium # plotting library

from sklearn.cluster import KMeans

## Map of Toronto

In [31]:
address= 'Toronto, Ontario'
geolocator=Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The coordinates of Toronto are {},{}'.format(latitude, longitude))

The coordinates of Toronto are 43.653963,-79.387207


In [32]:
Torloc=[latitude,longitude]
map_Tor = folium.Map(location=Torloc, zoom_start=11)

for PO, lat, lng, borough, neighborhood in zip(df_Neigh.Postcode,
                                           df_Neigh.Latitude,
                                           df_Neigh.Longitude,
                                           df_Neigh.Borough,
                                           df_Neigh.Neighbourhoods):

    label = '{} : {}'.format(PO,neighborhood)
    label = folium.Popup(label,parse_html=True)
    folium.CircleMarker(
         [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='neighborhoods.borough',
        fill_opacity=0.7,
        ).add_to(map_Tor)

In [33]:
map_Tor

In [34]:
df_Neigh.shape

(103, 5)

We will examine only the central Toronto neighborhoods:

In [35]:
df_Toronto=df_Neigh[df_Neigh.Borough.str.contains("Toronto")].reset_index(drop=True)

In [36]:
df_Toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhoods,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


Let's see a map of Toronto center:

In [37]:
Torloc=[latitude,longitude]
map_Tor = folium.Map(location=Torloc, zoom_start=12)

for PO, lat, lng, borough, neighborhood in zip(df_Toronto.Postcode,
                                           df_Toronto.Latitude,
                                           df_Toronto.Longitude,
                                           df_Toronto.Borough,
                                           df_Toronto.Neighbourhoods):

    label = '{} : {}'.format(PO,neighborhood)
    label = folium.Popup(label,parse_html=True)
    folium.CircleMarker(
         [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='neighborhoods.borough',
        fill_opacity=0.7,
        ).add_to(map_Tor)
    
map_Tor

Now let's import the venues of each neighborhood from Foursquare.

### Useful Functions 

In [38]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [39]:
# Function that gives the venues located on all neighborhoods in the dataframe

LIMIT  = 100
CLIENT_ID = 'HWG4GRWAET5CTID3P3CGEWPIMQJCZS5FEMQQWCDL5FCCSC5X' # your Foursquare ID
CLIENT_SECRET = '53CLPTEMYTZJZUKGRDBVD40WSJ54ZRILJ4NF43CU33HDBSH4' # your Foursquare Secret
VERSION = '20180605'

rad = 500

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Import the venues at Toronto

First, we should give our credentials:

In [40]:
CLIENT_ID = 'HWG4GRWAET5CTID3P3CGEWPIMQJCZS5FEMQQWCDL5FCCSC5X' # your Foursquare ID
CLIENT_SECRET = '53CLPTEMYTZJZUKGRDBVD40WSJ54ZRILJ4NF43CU33HDBSH4' # your Foursquare Secret
VERSION = '20180605'

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: HWG4GRWAET5CTID3P3CGEWPIMQJCZS5FEMQQWCDL5FCCSC5X
CLIENT_SECRET:53CLPTEMYTZJZUKGRDBVD40WSJ54ZRILJ4NF43CU33HDBSH4


In [41]:
# Call the toronto venues function

toronto_venues=getNearbyVenues(df_Toronto.Neighbourhoods,
                df_Toronto.Latitude, 
                df_Toronto.Longitude)



The Beaches
The Danforth West, Riverdale
The Beaches West, India Bazaar
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park, Summerhill East
Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West
Rosedale
Cabbagetown, St. James Town
Church and Wellesley
Harbourfront, Regent Park
Ryerson, Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide, King, Richmond
Harbourfront East, Toronto Islands, Union Station
Design Exchange, Toronto Dominion Centre
Commerce Court, Victoria Hotel
Roselawn
Forest Hill North, Forest Hill West
The Annex, North Midtown, Yorkville
Harbord, University of Toronto
Chinatown, Grange Park, Kensington Market
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place, Underground city
Christie
Dovercourt Village, Dufferin
Little Portugal, Trinity
Brockton, Exhibition Place, Parkdale Village
High Park, The 

In [42]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Starbucks,43.678798,-79.298045,Coffee Shop
1,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
2,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
3,"The Danforth West, Riverdale",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant
4,"The Danforth West, Riverdale",43.679557,-79.352188,Dolce Gelato,43.677773,-79.351187,Ice Cream Shop


Let's categorize the venues:

In [43]:
toronto_venues['Venue Category'].value_counts().head()

Coffee Shop           158
Café                   96
Restaurant             50
Hotel                  43
Italian Restaurant     43
Name: Venue Category, dtype: int64

In [44]:
print('There are {} unique venue categories in Toronto.'.format(len(toronto_venues['Venue Category'].unique())))

There are 231 unique venue categories in Toronto.


### One-hot Encoding

In [117]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']],prefix='',prefix_sep='')
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 
fixed_columns = [toronto_onehot.columns[157]] + list(toronto_onehot.columns[0:157])+ list(toronto_onehot.columns[158:])
toronto_onehot=toronto_onehot[fixed_columns]
toronto_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"The Danforth West, Riverdale",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"The Danforth West, Riverdale",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [118]:
toronto_onehot.shape

(1691, 231)

### Group by neighborhood

In [119]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

In [120]:
toronto_grouped.shape

(38, 231)

In [121]:
toronto_grouped.head()

Unnamed: 0,Neighborhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,"Adelaide, King, Richmond",0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.01,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Brockton, Exhibition Place, Parkdale Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Business reply mail Processing Centre969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.0,0.0,0.071429,0.071429,0.071429,0.142857,0.142857,0.142857,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Show most common venues

In [122]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted=row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:num_top_venues]
    

In [123]:
n_top_venues = 3
columns= ['Neighborhood']

indicators = ['st', 'nd', 'rd']

for ind in np.arange(n_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

        
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(
        toronto_grouped.iloc[ind, :], 
        n_top_venues)

In [124]:
neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,Steakhouse
1,Berczy Park,Coffee Shop,Cocktail Bar,Restaurant
2,"Brockton, Exhibition Place, Parkdale Village",Coffee Shop,Café,Breakfast Spot
3,Business reply mail Processing Centre969 Eastern,Yoga Studio,Auto Workshop,Garden Center
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Lounge,Airport Terminal,Airport Service


## K-means Clustering

In [228]:
kclusters=4
toronto_grouped_clustering= toronto_grouped.drop('Neighborhood',1)
                                           
k_means= KMeans(init='k-means++',n_clusters = kclusters,n_init=50).fit(toronto_grouped_clustering)
k_means.labels_[0:30]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1,
       3, 1, 0, 1, 1, 3, 2, 1])

In [229]:
toronto_merged=df_Toronto

toronto_merged['Cluster Labels']=k_means.labels_

toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhoods')

In [230]:
toronto_merged.head(18)

Unnamed: 0,Postcode,Borough,Neighbourhoods,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,1,Coffee Shop,Pub,Yoga Studio
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,1,Greek Restaurant,Coffee Shop,Ice Cream Shop
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572,1,Sandwich Place,Park,Brewery
3,M4M,East Toronto,Studio District,43.659526,-79.340923,1,Café,Coffee Shop,Bakery
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,1,Bus Line,Park,Swim School
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197,1,Food & Drink Shop,Sandwich Place,Burger Joint
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678,1,Coffee Shop,Sporting Goods Shop,Clothing Store
7,M4S,Central Toronto,Davisville,43.704324,-79.38879,1,Sandwich Place,Dessert Shop,Coffee Shop
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316,1,Playground,Filipino Restaurant,Farmers Market
9,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049,1,Coffee Shop,Pub,Pizza Place


In [231]:

map_clusters= folium.Map(location=Torloc, zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]

colors_array = cm.rainbow(np.linspace(0,1,len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
markers_colors=[]
for lat, lon, poi, cluster in zip(toronto_merged.Latitude,
                                  toronto_merged.Longitude,
                                  toronto_merged.Neighbourhoods,
                                  toronto_merged['Cluster Labels']):
    label= folium.Popup(str(poi)+', Cluster ' + str(cluster),parse_html=True)
    folium.CircleMarker(
    [lat,lon],
    radius=5,
    popup=label,
    color= rainbow[cluster],
    fill_color=rainbow[cluster],
    fill = True,
    fill_opacity=0.7).add_to(map_clusters)

In [232]:
map_clusters

### Clustering Results

In [233]:
toronto_merged.loc[toronto_merged['Cluster Labels']==0, toronto_merged.columns[[2] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Neighbourhoods,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
24,"The Annex, North Midtown, Yorkville",0,Café,Sandwich Place,Coffee Shop


In [234]:
toronto_merged.loc[toronto_merged['Cluster Labels']==1, toronto_merged.columns[[2] + list(range(5, toronto_merged.shape[1]))]] 

Unnamed: 0,Neighbourhoods,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,The Beaches,1,Coffee Shop,Pub,Yoga Studio
1,"The Danforth West, Riverdale",1,Greek Restaurant,Coffee Shop,Ice Cream Shop
2,"The Beaches West, India Bazaar",1,Sandwich Place,Park,Brewery
3,Studio District,1,Café,Coffee Shop,Bakery
4,Lawrence Park,1,Bus Line,Park,Swim School
5,Davisville North,1,Food & Drink Shop,Sandwich Place,Burger Joint
6,North Toronto West,1,Coffee Shop,Sporting Goods Shop,Clothing Store
7,Davisville,1,Sandwich Place,Dessert Shop,Coffee Shop
8,"Moore Park, Summerhill East",1,Playground,Filipino Restaurant,Farmers Market
9,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",1,Coffee Shop,Pub,Pizza Place


In [235]:
toronto_merged.loc[toronto_merged['Cluster Labels']==2, toronto_merged.columns[[2] + list(range(5, toronto_merged.shape[1]))]] 

Unnamed: 0,Neighbourhoods,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
28,Stn A PO Boxes 25 The Esplanade,2,Coffee Shop,Café,Seafood Restaurant


In [236]:
toronto_merged.loc[toronto_merged['Cluster Labels']==3, toronto_merged.columns[[2] + list(range(5, toronto_merged.shape[1]))]] 

Unnamed: 0,Neighbourhoods,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
17,Central Bay Street,3,Coffee Shop,Café,Sandwich Place
22,Roselawn,3,Garden,Yoga Studio,Fast Food Restaurant
27,"CN Tower, Bathurst Quay, Island airport, Harbo...",3,Airport Lounge,Airport Terminal,Airport Service
