# Toronto KMeans Clustering

>**NOTE:** I used some code cells that I've included in this final stage just for reference but they are now Row cells.

## Import Packages

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests

## Fetch Data

In [2]:
try:
    res = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
except Exception as e:
    print('Failed to get Wiki Content: ', e)
    exit()

In [3]:
html = BeautifulSoup(res.text, 'html.parser')
tbody = html.select('table')[0].select('tbody')[0]
tbody

<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
</td></tr>
<tr>
<td>M6A</td>
<td><a href="/wiki/North_York" tit

In [4]:
columns = np.array([c.text.replace('\n', '') for c in tbody.select('tr')[0].select('th')])
columns[0] = 'PostalCode'
print(columns)

['PostalCode' 'Borough' 'Neighbourhood']


In [5]:
values = np.array([c.text[1:-1].split('\n') for c in tbody.select('tr')[1:]])
values[:3]

array([['M1A', 'Not assigned', 'Not assigned'],
       ['M2A', 'Not assigned', 'Not assigned'],
       ['M3A', 'North York', 'Parkwoods']], dtype='<U49')

In [6]:
df = pd.DataFrame(data=values, columns=columns)
df.head(1)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned


In [7]:
df.shape

(289, 3)

## Remove Invalid Rows

### Fix Boroughs

In [8]:
df = df[(df['Borough'] != 'Not assigned')]
df.shape

(212, 3)

### Fix Neighbourhoods

In [9]:
df[(df['Neighbourhood'] == 'Not assigned')]

Unnamed: 0,PostalCode,Borough,Neighbourhood
8,M7A,Queen's Park,Not assigned


In [10]:
df['Neighbourhood'] = df[['Borough', 'Neighbourhood']].apply(lambda r: r['Borough'] if r['Neighbourhood'] == 'Not assigned' else r['Neighbourhood'], axis=1)
df[(df['Neighbourhood'] == 'Not assigned')]

Unnamed: 0,PostalCode,Borough,Neighbourhood


#### Merge Neighbourhoods by PostalCode

In [11]:
df2 = df.groupby(['PostalCode', 'Borough'])[['Neighbourhood']].count()
df2['Neighbourhood'] = df.groupby(['PostalCode', 'Borough']).apply(lambda r: ', '.join(i for i in r['Neighbourhood']))
df2.reset_index(inplace=True)
df = df2
df.head(12)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [12]:
df.shape

(103, 3)

## Fetch Coordinates

### Read CSV

In [13]:
df_geo = pd.read_csv('canada_coordinates.csv')
df_geo.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)
df_geo.set_index('PostalCode', inplace=True)
df_geo.head()

Unnamed: 0_level_0,Latitude,Longitude
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476


In [14]:
df.set_index('PostalCode', inplace=True)
df.head()

Unnamed: 0_level_0,Borough,Neighbourhood
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Rouge, Malvern"
M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae


### Add Coordinates to Toronto DataFrame

In [15]:
df = pd.merge(df, df_geo, left_index=True, right_index=True, how='outer')
df.reset_index(inplace=True)
df.to_csv('canada_dataset.csv', index=False)
df.shape

(103, 5)

In [16]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [17]:
# Only for Offline Work
# df = pd.read_csv('canada_dataset.csv')
# df.head()

## Pre Visualize Neighbourhoods

In [18]:
import folium
from geopy.geocoders import Nominatim

import json
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

### Fetch Toronto's coordinates

In [19]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="ca_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


### Visualize Toronto Neighbouhoods

In [20]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Fetch Venues from Foursquare API

In [46]:
CLIENT_ID = '{YOUR CLIENT_ID HERE}' # your Foursquare ID
CLIENT_SECRET = '{YOUR CLIENT_SECRET HERE}' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: {YOUR CLIENT_ID HERE}
CLIENT_SECRET:{YOUR CLIENT_SECRET HERE}


In [22]:
def get_venues_url(endpoint, lat=0, lon=0, limit=50, radius=500):
    BASE_HOST = 'https://api.foursquare.com/v2/venues'
    AUTH_PARAMS = '&client_id={}&client_secret={}&v={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION)
    
    return BASE_HOST + endpoint + '?&ll={},{}&radius={}&limit={}'.format(lat, lon, radius, limit) + AUTH_PARAMS


In [23]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [24]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
        try:
            
            # create the API request URL
            url = get_venues_url('/explore', lat=lat, lon=lng, radius=radius, limit=100)

            # make the GET request
            results = requests.get(url).json()["response"]['groups'][0]['items']
            print('\tTotal Venues: ', len(results))

            # return only relevant information for each nearby venue
            venues_list.append([(
                name, 
                lat, 
                lng, 
                v['venue']['name'], 
                v['venue']['location']['lat'], 
                v['venue']['location']['lng'],  
                v['venue']['categories'][0]['name']) for v in results])
        except Exception as e:
            print('\tFAILED: ', e)

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = [
        'Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude',
        'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category'
    ]
    
    return(nearby_venues)

### Fetch venues for each neighbourhood

In [25]:
toronto_venues = getNearbyVenues(
    names=df['Neighbourhood'],
    latitudes=df['Latitude'],
    longitudes=df['Longitude']
)

Rouge, Malvern
	Total Venues:  1
Highland Creek, Rouge Hill, Port Union
	Total Venues:  1
Guildwood, Morningside, West Hill
	Total Venues:  7
Woburn
	Total Venues:  3
Cedarbrae
	Total Venues:  7
Scarborough Village
	Total Venues:  2
East Birchmount Park, Ionview, Kennedy Park
	Total Venues:  8
Clairlea, Golden Mile, Oakridge
	Total Venues:  8
Cliffcrest, Cliffside, Scarborough Village West
	Total Venues:  3
Birch Cliff, Cliffside West
	Total Venues:  4
Dorset Park, Scarborough Town Centre, Wexford Heights
	Total Venues:  6
Maryvale, Wexford
	Total Venues:  8
Agincourt
	Total Venues:  4
Clarks Corners, Sullivan, Tam O'Shanter
	Total Venues:  9
Agincourt North, L'Amoreaux East, Milliken, Steeles East
	Total Venues:  3
L'Amoreaux West, Steeles West
	Total Venues:  15
Upper Rouge
	Total Venues:  0
Hillcrest Village
	Total Venues:  4
Fairview, Henry Farm, Oriole
	Total Venues:  65
Bayview Village
	Total Venues:  4
Silver Hills, York Mills
	Total Venues:  1
Newtonbrook, Willowdale
	Total Ven

In [26]:
print(toronto_venues.shape)
toronto_venues.head()

(2237, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
3,"Guildwood, Morningside, West Hill",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store
4,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Big Bite Burrito,43.766299,-79.19072,Mexican Restaurant


### Venues OneHot Encoding

In [27]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# Delete Venue Category 'Neighborhood' to avoid confusion
toronto_onehot.drop(columns=['Neighborhood'], axis=1, inplace=True)

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood']

# # move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]


# For Offline Work
toronto_onehot.to_csv('toronto_onehot.csv', index=False)
print(toronto_onehot.shape)
toronto_onehot.head(3)

(2237, 277)


Unnamed: 0,Neighborhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Rouge, Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Highland Creek, Rouge Hill, Port Union",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Guildwood, Morningside, West Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
print(toronto_grouped.shape)
toronto_grouped

(100, 277)


Unnamed: 0,Neighborhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.010000,0.000000,0.000000,0.000000,0.0000,0.010000,0.000000,0.010000,0.000000
1,Agincourt,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.083333,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000
4,"Alderwood, Long Branch",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000
5,"Bathurst Manor, Downsview North, Wilson Heights",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.055556,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000
6,Bayview Village,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000
7,"Bedford Park, Lawrence Manor East",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000
8,Berczy Park,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000
9,"Birch Cliff, Cliffside West",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000


### Get Top N Venues

In [29]:
n_top_venues = 10

for hood in toronto_grouped['Neighborhood']:
    print("---- " + hood + " ----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(n_top_venues))
    print('\n')

---- Adelaide, King, Richmond ----
                 venue  freq
0          Coffee Shop  0.06
1                 Café  0.05
2           Steakhouse  0.04
3      Thai Restaurant  0.04
4  American Restaurant  0.04
5                  Bar  0.03
6               Bakery  0.03
7           Restaurant  0.03
8                Hotel  0.03
9     Asian Restaurant  0.03


---- Agincourt ----
                             venue  freq
0                   Sandwich Place  0.25
1                   Breakfast Spot  0.25
2                           Lounge  0.25
3               Chinese Restaurant  0.25
4                Accessories Store  0.00
5       Modern European Restaurant  0.00
6                    Movie Theater  0.00
7                            Motel  0.00
8              Monument / Landmark  0.00
9  Molecular Gastronomy Restaurant  0.00


---- Agincourt North, L'Amoreaux East, Milliken, Steeles East ----
                             venue  freq
0                       Playground  0.33
1                     

                      venue  freq
0       Japanese Restaurant  0.07
1          Sushi Restaurant  0.06
2               Coffee Shop  0.06
3                   Gay Bar  0.05
4              Burger Joint  0.03
5                Restaurant  0.03
6           Bubble Tea Shop  0.02
7      Fast Food Restaurant  0.02
8                 Gastropub  0.02
9  Mediterranean Restaurant  0.02


---- Clairlea, Golden Mile, Oakridge ----
                             venue  freq
0                           Bakery  0.25
1                         Bus Line  0.25
2                     Soccer Field  0.12
3             Fast Food Restaurant  0.12
4                     Intersection  0.12
5                      Bus Station  0.12
6  Molecular Gastronomy Restaurant  0.00
7                    Movie Theater  0.00
8                            Motel  0.00
9              Monument / Landmark  0.00


---- Clarks Corners, Sullivan, Tam O'Shanter ----
                  venue  freq
0           Pizza Place  0.22
1    Chinese Restau

                             venue  freq
0                    Jewelry Store  0.25
1                             Park  0.25
2                 Sushi Restaurant  0.25
3                            Trail  0.25
4                Accessories Store  0.00
5                Mobile Phone Shop  0.00
6                            Motel  0.00
7              Monument / Landmark  0.00
8  Molecular Gastronomy Restaurant  0.00
9       Modern European Restaurant  0.00


---- Glencairn ----
                       venue  freq
0        Japanese Restaurant  0.25
1                       Park  0.25
2                     Bakery  0.25
3                        Pub  0.25
4              Metro Station  0.00
5                Men's Store  0.00
6         Mexican Restaurant  0.00
7  Middle Eastern Restaurant  0.00
8         Miscellaneous Shop  0.00
9                Music Store  0.00


---- Guildwood, Morningside, West Hill ----
                      venue  freq
0               Pizza Place  0.14
1       Rental Car Location 

                             venue  freq
0       Construction & Landscaping  0.25
1                           Bakery  0.25
2                             Park  0.25
3                 Basketball Court  0.25
4                Accessories Store  0.00
5                Mobile Phone Shop  0.00
6                    Movie Theater  0.00
7                            Motel  0.00
8              Monument / Landmark  0.00
9  Molecular Gastronomy Restaurant  0.00


---- Maryvale, Wexford ----
                       venue  freq
0  Middle Eastern Restaurant  0.25
1          Accessories Store  0.12
2             Sandwich Place  0.12
3             Breakfast Spot  0.12
4                 Smoke Shop  0.12
5              Shopping Mall  0.12
6                Auto Garage  0.12
7            Organic Grocery  0.00
8       Other Great Outdoors  0.00
9                Men's Store  0.00


---- Moore Park, Summerhill East ----
                             venue  freq
0                       Playground   0.5
1           

                             venue  freq
0                              Pub  0.33
1                      Coffee Shop  0.33
2                Accessories Store  0.00
3                    Moving Target  0.00
4                    Movie Theater  0.00
5                            Motel  0.00
6              Monument / Landmark  0.00
7  Molecular Gastronomy Restaurant  0.00
8       Modern European Restaurant  0.00
9                Mobile Phone Shop  0.00


---- The Beaches West, India Bazaar ----
                venue  freq
0             Brewery  0.05
1                 Gym  0.05
2    Sushi Restaurant  0.05
3        Liquor Store  0.05
4          Steakhouse  0.05
5      Sandwich Place  0.05
6  Italian Restaurant  0.05
7                 Pub  0.05
8       Burrito Place  0.05
9          Board Shop  0.05


---- The Danforth West, Riverdale ----
                  venue  freq
0      Greek Restaurant  0.24
1           Coffee Shop  0.10
2        Ice Cream Shop  0.07
3    Italian Restaurant  0.05
4      

In [30]:
def return_most_common_venues(row, num_top_venues = 10):
    #print('*********** row:\n', row[:num_top_venues])
    row_categories = row.iloc[1:] # Remove: Neighborhood, Neighborhood_NAME
    #print('\n\n*********** row_categories:\n', row_categories[:num_top_venues])
    row_categories_sorted = row_categories.sort_values(ascending=False)
    #print('\n\n*********** row_categories_sorted:\n', row_categories_sorted[:num_top_venues])
    
    return row_categories_sorted.index.values[0:num_top_venues]


indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(n_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
# for ind in np.arange(2):
    #print('\n\n\n[{}] ------------------------------------'.format(ind))
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues=n_top_venues)

print(neighborhoods_venues_sorted.shape)
neighborhoods_venues_sorted.head()

(100, 11)


Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,Steakhouse,Thai Restaurant,American Restaurant,Restaurant,Hotel,Bakery,Bar,Asian Restaurant
1,Agincourt,Chinese Restaurant,Lounge,Sandwich Place,Breakfast Spot,Yoga Studio,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant
2,"Agincourt North, L'Amoreaux East, Milliken, St...",Park,Coffee Shop,Playground,Yoga Studio,Dumpling Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",Grocery Store,Coffee Shop,Fried Chicken Joint,Pharmacy,Pizza Place,Discount Store,Sandwich Place,Beer Store,Japanese Restaurant,Fast Food Restaurant
4,"Alderwood, Long Branch",Pizza Place,Gym,Coffee Shop,Skating Rink,Pharmacy,Athletics & Sports,Pub,Sandwich Place,Pool,Yoga Studio


## Cluster Neighborhoods

In [31]:
# set number of clusters
kclusters = 10

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 0, 5, 0, 0, 0, 0, 0, 0, 0])

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [32]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_, )

toronto_merged = df.copy()

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

# drop neighborhoods with NO venues
toronto_merged.dropna(inplace=True)

toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].map(int)

toronto_merged.head() # check the last columns!
# toronto_merged[(toronto_merged['Neighbourhood'] == 'Agincourt')]

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,2,Fast Food Restaurant,Dumpling Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Eastern European Restaurant,Field
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,8,Bar,Yoga Studio,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Diner
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,0,Rental Car Location,Electronics Store,Medical Center,Pizza Place,Breakfast Spot,Tech Startup,Mexican Restaurant,Doner Restaurant,Dim Sum Restaurant,Diner
3,M1G,Scarborough,Woburn,43.770992,-79.216917,0,Coffee Shop,Korean Restaurant,Yoga Studio,Dumpling Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Eastern European Restaurant
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,0,Athletics & Sports,Hakka Restaurant,Fried Chicken Joint,Bakery,Caribbean Restaurant,Thai Restaurant,Bank,Donut Shop,Dog Run,Doner Restaurant


Finally, let's visualize the resulting clusters


In [33]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11) # lat, lon corresponds to Toronto

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' (Cluster ' + str(cluster) + ')', parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examine Clusters

Now, you can examine each cluster and determine the discriminating venue categories that distinguish each cluster. Based on the defining categories, you can then assign a name to each cluster.

In [34]:
print('TOTAL CLUSTERS: ', kclusters)
print('Classes: ', np.unique(toronto_merged['Cluster Labels']))

TOTAL CLUSTERS:  10
Classes:  [0 1 2 3 4 5 6 7 8 9]


In [35]:
cols = toronto_merged.columns[[2] + list(range(6, toronto_merged.shape[1]))]
cols

Index(['Neighbourhood', '1st Most Common Venue', '2nd Most Common Venue',
       '3rd Most Common Venue', '4th Most Common Venue',
       '5th Most Common Venue', '6th Most Common Venue',
       '7th Most Common Venue', '8th Most Common Venue',
       '9th Most Common Venue', '10th Most Common Venue'],
      dtype='object')

#### Cluster 1

In [36]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, cols]

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,"Guildwood, Morningside, West Hill",Rental Car Location,Electronics Store,Medical Center,Pizza Place,Breakfast Spot,Tech Startup,Mexican Restaurant,Doner Restaurant,Dim Sum Restaurant,Diner
3,Woburn,Coffee Shop,Korean Restaurant,Yoga Studio,Dumpling Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Eastern European Restaurant
4,Cedarbrae,Athletics & Sports,Hakka Restaurant,Fried Chicken Joint,Bakery,Caribbean Restaurant,Thai Restaurant,Bank,Donut Shop,Dog Run,Doner Restaurant
6,"East Birchmount Park, Ionview, Kennedy Park",Discount Store,Department Store,Bus Station,Coffee Shop,Hobby Shop,Convenience Store,Train Station,Dumpling Restaurant,Dog Run,Doner Restaurant
7,"Clairlea, Golden Mile, Oakridge",Bakery,Bus Line,Intersection,Bus Station,Fast Food Restaurant,Soccer Field,Cosmetics Shop,Construction & Landscaping,Comic Shop,Farmers Market
8,"Cliffcrest, Cliffside, Scarborough Village West",Motel,American Restaurant,Skating Rink,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Yoga Studio
9,"Birch Cliff, Cliffside West",College Stadium,Café,Skating Rink,General Entertainment,Yoga Studio,Donut Shop,Diner,Discount Store,Dog Run,Doner Restaurant
10,"Dorset Park, Scarborough Town Centre, Wexford ...",Indian Restaurant,Pet Store,Chinese Restaurant,Vietnamese Restaurant,Latin American Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
11,"Maryvale, Wexford",Middle Eastern Restaurant,Accessories Store,Shopping Mall,Sandwich Place,Breakfast Spot,Auto Garage,Smoke Shop,Electronics Store,Empanada Restaurant,Eastern European Restaurant
12,Agincourt,Chinese Restaurant,Lounge,Sandwich Place,Breakfast Spot,Yoga Studio,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant


#### Cluster 2

In [37]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, cols]

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
20,"Silver Hills, York Mills",Cafeteria,Dumpling Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Yoga Studio,Dim Sum Restaurant


#### Cluster 3

In [38]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, cols]

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Rouge, Malvern",Fast Food Restaurant,Dumpling Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Eastern European Restaurant,Field
25,Parkwoods,Food & Drink Shop,Park,Fast Food Restaurant,Event Space,Falafel Restaurant,Ethiopian Restaurant,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Dim Sum Restaurant
74,Caledonia-Fairbanks,Park,Fast Food Restaurant,Market,Pharmacy,Women's Store,Gym,Drugstore,Dim Sum Restaurant,Diner,Discount Store


#### Cluster 4

In [39]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, cols]

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
32,Downsview Central,Baseball Field,Food Truck,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,Eastern European Restaurant,Yoga Studio,Diner
91,"Humber Bay, King's Mill Park, Kingsway Park So...",Baseball Field,Electronics Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,Eastern European Restaurant,Yoga Studio,Diner
97,"Emery, Humberlea",Baseball Field,Electronics Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,Eastern European Restaurant,Yoga Studio,Diner


#### Cluster 5

In [40]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, cols]

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
94,"Cloverdale, Islington, Martin Grove, Princess ...",Bank,Yoga Studio,Eastern European Restaurant,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,Electronics Store,Diner


#### Cluster 6

In [41]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 5, cols]

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,Scarborough Village,Women's Store,Playground,Dumpling Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Eastern European Restaurant
14,"Agincourt North, L'Amoreaux East, Milliken, St...",Park,Coffee Shop,Playground,Yoga Studio,Dumpling Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
48,"Moore Park, Summerhill East",Park,Playground,Yoga Studio,Dumpling Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore
50,Rosedale,Park,Playground,Trail,Yoga Studio,Drugstore,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant


#### Cluster 7

In [42]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 6, cols]

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
63,Roselawn,Garden,Yoga Studio,Dumpling Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Eastern European Restaurant


#### Cluster 8

In [43]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 7, cols]

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
96,Humber Summit,Pizza Place,Empanada Restaurant,Yoga Studio,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Eastern European Restaurant
100,"Kingsview Village, Martin Grove Gardens, Richv...",Pizza Place,Park,Mobile Phone Shop,Yoga Studio,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop


#### Cluster 9

In [44]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 8, cols]

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,"Highland Creek, Rouge Hill, Port Union",Bar,Yoga Studio,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Diner


#### Cluster 10

In [45]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 9, cols]

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
23,York Mills West,Park,Bank,Yoga Studio,Dumpling Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Eastern European Restaurant
98,Weston,Park,Yoga Studio,Dumpling Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Eastern European Restaurant
