# Segmenting and Clustering Neighborhoods in Toronto

## 1. Install & Import Libraries

In [1]:
!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab

Fetching package metadata .............
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda/envs/DSX-Python35:
#
folium                    0.5.0                      py_0    conda-forge


In [2]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans


import folium # map rendering library



import types
from botocore.client import Config
import ibm_boto3

print('Libraries imported.')

Libraries imported.


   ## 2. Read html using BeautifulSoup library

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M';
html = requests.get(url).text
soup = BeautifulSoup(html,'lxml')

In [4]:
# find the table
postal_codes_table = soup.find('table',{'class':'wikitable sortable'})

In [5]:
# extract td's only
table_cells = postal_codes_table.find_all('td')

In [6]:
# group each 3 cells together
N = 3
table_rows = [table_cells[n:n+N] for n in range(0, len(table_cells), N)]
postal_code = []
borough = []
neighborhood = []
for row in table_rows:
    postal_code_text = row[0].text
    borough_text = row[1].text
    neighborhood_text = row[2].text.rstrip('\n')
    if borough_text != 'Not assigned':
        postal_code.append(postal_code_text)
        borough.append(borough_text)
        neighborhood.append(neighborhood_text)

## 3. Create & Explore Dataset

In [7]:
df = pd.DataFrame()
df['PostalCode'] = postal_code
df['Borough'] = borough
df['Neighborhood'] = neighborhood
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [8]:
# group neighborhoods of the same postal code
df = df.groupby(['PostalCode','Borough']).aggregate(lambda x : ', '.join(x)).reset_index()
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [9]:
# check M5A 
df.loc[df['PostalCode'] == 'M5A']

Unnamed: 0,PostalCode,Borough,Neighborhood
53,M5A,Downtown Toronto,"Harbourfront, Regent Park"


In [10]:
# fill 'Not assigned' neighborhood to the proper borough
df.loc[df['Neighborhood'] == 'Not assigned', 'Neighborhood'] = df['Borough']

In [11]:
# check M5A 
df.loc[df['PostalCode'] == 'M7A']

Unnamed: 0,PostalCode,Borough,Neighborhood
85,M7A,Queen's Park,Queen's Park


In [12]:
# df shape
df.shape

(103, 3)

## Insert Geospatial Coordinates CSV file to the code

In [13]:
def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share your notebook.
client_1f2743066d6b4a08862baaf50b794f6d = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='DaADxXD1xYXv4gPZuq9gM2enGDBpOTc_pftcgowbyk-C',
    ibm_auth_endpoint="https://iam.bluemix.net/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')

body = client_1f2743066d6b4a08862baaf50b794f6d.get_object(Bucket='mydatascienceproject-donotdelete-pr-4no7swxe9h5vwt',Key='Geospatial_Coordinates.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

df_data_1 = pd.read_csv(body)
df_data_1.head()



Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## Perform Left Join to the data frames.

In [14]:

df_data_1.rename(columns={'Postal Code':'PostalCode'},inplace=True)
df_data_1.head()
df = pd.merge(df, df_data_1, on='PostalCode', how='left')
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [15]:
# check M5G 
df.loc[df['PostalCode'] == 'M5G']

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
57,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383


## Now let's create a map of Toronto with neighborhoods superimposed on top.

first, let's work with only borough that containts the name Toronto 

In [16]:
df = df[df['Borough'].str.contains("Toronto")].reset_index()
df.head()

Unnamed: 0,index,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,37,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,42,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,43,M4M,East Toronto,Studio District,43.659526,-79.340923
4,44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [17]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [18]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186c0',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## Next, we are going to start utilizing the Foursquare API to explore the neighborhoods and segment them.

## 4. Explore Neighborhoods

#### Define Foursquare Credentials and Version

In [19]:
CLIENT_ID = 'B2CWZ3L4M04KBQOBSR0JDVTDGVCDFFW0EVYKBBZL5JVVXWFJ' # your Foursquare ID
CLIENT_SECRET = 'WNG3ODIQUSFPIQ0DEJGDJJT5U0TSJA1KOCARDPQSVR3IPDKC' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

### Let's explore the first neighborhood

In [20]:
df.loc[0, 'Neighborhood']

'The Beaches'

#### get latitude and longitude of the neighborhood

In [21]:
neighborhood_latitude = df.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = df.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of The Beaches are 43.67635739999999, -79.2930312.


#### Now, let's get the top 100 venues that are in Marble Hill within a radius of 500 meters.

In [22]:
LIMIT = 100 # limit of number of venues returned by Foursquare API

radius = 500 # define radius

# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)

# get resullt
results = requests.get(url).json()

In [23]:
results

{'meta': {'code': 200, 'requestId': '5ca4b54cdd57977ce652ec2e'},
 'response': {'groups': [{'items': [{'reasons': {'count': 0,
       'items': [{'reasonName': 'globalInteractionReason',
         'summary': 'This spot is popular',
         'type': 'general'}]},
      'referralId': 'e-0-4ad4c062f964a52011f820e3-0',
      'venue': {'categories': [{'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/shops/food_grocery_',
          'suffix': '.png'},
         'id': '50aa9e744b90af0d42d5de0e',
         'name': 'Health Food Store',
         'pluralName': 'Health Food Stores',
         'primary': True,
         'shortName': 'Health Food Store'}],
       'id': '4ad4c062f964a52011f820e3',
       'location': {'address': '125 Southwood Dr',
        'cc': 'CA',
        'city': 'Toronto',
        'country': 'Canada',
        'distance': 471,
        'formattedAddress': ['125 Southwood Dr',
         'Toronto ON M4E 0B8',
         'Canada'],
        'labeledLatLngs': [{'label': 'display',
      

In [24]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

#### Now let's clean the json and structure it into a *pandas* dataframe.

In [25]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,The Big Carrot Natural Food Market,Health Food Store,43.678879,-79.297734
1,Grover Pub and Grub,Pub,43.679181,-79.297215
2,Starbucks,Coffee Shop,43.678798,-79.298045
3,Upper Beaches,Neighborhood,43.680563,-79.292869


### Let's create a function to repeat the same process to all the neighborhoods in Toronto

In [26]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Run the above function on each neighborhood and create a new dataframe called *toronto_venues*.

In [27]:
toronto_venues = getNearbyVenues(names=df['Neighborhood'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )

The Beaches
The Danforth West, Riverdale
The Beaches West, India Bazaar
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park, Summerhill East
Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West
Rosedale
Cabbagetown, St. James Town
Church and Wellesley
Harbourfront, Regent Park
Ryerson, Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide, King, Richmond
Harbourfront East, Toronto Islands, Union Station
Design Exchange, Toronto Dominion Centre
Commerce Court, Victoria Hotel
Roselawn
Forest Hill North, Forest Hill West
The Annex, North Midtown, Yorkville
Harbord, University of Toronto
Chinatown, Grange Park, Kensington Market
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place, Underground city
Christie
Dovercourt Village, Dufferin
Little Portugal, Trinity
Brockton, Exhibition Place, Parkdale Village
High Park, The 

In [28]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
1,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
2,The Beaches,43.676357,-79.293031,Starbucks,43.678798,-79.298045,Coffee Shop
3,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,"The Danforth West, Riverdale",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant


#### Let's check how many venues were returned for each neighborhood

In [29]:
toronto_venues.groupby('Neighborhood').count()['Venue']

Neighborhood
Adelaide, King, Richmond                                                                                      100
Berczy Park                                                                                                    57
Brockton, Exhibition Place, Parkdale Village                                                                   23
Business Reply Mail Processing Centre 969 Eastern                                                              17
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara     14
Cabbagetown, St. James Town                                                                                    43
Central Bay Street                                                                                             81
Chinatown, Grange Park, Kensington Market                                                                     100
Christie                                                                   

#### Let's find out how many unique categories can be curated from all the returned venues

In [30]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 240 uniques categories.


## 5. Analyze Each Neighborhood

In [31]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']])

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()



Unnamed: 0,Neighborhood,Venue Category_Accessories Store,Venue Category_Adult Boutique,Venue Category_Afghan Restaurant,Venue Category_Airport,Venue Category_Airport Food Court,Venue Category_Airport Gate,Venue Category_Airport Lounge,Venue Category_Airport Service,Venue Category_Airport Terminal,...,Venue Category_Toy / Game Store,Venue Category_Trail,Venue Category_Train Station,Venue Category_Transportation Service,Venue Category_Vegetarian / Vegan Restaurant,Venue Category_Video Game Store,Venue Category_Vietnamese Restaurant,Venue Category_Wine Bar,Venue Category_Wings Joint,Venue Category_Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"The Danforth West, Riverdale",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
# new data frame shape
toronto_onehot.shape

(1709, 241)

#### Now, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [33]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Venue Category_Accessories Store,Venue Category_Adult Boutique,Venue Category_Afghan Restaurant,Venue Category_Airport,Venue Category_Airport Food Court,Venue Category_Airport Gate,Venue Category_Airport Lounge,Venue Category_Airport Service,Venue Category_Airport Terminal,...,Venue Category_Toy / Game Store,Venue Category_Trail,Venue Category_Train Station,Venue Category_Transportation Service,Venue Category_Vegetarian / Vegan Restaurant,Venue Category_Video Game Store,Venue Category_Vietnamese Restaurant,Venue Category_Wine Bar,Venue Category_Wings Joint,Venue Category_Yoga Studio
0,"Adelaide, King, Richmond",0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.017544,0.0,0.0,0.0,0.0,0.0
2,"Brockton, Exhibition Place, Parkdale Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.0,0.0,0.071429,0.071429,0.071429,0.142857,0.142857,0.142857,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Cabbagetown, St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.012346,0.0,0.0,0.012346,0.0,0.012346
7,"Chinatown, Grange Park, Kensington Market",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.05,0.0,0.05,0.01,0.0,0.0
8,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Church and Wellesley,0.0,0.011236,0.011236,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.011236,0.011236,0.0,0.011236,0.022472


#### Now let's confirm the new size

In [34]:
toronto_grouped.shape

(38, 241)

#### Let's print each neighborhood along with the top 5 most common venues

In [35]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide, King, Richmond----
                            venue  freq
0      Venue Category_Coffee Shop  0.06
1       Venue Category_Steakhouse  0.04
2             Venue Category_Café  0.04
3  Venue Category_Thai Restaurant  0.04
4     Venue Category_Burger Joint  0.03


----Berczy Park----
                               venue  freq
0         Venue Category_Coffee Shop  0.07
1        Venue Category_Cocktail Bar  0.05
2      Venue Category_Farmers Market  0.04
3          Venue Category_Restaurant  0.04
4  Venue Category_Seafood Restaurant  0.04


----Brockton, Exhibition Place, Parkdale Village----
                                 venue  freq
0        Venue Category_Breakfast Spot  0.09
1             Venue Category_Nightclub  0.09
2           Venue Category_Coffee Shop  0.09
3                  Venue Category_Café  0.09
4  Venue Category_Gym / Fitness Center  0.04


----Business Reply Mail Processing Centre 969 Eastern----
                               venue  freq
0  Venue Category_L

#### Let's put that into a *pandas* dataframe

First, let's write a function to sort the venues in descending order.

In [36]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [37]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Venue Category_Coffee Shop,Venue Category_Steakhouse,Venue Category_Café,Venue Category_Thai Restaurant,Venue Category_Bakery,Venue Category_Restaurant,Venue Category_Gym,Venue Category_Burger Joint,Venue Category_Hotel,Venue Category_American Restaurant
1,Berczy Park,Venue Category_Coffee Shop,Venue Category_Cocktail Bar,Venue Category_Bakery,Venue Category_Steakhouse,Venue Category_Pub,Venue Category_Cheese Shop,Venue Category_Farmers Market,Venue Category_Restaurant,Venue Category_Seafood Restaurant,Venue Category_Café
2,"Brockton, Exhibition Place, Parkdale Village",Venue Category_Breakfast Spot,Venue Category_Nightclub,Venue Category_Café,Venue Category_Coffee Shop,Venue Category_Caribbean Restaurant,Venue Category_Stadium,Venue Category_Furniture / Home Store,Venue Category_Italian Restaurant,Venue Category_Climbing Gym,Venue Category_Restaurant
3,Business Reply Mail Processing Centre 969 Eastern,Venue Category_Light Rail Station,Venue Category_Yoga Studio,Venue Category_Garden,Venue Category_Pizza Place,Venue Category_Restaurant,Venue Category_Burrito Place,Venue Category_Brewery,Venue Category_Skate Park,Venue Category_Smoke Shop,Venue Category_Farmers Market
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",Venue Category_Airport Terminal,Venue Category_Airport Service,Venue Category_Airport Lounge,Venue Category_Boat or Ferry,Venue Category_Airport Gate,Venue Category_Sculpture Garden,Venue Category_Plane,Venue Category_Airport Food Court,Venue Category_Airport,Venue Category_Harbor / Marina


## 6. Cluster Neighborhoods

Run *k*-means to cluster the neighborhood into 5 clusters.

In [38]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

add clustering labels

In [39]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [40]:

toronto_merged = df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')


toronto_merged.tail() # check the last columns!

Unnamed: 0,index,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
33,78,M6K,West Toronto,"Brockton, Exhibition Place, Parkdale Village",43.636847,-79.428191,0,Venue Category_Breakfast Spot,Venue Category_Nightclub,Venue Category_Café,Venue Category_Coffee Shop,Venue Category_Caribbean Restaurant,Venue Category_Stadium,Venue Category_Furniture / Home Store,Venue Category_Italian Restaurant,Venue Category_Climbing Gym,Venue Category_Restaurant
34,82,M6P,West Toronto,"High Park, The Junction South",43.661608,-79.464763,0,Venue Category_Café,Venue Category_Mexican Restaurant,Venue Category_Bar,Venue Category_Flea Market,Venue Category_Arts & Crafts Store,Venue Category_Music Venue,Venue Category_Diner,Venue Category_Cajun / Creole Restaurant,Venue Category_Sandwich Place,Venue Category_Fast Food Restaurant
35,83,M6R,West Toronto,"Parkdale, Roncesvalles",43.64896,-79.456325,0,Venue Category_Breakfast Spot,Venue Category_Gift Shop,Venue Category_Bookstore,Venue Category_Dessert Shop,Venue Category_Movie Theater,Venue Category_Restaurant,Venue Category_Bank,Venue Category_Dog Run,Venue Category_Italian Restaurant,Venue Category_Bar
36,84,M6S,West Toronto,"Runnymede, Swansea",43.651571,-79.48445,0,Venue Category_Coffee Shop,Venue Category_Café,Venue Category_Italian Restaurant,Venue Category_Pizza Place,Venue Category_Sushi Restaurant,Venue Category_Food & Drink Shop,Venue Category_Pub,Venue Category_Indie Movie Theater,Venue Category_Latin American Restaurant,Venue Category_Dessert Shop
37,87,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558,0,Venue Category_Light Rail Station,Venue Category_Yoga Studio,Venue Category_Garden,Venue Category_Pizza Place,Venue Category_Restaurant,Venue Category_Burrito Place,Venue Category_Brewery,Venue Category_Skate Park,Venue Category_Smoke Shop,Venue Category_Farmers Market


clusters sizes

In [41]:
toronto_merged.groupby('Cluster Labels').count()['index']

Cluster Labels
0    33
1     1
2     1
3     2
4     1
Name: index, dtype: int64

Finally, let's visualize the resulting clusters

In [42]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## 5. Examine Clusters

Now, you can examine each cluster and determine the discriminating venue categories that distinguish each cluster. Based on the defining categories.

#### Cluster 1

In [43]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,-79.293031,0,Venue Category_Health Food Store,Venue Category_Coffee Shop,Venue Category_Pub,Venue Category_Neighborhood,Venue Category_Coworking Space,Venue Category_Cosmetics Shop,Venue Category_Fast Food Restaurant,Venue Category_Farmers Market,Venue Category_Falafel Restaurant,Venue Category_Event Space
1,M4K,-79.352188,0,Venue Category_Greek Restaurant,Venue Category_Coffee Shop,Venue Category_Ice Cream Shop,Venue Category_Italian Restaurant,Venue Category_Yoga Studio,Venue Category_Pizza Place,Venue Category_Bookstore,Venue Category_Brewery,Venue Category_Bubble Tea Shop,Venue Category_Café
2,M4L,-79.315572,0,Venue Category_Park,Venue Category_Fast Food Restaurant,Venue Category_Ice Cream Shop,Venue Category_Burger Joint,Venue Category_Liquor Store,Venue Category_Burrito Place,Venue Category_Fish & Chips Shop,Venue Category_Sandwich Place,Venue Category_Steakhouse,Venue Category_Sushi Restaurant
3,M4M,-79.340923,0,Venue Category_Café,Venue Category_Coffee Shop,Venue Category_Gastropub,Venue Category_Italian Restaurant,Venue Category_Bakery,Venue Category_American Restaurant,Venue Category_Yoga Studio,Venue Category_Park,Venue Category_Brewery,Venue Category_Seafood Restaurant
5,M4P,-79.390197,0,Venue Category_Burger Joint,Venue Category_Sandwich Place,Venue Category_Breakfast Spot,Venue Category_Gym,Venue Category_Park,Venue Category_Hotel,Venue Category_Food & Drink Shop,Venue Category_Yoga Studio,Venue Category_Donut Shop,Venue Category_Dumpling Restaurant
6,M4R,-79.405678,0,Venue Category_Clothing Store,Venue Category_Sporting Goods Shop,Venue Category_Coffee Shop,Venue Category_Yoga Studio,Venue Category_Fast Food Restaurant,Venue Category_Mexican Restaurant,Venue Category_Diner,Venue Category_Dessert Shop,Venue Category_Cosmetics Shop,Venue Category_Park
7,M4S,-79.38879,0,Venue Category_Dessert Shop,Venue Category_Sandwich Place,Venue Category_Pizza Place,Venue Category_Italian Restaurant,Venue Category_Café,Venue Category_Coffee Shop,Venue Category_Thai Restaurant,Venue Category_Sushi Restaurant,Venue Category_Flower Shop,Venue Category_Farmers Market
9,M4V,-79.400049,0,Venue Category_Coffee Shop,Venue Category_Pub,Venue Category_Pizza Place,Venue Category_American Restaurant,Venue Category_Convenience Store,Venue Category_Medical Center,Venue Category_Sports Bar,Venue Category_Bagel Shop,Venue Category_Supermarket,Venue Category_Sushi Restaurant
11,M4X,-79.367675,0,Venue Category_Coffee Shop,Venue Category_Restaurant,Venue Category_Bakery,Venue Category_Café,Venue Category_Pub,Venue Category_Pizza Place,Venue Category_Italian Restaurant,Venue Category_Beer Store,Venue Category_Bank,Venue Category_Diner
12,M4Y,-79.38316,0,Venue Category_Japanese Restaurant,Venue Category_Coffee Shop,Venue Category_Sushi Restaurant,Venue Category_Gay Bar,Venue Category_Restaurant,Venue Category_Burger Joint,Venue Category_Pub,Venue Category_Bubble Tea Shop,Venue Category_Café,Venue Category_Men's Store


#### Cluster 2

In [44]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
8,M4T,-79.38316,1,Venue Category_Playground,Venue Category_Historic Site,Venue Category_Filipino Restaurant,Venue Category_Fast Food Restaurant,Venue Category_Farmers Market,Venue Category_Falafel Restaurant,Venue Category_Event Space,Venue Category_Ethiopian Restaurant,Venue Category_Electronics Store,Venue Category_Eastern European Restaurant


#### Cluster 3

In [45]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
22,M5N,-79.416936,2,Venue Category_Garden,Venue Category_Yoga Studio,Venue Category_Fish Market,Venue Category_Filipino Restaurant,Venue Category_Fast Food Restaurant,Venue Category_Farmers Market,Venue Category_Falafel Restaurant,Venue Category_Event Space,Venue Category_Ethiopian Restaurant,Venue Category_Electronics Store


#### Cluster 4

In [46]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
10,M4W,-79.377529,3,Venue Category_Park,Venue Category_Playground,Venue Category_Trail,Venue Category_Diner,Venue Category_Fast Food Restaurant,Venue Category_Farmers Market,Venue Category_Falafel Restaurant,Venue Category_Event Space,Venue Category_Ethiopian Restaurant,Venue Category_Electronics Store
23,M5P,-79.411307,3,Venue Category_Trail,Venue Category_Bus Line,Venue Category_Park,Venue Category_Jewelry Store,Venue Category_Sushi Restaurant,Venue Category_Electronics Store,Venue Category_Donut Shop,Venue Category_Dumpling Restaurant,Venue Category_Eastern European Restaurant,Venue Category_Yoga Studio


#### Cluster 5

In [47]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,M4N,-79.38879,4,Venue Category_Park,Venue Category_Swim School,Venue Category_Bus Line,Venue Category_Yoga Studio,Venue Category_Dog Run,Venue Category_Filipino Restaurant,Venue Category_Fast Food Restaurant,Venue Category_Farmers Market,Venue Category_Falafel Restaurant,Venue Category_Event Space
