# Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto

## Obtaining the postal codes table

First, we obtain a list of the tables in the url

In [1]:
import pandas as pd
import numpy as np

WIKI_URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

html_tables = pd.read_html(WIKI_URL)
for df in html_tables:
    print(df.head())
    print(df.shape)

  Postcode           Borough     Neighbourhood
0      M1A      Not assigned      Not assigned
1      M2A      Not assigned      Not assigned
2      M3A        North York         Parkwoods
3      M4A        North York  Victoria Village
4      M5A  Downtown Toronto      Harbourfront
(289, 3)
                                                  0   \
0                                                NaN   
1  NL NS PE NB QC ON MB SK AB BC NU/NT YT A B C E...   
2                                                 NL   
3                                                  A   

                                                  1   \
0                              Canadian postal codes   
1  NL NS PE NB QC ON MB SK AB BC NU/NT YT A B C E...   
2                                                 NS   
3                                                  B   

                                                  2    3    4    5    6    7   \
0                                                NaN  NaN  NaN  Na

We can notice that the postal code tables is the first dataframe in the list, so we assign it to a variable

In [2]:
postal_df = html_tables[0]
print(postal_df.shape)
postal_df.head()

(289, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Now we drop all the rows with 'Not assigned' borough

In [3]:
postal_df['Borough'].replace('Not assigned', np.nan, inplace=True)
postal_df.dropna(subset=['Borough'], inplace=True)
print(postal_df.shape)
postal_df.head()

(212, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


Now we concatenate all the Neighbourhood of the Postcode values to the first row it appears and drop all the other rows

In [4]:
for p in postal_df['Postcode'].unique():
    subset_df = postal_df[postal_df['Postcode']==p]
    first_index = None
    neighbourhoods = []
    for index in subset_df.index:
        if (first_index == None):
            first_index = index
        neighbourhoods.append(subset_df.loc[index,'Neighbourhood'])
    postal_df.loc[first_index, 'Neighbourhood'] = ', '.join(neighbourhoods)
postal_df.drop_duplicates('Postcode', inplace=True)
postal_df.index = range(postal_df.shape[0])
print(postal_df.shape)
postal_df.head()

(103, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Not assigned


Lastly, we assign the Borough name to all 'Not assigned' Neighbourhoods and print the shape of the final dataframe

In [5]:
for i in postal_df[postal_df['Neighbourhood']=='Not assigned'].index:
    postal_df.loc[i, 'Neighbourhood'] = postal_df.loc[i, 'Borough']
postal_df.shape

(103, 3)

## Obtaining the Postal code coordinates

First we obtain the lat and long data from the csv file

In [6]:
geo_df = pd.read_csv('https://cocl.us/Geospatial_data', index_col='Postal Code')
geo_df

Unnamed: 0_level_0,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476
M1J,43.744734,-79.239476
M1K,43.727929,-79.262029
M1L,43.711112,-79.284577
M1M,43.716316,-79.239476
M1N,43.692657,-79.264848


In [7]:
postal_df = postal_df.join(geo_df, on='Postcode')
postal_df

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.654260,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


## Clustering the Neighborhoods

In [8]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [9]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [10]:
neighborhoods = postal_df[postal_df['Borough'].str.contains('Toronto')].reset_index(drop=True)

# create map of Toronto using latitude and longitude values
toronto_map = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighbourhood']):
    label = '{} - {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  
    
toronto_map

### Preparing and using the Foursquare api

First we prepare the credentials and version of the api to be used

In [11]:
CLIENT_ID = 'M0BHW1CM5X4XAI2CATS2PT4SD5LJC3JW5JBM0WI2NVKUKZ4R' # your Foursquare ID
CLIENT_SECRET = 'BZGORQPJMLZUTT0RY4XILZ11ZAVYF2PPZNPZ3W22BH0ZPJAZ' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: M0BHW1CM5X4XAI2CATS2PT4SD5LJC3JW5JBM0WI2NVKUKZ4R
CLIENT_SECRET:BZGORQPJMLZUTT0RY4XILZ11ZAVYF2PPZNPZ3W22BH0ZPJAZ


Now we borrow the getNearbyVenues function from the lab

In [12]:
def getNearbyVenues(postcodes, names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for postcode, name, lat, lng in zip(postcodes, names, latitudes, longitudes):
        print(', '.join([postcode, name]))
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            postcode, 
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postcode', 
                  'Neighborhoods', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Now we get the venues data with the function

In [13]:
toronto_venues = getNearbyVenues(neighborhoods['Postcode'], neighborhoods['Neighbourhood'], neighborhoods['Latitude'], neighborhoods['Longitude'])

M5A, Harbourfront, Regent Park
M5B, Ryerson, Garden District
M5C, St. James Town
M4E, The Beaches
M5E, Berczy Park
M5G, Central Bay Street
M6G, Christie
M5H, Adelaide, King, Richmond
M6H, Dovercourt Village, Dufferin
M5J, Harbourfront East, Toronto Islands, Union Station
M6J, Little Portugal, Trinity
M4K, The Danforth West, Riverdale
M5K, Design Exchange, Toronto Dominion Centre
M6K, Brockton, Exhibition Place, Parkdale Village
M4L, The Beaches West, India Bazaar
M5L, Commerce Court, Victoria Hotel
M4M, Studio District
M4N, Lawrence Park
M5N, Roselawn
M4P, Davisville North
M5P, Forest Hill North, Forest Hill West
M6P, High Park, The Junction South
M4R, North Toronto West
M5R, The Annex, North Midtown, Yorkville
M6R, Parkdale, Roncesvalles
M4S, Davisville
M5S, Harbord, University of Toronto
M6S, Runnymede, Swansea
M4T, Moore Park, Summerhill East
M5T, Chinatown, Grange Park, Kensington Market
M4V, Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West
M5V, CN Tower, Bathurst 

### Preparing the data for clustering

First we transpose the Venue Category column

In [14]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add postcode and neighborhood column back to dataframe
toronto_onehot['Postcode'] = toronto_venues['Postcode']
toronto_onehot['Neighborhoods'] = toronto_venues['Neighborhoods']

# move postcode and neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-2]] + [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-2])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Postcode,Neighborhoods,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M5A,"Harbourfront, Regent Park",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M5A,"Harbourfront, Regent Park",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M5A,"Harbourfront, Regent Park",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M5A,"Harbourfront, Regent Park",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M5A,"Harbourfront, Regent Park",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Now we group the rows by postcode and by taking the mean of the frequency of occurrence of each category

In [15]:
toronto_grouped = toronto_onehot.groupby(['Postcode', 'Neighborhoods']).mean().reset_index()
toronto_grouped

Unnamed: 0,Postcode,Neighborhoods,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M4E,The Beaches,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4K,"The Danforth West, Riverdale",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02381
2,M4L,"The Beaches West, India Bazaar",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,Studio District,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025
4,M4N,Lawrence Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,M4P,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,M4R,North Toronto West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05
7,M4S,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,M4T,"Moore Park, Summerhill East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,M4V,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0


### Preparing for further examination

First, we borrow the function return_most_common_venues from the labs

In [16]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[2:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now we use it to get what are the most common venues on each postcode

In [17]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Postcode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
toronto_venues_sorted = pd.DataFrame(columns=columns)
toronto_venues_sorted['Postcode'] = toronto_grouped['Postcode']

for ind in np.arange(toronto_grouped.shape[0]):
    toronto_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

toronto_venues_sorted.head()

Unnamed: 0,Postcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,Neighborhood,Health Food Store,Coffee Shop,Pub,Yoga Studio,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store
1,M4K,Greek Restaurant,Coffee Shop,Ice Cream Shop,Bookstore,Italian Restaurant,Cosmetics Shop,Brewery,Bubble Tea Shop,Caribbean Restaurant,Restaurant
2,M4L,Park,Pizza Place,Sushi Restaurant,Ice Cream Shop,Italian Restaurant,Fish & Chips Shop,Liquor Store,Fast Food Restaurant,Movie Theater,Pet Store
3,M4M,Café,Coffee Shop,American Restaurant,Italian Restaurant,Gastropub,Bakery,Fish Market,Latin American Restaurant,Bookstore,Cheese Shop
4,M4N,Park,Swim School,Dim Sum Restaurant,Bus Line,Yoga Studio,Dog Run,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space


### Creating the clusters

First we create the clusters with kmeans algorithm

In [18]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop(['Postcode','Neighborhoods'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 2, 2, 2, 3, 2, 2, 2, 4, 2], dtype=int32)

Then we assemble the data to a more readable dataframe

In [19]:
# add clustering labels
toronto_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = neighborhoods

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(toronto_venues_sorted.set_index('Postcode'), on='Postcode')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,2,Coffee Shop,Café,Pub,Bakery,Park,Mexican Restaurant,Theater,Gym / Fitness Center,Breakfast Spot,Restaurant
1,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937,2,Coffee Shop,Clothing Store,Cosmetics Shop,Café,Restaurant,Fast Food Restaurant,Middle Eastern Restaurant,Juice Bar,Burger Joint,Tea Room
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,2,Coffee Shop,Restaurant,Hotel,Café,Breakfast Spot,Italian Restaurant,Park,Cosmetics Shop,Cocktail Bar,Clothing Store
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,2,Neighborhood,Health Food Store,Coffee Shop,Pub,Yoga Studio,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,2,Coffee Shop,Cocktail Bar,Cheese Shop,Bakery,Restaurant,Café,Farmers Market,Seafood Restaurant,Beer Bar,Steakhouse


Now we create a visual representation of the clusters

In [20]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Now we examine the clusters

### Cluster 1

In [21]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[list(range(0,3)) + list(range(6, toronto_merged.shape[1]))]]]

  result = getitem(key)


Unnamed: 0,Postcode,Borough,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
20,M5P,Central Toronto,"Forest Hill North, Forest Hill West",Jewelry Store,Park,Sushi Restaurant,Trail,Yoga Studio,Eastern European Restaurant,Doner Restaurant,Donut Shop,Dumpling Restaurant,Electronics Store
32,M4W,Downtown Toronto,Rosedale,Park,Playground,Trail,Diner,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant


### Cluster 2

In [22]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[list(range(0,3)) + list(range(6, toronto_merged.shape[1]))]]]

Unnamed: 0,Postcode,Borough,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
18,M5N,Central Toronto,Roselawn,Garden,Yoga Studio,Fish Market,Filipino Restaurant,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store


### Cluster 3

In [23]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[list(range(0,3)) + list(range(6, toronto_merged.shape[1]))]]]

Unnamed: 0,Postcode,Borough,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Harbourfront, Regent Park",Coffee Shop,Café,Pub,Bakery,Park,Mexican Restaurant,Theater,Gym / Fitness Center,Breakfast Spot,Restaurant
1,M5B,Downtown Toronto,"Ryerson, Garden District",Coffee Shop,Clothing Store,Cosmetics Shop,Café,Restaurant,Fast Food Restaurant,Middle Eastern Restaurant,Juice Bar,Burger Joint,Tea Room
2,M5C,Downtown Toronto,St. James Town,Coffee Shop,Restaurant,Hotel,Café,Breakfast Spot,Italian Restaurant,Park,Cosmetics Shop,Cocktail Bar,Clothing Store
3,M4E,East Toronto,The Beaches,Neighborhood,Health Food Store,Coffee Shop,Pub,Yoga Studio,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store
4,M5E,Downtown Toronto,Berczy Park,Coffee Shop,Cocktail Bar,Cheese Shop,Bakery,Restaurant,Café,Farmers Market,Seafood Restaurant,Beer Bar,Steakhouse
5,M5G,Downtown Toronto,Central Bay Street,Coffee Shop,Café,Italian Restaurant,Burger Joint,Bar,Bubble Tea Shop,Spa,Chinese Restaurant,Juice Bar,Middle Eastern Restaurant
6,M6G,Downtown Toronto,Christie,Grocery Store,Café,Park,Convenience Store,Baby Store,Diner,Italian Restaurant,Coffee Shop,Restaurant,Nightclub
7,M5H,Downtown Toronto,"Adelaide, King, Richmond",Coffee Shop,Café,Bar,Steakhouse,Thai Restaurant,American Restaurant,Hotel,Restaurant,Burger Joint,Bakery
8,M6H,West Toronto,"Dovercourt Village, Dufferin",Pharmacy,Supermarket,Bakery,Discount Store,Park,Music Venue,Liquor Store,Middle Eastern Restaurant,Café,Bar
9,M5J,Downtown Toronto,"Harbourfront East, Toronto Islands, Union Station",Coffee Shop,Aquarium,Hotel,Pizza Place,Café,Italian Restaurant,Scenic Lookout,Brewery,Restaurant,Bakery


### Cluster 4

In [24]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[list(range(0,3)) + list(range(6, toronto_merged.shape[1]))]]]

Unnamed: 0,Postcode,Borough,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
17,M4N,Central Toronto,Lawrence Park,Park,Swim School,Dim Sum Restaurant,Bus Line,Yoga Studio,Dog Run,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space


### Cluster 5

In [25]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[list(range(0,3)) + list(range(6, toronto_merged.shape[1]))]]]

Unnamed: 0,Postcode,Borough,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
28,M4T,Central Toronto,"Moore Park, Summerhill East",Playground,Gym,Restaurant,Trail,Dumpling Restaurant,Dive Bar,Dog Run,Doner Restaurant,Donut Shop,Eastern European Restaurant


## Conclusions

The data show that there are only two assumptions we could make: a) the majority of the Toronto neighborhoods are very much alike; or b) the clusters 1, 2, 4 and 5 are outliers and we should focus our analysis on the cluster 3 neighborhoods, or use some other algorithm more resistent to outliers, such as DBScan