# Coursera Capstone Project
This notebook will be used for the final Coursera Capstone Project "Battle of Neighborhoods"

In [367]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import geocoder
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

print("Hello Capstone Project Course!")

Hello Capstone Project Course!


## Part 1: Data Wrangling

#### Data Scraping from Wikipedia using "request" and "BeautifulSoup"

In [368]:
#scrape data from wikipedia and transform to BS-object
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
website_content = requests.get(url).text
soup = BeautifulSoup(website_content,'lxml')
soup.prettify()

#extract table from html
table = soup.find('table',{'class':'wikitable sortable'})
df = pd.read_html(str(table))[0]

df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
...,...,...,...
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West
285,M8Z,Etobicoke,South of Bloor


#### Check data

In [369]:
print(df.describe())
print("_"*50)
print("Borough 'NA': "+str(sum(df['Borough']=='Not assigned')))
print("Neighbourhood 'NA': "+str(sum(df['Neighbourhood']=='Not assigned')))
print("Borough and Neighbourhood 'NA': "+str(sum((df['Borough']=='Not assigned') & (df['Neighbourhood']=='Not assigned'))))

       Postcode       Borough Neighbourhood
count       287           287           287
unique      180            11           209
top         M9V  Not assigned  Not assigned
freq          8            77            77
__________________________________________________
Borough 'NA': 77
Neighbourhood 'NA': 77
Borough and Neighbourhood 'NA': 77


There are 77 boroughs and neighbours not assigned. This always occurs for both parameters.

#### Data Wrangling and cleaning

In [370]:
#Rename Postcode column to 'Postal Code'
df = df.rename(columns={'Postcode': 'Postal Code'})

#Drop rows where borough not assigned
df = df[df['Borough'] !=  'Not assigned']

#Aggregate neighbourhoods with same Postcode
df=df.groupby(['Postal Code', 'Borough'], as_index=False)['Neighbourhood'].agg(lambda x: ', '.join(x))

#Assign borough where neighbourhood not assigned (does not occur on current data frame version 2020-03-07)
df.loc[df['Neighbourhood'] == 'Not assigned', 'Neighbourhood'] = df['Borough']

df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [371]:
df.shape
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df['Borough'].unique()),
        df.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


## Part 2: Get and assign coordinates

As the retrieval of the coordinates did not work with geocoder, the csv file with coordinates was used.

In [372]:
coords = pd.read_csv('https://cocl.us/Geospatial_data')
coords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Assign cooordinates by using postal code as key

In [373]:
df = pd.merge(df, coords, on='Postal Code')
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [374]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Part 3: Cluster neighbourhoods

#### Visualize neighbourhoods with Folium

Get the coordinates of the city of Toronto.

In [375]:
toronto = geocoder.osm('Toronto, CA')
print('The geograpical coordinate of Toronto, Canada are {}, {}.'.format(toronto.latlng[0], toronto.latlng[1]))

The geograpical coordinate of Toronto, Canada are 43.653963, -79.387207.


Create an overview map visualizing the neighbourhoods.

In [376]:
map_toronto = folium.Map(location=toronto.latlng, zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

#### Venue data from Foursquare

Use the Foursquare API to explore all neighbourhoods by borrowing the function from the lab exercise.

In [377]:
#Credentials (taken out afterwards)

In [378]:
def getNearbyVenues(names, latitudes, longitudes, radius=1000, limit=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [379]:
toronto_venues = getNearbyVenues(names=df['Neighbourhood'],latitudes=df['Latitude'],longitudes=df['Longitude'])

Rouge, Malvern
Highland Creek, Rouge Hill, Port Union
Guildwood, Morningside, West Hill
Woburn
Cedarbrae
Scarborough Village
East Birchmount Park, Ionview, Kennedy Park
Clairlea, Golden Mile, Oakridge
Cliffcrest, Cliffside, Scarborough Village West
Birch Cliff, Cliffside West
Dorset Park, Scarborough Town Centre, Wexford Heights
Maryvale, Wexford
Agincourt
Clarks Corners, Sullivan, Tam O'Shanter
Agincourt North, L'Amoreaux East, Milliken, Steeles East
L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview, Henry Farm, Oriole
Bayview Village
Silver Hills, York Mills
Newtonbrook, Willowdale
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Flemingdon Park, Don Mills South
Bathurst Manor, Downsview North, Wilson Heights
Northwood Park, York University
CFB Toronto, Downsview East
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Woodbine Gardens, Parkview Hill
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
The Danforth West, 

Check the categories of the retrieved data.

In [380]:
print('There are {} unique categories.'.format(len(toronto_venues['Venue Category'].unique())))
toronto_venues.head()

There are 324 unique categories.


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.802008,-79.19808,Fast Food Restaurant
1,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
2,"Rouge, Malvern",43.806686,-79.194353,Caribbean Wave,43.798558,-79.195777,Caribbean Restaurant
3,"Rouge, Malvern",43.806686,-79.194353,Harvey's,43.80002,-79.198307,Restaurant
4,"Rouge, Malvern",43.806686,-79.194353,Staples Morningside,43.800285,-79.196607,Paper / Office Supplies Store


Use one hot encoding to parametrize the venues with binary and prepare the neighbourhoods for clustering.

In [381]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighbourhood,Accessories Store,Afghan Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,Aquarium,...,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
0,"Rouge, Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Rouge, Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Rouge, Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Rouge, Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Rouge, Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Check the data size before clustering.

In [382]:
toronto_onehot.shape

(4914, 325)

The data will be grouped by neighbourhoods and the frequency of occurence calculated.

In [383]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighbourhood,Accessories Store,Afghan Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,Aquarium,...,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.020000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0
1,Agincourt,0.0,0.0,0.0,0.0,0.021277,0.0,0.0,0.0,0.0,...,0.000000,0.021277,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0
4,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,Willowdale West,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0
98,Woburn,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0
99,"Woodbine Gardens, Parkview Hill",0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0
100,Woodbine Heights,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.033333,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0


In [384]:
toronto_grouped.shape

(102, 325)

Display the top 10 venues of each neighbourhood:

In [414]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    columns.append('{}. Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighbourhood,1. Most Common Venue,2. Most Common Venue,3. Most Common Venue,4. Most Common Venue,5. Most Common Venue,6. Most Common Venue,7. Most Common Venue,8. Most Common Venue,9. Most Common Venue,10. Most Common Venue
0,"Adelaide, King, Richmond",Café,Coffee Shop,Hotel,Restaurant,Theater,Pizza Place,Bakery,Seafood Restaurant,Beer Bar,Concert Hall
1,Agincourt,Chinese Restaurant,Shopping Mall,Pizza Place,Bakery,Sandwich Place,Caribbean Restaurant,Pool,Supermarket,Sri Lankan Restaurant,Latin American Restaurant
2,"Agincourt North, L'Amoreaux East, Milliken, St...",Chinese Restaurant,Pizza Place,Bakery,BBQ Joint,Park,Bubble Tea Shop,Dessert Shop,Korean Restaurant,Caribbean Restaurant,Malay Restaurant
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",Pizza Place,Grocery Store,Sandwich Place,Gym Pool,Beer Store,Caribbean Restaurant,Park,Discount Store,Fried Chicken Joint,Japanese Restaurant
4,"Alderwood, Long Branch",Discount Store,Pharmacy,Convenience Store,Pizza Place,Gas Station,Intersection,Skating Rink,Shopping Mall,Donut Shop,Liquor Store
...,...,...,...,...,...,...,...,...,...,...,...
97,Willowdale West,Pharmacy,Convenience Store,Bus Line,Butcher,Eastern European Restaurant,Baby Store,Park,Coffee Shop,Discount Store,Bakery
98,Woburn,Park,Coffee Shop,Mobile Phone Shop,Pharmacy,Fast Food Restaurant,Indian Restaurant,Chinese Restaurant,Dumpling Restaurant,Eastern European Restaurant,Electronics Store
99,"Woodbine Gardens, Parkview Hill",Pizza Place,Construction & Landscaping,Bakery,Brewery,Bus Line,Gym / Fitness Center,Coffee Shop,Bank,Rock Climbing Spot,Restaurant
100,Woodbine Heights,Coffee Shop,Pizza Place,Sandwich Place,Thai Restaurant,Park,Athletics & Sports,Farmers Market,Spa,Skating Rink,Café


## Clustering

Run the clustering algorithm (k-Means) with 20 clusters:

In [415]:
no_clusters = 6 #define number of clusters

#drop neighbourhood columm
toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

#run algorithm
kmeans = KMeans(n_clusters=no_clusters, random_state=1).fit(toronto_grouped_clustering)

kmeans.labels_

array([1, 3, 3, 0, 3, 3, 3, 1, 1, 1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1,
       1, 3, 3, 3, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 3, 0, 3, 3, 1, 3, 1,
       1, 1, 1, 1, 3, 1, 1, 1, 1, 5, 3, 1, 5, 3, 3, 3, 3, 1, 3, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 3, 1, 1, 1, 1, 3, 2, 1, 1, 1, 1, 1,
       1, 1, 3, 1, 1, 3, 3, 3, 1, 3, 3, 3, 3, 3], dtype=int32)

Update the dataframe with cluster labels:

In [416]:
#add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

df_merged = df

#merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
df_merged = df_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

#drop na clusters
df_merged.dropna(inplace=True)

#restore int type in cluster labels
df_merged = df_merged.astype({"Cluster Labels": int})

df_merged

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1. Most Common Venue,2. Most Common Venue,3. Most Common Venue,4. Most Common Venue,5. Most Common Venue,6. Most Common Venue,7. Most Common Venue,8. Most Common Venue,9. Most Common Venue,10. Most Common Venue
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,1,Fast Food Restaurant,Trail,Coffee Shop,Gym,Fruit & Vegetable Store,Greek Restaurant,Caribbean Restaurant,Chinese Restaurant,Restaurant,Paper / Office Supplies Store
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,5,Italian Restaurant,Breakfast Spot,Burger Joint,Playground,Park,Zoo,Electronics Store,Empanada Restaurant,Ethiopian Restaurant,Event Space
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,3,Pizza Place,Coffee Shop,Fast Food Restaurant,Supermarket,Sports Bar,Food & Drink Shop,Fried Chicken Joint,Liquor Store,Beer Store,Sandwich Place
3,M1G,Scarborough,Woburn,43.770992,-79.216917,3,Park,Coffee Shop,Mobile Phone Shop,Pharmacy,Fast Food Restaurant,Indian Restaurant,Chinese Restaurant,Dumpling Restaurant,Eastern European Restaurant,Electronics Store
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,3,Coffee Shop,Bakery,Pharmacy,Gas Station,Indian Restaurant,Sporting Goods Shop,Intersection,Chinese Restaurant,Caribbean Restaurant,Music Store
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188,3,Coffee Shop,Pizza Place,Train Station,Middle Eastern Restaurant,Breakfast Spot,Soccer Field,Café,Skating Rink,Fried Chicken Joint,Furniture / Home Store
99,M9P,Etobicoke,Westmount,43.696319,-79.532242,3,Pizza Place,Gas Station,Middle Eastern Restaurant,Intersection,Breakfast Spot,Supermarket,Chinese Restaurant,Park,Golf Course,Golf Driving Range
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.688905,-79.554724,3,Pharmacy,Pizza Place,Intersection,Business Service,Shopping Mall,Supermarket,Supplement Shop,Beer Store,Gas Station,Chinese Restaurant
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437,0,Pizza Place,Grocery Store,Sandwich Place,Gym Pool,Beer Store,Caribbean Restaurant,Park,Discount Store,Fried Chicken Joint,Japanese Restaurant


Final visualization with Folium:

In [417]:
# create map
map_clusters = folium.Map(location=toronto.latlng, zoom_start=11)

# set color scheme for the clusters
x = np.arange(no_clusters)
ys = [i + x + (i*x)**2 for i in range(no_clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_merged['Latitude'], df_merged['Longitude'], df_merged['Neighbourhood'], df_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examine the clusters and determine categories

In [418]:
df_merged.loc[df_merged['Cluster Labels'] == 0, df_merged.columns[[1] + list(range(5, df_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1. Most Common Venue,2. Most Common Venue,3. Most Common Venue,4. Most Common Venue,5. Most Common Venue,6. Most Common Venue,7. Most Common Venue,8. Most Common Venue,9. Most Common Venue,10. Most Common Venue
31,North York,0,Park,Bank,Moving Target,Pizza Place,Shopping Mall,Vietnamese Restaurant,Grocery Store,Ethiopian Restaurant,Drugstore,Dumpling Restaurant
94,Etobicoke,0,Park,Pizza Place,Bank,Restaurant,Mexican Restaurant,Café,Theater,Clothing Store,Grocery Store,Gym
101,Etobicoke,0,Pizza Place,Grocery Store,Sandwich Place,Gym Pool,Beer Store,Caribbean Restaurant,Park,Discount Store,Fried Chicken Joint,Japanese Restaurant


### -> Cluster 0: Park & Food

In [420]:
df_merged.loc[df_merged['Cluster Labels'] == 1, df_merged.columns[[1] + list(range(5, df_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1. Most Common Venue,2. Most Common Venue,3. Most Common Venue,4. Most Common Venue,5. Most Common Venue,6. Most Common Venue,7. Most Common Venue,8. Most Common Venue,9. Most Common Venue,10. Most Common Venue
0,Scarborough,1,Fast Food Restaurant,Trail,Coffee Shop,Gym,Fruit & Vegetable Store,Greek Restaurant,Caribbean Restaurant,Chinese Restaurant,Restaurant,Paper / Office Supplies Store
9,Scarborough,1,Convenience Store,College Stadium,Thai Restaurant,General Entertainment,Park,Diner,Skating Rink,Gym,Gym Pool,Restaurant
10,Scarborough,1,Bakery,Furniture / Home Store,Asian Restaurant,Restaurant,Coffee Shop,Electronics Store,Indian Restaurant,Chinese Restaurant,Pharmacy,Fast Food Restaurant
11,Scarborough,1,Middle Eastern Restaurant,Grocery Store,Pizza Place,Burger Joint,Supermarket,Bakery,Bar,Restaurant,Korean Restaurant,Gas Station
18,North York,1,Clothing Store,Coffee Shop,Sandwich Place,Fast Food Restaurant,Juice Bar,Bakery,Japanese Restaurant,Electronics Store,Food Court,Liquor Store
21,North York,1,Korean Restaurant,Café,Pizza Place,Coffee Shop,Middle Eastern Restaurant,Trail,Ramen Restaurant,Park,Dessert Shop,Sandwich Place
22,North York,1,Ramen Restaurant,Coffee Shop,Bubble Tea Shop,Pizza Place,Japanese Restaurant,Korean Restaurant,Fast Food Restaurant,Sandwich Place,Café,Sushi Restaurant
26,North York,1,Coffee Shop,Japanese Restaurant,Pizza Place,Burger Joint,Liquor Store,Bank,Bar,Baseball Field,Thai Restaurant,Salad Place
27,North York,1,Restaurant,Coffee Shop,Gym,Beer Store,Supermarket,Japanese Restaurant,Asian Restaurant,Bank,Sporting Goods Shop,Sushi Restaurant
30,North York,1,Coffee Shop,Turkish Restaurant,Soccer Field,Gym,Gas Station,Chinese Restaurant,Liquor Store,Electronics Store,Other Repair Shop,Italian Restaurant


### -> Cluster 1: Coffee shops & restaurants

In [421]:
df_merged.loc[df_merged['Cluster Labels'] == 2, df_merged.columns[[1] + list(range(5, df_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1. Most Common Venue,2. Most Common Venue,3. Most Common Venue,4. Most Common Venue,5. Most Common Venue,6. Most Common Venue,7. Most Common Venue,8. Most Common Venue,9. Most Common Venue,10. Most Common Venue
20,North York,2,Park,Pool,Zoo,Farm,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Empanada Restaurant,Ethiopian Restaurant,Event Space


### -> Cluster 2 (single neighbourhood): Park, Pool, Zoo

In [422]:
df_merged.loc[df_merged['Cluster Labels'] == 3, df_merged.columns[[1] + list(range(5, df_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1. Most Common Venue,2. Most Common Venue,3. Most Common Venue,4. Most Common Venue,5. Most Common Venue,6. Most Common Venue,7. Most Common Venue,8. Most Common Venue,9. Most Common Venue,10. Most Common Venue
2,Scarborough,3,Pizza Place,Coffee Shop,Fast Food Restaurant,Supermarket,Sports Bar,Food & Drink Shop,Fried Chicken Joint,Liquor Store,Beer Store,Sandwich Place
3,Scarborough,3,Park,Coffee Shop,Mobile Phone Shop,Pharmacy,Fast Food Restaurant,Indian Restaurant,Chinese Restaurant,Dumpling Restaurant,Eastern European Restaurant,Electronics Store
4,Scarborough,3,Coffee Shop,Bakery,Pharmacy,Gas Station,Indian Restaurant,Sporting Goods Shop,Intersection,Chinese Restaurant,Caribbean Restaurant,Music Store
5,Scarborough,3,Ice Cream Shop,Convenience Store,Japanese Restaurant,Sandwich Place,Pizza Place,Fast Food Restaurant,Bowling Alley,Restaurant,Coffee Shop,Train Station
6,Scarborough,3,Coffee Shop,Discount Store,Chinese Restaurant,Grocery Store,Fast Food Restaurant,Intersection,Pharmacy,Sandwich Place,Bank,Metro Station
7,Scarborough,3,Intersection,Bus Line,Convenience Store,Coffee Shop,Bakery,Pizza Place,Mexican Restaurant,Fast Food Restaurant,Sandwich Place,Beer Store
8,Scarborough,3,Pizza Place,Beach,Ice Cream Shop,Sports Bar,Burger Joint,Park,Hardware Store,Cajun / Creole Restaurant,Farm,Electronics Store
12,Scarborough,3,Chinese Restaurant,Shopping Mall,Pizza Place,Bakery,Sandwich Place,Caribbean Restaurant,Pool,Supermarket,Sri Lankan Restaurant,Latin American Restaurant
13,Scarborough,3,Coffee Shop,Fast Food Restaurant,Pizza Place,Sandwich Place,Intersection,Seafood Restaurant,Cantonese Restaurant,Caribbean Restaurant,Shopping Mall,Fried Chicken Joint
14,Scarborough,3,Chinese Restaurant,Pizza Place,Bakery,BBQ Joint,Park,Bubble Tea Shop,Dessert Shop,Korean Restaurant,Caribbean Restaurant,Malay Restaurant


### -> Cluster 3: Pizza & Coffee

In [423]:
df_merged.loc[df_merged['Cluster Labels'] == 4, df_merged.columns[[1] + list(range(5, df_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1. Most Common Venue,2. Most Common Venue,3. Most Common Venue,4. Most Common Venue,5. Most Common Venue,6. Most Common Venue,7. Most Common Venue,8. Most Common Venue,9. Most Common Venue,10. Most Common Venue
32,North York,4,Vietnamese Restaurant,Thai Restaurant,Baseball Field,Zoo,Farm,Eastern European Restaurant,Electronics Store,Empanada Restaurant,Ethiopian Restaurant,Event Space


### -> Cluster 4 (single neighbourhood): Asian restaurants

In [424]:
df_merged.loc[df_merged['Cluster Labels'] == 5, df_merged.columns[[1] + list(range(5, df_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1. Most Common Venue,2. Most Common Venue,3. Most Common Venue,4. Most Common Venue,5. Most Common Venue,6. Most Common Venue,7. Most Common Venue,8. Most Common Venue,9. Most Common Venue,10. Most Common Venue
1,Scarborough,5,Italian Restaurant,Breakfast Spot,Burger Joint,Playground,Park,Zoo,Electronics Store,Empanada Restaurant,Ethiopian Restaurant,Event Space
91,Etobicoke,5,Park,Italian Restaurant,Eastern European Restaurant,Ice Cream Shop,Gym / Fitness Center,Falafel Restaurant,Dumpling Restaurant,Electronics Store,Empanada Restaurant,Ethiopian Restaurant


### -> Cluster 5: Italian restaurants