# Toronto's Neighborhoods Clustering
### Cristian Castro Álvarez
#### Capstone Project - IBM Data Science Certificate

### Importing 

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup as bs
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

### Web Scrapping

In [2]:
# Load the webpage
req = requests.Session()
r = req.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M", timeout=10)
r

<Response [200]>

In [3]:
soup = bs(r.content, 'html.parser')
pretty_soup = soup.prettify()
# Title of Wikipedia Page
soup.title.string

'List of postal codes of Canada: M - Wikipedia'

In [4]:
# Lets scrape the table we want
table_wiki = soup.find('table', {"class": 'wikitable sortable'})

In [5]:
# Number of columns in the table
for row in table_wiki.findAll("tr"):
    cells = row.findAll('td')

len(cells)

3

In [6]:
# number of rows in the table including header
rows = table_wiki.findAll("tr")
len(rows)

181

In [7]:
# header attributes of the table
header = [th.text.rstrip() for th in rows[0].find_all('th')]
print(header)
print('------------')
print(len(header))

['Postal Code', 'Borough', 'Neighbourhood']
------------
3


In [8]:
header = ['Postal Code', 'Borough', 'Neighborhood']
# Lets put the data into a dataframe
pre_df = []
for row in rows[1:]:
            data = [d.text.rstrip() for d in row.find_all('td')]
            pre_df.append(data)

# sample records
pre_df[0:3]            

[['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods']]

In [9]:
# We conver the data
df = pd.DataFrame(pre_df)
df.head()

Unnamed: 0,0,1,2
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [10]:
# Let's change the header and we are done
df.columns = header
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Wrangling data

In [11]:
df.count()

Postal Code     180
Borough         180
Neighborhood    180
dtype: int64

In [12]:
# Let's replace "Not assigned" for NaN in order to discriminate
df.replace("Not assigned", np.nan, inplace = True)
df

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,,
176,M6Z,,
177,M7Z,,
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [13]:
# Let's ignore cells with a Borough that is Not Assined
df = df.dropna(subset=['Borough'])
df

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [14]:
df.shape

(103, 3)

In [15]:
# I will create a second dataframe, where each Neighborhood has its own postal code.
df2 = df.set_index(['Postal Code', 'Borough']).Neighborhood.str.split(',', expand=True).stack().reset_index(['Postal Code', 'Borough'])
df2.columns = header
df2 = df2.reset_index()
del df2['index']
df2

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park
3,M5A,Downtown Toronto,Harbourfront
4,M6A,North York,Lawrence Manor
...,...,...,...
212,M8Z,Etobicoke,Mimico NW
213,M8Z,Etobicoke,The Queensway West
214,M8Z,Etobicoke,South of Bloor
215,M8Z,Etobicoke,Kingsway Park South West


In [16]:
df2.shape

(217, 3)

### Latitude and Longitude Data

In [17]:
# Let's load the csv file of the geographical coordinates of each postal code of each Neighborhood
geo_coord = pd.read_csv('Geospatial_Coordinates.csv')
geo_coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [18]:
# We will use the df dataframe for alocate each Postal Code to Latitude and Longitude
df_wgc = pd.merge(df, geo_coord, on='Postal Code')
df_wgc

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


In [19]:
# We also use the df2 dataframe for alocate each Postal Code to Latitude and Longitude
df2_wgc = pd.merge(df2, geo_coord, on='Postal Code')
df2_wgc

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park,43.654260,-79.360636
3,M5A,Downtown Toronto,Harbourfront,43.654260,-79.360636
4,M6A,North York,Lawrence Manor,43.718518,-79.464763
...,...,...,...,...,...
212,M8Z,Etobicoke,Mimico NW,43.628841,-79.520999
213,M8Z,Etobicoke,The Queensway West,43.628841,-79.520999
214,M8Z,Etobicoke,South of Bloor,43.628841,-79.520999
215,M8Z,Etobicoke,Kingsway Park South West,43.628841,-79.520999


### Explore and Cluster!

In [20]:
# Let's work only with boroughs that contain the word Toronto
# We are using the database that individualizes the neighbourghoods (df2)
df2_wgc.groupby(['Borough']).count()

Unnamed: 0_level_0,Postal Code,Neighborhood,Latitude,Longitude
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Central Toronto,18,18,18,18
Downtown Toronto,39,39,39,39
East Toronto,8,8,8,8
East York,7,7,7,7
Etobicoke,47,47,47,47
Mississauga,1,1,1,1
North York,38,38,38,38
Scarborough,38,38,38,38
West Toronto,13,13,13,13
York,8,8,8,8


In [21]:
# Filtering Boroughs only with the word "Toronto"
df2_final = df2_wgc[(df2_wgc['Borough']== 'Central Toronto')|(df2_wgc['Borough']=='East Toronto')|(df2_wgc['Borough']=='West Toronto')|(df2_wgc['Borough']=='Downtown Toronto')]
df2_final

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,Regent Park,43.654260,-79.360636
3,M5A,Downtown Toronto,Harbourfront,43.654260,-79.360636
6,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
7,M7A,Downtown Toronto,Ontario Provincial Government,43.662301,-79.389494
15,M5B,Downtown Toronto,Garden District,43.657162,-79.378937
...,...,...,...,...,...
196,M5X,Downtown Toronto,First Canadian Place,43.648429,-79.382280
197,M5X,Downtown Toronto,Underground city,43.648429,-79.382280
201,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
202,M7Y,East Toronto,Business reply mail Processing Centre,43.662744,-79.321558


In [22]:
# Lets visualize the Neighborhoods
latitude = 43.670278
longitude = -79.386667
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11.5)

# add markers to map
for lat, lng, borough, neighborhood in zip(df2_final['Latitude'], df2_final['Longitude'], df2_final['Borough'], df2_final['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
map_toronto

In [23]:
# Foursquare data

CLIENT_ID = 'DALIYLPNG5TKEI2LOG1HGFYCCCFKZN4AQKP2OJUA0UO332NY' # your Foursquare ID
CLIENT_SECRET = 'CWEPZS5UPGYWCTPCTWHLZYPXOQHFLB0513KUMZQWJ4POIVZD' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: DALIYLPNG5TKEI2LOG1HGFYCCCFKZN4AQKP2OJUA0UO332NY
CLIENT_SECRET:CWEPZS5UPGYWCTPCTWHLZYPXOQHFLB0513KUMZQWJ4POIVZD


In [24]:
# Explore Foursquare Data

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

toronto_venues = getNearbyVenues(names=df2_final['Neighborhood'],
                                   latitudes=df2_final['Latitude'],
                                   longitudes=df2_final['Longitude']
                                  )

Regent Park
 Harbourfront
Queen's Park
 Ontario Provincial Government
Garden District
 Ryerson
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Richmond
 Adelaide
 King
Dufferin
 Dovercourt Village
Harbourfront East
 Union Station
 Toronto Islands
Little Portugal
 Trinity
The Danforth West
 Riverdale
Toronto Dominion Centre
 Design Exchange
Brockton
 Parkdale Village
 Exhibition Place
India Bazaar
 The Beaches West
Commerce Court
 Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North & West
 Forest Hill Road Park
High Park
 The Junction South
North Toronto West
  Lawrence Park
The Annex
 North Midtown
 Yorkville
Parkdale
 Roncesvalles
Davisville
University of Toronto
 Harbord
Runnymede
 Swansea
Moore Park
 Summerhill East
Kensington Market
 Chinatown
 Grange Park
Summerhill West
 Rathnelly
 South Hill
 Forest Hill SE
 Deer Park
CN Tower
 King and Spadina
 Railway Lands
 Harbourfront West
 Bathurst Quay
 South Niagara
 Island airport


In [25]:
# Let's check the dataframe
print(toronto_venues.shape)
toronto_venues.head()

(3199, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Regent Park,43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,Regent Park,43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,Regent Park,43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,Regent Park,43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,Regent Park,43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant


In [26]:
# Let's see how many venues per Neighborhood
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Lawrence Park,18,18,18,18,18,18
Adelaide,100,100,100,100,100,100
Bathurst Quay,16,16,16,16,16,16
Cabbagetown,48,48,48,48,48,48
Chinatown,74,74,74,74,74,74
...,...,...,...,...,...,...
The Annex,19,19,19,19,19,19
The Beaches,4,4,4,4,4,4
The Danforth West,43,43,43,43,43,43
Toronto Dominion Centre,100,100,100,100,100,100


In [27]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 237 uniques categories.


### Let's analyze each Neighborhood

In [28]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe and put it in the first column
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 
insert_col = toronto_onehot.pop('Neighborhood')
toronto_onehot.insert(0, 'Neighborhood', insert_col)

toronto_onehot.head()

Unnamed: 0,Neighborhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,Regent Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Regent Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Regent Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Regent Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Regent Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
# Let's see the shape
toronto_onehot.shape

(3199, 237)

In [30]:
# Let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,Lawrence Park,0.0,0.0000,0.0000,0.0000,0.000,0.000,0.0000,0.000000,0.0,...,0.0,0.0,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.00,0.055556
1,Adelaide,0.0,0.0000,0.0000,0.0000,0.000,0.000,0.0000,0.020000,0.0,...,0.0,0.0,0.000000,0.00,0.010000,0.000000,0.000000,0.000000,0.01,0.000000
2,Bathurst Quay,0.0,0.0625,0.0625,0.0625,0.125,0.125,0.0625,0.000000,0.0,...,0.0,0.0,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.00,0.000000
3,Cabbagetown,0.0,0.0000,0.0000,0.0000,0.000,0.000,0.0000,0.000000,0.0,...,0.0,0.0,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.00,0.000000
4,Chinatown,0.0,0.0000,0.0000,0.0000,0.000,0.000,0.0000,0.000000,0.0,...,0.0,0.0,0.000000,0.00,0.054054,0.000000,0.040541,0.013514,0.00,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,The Annex,0.0,0.0000,0.0000,0.0000,0.000,0.000,0.0000,0.000000,0.0,...,0.0,0.0,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.00,0.000000
73,The Beaches,0.0,0.0000,0.0000,0.0000,0.000,0.000,0.0000,0.000000,0.0,...,0.0,0.0,0.250000,0.00,0.000000,0.000000,0.000000,0.000000,0.00,0.000000
74,The Danforth West,0.0,0.0000,0.0000,0.0000,0.000,0.000,0.0000,0.023256,0.0,...,0.0,0.0,0.023256,0.00,0.000000,0.000000,0.000000,0.000000,0.00,0.023256
75,Toronto Dominion Centre,0.0,0.0000,0.0000,0.0000,0.000,0.000,0.0000,0.030000,0.0,...,0.0,0.0,0.000000,0.01,0.010000,0.000000,0.000000,0.010000,0.00,0.000000


In [31]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----  Lawrence Park----
            venue  freq
0     Coffee Shop  0.11
1  Clothing Store  0.11
2     Yoga Studio  0.06
3      Bagel Shop  0.06
4           Diner  0.06


---- Adelaide----
         venue  freq
0  Coffee Shop  0.08
1         Café  0.05
2          Gym  0.04
3        Hotel  0.04
4   Restaurant  0.04


---- Bathurst Quay----
                venue  freq
0      Airport Lounge  0.12
1     Airport Service  0.12
2               Plane  0.06
3    Sculpture Garden  0.06
4  Airport Food Court  0.06


---- Cabbagetown----
                venue  freq
0         Coffee Shop  0.08
1          Restaurant  0.06
2         Pizza Place  0.06
3                Café  0.06
4  Italian Restaurant  0.04


---- Chinatown----
                           venue  freq
0                    Coffee Shop  0.05
1                            Bar  0.05
2  Vegetarian / Vegan Restaurant  0.05
3             Mexican Restaurant  0.05
4                           Café  0.05


---- Deer Park----
                 venue  fr

                  venue  freq
0  Gym / Fitness Center  0.11
1      Department Store  0.11
2                  Park  0.11
3        Breakfast Spot  0.11
4        Sandwich Place  0.11


----Dufferin----
         venue  freq
0     Pharmacy  0.15
1       Bakery  0.15
2         Bank  0.08
3  Music Venue  0.08
4      Brewery  0.08


----First Canadian Place----
         venue  freq
0  Coffee Shop  0.10
1         Café  0.07
2        Hotel  0.06
3   Restaurant  0.04
4          Gym  0.04


----Forest Hill North & West----
               venue  freq
0               Park  0.25
1              Trail  0.25
2      Jewelry Store  0.25
3   Sushi Restaurant  0.25
4  Afghan Restaurant  0.00


----Garden District----
                 venue  freq
0       Clothing Store  0.10
1          Coffee Shop  0.09
2                 Café  0.04
3  Japanese Restaurant  0.03
4      Bubble Tea Shop  0.03


----Harbourfront East----
            venue  freq
0     Coffee Shop  0.13
1        Aquarium  0.05
2            Café  0.

In [32]:
# Let's create another dataframe with this summary
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Lawrence Park,Coffee Shop,Clothing Store,Yoga Studio,Chinese Restaurant,Salon / Barbershop,Fast Food Restaurant,Spa,Diner,Sporting Goods Shop,Restaurant
1,Adelaide,Coffee Shop,Café,Gym,Hotel,Bar,Restaurant,Clothing Store,Thai Restaurant,Deli / Bodega,Steakhouse
2,Bathurst Quay,Airport Lounge,Airport Service,Boutique,Plane,Airport,Airport Food Court,Airport Gate,Airport Terminal,Bar,Harbor / Marina
3,Cabbagetown,Coffee Shop,Café,Pizza Place,Restaurant,Italian Restaurant,Market,Park,Chinese Restaurant,Bakery,Pub
4,Chinatown,Mexican Restaurant,Vegetarian / Vegan Restaurant,Coffee Shop,Café,Bar,Vietnamese Restaurant,Gaming Cafe,Park,Dessert Shop,Bakery


### Clustering Baby

In [33]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 4, 1, 1, 1, 1, 1, 1, 0], dtype=int32)

In [34]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df2_final

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636,1,Coffee Shop,Park,Bakery,Pub,Breakfast Spot,Café,Theater,Shoe Store,Brewery,Restaurant
3,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,1,Coffee Shop,Park,Bakery,Pub,Breakfast Spot,Café,Theater,Shoe Store,Brewery,Restaurant
6,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494,1,Coffee Shop,Yoga Studio,Diner,Restaurant,Portuguese Restaurant,Park,Music Venue,Mexican Restaurant,Italian Restaurant,Hobby Shop
7,M7A,Downtown Toronto,Ontario Provincial Government,43.662301,-79.389494,1,Coffee Shop,Yoga Studio,Diner,Restaurant,Portuguese Restaurant,Park,Music Venue,Mexican Restaurant,Italian Restaurant,Hobby Shop
15,M5B,Downtown Toronto,Garden District,43.657162,-79.378937,1,Clothing Store,Coffee Shop,Café,Bubble Tea Shop,Japanese Restaurant,Cosmetics Shop,Lingerie Store,Hotel,Bookstore,Pizza Place


In [35]:
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Examine the Clusters

In [36]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
113,Central Toronto,0,Park,Bus Line,Swim School,Dim Sum Restaurant,Ethiopian Restaurant,Escape Room,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop
123,Central Toronto,0,Park,Jewelry Store,Trail,Sushi Restaurant,Yoga Studio,Dessert Shop,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop
124,Central Toronto,0,Park,Jewelry Store,Trail,Sushi Restaurant,Yoga Studio,Dessert Shop,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop
187,Downtown Toronto,0,Park,Playground,Trail,Department Store,Escape Room,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant


In [37]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,Downtown Toronto,1,Coffee Shop,Park,Bakery,Pub,Breakfast Spot,Café,Theater,Shoe Store,Brewery,Restaurant
3,Downtown Toronto,1,Coffee Shop,Park,Bakery,Pub,Breakfast Spot,Café,Theater,Shoe Store,Brewery,Restaurant
6,Downtown Toronto,1,Coffee Shop,Yoga Studio,Diner,Restaurant,Portuguese Restaurant,Park,Music Venue,Mexican Restaurant,Italian Restaurant,Hobby Shop
7,Downtown Toronto,1,Coffee Shop,Yoga Studio,Diner,Restaurant,Portuguese Restaurant,Park,Music Venue,Mexican Restaurant,Italian Restaurant,Hobby Shop
15,Downtown Toronto,1,Clothing Store,Coffee Shop,Café,Bubble Tea Shop,Japanese Restaurant,Cosmetics Shop,Lingerie Store,Hotel,Bookstore,Pizza Place
...,...,...,...,...,...,...,...,...,...,...,...,...
196,Downtown Toronto,1,Coffee Shop,Café,Hotel,Restaurant,American Restaurant,Japanese Restaurant,Gym,Salad Place,Asian Restaurant,Steakhouse
197,Downtown Toronto,1,Coffee Shop,Café,Hotel,Restaurant,American Restaurant,Japanese Restaurant,Gym,Salad Place,Asian Restaurant,Steakhouse
201,Downtown Toronto,1,Coffee Shop,Japanese Restaurant,Gay Bar,Sushi Restaurant,Restaurant,Pub,Men's Store,Mediterranean Restaurant,Hotel,Yoga Studio
202,East Toronto,1,Pizza Place,Garden Center,Farmers Market,Butcher,Brewery,Burrito Place,Recording Studio,Garden,Auto Workshop,Fast Food Restaurant


In [38]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
153,Central Toronto,2,Playground,Trail,Yoga Studio,Department Store,Escape Room,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant
154,Central Toronto,2,Playground,Trail,Yoga Studio,Department Store,Escape Room,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant


In [39]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
114,Central Toronto,3,Music Venue,Garden,Yoga Studio,Department Store,Escape Room,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant


In [40]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
167,Downtown Toronto,4,Airport Lounge,Airport Service,Boutique,Plane,Airport,Airport Food Court,Airport Gate,Airport Terminal,Bar,Harbor / Marina
168,Downtown Toronto,4,Airport Lounge,Airport Service,Boutique,Plane,Airport,Airport Food Court,Airport Gate,Airport Terminal,Bar,Harbor / Marina
169,Downtown Toronto,4,Airport Lounge,Airport Service,Boutique,Plane,Airport,Airport Food Court,Airport Gate,Airport Terminal,Bar,Harbor / Marina
170,Downtown Toronto,4,Airport Lounge,Airport Service,Boutique,Plane,Airport,Airport Food Court,Airport Gate,Airport Terminal,Bar,Harbor / Marina
171,Downtown Toronto,4,Airport Lounge,Airport Service,Boutique,Plane,Airport,Airport Food Court,Airport Gate,Airport Terminal,Bar,Harbor / Marina
172,Downtown Toronto,4,Airport Lounge,Airport Service,Boutique,Plane,Airport,Airport Food Court,Airport Gate,Airport Terminal,Bar,Harbor / Marina
173,Downtown Toronto,4,Airport Lounge,Airport Service,Boutique,Plane,Airport,Airport Food Court,Airport Gate,Airport Terminal,Bar,Harbor / Marina


###### The neighborhood that i liked the least are the ones in Cluster 4, near the airport. The one that i like the most is the cluster 1 where you have plenty of fun places to attend.