# Segmenting and Clustering Neighborhoods in Toronto

## PART 1 


## Scraping data from Wikipedia using BeautifulSoup

In [10]:
#Import requests for web scraping
import pandas as pd
import requests
import numpy as np

import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

print('Libraries installed')

Libraries installed


In [11]:
website_url= rq.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text #Bring the data from the target URL

### Now we shall use BeautifulSoup library

In [18]:
#get html from wiki page and create soup object
source = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(source.text, 'lxml')

#using soup object, iterate the .wikitable to get the data from the HTML page and store it into a list
table_contents=[]
table=soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

In [27]:
df=pd.DataFrame(table_contents)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


### Drop Not asiggned Neighbourhoods as there are no Boroughs assigned to them neither

In [28]:
noNeighbourhood = df[df['Neighborhood'] == 'Not assigned'].index
df.drop(noNeighbourhood, inplace = True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


### Now, we proceed to group our Dataframe by Postcode with a concatenation of a ","

In [30]:
grpdf =df.groupby(['PostalCode','Borough'], as_index=False, sort=False).agg(','.join)
grpdf #Dataframe Grouped by Postcode and joined with ","

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East TorontoBusiness reply mail Processing Cen...,Enclave of M4L
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


### Our last requirement is to verify our Dataframe shape

In [31]:
grpdf.shape

(103, 3)

## PART 2 

In [35]:
urlCSV = 'https://cocl.us/Geospatial_data' #Retreive the data
geoSpatial = pd.read_csv(urlCSV) #Turned to dataFrame
newdf = geoSpatial.rename(columns ={'Postal Code':'PostalCode'}) #Rename our column in order to have the same Column title as our previous DataFrame
newdf.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [36]:
mergedf=pd.merge(grpdf, newdf, on='PostalCode') #Merge by column name and build new dataframe
mergedf.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


# PART 3

### First we need to import our libraries

In [37]:
!pip -q install folium
import folium
print('Folium imported')

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Folium imported


## 1. Exploring the dataset


In [40]:
map_toronto = folium.Map(location=[43.651070, -79.347015], zoom_start=11) # Create Map

# add markers to map
for lat, lng, borough, neighborhood in zip(mergedf['Latitude'], mergedf['Longitude'], mergedf['Borough'], mergedf['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto) 
    
map_toronto


### Adding the Foursquare credentials

In [69]:
#@hidden_cell
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 
CLIENT_SECRET:


### For this excercise we will use only the Neighbourhoods from the 'Downtown Toronto' Borough as is quite an important venue 

In [42]:
dt_Toronto_data = mergedf[mergedf['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
dt_Toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
4,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383


In [43]:
neighborhood_latitude = dt_Toronto_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = dt_Toronto_data.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = dt_Toronto_data.loc[0, 'Neighborhood'] # neighbourhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Regent Park, Harbourfront are 43.6542599, -79.3606359.


### Let's create the GET request URL. 

In [44]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=SBTM0LTBIGGWLPMLM0EPDOVF1YFKY2ZLW5P25H1GCR1XML1E&client_secret=DM2MZATDEVOADHMQIVQMOFOAI4RK5K0HHRZYDFK3LYB54QXD&v=20180605&ll=43.6542599,-79.3606359&radius=500&limit=100'

### Send the GET request and examine the resutls

In [45]:
results = rq.get(url).json()
results

{'meta': {'code': 200, 'requestId': '60c177a86c717c5afc5d0834'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Corktown',
  'headerFullLocation': 'Corktown, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 43,
  'suggestedBounds': {'ne': {'lat': 43.6587599045, 'lng': -79.3544279001486},
   'sw': {'lat': 43.6497598955, 'lng': -79.36684389985142}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '53b8466a498e83df908c3f21',
       'name': 'Tandem Coffee',
       'location': {'address': '368 King St E',
        'crossStreet': 'at Trinity St',
        'lat': 43.65355870959944,
        'lng': -79.36180945913513,
        'labeledLatLngs': [{'label': 'display',
 

### Function that extracts the category of the venue

In [46]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

### Now we clean the json and structure it into a pandas dataframe.

In [47]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

  app.launch_new_instance()


Unnamed: 0,name,categories,lat,lng
0,Tandem Coffee,Coffee Shop,43.653559,-79.361809
1,Roselle Desserts,Bakery,43.653447,-79.362017
2,Cooper Koo Family YMCA,Distribution Center,43.653249,-79.358008
3,Body Blitz Spa East,Spa,43.654735,-79.359874
4,Impact Kitchen,Restaurant,43.656369,-79.35698


In [48]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

43 venues were returned by Foursquare.


## 2. Exploring Neighbourhoods in Downtown Toronto

### Function to repeat the same process to all the neighborhoods in Downtown Toronto

In [49]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = rq.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### The code to run the above function on each neighborhood and create a new dataframe called *dt_Toronto__venues*.

In [50]:

dt_Toronto_venues = getNearbyVenues(names=dt_Toronto_data['Neighborhood'],
                                   latitudes=dt_Toronto_data['Latitude'],
                                   longitudes=dt_Toronto_data['Longitude']
                                  )

Regent Park, Harbourfront
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Harbourfront East, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
University of Toronto, Harbord
Kensington Market, Chinatown, Grange Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Rosedale
St. James Town, Cabbagetown
First Canadian Place, Underground city
Church and Wellesley


### Check the size of the new dataFrame


In [51]:
print(dt_Toronto_venues.shape)
dt_Toronto_venues.head()

(1070, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
1,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
2,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,"Regent Park, Harbourfront",43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant


### Let's group our dataframe by Neighbourhood adn count how many venues they have

In [52]:
dt_Toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,46,46,46,46,46,46
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",18,18,18,18,18,18
Central Bay Street,61,61,61,61,61,61
Christie,16,16,16,16,16,16
Church and Wellesley,66,66,66,66,66,66
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
"First Canadian Place, Underground city",100,100,100,100,100,100
"Garden District, Ryerson",100,100,100,100,100,100
"Harbourfront East, Union Station, Toronto Islands",100,100,100,100,100,100
"Kensington Market, Chinatown, Grange Park",57,57,57,57,57,57


In [53]:
# Unique venues categories
print('There are {} uniques categories.'.format(len(dt_Toronto_venues['Venue Category'].unique())))

There are 186 uniques categories.


## 3. Analyze Each Neighbourhood

In [54]:
# one hot encoding
dt_Toronto_onehot = pd.get_dummies(dt_Toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
dt_Toronto_onehot['Neighborhood'] = dt_Toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [dt_Toronto_onehot.columns[-1]] + list(dt_Toronto_onehot.columns[:-1])
dt_Toronto_onehot = dt_Toronto_onehot[fixed_columns]

dt_Toronto_onehot.head()

Unnamed: 0,Yoga Studio,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Theater,Theme Restaurant,Thrift / Vintage Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
dt_Toronto_onehot.shape

(1070, 186)

### Next, let's group rows by neighbourhood and by taking the mean of the frequency of occurrence of each category

In [56]:
dt_Toronto_grouped = dt_Toronto_onehot.groupby('Neighborhood').mean().reset_index()
dt_Toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Theater,Theme Restaurant,Thrift / Vintage Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0
1,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0,0.0,0.055556,0.055556,0.055556,0.111111,0.166667,0.111111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Central Bay Street,0.016393,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.016393,0.0,0.0,0.016393,0.0
3,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Church and Wellesley,0.015152,0.015152,0.015152,0.0,0.0,0.0,0.0,0.0,0.0,...,0.015152,0.015152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015152
5,"Commerce Court, Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0
6,"First Canadian Place, Underground city",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.01,0.0
7,"Garden District, Ryerson",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.02,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.01,0.0
8,"Harbourfront East, Union Station, Toronto Islands",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.01
9,"Kensington Market, Chinatown, Grange Park",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.017544,0.0,0.0,0.052632,0.0,0.035088,0.017544,0.0


In [57]:
dt_Toronto_grouped.shape

(17, 186)

### Let's print each neighbourhood along with the top 5 most common venues

In [58]:
num_top_venues = 5

for hood in dt_Toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = dt_Toronto_grouped[dt_Toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
                           venue  freq
0                    Coffee Shop  0.09
1                   Cocktail Bar  0.09
2                 Sandwich Place  0.07
3                         Bakery  0.07
4  Vegetarian / Vegan Restaurant  0.04


----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
              venue  freq
0   Airport Service  0.17
1    Airport Lounge  0.11
2  Airport Terminal  0.11
3     Boat or Ferry  0.06
4             Plane  0.06


----Central Bay Street----
                venue  freq
0         Coffee Shop  0.16
1      Sandwich Place  0.08
2    Sushi Restaurant  0.07
3  Italian Restaurant  0.05
4                Café  0.05


----Christie----
           venue  freq
0  Grocery Store  0.25
1           Café  0.19
2           Park  0.12
3     Baby Store  0.06
4    Coffee Shop  0.06


----Church and Wellesley----
                 venue  freq
0     Sushi Restaurant  0.09
1  Japanese Restaurant  0.08
2

### Let's put that into a *pandas* dataframe

Sorting venues in descending order

In [59]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

### Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [60]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighborhood'] = dt_Toronto_grouped['Neighborhood']

for ind in np.arange(dt_Toronto_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(dt_Toronto_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head(10)

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Bakery,Sandwich Place,Vegetarian / Vegan Restaurant,Farmers Market,Beer Bar,Seafood Restaurant,Japanese Restaurant,French Restaurant
1,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Service,Airport Terminal,Airport Lounge,Bar,Boutique,Harbor / Marina,Sculpture Garden,Coffee Shop,Airport Gate,Airport Food Court
2,Central Bay Street,Coffee Shop,Sandwich Place,Sushi Restaurant,Café,Italian Restaurant,Japanese Restaurant,Salad Place,Middle Eastern Restaurant,Restaurant,Pizza Place
3,Christie,Grocery Store,Café,Park,Athletics & Sports,Nightclub,Coffee Shop,Baseball Field,Restaurant,Baby Store,Italian Restaurant
4,Church and Wellesley,Sushi Restaurant,Japanese Restaurant,Restaurant,Coffee Shop,Burrito Place,Gay Bar,Gym,Indian Restaurant,Fast Food Restaurant,Mediterranean Restaurant
5,"Commerce Court, Victoria Hotel",Coffee Shop,Café,Sandwich Place,Hotel,Restaurant,Gym,Japanese Restaurant,Asian Restaurant,Deli / Bodega,Bank
6,"First Canadian Place, Underground city",Coffee Shop,Sandwich Place,Café,Hotel,Japanese Restaurant,Bank,Gym,Restaurant,Sushi Restaurant,Deli / Bodega
7,"Garden District, Ryerson",Coffee Shop,Sandwich Place,Clothing Store,Café,Middle Eastern Restaurant,Cosmetics Shop,Pizza Place,Hotel,Bank,Japanese Restaurant
8,"Harbourfront East, Union Station, Toronto Islands",Coffee Shop,Hotel,Café,Aquarium,Pizza Place,Scenic Lookout,Gym,Deli / Bodega,Sporting Goods Shop,Steakhouse
9,"Kensington Market, Chinatown, Grange Park",Café,Coffee Shop,Vegetarian / Vegan Restaurant,Burger Joint,Vietnamese Restaurant,Gaming Cafe,Mexican Restaurant,Bar,Chinese Restaurant,Cheese Shop


## 4. Cluster Neighbourhoods

Run *k*-means to cluster the neighborhood into 5 clusters. we wil be using k=5 as this is only for demostration on the Foursquare API and clustering, we are not analyzing the optimal k

In [61]:
# set number of clusters
kclusters = 5

dt_Toronto_grouped_clustering = dt_Toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(dt_Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 2, 4, 3, 0, 4, 4, 0, 4, 0], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [62]:
# add clustering labels
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

dt_Toronto_merged = dt_Toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
dt_Toronto_merged = dt_Toronto_merged.join(neighbourhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

dt_Toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,4,Coffee Shop,Park,Bakery,Restaurant,Pub,Café,French Restaurant,Food Truck,Sandwich Place,Beer Store
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0,Coffee Shop,Sandwich Place,Clothing Store,Café,Middle Eastern Restaurant,Cosmetics Shop,Pizza Place,Hotel,Bank,Japanese Restaurant
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0,Coffee Shop,Cocktail Bar,Café,Italian Restaurant,Restaurant,Clothing Store,Japanese Restaurant,Farmers Market,Gastropub,Gym
3,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,0,Coffee Shop,Cocktail Bar,Bakery,Sandwich Place,Vegetarian / Vegan Restaurant,Farmers Market,Beer Bar,Seafood Restaurant,Japanese Restaurant,French Restaurant
4,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383,4,Coffee Shop,Sandwich Place,Sushi Restaurant,Café,Italian Restaurant,Japanese Restaurant,Salad Place,Middle Eastern Restaurant,Restaurant,Pizza Place


Finally, let's visualize the resulting clusters

In [63]:
# create map
map_clusters = folium.Map(location=[43.651070, -79.347015], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(dt_Toronto_merged['Latitude'], dt_Toronto_merged['Longitude'], dt_Toronto_merged['Neighborhood'], dt_Toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## 5. Examining the Clusters

#### Cluster 1

In [64]:
dt_Toronto_merged.loc[dt_Toronto_merged['Cluster Labels'] == 0, dt_Toronto_merged.columns[[1] + list(range(5, dt_Toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Downtown Toronto,0,Coffee Shop,Sandwich Place,Clothing Store,Café,Middle Eastern Restaurant,Cosmetics Shop,Pizza Place,Hotel,Bank,Japanese Restaurant
2,Downtown Toronto,0,Coffee Shop,Cocktail Bar,Café,Italian Restaurant,Restaurant,Clothing Store,Japanese Restaurant,Farmers Market,Gastropub,Gym
3,Downtown Toronto,0,Coffee Shop,Cocktail Bar,Bakery,Sandwich Place,Vegetarian / Vegan Restaurant,Farmers Market,Beer Bar,Seafood Restaurant,Japanese Restaurant,French Restaurant
11,Downtown Toronto,0,Café,Coffee Shop,Vegetarian / Vegan Restaurant,Burger Joint,Vietnamese Restaurant,Gaming Cafe,Mexican Restaurant,Bar,Chinese Restaurant,Cheese Shop
14,Downtown Toronto,0,Coffee Shop,Restaurant,Pizza Place,Café,Bakery,Pub,Italian Restaurant,Caribbean Restaurant,Beer Store,Plaza
16,Downtown Toronto,0,Sushi Restaurant,Japanese Restaurant,Restaurant,Coffee Shop,Burrito Place,Gay Bar,Gym,Indian Restaurant,Fast Food Restaurant,Mediterranean Restaurant


#### Cluster 2

In [65]:
dt_Toronto_merged.loc[dt_Toronto_merged['Cluster Labels'] == 1, dt_Toronto_merged.columns[[1] + list(range(5, dt_Toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
13,Downtown Toronto,1,Park,Trail,Playground,Comic Shop,Diner,Dessert Shop,Department Store,Deli / Bodega,Dance Studio,Creperie


#### Cluster 3

In [66]:
dt_Toronto_merged.loc[dt_Toronto_merged['Cluster Labels'] == 2, dt_Toronto_merged.columns[[1] + list(range(5, dt_Toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
12,Downtown Toronto,2,Airport Service,Airport Terminal,Airport Lounge,Bar,Boutique,Harbor / Marina,Sculpture Garden,Coffee Shop,Airport Gate,Airport Food Court


#### Cluster 4

In [67]:
dt_Toronto_merged.loc[dt_Toronto_merged['Cluster Labels'] == 3, dt_Toronto_merged.columns[[1] + list(range(5, dt_Toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,Downtown Toronto,3,Grocery Store,Café,Park,Athletics & Sports,Nightclub,Coffee Shop,Baseball Field,Restaurant,Baby Store,Italian Restaurant


#### Cluster 5 

In [68]:
dt_Toronto_merged.loc[dt_Toronto_merged['Cluster Labels'] == 4, dt_Toronto_merged.columns[[1] + list(range(5, dt_Toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,4,Coffee Shop,Park,Bakery,Restaurant,Pub,Café,French Restaurant,Food Truck,Sandwich Place,Beer Store
4,Downtown Toronto,4,Coffee Shop,Sandwich Place,Sushi Restaurant,Café,Italian Restaurant,Japanese Restaurant,Salad Place,Middle Eastern Restaurant,Restaurant,Pizza Place
6,Downtown Toronto,4,Coffee Shop,Café,Sandwich Place,Gym,Clothing Store,Sushi Restaurant,Restaurant,Asian Restaurant,Lounge,Burrito Place
7,Downtown Toronto,4,Coffee Shop,Hotel,Café,Aquarium,Pizza Place,Scenic Lookout,Gym,Deli / Bodega,Sporting Goods Shop,Steakhouse
8,Downtown Toronto,4,Coffee Shop,Hotel,Café,Sandwich Place,Asian Restaurant,Pharmacy,Restaurant,Salad Place,Deli / Bodega,Japanese Restaurant
9,Downtown Toronto,4,Coffee Shop,Café,Sandwich Place,Hotel,Restaurant,Gym,Japanese Restaurant,Asian Restaurant,Deli / Bodega,Bank
10,Downtown Toronto,4,Café,Coffee Shop,Sandwich Place,Bar,Japanese Restaurant,Pub,Bakery,Beer Bar,Sushi Restaurant,Poutine Place
15,Downtown Toronto,4,Coffee Shop,Sandwich Place,Café,Hotel,Japanese Restaurant,Bank,Gym,Restaurant,Sushi Restaurant,Deli / Bodega
