# Segmenting and Clustering Neighborhoods in Toronto

## ------- Part 1 ------ Scraping Wikipedia Page and creating a dataframe

In [61]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
#!conda install -c conda-forge folium=0.5.0 --yes
import folium 
import requests # library to handle requests
import matplotlib.cm as cm
import matplotlib.colors as colors

In [3]:
pip install lxml

Collecting lxml
[?25l  Downloading https://files.pythonhosted.org/packages/dd/ba/a0e6866057fc0bbd17192925c1d63a3b85cf522965de9bc02364d08e5b84/lxml-4.5.0-cp36-cp36m-manylinux1_x86_64.whl (5.8MB)
[K     |████████████████████████████████| 5.8MB 7.3MB/s eta 0:00:01
[?25hInstalling collected packages: lxml
Successfully installed lxml-4.5.0
Note: you may need to restart the kernel to use updated packages.


### Use pandas to read wikipedia table

In [2]:
# Use pandas to read html table
d = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

In [3]:
# display the dataframe
df = d[0]
df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M1ANot assigned,M2ANot assigned,M3ANorth York(Parkwoods),M4ANorth York(Victoria Village),M5ADowntown Toronto(Regent Park / Harbourfront),M6ANorth York(Lawrence Manor / Lawrence Heights),M7AQueen's Park / Ontario Provincial Government,M8ANot assigned,M9AEtobicoke(Islington Avenue)
1,M1BScarborough(Malvern / Rouge),M2BNot assigned,M3BNorth York(Don Mills)North,M4BEast York(Parkview Hill / Woodbine Gardens),"M5BDowntown Toronto(Garden District, Ryerson)",M6BNorth York(Glencairn),M7BNot assigned,M8BNot assigned,M9BEtobicoke(West Deane Park / Princess Garden...
2,M1CScarborough(Rouge Hill / Port Union / Highl...,M2CNot assigned,M3CNorth York(Don Mills)South(Flemingdon Park),M4CEast York(Woodbine Heights),M5CDowntown Toronto(St. James Town),M6CYork(Humewood-Cedarvale),M7CNot assigned,M8CNot assigned,M9CEtobicoke(Eringate / Bloordale Gardens / Ol...
3,M1EScarborough(Guildwood / Morningside / West ...,M2ENot assigned,M3ENot assigned,M4EEast Toronto(The Beaches),M5EDowntown Toronto(Berczy Park),M6EYork(Caledonia-Fairbanks),M7ENot assigned,M8ENot assigned,M9ENot assigned
4,M1GScarborough(Woburn),M2GNot assigned,M3GNot assigned,M4GEast York(Leaside),M5GDowntown Toronto(Central Bay Street),M6GDowntown Toronto(Christie),M7GNot assigned,M8GNot assigned,M9GNot assigned


In [4]:
# make the list and flatten the list to have all the cells in just one column
from pandas.core.common import flatten
k = list(flatten(df.values.tolist()))

In [5]:
#make a dataframe out of this list and display it
e = pd.DataFrame(k)
e.head(5)

Unnamed: 0,0
0,M1ANot assigned
1,M2ANot assigned
2,M3ANorth York(Parkwoods)
3,M4ANorth York(Victoria Village)
4,M5ADowntown Toronto(Regent Park / Harbourfront)


### Make a new (final) dataframe and clean the data

In [71]:
new = pd.DataFrame()
new['PostalCode'] = e[0].str[:3]  # take 3 first characters from each row and make a column
new['Borough'] = e[0].str[3:]  # take all the characters except the first 3 and make a column
new['Borough'] = new.Borough.str.replace(',' , ' /').str.replace(')' , ' ')   # do some characters replacement for the subsequent column split
new[['Borough','Neighborhood']] = new.Borough.str.split('(', 1, expand=True)  # split the column into 2 columns. Divider is "(" 
new['Neighborhood'] = new.Neighborhood.str.replace(' /' , ',').str.replace('(' , ', ') # do some final data cleaning
new.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Queen's Park / Ontario Provincial Government,
7,M8A,Not assigned,
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,"Malvern, Rouge"


### Removing rows with "Not assigned" Boroughs

In [7]:

new.replace("Not assigned", np.nan, inplace = True)
new.dropna(subset=["Borough"], axis=0, inplace=True)
new.reset_index(drop=True, inplace=True)
new.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park / Ontario Provincial Government,


In [8]:
new['Borough'].value_counts() 

North York                                                      24
Downtown Toronto                                                17
Scarborough                                                     17
Etobicoke                                                       11
Central Toronto                                                  9
West Toronto                                                     6
York                                                             5
East Toronto                                                     4
East York                                                        4
East YorkEast Toronto                                            1
East TorontoBusiness reply mail Processing Centre969 Eastern     1
EtobicokeNorthwest                                               1
Queen's Park / Ontario Provincial Government                     1
Downtown TorontoStn A PO Boxes25 The Esplanade                   1
MississaugaCanada Post Gateway Processing Centre              

### We see that we have some wrong Borough names, so we need to fix this

In [9]:
#Solving the wrong Boroughs issue
new['Borough'].replace("Etobicoke.*", "Etobicoke", regex=True, inplace=True)
new['Borough'].replace("Downtown.*", "Downtown Toronto", regex=True, inplace=True)
new['Borough'].replace("Mississauga.*", "Mississauga", regex=True, inplace=True)
new['Borough'].replace("East Toronto.*", "East Toronto", regex=True, inplace=True)
new['Borough'].replace("Queen.*", "Queen\'s Park", regex=True, inplace=True)
new['Borough'].replace("East York.*", "East York", regex=True, inplace=True)
new['Borough'].value_counts()

North York          24
Downtown Toronto    18
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
York                 5
East Toronto         5
East York            5
Mississauga          1
Queen's Park         1
Name: Borough, dtype: int64

In [10]:
# Everything is OK with Boroughs now, but what we have with Neighborhoods?
new.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills North
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


### Our final table looks like this

In [11]:
#We have some Neighborhoods with "None" value, so let's give them the name of the respective Borough
i = 0
test = 0
while (i <= 102):
    if (new['Neighborhood'][i]== None):
        new['Neighborhood'][i] = new['Borough'][i]
    i = i+1

In [12]:

new.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills North
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [13]:
# here is the shape of our table
new.shape

(103, 3)

## ----- Part 2 ----- Creating a Dataframe with latitudes and longitudes

### Read csv file with pandas

In [16]:
df_geo = pd.read_csv('Geospatial_Coordinates.csv')
df_geo.head()



Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge two dataframes

In [17]:
merged_inner = pd.merge(left=new, right=df_geo, left_on='PostalCode', right_on='Postal Code')
merged_inner.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M3A,North York,Parkwoods,M3A,43.753259,-79.329656
1,M4A,North York,Victoria Village,M4A,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",M5A,43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",M6A,43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,M7A,43.662301,-79.389494


### Our final DataFrame looks like this

In [18]:
df_final = merged_inner.drop(['Postal Code'], axis = 1)
df_final.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


In [19]:
df_final.shape

(103, 5)

## ----- Part 3 ----- Explore and cluster the neighborhoods in Toronto

### Create a map of Toronto with neighborhoods superimposed on top.

In [20]:
# create map of Toronto using latitude and longitude values

map_toronto = folium.Map(location=[43.6532, -79.3832], zoom_start=10)


In [21]:

# add markers to map
for lat, lng, borough, neighborhood in zip(df_final['Latitude'], df_final['Longitude'], df_final['Borough'], df_final['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
       
map_toronto

### Now, let's focus on those boroughs that contain the word Toronto

In [31]:
toronto_sub = df_final[df_final['Borough'].str.contains('Toronto')].reset_index(drop=True)
toronto_sub.head()


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


In [32]:
toronto_sub.shape

(38, 5)

### Let's visualize corresponding neighborhoods on the map 

In [74]:
map_toronto_sub = folium.Map(location=[43.6532, -79.3832], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_sub['Latitude'], toronto_sub['Longitude'], toronto_sub['Borough'], toronto_sub['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto_sub)  
       
map_toronto_sub

### Now we will explore and segment our neighborhoods with Foursquare API 

In [36]:
CLIENT_ID = 'FVLC0DVBG0MFLFWPBRKVQZWPJ15LJJQ1LN3HT0DTAYU5PGOU' # your Foursquare ID
CLIENT_SECRET = 'WKJFY5I31TA1M0E5W4RUQ135VDRIXSEDRTMOAINIAHPPPO2W' # your Foursquare Secret
VERSION = '20200323' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: FVLC0DVBG0MFLFWPBRKVQZWPJ15LJJQ1LN3HT0DTAYU5PGOU
CLIENT_SECRET:WKJFY5I31TA1M0E5W4RUQ135VDRIXSEDRTMOAINIAHPPPO2W


### Get the top 100 venues from each neighborhood within a radius of 300 meters.

In [76]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 300

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [77]:
toronto_sub_venues = getNearbyVenues(names=toronto_sub['Neighborhood'],
                                   latitudes=toronto_sub['Latitude'],
                                   longitudes=toronto_sub['Longitude']
                                  )

Regent Park, Harbourfront 
Garden District, Ryerson 
St. James Town 
The Beaches 
Berczy Park 
Central Bay Street 
Christie 
Richmond, Adelaide, King 
Dufferin, Dovercourt Village 
Harbourfront East, Union Station, Toronto Islands 
Little Portugal, Trinity 
The Danforth West, Riverdale 
Toronto Dominion Centre, Design Exchange 
Brockton, Parkdale Village, Exhibition Place 
India Bazaar, The Beaches West 
Commerce Court, Victoria Hotel 
Studio District 
Lawrence Park 
Roselawn 
Davisville North 
Forest Hill North & West 
High Park, The Junction South 
North Toronto West 
The Annex, North Midtown, Yorkville 
Parkdale, Roncesvalles 
Davisville 
University of Toronto, Harbord 
Runnymede, Swansea 
Moore Park, Summerhill East 
Kensington Market, Chinatown, Grange Park 
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park 
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport 
Rosedale 
Enclave of M5E 
St. James Town, Cabbageto

In [78]:
print(toronto_sub_venues.shape)
toronto_sub_venues.head()

(1671, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,"Regent Park, Harbourfront",43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant


### Let's see how many venues were returned for each neighborhood

In [79]:
toronto_sub_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,55,55,55,55,55,55
"Brockton, Parkdale Village, Exhibition Place",22,22,22,22,22,22
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",14,14,14,14,14,14
Central Bay Street,81,81,81,81,81,81
Christie,17,17,17,17,17,17
Church and Wellesley,88,88,88,88,88,88
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
Davisville,35,35,35,35,35,35
Davisville North,8,8,8,8,8,8
"Dufferin, Dovercourt Village",19,19,19,19,19,19


### How many unique categories are in all the returned venues?

In [80]:
print('There are {} uniques categories.'.format(len(toronto_sub_venues['Venue Category'].unique())))

There are 237 uniques categories.


## Let's Analyze Each Neighborhood

In [81]:
# one hot encoding
toronto_sub_onehot = pd.get_dummies(toronto_sub_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_sub_onehot['Neighborhood'] = toronto_sub_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_sub_onehot.columns[-1]] + list(toronto_sub_onehot.columns[:-1])
toronto_sub_onehot = toronto_sub_onehot[fixed_columns]
print(toronto_sub_onehot.shape)
toronto_sub_onehot.head()

(1671, 237)


Unnamed: 0,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### let's group rows by neighborhood

In [82]:
toronto_sub_grouped = toronto_sub_onehot.groupby('Neighborhood').mean().reset_index()
print(toronto_sub_grouped.shape)
toronto_sub_grouped.head()

(38, 237)


Unnamed: 0,Neighborhood,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.018182,0.0,0.0,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0,0.071429,0.071429,0.071429,0.142857,0.142857,0.071429,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Central Bay Street,0.012346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012346,...,0.0,0.0,0.0,0.012346,0.0,0.0,0.012346,0.0,0.0,0.0
4,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Let's create the new dataframe and display the top 10 venues for each neighborhood

In [83]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [88]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_sub_grouped['Neighborhood']

for ind in np.arange(toronto_sub_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_sub_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Seafood Restaurant,Beer Bar,Farmers Market,Cocktail Bar,Bakery,Cheese Shop,Café,Restaurant,Irish Pub
1,"Brockton, Parkdale Village, Exhibition Place",Coffee Shop,Café,Breakfast Spot,Bakery,Convenience Store,Performing Arts Venue,Pet Store,Climbing Gym,Restaurant,Burrito Place
2,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Lounge,Airport Service,Boat or Ferry,Coffee Shop,Boutique,Airport Terminal,Harbor / Marina,Airport Gate,Airport Food Court,Airport
3,Central Bay Street,Coffee Shop,Italian Restaurant,Sandwich Place,Burger Joint,Ice Cream Shop,Japanese Restaurant,Thai Restaurant,Salad Place,Bubble Tea Shop,Café
4,Christie,Grocery Store,Café,Park,Candy Store,Restaurant,Diner,Italian Restaurant,Nightclub,Gas Station,Coffee Shop


 ## Cluster Neighborhoods

In [89]:
# set number of clusters
kclusters = 5

toronto_sub_grouped_clustering = toronto_sub_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_sub_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

### The new dataframe will include clusters together with the top 10 venues for each neighborhood.

In [90]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_sub_merged = toronto_sub

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_sub_merged = toronto_sub_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_sub_merged.head() # check the last columns!


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0,Coffee Shop,Bakery,Pub,Park,Restaurant,Theater,Café,Mexican Restaurant,Chocolate Shop,Dessert Shop
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0,Coffee Shop,Clothing Store,Japanese Restaurant,Café,Cosmetics Shop,Middle Eastern Restaurant,Restaurant,Bubble Tea Shop,Pizza Place,Bookstore
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0,Coffee Shop,Restaurant,Café,Hotel,Italian Restaurant,Cosmetics Shop,Breakfast Spot,Beer Bar,Diner,Bakery
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Trail,Pub,Health Food Store,Doner Restaurant,Dessert Shop,Diner,Discount Store,Distribution Center,Dog Run,Women's Store
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,0,Coffee Shop,Seafood Restaurant,Beer Bar,Farmers Market,Cocktail Bar,Bakery,Cheese Shop,Café,Restaurant,Irish Pub


### Put the clasters on the map

In [91]:
# create map
map_clusters = folium.Map(location=[43.6532, -79.3832], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_sub_merged['Latitude'], toronto_sub_merged['Longitude'], toronto_sub_merged['Neighborhood'], toronto_sub_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters