# Cluster Toronto Neightbourhoods
## Capstone project, Week 3 Notebook 2

In [122]:
# Get the latest version of BeautifulSoup; use XML parser for speed
#!pip install beautifulsoup4
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from lxml import html
import requests

### Get the data
Read the wikipedia page using BeautifoulSoup

In [2]:
# if we don't submit a User-Agent header, the connection is refused by the server
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
}

# The list of postal codes in Canada where the first letter is M
url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

# Make a GET request to fetch the raw HTML content
html_content = requests.get(url).text

# Parse the html content
soup = BeautifulSoup(html_content, "lxml")

### Get HTML table
Get the HTML elements that interest us

In [3]:
# The content we want is in tr elements inside a tbody tag 
# We must select only the tr's inside the tbody, avoid the extra tr's after that
html_table = soup.tbody
html_rows = html_table.find_all('tr')

### Convert HTML table into dictionary
Create a dictionary for each table row, of the form {'Postal Code': '', 'Borough': '', 'Neighborhood': ''}
This will prove useful when we want to use logic to exclude records, change values etc.

In [4]:
# table header, remove newlines before extracting 
table_header = []
for th in html_rows[0].find_all('th'):
    table_header.append(th.text.replace('\n', ' ').strip())

table_data = []
for tr in html_rows:
    t_row = {}
    # Each table row is stored in the form of t_row = {'Postal Code': '', 'Borough': '', 'Neighborhood': ''}
    for td, th in zip(tr.find_all("td"), table_header):
        cell = td.text.replace('\n', '')
        t_row[th] = cell 
    
    # skip the first empty row
    if not t_row:
        continue
        
    # add to the dataset only if the Borough is not assigned
    if (t_row['Borough'] != 'Not assigned'):
        table_data.append(t_row)
        
    # If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the
    # same as the borough. Today there is no data on the page that fits this rule, let's implement it anyway
    if (t_row['Borough'] != 'Not assigned' and t_row['Neighborhood'] == ''):
        t_row['Neighborhood'] = t_row['Borough']


### Put into dataframe

In [5]:
df = pd.DataFrame(table_data)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [6]:
# Check the number of records
df.shape

(103, 3)

### Get geocoding information
The Google provider returns nothing, Bing et al. need API keys, we use Geocode.Farm that provides 250 free query requests per day. The while loop is still useful because sometimes the API returns no values

In [7]:
#!pip install geocoder
import time

latitudes = []
longitudes = []
import geocoder
for postal_code in df['Postal Code']:
    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.geocodefarm('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
        # print(postal_code, " returned lat =", g.lat)
        time.sleep(1)

    latitudes.append(lat_lng_coords[0])
    longitudes.append(lat_lng_coords[1])
    
print ("Got ", len(latitudes), "elements for latitude and ", len(longitudes), "for longitude")

Got  103 elements for latitude and  103 for longitude


Now we combine dataframe and lists to produce the desired result

In [8]:
df['Latitude'] = latitudes
df['Longitude'] = longitudes

In [9]:
 # check it out
df.head(12)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.751881,-79.33036
1,M4A,North York,Victoria Village,43.730419,-79.31282
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65514,-79.362648
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.723209,-79.451408
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66449,-79.393021
5,M9A,Etobicoke,Islington Avenue,43.662769,-79.528313
6,M1B,Scarborough,"Malvern, Rouge",43.811531,-79.195518
7,M3B,North York,Don Mills,43.74929,-79.361687
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.707939,-79.3116
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.65736,-79.378181


In [10]:
# Save results in case the API becomes unavailable or our experimentation hits daily usage limits 
df.to_csv('Toronto_neigbourhoods_geocoded.csv')

### Use the Foursquare API
First let's define app keys and version

In [13]:
CLIENT_ID = '52VQHWMFSFEKTROCKQBDCZG1OSHFQJLG4E3JYHFS30AK3JHA' # your Foursquare ID
CLIENT_SECRET = 'PP5LO3LWQ2KC3DICE1PGSOCU2ZGGVGRIACOMCTKLJJR3PHO3' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

### Systematic extraction of venue data
Let's create a function that will, for each neighborhood in Toronto
1. Create a Foursquare API call with the appropriate coordinates
2. Retrieve nearby venues in JSON format
3. Filter data to extract venue category
4. Return a dataframe containing venue name, venue location, venue category

__We define a radius of 1000m to account for the apparent lower density of Toronto__ 

In [53]:
LIMIT = 100    # limit of number of venues returned by Foursquare API
radius = 1000   # define radius

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Run this function to explore all neighborhoods (postal codes) in Toronto and receive one big list of venues, with the exact location and category of each, the neighborhood to which they belong to and the location of the neighborhood

In [54]:
toronto_venues = getNearbyVenues( names = df['Neighborhood'],
                                   latitudes = df['Latitude'],
                                   longitudes = df['Longitude']
                                  )

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Queen's Park, Ontario Provincial Government
Islington Avenue
Malvern, Rouge
Don Mills
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don Mills
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
East Toronto
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmount Park
Bayview Village
Downsview
The Danforth West, Ri

In [55]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.751881,-79.33036,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.751881,-79.33036,PetSmart,43.748639,-79.333488,Pet Store
2,Parkwoods,43.751881,-79.33036,Variety Store,43.751974,-79.333114,Food & Drink Shop
3,Victoria Village,43.730419,-79.31282,Memories of Africa,43.726602,-79.312427,Grocery Store
4,Victoria Village,43.730419,-79.31282,Eglinton Ave E & Sloane Ave/Bermondsey Rd,43.726086,-79.31362,Intersection


Let's check how many venues were collected per each neighborhood

In [56]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,7,7,7,7,7,7
"Alderwood, Long Branch",5,5,5,5,5,5
"Bathurst Manor, Wilson Heights, Downsview North",1,1,1,1,1,1
Bayview Village,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",20,20,20,20,20,20
...,...,...,...,...,...,...
"Willowdale, Newtonbrook",16,16,16,16,16,16
Woburn,4,4,4,4,4,4
Woodbine Heights,16,16,16,16,16,16
York Mills West,4,4,4,4,4,4


Let's find out how many unique categories can be curated from all the returned venues

In [57]:
print('There are {} unique categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 259 unique categories.


### Analyze venue data, get top 5 venues for each neighborhood

We are going to create a new dataframe that contains the top 5 venues for each neighborhood

In [115]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe, put it in first column
toronto_onehot.drop('Neighborhood', axis=1, inplace=True)
toronto_onehot.insert(0, 'Neighborhood', toronto_venues['Neighborhood'], allow_duplicates=False) 

toronto_onehot.head()

Unnamed: 0,Neighborhood,ATM,Accessories Store,Afghan Restaurant,Airport,American Restaurant,Antique Shop,Aquarium,Art Gallery,Art Museum,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


And let's examine the new dataframe size.

In [135]:
toronto_onehot.shape

(2380, 259)

Next, let's group rows by neighborhood and take the mean of the frequency of occurrence of each category¶

In [136]:
toronto_onehot.groupby('Neighborhood')

toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,ATM,Accessories Store,Afghan Restaurant,Airport,American Restaurant,Antique Shop,Aquarium,Art Gallery,Art Museum,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,"Willowdale, Newtonbrook",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
93,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
94,Woodbine Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95,York Mills West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Get the new size

In [119]:
toronto_grouped.shape

(97, 259)

Let's print each neighborhood along with the top 5 most common venues

In [139]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt----
              venue  freq
0     Shopping Mall  0.14
1    Discount Store  0.14
2   Badminton Court  0.14
3              Pool  0.14
4  Sushi Restaurant  0.14


----Alderwood, Long Branch----
               venue  freq
0  Convenience Store   0.2
1                Pub   0.2
2     Sandwich Place   0.2
3                Gym   0.2
4        Pizza Place   0.2


----Bathurst Manor, Wilson Heights, Downsview North----
                       venue  freq
0                IT Services   1.0
1                        ATM   0.0
2    New American Restaurant   0.0
3  Middle Eastern Restaurant   0.0
4         Miscellaneous Shop   0.0


----Bayview Village----
                        venue  freq
0  Construction & Landscaping  0.25
1          Golf Driving Range  0.25
2                        Park  0.25
3                       Trail  0.25
4                         ATM  0.00


----Bedford Park, Lawrence Manor East----
                venue  freq
0  Italian Restaurant  0.15
1         Coffee Shop

4                    Bank  0.06


----Little Portugal, Trinity----
          venue  freq
0  Cocktail Bar  0.07
1    Restaurant  0.07
2           Bar  0.07
3   Coffee Shop  0.05
4      Wine Bar  0.05


----Malvern, Rouge----
                       venue  freq
0               Home Service   1.0
1                   Platform   0.0
2         Mexican Restaurant   0.0
3  Middle Eastern Restaurant   0.0
4         Miscellaneous Shop   0.0


----Milliken, Agincourt North, Steeles East, L'Amoreaux East----
                       venue  freq
0                   Pharmacy   1.0
1                        ATM   0.0
2                Music Venue   0.0
3         Mexican Restaurant   0.0
4  Middle Eastern Restaurant   0.0


----Mimico NW, The Queensway West, South of Bloor, Kingsway Park South West, Royal York South West----
                         venue  freq
0                         Bank  0.17
1  Eastern European Restaurant  0.08
2                 Optical Shop  0.08
3                 Burger Joint  0.08

Let's write a function to sort the venues in descending order

In [120]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [234]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.tail(10)

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
87,"West Deane Park, Princess Gardens, Martin Grov...",Movie Theater,Yoga Studio,Comic Shop,Fish Market,Fish & Chips Shop
88,Westmount,Pizza Place,Middle Eastern Restaurant,Chinese Restaurant,Sandwich Place,Coffee Shop
89,Weston,Coffee Shop,Train Station,Pizza Place,Grocery Store,Diner
90,"Wexford, Maryvale",Auto Garage,Yoga Studio,Falafel Restaurant,Dry Cleaner,Eastern European Restaurant
91,Willowdale,Coffee Shop,Pizza Place,Grocery Store,Bank,Fast Food Restaurant
92,"Willowdale, Newtonbrook",Korean Restaurant,Pizza Place,Middle Eastern Restaurant,Japanese Restaurant,Coffee Shop
93,Woburn,Construction & Landscaping,Park,Business Service,Coffee Shop,Fast Food Restaurant
94,Woodbine Heights,Pharmacy,Bus Line,Grocery Store,Pizza Place,Gas Station
95,York Mills West,Convenience Store,Bank,Park,Coffee Shop,Yoga Studio
96,"York Mills, Silver Hills",Music Venue,Comic Shop,Fish Market,Fish & Chips Shop,Field


### Cluster Neighborhoods

Now let's cluster neighborhoods using k-means.

In [253]:
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:100]

array([0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 2], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [236]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_, False)

toronto_merged = df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M3A,North York,Parkwoods,43.751881,-79.33036,0.0,Pet Store,Food & Drink Shop,Park,Yoga Studio,Falafel Restaurant
1,M4A,North York,Victoria Village,43.730419,-79.31282,0.0,Grocery Store,Intersection,Yoga Studio,Dry Cleaner,Eastern European Restaurant
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65514,-79.362648,0.0,Coffee Shop,Breakfast Spot,Theater,Event Space,Spa
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.723209,-79.451408,0.0,Clothing Store,Furniture / Home Store,Cosmetics Shop,Bookstore,Food Court
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66449,-79.393021,0.0,Coffee Shop,Park,Café,Museum,Sandwich Place


### Visualize results
Finally, let's visualize the resulting clusters. To show the map, we must locate the center of the city of Toronto

In [238]:
address = 'Toronto, Canada'

map_center = geocoder.geocodefarm(address)
lat_lng_coords = map_center.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]

print('The geograpical coordinates of {} are {}, {}.'.format(address, latitude, longitude))

The geograpical coordinates of Toronto, Canada are 43.6532715214454, -79.383181300614.


In [239]:
# To display yhe map we'll use the numeric values

toronto_merged['Cluster Labels'] = pd.to_numeric(toronto_merged['Cluster Labels'], errors='coerce')

# Let's drop any neighborhoods that have no venues at all, the label in K-means is calculated as NaN
# There is one such venue in the data, Upper Rouge
toronto_merged = toronto_merged[toronto_merged['Cluster Labels'].notna()]

In [254]:
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], \
                                  toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

__Important notice:__ how to view the color map on Github  <br>
If you are viewing this via the Github repository, you cannot see the colour map because Github only shows static HTML, it disables interactive JavaScript such as the one used by interactive Folium maps. <br><p>

You can see the same notebook and the map using the jupyter.org nbviewer service, like this:<br>
<a href="https://nbviewer.jupyter.org/github/ak501/Coursera_Capstone/blob/master/Week3_NB3.ipynb">Here is the notebook with the map<a/>

The map seems ok, but cluster population are not equally wighted. K-means clustering assigned the vast majority of the neighborhoods to cluster 0, with only a handful being assigned to clusters 1, 2, 3 and 4. 

### Examination of clusters
Let's look at the contents of each cluster

In [243]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, \
                   toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,North York,0.0,Pet Store,Food & Drink Shop,Park,Yoga Studio,Falafel Restaurant
1,North York,0.0,Grocery Store,Intersection,Yoga Studio,Dry Cleaner,Eastern European Restaurant
2,Downtown Toronto,0.0,Coffee Shop,Breakfast Spot,Theater,Event Space,Spa
3,North York,0.0,Clothing Store,Furniture / Home Store,Cosmetics Shop,Bookstore,Food Court
4,Downtown Toronto,0.0,Coffee Shop,Park,Café,Museum,Sandwich Place
...,...,...,...,...,...,...,...
97,Downtown Toronto,0.0,Coffee Shop,Café,Hotel,Gym,American Restaurant
99,Downtown Toronto,0.0,Coffee Shop,Japanese Restaurant,Restaurant,Sushi Restaurant,Dance Studio
100,East Toronto,0.0,Coffee Shop,Hotel,Café,Restaurant,Japanese Restaurant
101,Etobicoke,0.0,Coffee Shop,Fast Food Restaurant,Chinese Restaurant,Sushi Restaurant,Italian Restaurant


In [244]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, \
                   toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
6,Scarborough,1.0,Home Service,Yoga Studio,Dog Run,Fish Market,Fish & Chips Shop
61,Central Toronto,1.0,Park,Home Service,Farm,Dry Cleaner,Eastern European Restaurant
62,Central Toronto,1.0,Home Service,Yoga Studio,Dog Run,Fish Market,Fish & Chips Shop


In [246]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, \
                   toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
45,North York,2.0,Music Venue,Comic Shop,Fish Market,Fish & Chips Shop,Field


In [247]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, \
                   toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
68,Central Toronto,3.0,Park,Yoga Studio,Dog Run,Fish Market,Fish & Chips Shop
69,West Toronto,3.0,Residential Building (Apartment / Condo),Park,Yoga Studio,Event Space,Donut Shop
98,Etobicoke,3.0,Park,Yoga Studio,Dog Run,Fish Market,Fish & Chips Shop


In [250]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, \
                   toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
28,North York,4.0,IT Services,Yoga Studio,Dog Run,Fish Market,Fish & Chips Shop


### Comment on clustering results
The results do not look perfect because there is one predominant cluster that overshadows others. <br>
Perhaps this is intrinsic to the nature of our exercise, the area of Toronto with postcode M seems pretty homogeneous, with its basic destination of use is centered around food venues.<br>

We can see that each cluster has good homogeneity:
-  Cluster 0 is centered around __Coffee Shops and Restaurants__ of various types
-  Cluster 1 is of a __semi-central__ type, with Home Service and Yoga Studio as prevailing venues
-  Cluster 2 contains 1 neighborhood, it is not central but close to a major road, it contains __Music__ venues and the __Fish market__
-  Cluster 3 contains 3 neighbourhoods with __Parks, residential buildings and yoga studios__
-  Cluster 4 contains 1 neighbourhood and is dominated by __IT Services__, it is not central but close to the airport, so must be convenient for business travel.
 