# Part3 of Segmenting and Clustering Neighborhoods in Toronto by Haowen Wang

In [1]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done


  current version: 4.5.11
  latest version: 4.8.1

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    scikit-learn-0.20.1        |   py36h22eb022_0         5.7 MB
    liblapack-3.8.0            |      11_openblas          10 KB  conda-forge
    liblapacke-3.8.0           |      11_openblas          10 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    libopenblas-0.3.6          |       h5a2b251_2         7.7 MB
    scipy-1.4.1                |   py36h921218d_0        18.9 MB  conda-forge
    libcblas-3.8.0             |      11_openblas        

<a id='item1'></a>

In [2]:
data = pd.read_csv('result.csv').iloc[:,1:]
data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,['Scarborough'],"['Rouge', 'Malvern']",43.806686,-79.194353
1,M1C,['Scarborough'],"['Highland Creek', 'Rouge Hill', 'Port Union']",43.784535,-79.160497
2,M1E,['Scarborough'],"['Guildwood', 'Morningside', 'West Hill']",43.763573,-79.188711
3,M1G,['Scarborough'],['Woburn'],43.770992,-79.216917
4,M1H,['Scarborough'],['Cedarbrae'],43.773136,-79.239476


#### Get the address of Toronto

In [3]:
address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


#### draw map

In [4]:
# create map of New York using latitude and longitude values
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(data['Latitude'], data['Longitude'], data['Borough'], data['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto

In [8]:
toronto_data = data[data['Borough'].str.contains("Toronto")].reset_index(drop=True)
toronto_data = toronto_data.iloc[:,1:]
print(toronto_data.shape)
toronto_data.head()

(39, 3)


Unnamed: 0,Neighbourhood,Latitude,Longitude
0,['The Beaches'],43.676357,-79.293031
1,"['The Danforth West', 'Riverdale']",43.679557,-79.352188
2,"['The Beaches West', 'India Bazaar']",43.668999,-79.315572
3,['Studio District'],43.659526,-79.340923
4,['Lawrence Park'],43.72802,-79.38879


In [9]:
CLIENT_ID = 'MW5J5HH04KCYPNEBQVGSXP5L3H1RPMCZ4YMO1YXOJDA0TAKS' # your Foursquare ID
CLIENT_SECRET = 'M1O4YU2OIYB4MZQK2IGEGCXIAKYBHC1I4JDFNIOQCB4J45FS' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: MW5J5HH04KCYPNEBQVGSXP5L3H1RPMCZ4YMO1YXOJDA0TAKS
CLIENT_SECRET:M1O4YU2OIYB4MZQK2IGEGCXIAKYBHC1I4JDFNIOQCB4J45FS


In [10]:
# The following function retrieves the venues given the names and coordinates and stores it into dataframe.
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [11]:
toronto_neighborhoods = toronto_data
toronto_venues = getNearbyVenues(names = toronto_neighborhoods['Neighbourhood'], latitudes = toronto_neighborhoods['Latitude'], longitudes = toronto_neighborhoods['Longitude'])

['The Beaches']
['The Danforth West', 'Riverdale']
['The Beaches West', 'India Bazaar']
['Studio District']
['Lawrence Park']
['Davisville North']
['North Toronto West']
['Davisville']
['Moore Park', 'Summerhill East']
['Deer Park', 'Forest Hill SE', 'Rathnelly', 'South Hill', 'Summerhill West']
['Rosedale']
['Cabbagetown', 'St. James Town']
['Church and Wellesley']
['Harbourfront']
['Ryerson', 'Garden District']
['St. James Town']
['Berczy Park']
['Central Bay Street']
['Adelaide', 'King', 'Richmond']
['Harbourfront East', 'Toronto Islands', 'Union Station']
['Design Exchange', 'Toronto Dominion Centre']
['Commerce Court', 'Victoria Hotel']
['Roselawn']
['Forest Hill North', 'Forest Hill West']
['The Annex', 'North Midtown', 'Yorkville']
['Harbord', 'University of Toronto']
['Chinatown', 'Grange Park', 'Kensington Market']
['CN Tower', 'Bathurst Quay', 'Island airport', 'Harbourfront West', 'King and Spadina', 'Railway Lands', 'South Niagara']
['Stn A PO Boxes 25 The Esplanade']
['Fir

In [12]:
print(toronto_venues.shape)
toronto_venues.head()

(862, 7)


Unnamed: 0,Neighbourhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,['The Beaches'],43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,['The Beaches'],43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,['The Beaches'],43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,['The Beaches'],43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,['The Beaches'],43.676357,-79.293031,Seaspray Restaurant,43.678888,-79.298167,Asian Restaurant


#### Count of venues were returned for each Borough

In [13]:
toronto_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"[""Queen's Park""]",30,30,30,30,30,30
"['Adelaide', 'King', 'Richmond']",30,30,30,30,30,30
['Berczy Park'],30,30,30,30,30,30
"['Brockton', 'Exhibition Place', 'Parkdale Village']",23,23,23,23,23,23
['Business Reply Mail Processing Centre 969 Eastern'],16,16,16,16,16,16
"['CN Tower', 'Bathurst Quay', 'Island airport', 'Harbourfront West', 'King and Spadina', 'Railway Lands', 'South Niagara']",16,16,16,16,16,16
"['Cabbagetown', 'St. James Town']",30,30,30,30,30,30
['Central Bay Street'],30,30,30,30,30,30
"['Chinatown', 'Grange Park', 'Kensington Market']",30,30,30,30,30,30
['Christie'],19,19,19,19,19,19


In [14]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 190 uniques categories.


### Analyze each neighborhood

In [15]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighbourhood,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Art Gallery,...,Theater,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Yoga Studio
0,['The Beaches'],0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,['The Beaches'],0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,['The Beaches'],0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,['The Beaches'],0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,['The Beaches'],0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
toronto_onehot.shape

(862, 191)

In [17]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighbourhood,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Art Gallery,...,Theater,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Yoga Studio
0,"[""Queen's Park""]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333
1,"['Adelaide', 'King', 'Richmond']",0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0
2,['Berczy Park'],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0
3,"['Brockton', 'Exhibition Place', 'Parkdale Vil...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478
4,['Business Reply Mail Processing Centre 969 Ea...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"['CN Tower', 'Bathurst Quay', 'Island airport'...",0.0625,0.0625,0.0625,0.125,0.125,0.125,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,"['Cabbagetown', 'St. James Town']",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,['Central Bay Street'],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,"['Chinatown', 'Grange Park', 'Kensington Market']",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066667,0.033333,0.0
9,['Christie'],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
toronto_grouped.shape

(39, 191)

In [19]:
num_top_venues = 5
for neigh in toronto_grouped['Neighbourhood']:
    print("----"+neigh+"----")
    temp = toronto_grouped[toronto_grouped['Neighbourhood'] == neigh].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----["Queen's Park"]----
         venue  freq
0  Coffee Shop  0.13
1         Park  0.07
2          Gym  0.07
3        Diner  0.03
4         Café  0.03


----['Adelaide', 'King', 'Richmond']----
                venue  freq
0          Steakhouse  0.10
1                Café  0.10
2  Seafood Restaurant  0.07
3               Hotel  0.07
4    Asian Restaurant  0.07


----['Berczy Park']----
            venue  freq
0     Coffee Shop  0.10
1        Beer Bar  0.07
2            Café  0.07
3  Farmers Market  0.07
4    Cocktail Bar  0.07


----['Brockton', 'Exhibition Place', 'Parkdale Village']----
            venue  freq
0  Breakfast Spot  0.09
1     Coffee Shop  0.09
2       Nightclub  0.09
3            Café  0.09
4          Bakery  0.04


----['Business Reply Mail Processing Centre 969 Eastern']----
                venue  freq
0          Comic Shop  0.06
1                 Spa  0.06
2       Garden Center  0.06
3              Garden  0.06
4  Light Rail Station  0.06


----['CN Tower', 'Bathurst 

In [20]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.shape

(39, 11)

#### neighbourhood clustering

In [21]:
# set number of clusters
kclusters = 10

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=1).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
print(kmeans.labels_[0:10])
print(len(kmeans.labels_))

[2 2 2 2 2 0 2 2 2 2]
39


In [22]:
toronto_neighborhoods.head()

Unnamed: 0,Neighbourhood,Latitude,Longitude
0,['The Beaches'],43.676357,-79.293031
1,"['The Danforth West', 'Riverdale']",43.679557,-79.352188
2,"['The Beaches West', 'India Bazaar']",43.668999,-79.315572
3,['Studio District'],43.659526,-79.340923
4,['Lawrence Park'],43.72802,-79.38879


In [23]:
toronto_merged = toronto_neighborhoods

# add clustering labels
toronto_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,['The Beaches'],43.676357,-79.293031,2,Trail,Neighborhood,Health Food Store,Pub,Asian Restaurant,Yoga Studio,Department Store,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant
1,"['The Danforth West', 'Riverdale']",43.679557,-79.352188,2,Greek Restaurant,Italian Restaurant,Ice Cream Shop,Bakery,Cosmetics Shop,Coffee Shop,Pizza Place,Dessert Shop,Diner,Pub
2,"['The Beaches West', 'India Bazaar']",43.668999,-79.315572,2,Italian Restaurant,Brewery,Burger Joint,Burrito Place,Ice Cream Shop,Light Rail Station,Fast Food Restaurant,Fish & Chips Shop,Steakhouse,Liquor Store
3,['Studio District'],43.659526,-79.340923,2,Café,Coffee Shop,Italian Restaurant,Bakery,Coworking Space,Park,Middle Eastern Restaurant,Pet Store,Cheese Shop,Stationery Store
4,['Lawrence Park'],43.72802,-79.38879,2,Park,Bus Line,Swim School,Yoga Studio,Dessert Shop,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant


In [24]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'],kmeans.labels_):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters