In [1]:
# Install libraries
!pip install geopy
!pip install folium


Collecting folium
  Downloading folium-0.12.1-py2.py3-none-any.whl (94 kB)
[K     |████████████████████████████████| 94 kB 6.6 MB/s  eta 0:00:01
Collecting branca>=0.3.0
  Downloading branca-0.4.2-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.2 folium-0.12.1


### Load the Toronto postal code data from Wikipedia, clean it and add coordinates from csv file.

In [100]:
import pandas as pd
import numpy as np
import json # to handle JSON
import folium # for maps
import requests # for handling requests

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
from sklearn.cluster import KMeans # For clustering
from geopy.geocoders import Nominatim # For getting lat/lon coordinates for an address

import matplotlib as mpl # plotting
import matplotlib.cm as cm
import matplotlib.colors as colors


url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
dfs = pd.read_html(url, na_values=['Not assigned'])
df = dfs[0].dropna(subset=['Borough'])
df.reset_index(inplace=True, drop=True)
df.shape

geo_url = 'https://cocl.us/Geospatial_data'
geo_df = pd.read_csv(geo_url)
geo_df.head()

df_toronto = pd.merge(left=df, right=geo_df, left_on='Postal Code', right_on='Postal Code')
df_toronto.rename(columns={'Neighbourhood':'Neighborhood'}, inplace=True)
df_toronto

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


### Create the map of Toronto neighborhoods

In [101]:
# Get coordinates for Toronto

address = 'Toronto, Canada'

geolocator = Nominatim(user_agent='toronto_explorer')
location = geolocator.geocode(address)
toronto_lat = location.latitude
toronto_lon = location.longitude

# Create map 
map_toronto = folium.Map(location=[toronto_lat, toronto_lon], zoom_start=12)

for lat, lon, postalcode, label in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Postal Code'], df_toronto['Neighborhood']):
    label = folium.Popup(postalcode+' '+label, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)

map_toronto



### Get Foursquare Data

In [116]:
CLIENT_ID = 'ASRKGE4VNECAYDK2JBS4XVAPUDYEKTOEQDO4UBYZ12KCIVBI' 
CLIENT_SECRET = 'RCWJJGUMADBMN2N3VOTPBJXR1GML0TMK2UNINKNINUJJLM2N' 
VERSION = '20180605' # Foursquare API version
LIMIT = 100

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    i=1
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(str(i)+': '+name)
        i+=1
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        if len(results) != 0:
            venues_list.append([(
                name, 
                lat, 
                lng, 
                v['venue']['name'], 
                v['venue']['location']['lat'], 
                v['venue']['location']['lng'],  
                v['venue']['categories'][0]['name']) for v in results])
        else:
            venues_list.append([(name, lat, lng, 'REMOVE', lat, lng, 'REMOVE')])
            print('No results')            
            
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postal Code', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

toronto_venues = getNearbyVenues(names=df_toronto['Postal Code'], latitudes=df_toronto['Latitude'], longitudes=df_toronto['Longitude'])


1: M3A
2: M4A
3: M5A
4: M6A
5: M7A
6: M9A
No results
7: M1B
8: M3B
9: M4B
10: M5B
11: M6B
12: M9B
13: M1C
14: M3C
15: M4C
16: M5C
17: M6C
18: M9C
19: M1E
20: M4E
21: M5E
22: M6E
23: M1G
24: M4G
25: M5G
26: M6G
27: M1H
28: M2H
29: M3H
30: M4H
31: M5H
32: M6H
33: M1J
34: M2J
35: M3J
36: M4J
37: M5J
38: M6J
39: M1K
40: M2K
41: M3K
42: M4K
43: M5K
44: M6K
45: M1L
46: M2L
No results
47: M3L
48: M4L
49: M5L
50: M6L
51: M9L
52: M1M
53: M2M
54: M3M
55: M4M
56: M5M
57: M6M
58: M9M
59: M1N
60: M2N
61: M3N
62: M4N
63: M5N
64: M6N
65: M9N
66: M1P
67: M2P
68: M4P
69: M5P
70: M6P
71: M9P
72: M1R
73: M2R
74: M4R
75: M5R
76: M6R
77: M7R
78: M9R
79: M1S
80: M4S
81: M5S
82: M6S
83: M1T
84: M4T
85: M5T
86: M1V
87: M4V
88: M5V
89: M8V
90: M9V
91: M1W
92: M4W
93: M5W
94: M8W
95: M9W
96: M1X
No results
97: M4X
98: M5X
99: M8X
100: M4Y
101: M7Y
102: M8Y
103: M8Z


### Venues Analysis

Finding top 10 venue types in each neighborhood

In [129]:
# Onehot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix='', prefix_sep='')
toronto_onehot['Postal Code'] = toronto_venues['Postal Code']
toronto_onehot = toronto_onehot[[toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])]

# Group results by neighborhood and relative frequency

toronto_grpd = toronto_onehot.groupby('Postal Code').mean().reset_index()

def most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

top = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Postal Code']
for ind in np.arange(top):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Postal Code'] = toronto_grpd['Postal Code']

for ind in np.arange(toronto_grpd.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = most_common_venues(toronto_grpd.iloc[ind, :], top)

neighborhoods_venues_sorted.shape


(103, 11)

### Clustering Neighborhoods based on Top 10 Venues

Using k-means clustering method, 10 clusters

In [130]:
# Number of clusters (5)
kclusters = 5

# Drop Neighborhood column
toronto_grouped_clustering = toronto_grpd.drop('Postal Code', 1)

# Run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

#kmeans.labels_[:]
#print(len(kmeans.labels_))

neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto = df_toronto
toronto = toronto.join(neighborhoods_venues_sorted.set_index('Postal Code'), on='Postal Code')
toronto

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,2,Food & Drink Shop,Park,Construction & Landscaping,Yoga Studio,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Donut Shop
1,M4A,North York,Victoria Village,43.725882,-79.315572,0,Hockey Arena,Pizza Place,Coffee Shop,Portuguese Restaurant,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636,0,Coffee Shop,Park,Bakery,Pub,Breakfast Spot,Café,Theater,Brewery,Shoe Store,Restaurant
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,0,Clothing Store,Furniture / Home Store,Accessories Store,Athletics & Sports,Event Space,Miscellaneous Shop,Coffee Shop,Boutique,Women's Store,Vietnamese Restaurant
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0,Coffee Shop,Sushi Restaurant,Yoga Studio,Discount Store,Italian Restaurant,Japanese Restaurant,Beer Bar,Fast Food Restaurant,Smoothie Shop,Sandwich Place
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944,2,Park,River,Yoga Studio,Doner Restaurant,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Donut Shop
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160,0,Coffee Shop,Sushi Restaurant,Japanese Restaurant,Fast Food Restaurant,Gay Bar,Restaurant,Yoga Studio,Café,Pub,Hotel
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558,0,Light Rail Station,Brewery,Farmers Market,Butcher,Auto Workshop,Burrito Place,Garden,Garden Center,Fast Food Restaurant,Restaurant
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509,0,Baseball Field,Yoga Studio,Drugstore,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant,Dim Sum Restaurant


In [126]:
kmeans.labels_[:]

array([7, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 5, 0, 1, 1, 5, 2,
       1, 6, 1, 2, 1, 1, 1, 1, 2, 1, 4, 1, 1, 1, 1, 1, 1, 1, 6, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 6, 1, 1, 1, 1, 1, 1,
       1, 1, 2, 4, 1, 5, 1, 1, 1, 1, 2, 1, 1, 1, 1], dtype=int32)

### Plot Clustering Results on map

In [131]:
map_clusters = folium.Map(location=[toronto_lat, toronto_lon], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto['Latitude'], toronto['Longitude'], toronto['Neighborhood'], toronto['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters