In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

In [2]:
tables_list = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

In [3]:
df = tables_list[0]
print(df.shape)
df.head()

(180, 3)


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [4]:
df['Borough'] = df['Borough'].str.lower()
df['Neighborhood'] = df['Neighborhood'].str.lower()

### clear NAs from Borough

In [5]:
df = df.loc[df.loc[:,'Borough'] != 'not assigned']

In [6]:
print(df.shape)
df.head(10)

(103, 3)


Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,north york,parkwoods
3,M4A,north york,victoria village
4,M5A,downtown toronto,"regent park, harbourfront"
5,M6A,north york,"lawrence manor, lawrence heights"
6,M7A,downtown toronto,"queen's park, ontario provincial government"
8,M9A,etobicoke,"islington avenue, humber valley village"
9,M1B,scarborough,"malvern, rouge"
11,M3B,north york,don mills
12,M4B,east york,"parkview hill, woodbine gardens"
13,M5B,downtown toronto,"garden district, ryerson"


### checking if multipile Postal code rows exist in the table

In [7]:
if df.loc[:, 'Postal Code'].nunique() == df.shape[0]:
    flag = False
else:
    flag = True
    
print(flag)


False


### checking if unassigned neighborhoods exist in the table

In [8]:
(df.loc[:,'Neighborhood'] == 'not assigned').sum()

if (df.loc[:,'Neighborhood'] == 'not assigned').sum() == 0:
    flag = False
else:
    flag = True

print(flag)

False


In [9]:
df.shape

(103, 3)

### grabbing geo data

In [10]:
geodf = pd.read_csv('https://cocl.us/Geospatial_data')

In [11]:
print(geodf.shape)
geodf.head()

(103, 3)


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [12]:
geodf['Postal Code'].dtype

dtype('O')

In [13]:
df['Postal Code'] = df['Postal Code'].astype('object')
geodf['Postal Code'] = geodf['Postal Code'].astype('object')

In [14]:
fulldf = df.merge(geodf, how='outer', on='Postal Code')
fulldf.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,north york,parkwoods,43.753259,-79.329656
1,M4A,north york,victoria village,43.725882,-79.315572
2,M5A,downtown toronto,"regent park, harbourfront",43.65426,-79.360636
3,M6A,north york,"lawrence manor, lawrence heights",43.718518,-79.464763
4,M7A,downtown toronto,"queen's park, ontario provincial government",43.662301,-79.389494


In [15]:
import matplotlib.cm as cm
import matplotlib.colors as colors

#!pip install folium
import folium

In [16]:
df = fulldf.copy()

In [17]:
latitude = np.average(df['Latitude'])
longitude = np.average(df['Longitude'])

toronto_map = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  
    
toronto_map

### foursquare

In [18]:
CLIENT_ID = 'C5JM0QEXRGJOB3BTF44PAC1NI1AO0B1QZJI1OAINJ2PHC5V5'
CLIENT_SECRET = 'A0IRKR3O15YGTROWAX1VCRWJP50PSJ4XOCNIRNF3J054PEWY'
VERSION = '20180605'

### filter 'toronto'

In [19]:
df = df[df['Borough'].str.contains('toronto')]
print(df.shape)
df.head()

(39, 5)


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M5A,downtown toronto,"regent park, harbourfront",43.65426,-79.360636
4,M7A,downtown toronto,"queen's park, ontario provincial government",43.662301,-79.389494
9,M5B,downtown toronto,"garden district, ryerson",43.657162,-79.378937
15,M5C,downtown toronto,st. james town,43.651494,-79.375418
19,M4E,east toronto,the beaches,43.676357,-79.293031


In [20]:
latitude = np.average(df['Latitude'])
longitude = np.average(df['Longitude'])

toronto_map = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  
    
toronto_map

In [21]:
import requests
from urllib.request import urlopen

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import json # library to handle JSON files

In [22]:
# define limit = 5 (limit to 5 venues only) & radius = 500 (meters)
LIMIT = 5
radius = 500

location_list = [] # initiate a list to store data from Foursquare API requests

for neighborhood, latitude, longitude in zip(df.Neighborhood, df.Latitude, df.Longitude):
    
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude, 
    longitude, 
    radius, 
    LIMIT)
    
    data = requests.get(url).json()
    
    # use len() to check if any data within "items" (len = 0 if nothing)
    length = len(data['response']['groups'][0]['items'])
    if length == 0:
        continue # skip the row if nothing is found
    else:
        venue = data['response']['groups'][0]['items'][0]['venue']
        
        # extract info within 'venue'
        name = venue['name']
        lat = venue['location']['lat']
        lon = venue['location']['lng']
        cat = venue['categories'][0]['name']
        
        location_list.append([(neighborhood, latitude, longitude, name, lat, lon, cat)])

In [23]:
# create data frame based on 'Location_list'

temp = pd.DataFrame(x for row in location_list for x in row)
temp.columns = ['Neighborhood','N_Latitude','N_Longitude','Venue','V_Latitude','V_Longitude','category']
temp.head()

Unnamed: 0,Neighborhood,N_Latitude,N_Longitude,Venue,V_Latitude,V_Longitude,category
0,"regent park, harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"queen's park, ontario provincial government",43.662301,-79.389494,Queen's Park,43.663946,-79.39218,Park
2,"garden district, ryerson",43.657162,-79.378937,UNIQLO ユニクロ,43.65591,-79.380641,Clothing Store
3,st. james town,43.651494,-79.375418,Gyu-Kaku Japanese BBQ,43.651422,-79.375047,Japanese Restaurant
4,the beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail


In [24]:
print("{} nearby locations downloaded for {} neighborhood.".format(len(temp.Venue), len(df.Neighborhood)))

39 nearby locations downloaded for 39 neighborhood.


In [25]:
cat = pd.get_dummies(temp.category) # one hot encoding with get_dummies()
df1 = pd.concat([temp[['Neighborhood']], cat], axis=1) # combine neighborhood & category tables

df1.head()

Unnamed: 0,Neighborhood,Airport,Bakery,Bar,Brewery,Café,Clothing Store,Coffee Shop,Cosmetics Shop,Dessert Shop,...,Mexican Restaurant,Neighborhood.1,Park,Playground,Plaza,Supermarket,Theme Restaurant,Trail,Vegetarian / Vegan Restaurant,Yoga Studio
0,"regent park, harbourfront",0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"queen's park, ontario provincial government",0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,"garden district, ryerson",0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,st. james town,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,the beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


### clustring

In [26]:
df2 = df1.drop('Neighborhood', axis=1)

n_group = 6 # we will group neighborhoods into 6 clusters

# run k-means clustering
kmeans = KMeans(n_clusters=n_group, random_state=0).fit(df2)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([5, 1, 5, 3, 4, 5, 5, 5, 5, 5])

In [27]:
# add clustering labels
df1.insert(1, 'label', kmeans.labels_)

In [28]:
df1.head() # 'label' generated from k-means included in data frame

Unnamed: 0,Neighborhood,label,Airport,Bakery,Bar,Brewery,Café,Clothing Store,Coffee Shop,Cosmetics Shop,...,Mexican Restaurant,Neighborhood.1,Park,Playground,Plaza,Supermarket,Theme Restaurant,Trail,Vegetarian / Vegan Restaurant,Yoga Studio
0,"regent park, harbourfront",5,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"queen's park, ontario provincial government",1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,"garden district, ryerson",5,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,st. james town,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,the beaches,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [29]:
df1.columns = ['Neighborhood', 'label', 'Airport', 'Bakery', 'Bar', 'Brewery', 'Café', \
       'Clothing Store', 'Coffee Shop', 'Cosmetics Shop', 'Dessert Shop', \
       'Diner', 'Dog Run', 'Fish & Chips Shop', 'Food & Drink Shop', 'Garden', \
       'Grocery Store', 'Gym', 'Gym / Fitness Center', 'Ice Cream Shop', \
       'Indian Restaurant', 'Japanese Restaurant', 'Liquor Store', \
       'Mexican Restaurant', 'Neighborhood1', 'Park', 'Playground', 'Plaza', \
       'Supermarket', 'Theme Restaurant', 'Trail', \
       'Vegetarian / Vegan Restaurant', 'Yoga Studio']

In [30]:
toronto_merged = pd.merge(df, df1, on='Neighborhood', how='right')
toronto_merged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,label,Airport,Bakery,Bar,Brewery,...,Mexican Restaurant,Neighborhood1,Park,Playground,Plaza,Supermarket,Theme Restaurant,Trail,Vegetarian / Vegan Restaurant,Yoga Studio
0,M5A,downtown toronto,"regent park, harbourfront",43.65426,-79.360636,5,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M7A,downtown toronto,"queen's park, ontario provincial government",43.662301,-79.389494,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,M5B,downtown toronto,"garden district, ryerson",43.657162,-79.378937,5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M5C,downtown toronto,st. james town,43.651494,-79.375418,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M4E,east toronto,the beaches,43.676357,-79.293031,4,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [31]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(n_group)
ys = [i + x + (i*x)**2 for i in range(n_group)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['label']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters