# <span style="color:purple">Segmenting and Clustering Neighborhoods in Toronto</span>

### <span style="color:darkred">Importing packages</span>

In [1]:
# Importing general packages
import pandas as pd
import dfply

from dfply import *
import numpy as np

# Importing packages for vizualization
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
%matplotlib inline
import seaborn as sns

# Importing packages for scraping
#!conda install -c conda-forge wikipedia --yes 
import wikipedia
#!conda install -c conda-forge requests --yes 
import requests
#!conda install -c conda-forge bs4 --yes 
from bs4 import BeautifulSoup

# Importing packages for handling gespatial data
#!conda install -c conda-forge geocoder --yes 
import geocoder
#!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim
import folium # map rendering library

# Packages for hiding sensitive data
from IPython.display import HTML

# Importing k-means from clustering stage
from sklearn.cluster import KMeans

### <span style="color:darkred">Scraping wiki page for data about Canada's Borough/span>

In [2]:
html = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

#turn the HTML into a soup text object
bs = BeautifulSoup(html.text, 'lxml')

In [3]:
#Defining customized replace function
REPLACE_SEQUENCES = ['\n']

def custom_replace(s):
    for to_replace in REPLACE_SEQUENCES:
        s = s.replace(to_replace, '')
    return s.strip()

#Defining a list for the neighbourhoods 
n_list = []

#Filling the list
for i in bs.find_all(name = 'td'):
    n_list.append(custom_replace(i.get_text()))
    
#Find the last element of the table of neighbourhoods in the list
matches = [i for i,x in enumerate(n_list) if x=='Not assigned']

#Dropping elements from the list that is not part of the original wikipedia table
n_list = n_list[0:matches[-1]+1]

#Creating DataFrame from the list
n_list = np.array(n_list)
columns=['Postcode','Borough','Neighbourhood']
df_n = pd.DataFrame(np.reshape(n_list, (int(len(n_list)/3),3)),columns=columns)
df_n

#Dropping those rows where Borough is not Assigned
df_n = df_n.drop(df_n[df_n['Borough'] == 'Not assigned' ].index)

#Sorting the DataFrame
df_n.sort_values('Postcode')

#Reindexing the DataFrame
df_n = df_n.reset_index(drop=True)

#Handling those cases, where we have Borough without Neighbourhood assigned
for i in range(0,len(df_n['Neighbourhood'])):
    if df_n['Neighbourhood'][i] == 'Not assigned':
        df_n['Neighbourhood'][i] = df_n['Borough'][i]

#Combining Neighbourhood into one line that belongs to the same Postcode
duplicates = df_n['Postcode'].duplicated()
for i in range(0,len(duplicates)):
    if duplicates[i] == True:
        first_index = list(df_n['Postcode']).index(df_n['Postcode'][i])
        df_n['Neighbourhood'][first_index] += str(', ' + df_n['Neighbourhood'][i])

#Dropping rows that are duplicates in terms of postcodes
df_n = df_n[(duplicates==False)]

#Reindexing the DataFrame
df_n = df_n.reset_index(drop=True)

#Checking the dataframe
df_n

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [4]:
print(df_n.shape)

(103, 3)


### <span style="color:darkred">Adding geospatial data</span>

In [5]:
# Reading Lat and Long for Postcodes
df_gsp = pd.read_csv('http://cocl.us/Geospatial_data')
df_gsp = df_gsp.rename(columns={"Postal Code": "Postcode"})
df_gsp.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [6]:
# Adding Lat and Long to df
df_n = df_n >> left_join(df_gsp, by = "Postcode")
df_n

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.654260,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


### <span style="color:darkred"> Adding the most common venue categories in each neighborhood in Toronto </span>

In [7]:
# Adding credentials for using Foursquare
import getpass

CLIENT_ID = getpass.getpass('Enter your Foursquare CLIENT_ID')
CLIENT_SECRET = getpass.getpass('Enter your Foursquare CLIENT_SECRET')
VERSION = '20180605'
LIMIT = 100

print('Your credentials are stored')

Enter your Foursquare CLIENT_ID········
Enter your Foursquare CLIENT_SECRET········
Your credentials are stored


### <span style="color:darkred"> Filtering neighbourhoods that belongs to Toronto </span>

In [8]:
### <span style="color:darkred"> Filtering neighbourhoods that belongs to Toronto </span># Filtering neighbourhoods
df_n_tor = df_n >> mask(X.Borough.str.contains('Toronto') == True)
df_n_tor

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,M5H,Downtown Toronto,"Adelaide, King, Richmond",43.650571,-79.384568
31,M6H,West Toronto,"Dovercourt Village, Dufferin",43.669005,-79.442259
36,M5J,Downtown Toronto,"Harbourfront East, Toronto Islands, Union Station",43.640816,-79.381752


In [9]:
#Definging function for using API of Foursquare
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood_Latitude', 
                  'Neighbourhood_Longitude', 
                  'Venue', 
                  'Venue_Latitude', 
                  'Venue_Longitude', 
                  'Venue_Category']
    
    return(nearby_venues)

In [10]:
# Creating Df by adding vanues to df of Toronto's Boroughs
df_Tor_venues = getNearbyVenues(df_n_tor['Neighbourhood'], df_n_tor['Latitude'], df_n_tor['Longitude'], 500)

df_Tor_venues.head(10)

Unnamed: 0,Neighbourhood,Neighbourhood_Latitude,Neighbourhood_Longitude,Venue,Venue_Latitude,Venue_Longitude,Venue_Category
0,"Harbourfront, Regent Park",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Harbourfront, Regent Park",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Harbourfront, Regent Park",43.65426,-79.360636,Toronto Cooper Koo Family Cherry St YMCA Centre,43.653191,-79.357947,Gym / Fitness Center
3,"Harbourfront, Regent Park",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,"Harbourfront, Regent Park",43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant
5,"Harbourfront, Regent Park",43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot
6,"Harbourfront, Regent Park",43.65426,-79.360636,Figs Breakfast & Lunch,43.655675,-79.364503,Breakfast Spot
7,"Harbourfront, Regent Park",43.65426,-79.360636,Dominion Pub and Kitchen,43.656919,-79.358967,Pub
8,"Harbourfront, Regent Park",43.65426,-79.360636,Corktown Common,43.655618,-79.356211,Park
9,"Harbourfront, Regent Park",43.65426,-79.360636,The Distillery Historic District,43.650244,-79.359323,Historic Site


### <span style="color:darkred"> Encoding df containing info about nearby venues </span>

In [89]:
# One hot encoding for calculating frequency
toronto_onehot = pd.get_dummies(df_Tor_venues[['Venue_Category']], prefix="", prefix_sep="")

# Adding neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = df_Tor_venues['Neighbourhood']
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

#calculating frequencies
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighbourhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,...,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.01,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.017857,0.0,0.0,0.0,0.0,0.0,0.0
2,"Brockton, Exhibition Place, Parkdale Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.074074
3,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.066667,0.066667,0.066667,0.133333,0.133333,0.133333,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Cabbagetown, St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011628,0.0,...,0.0,0.0,0.0,0.011628,0.0,0.0,0.011628,0.0,0.0,0.011628
7,"Chinatown, Grange Park, Kensington Market",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.06,0.0,0.04,0.01,0.0,0.0,0.0
8,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Church and Wellesley,0.011494,0.0,0.0,0.0,0.0,0.0,0.0,0.011494,0.0,...,0.0,0.0,0.0,0.0,0.011494,0.0,0.0,0.011494,0.0,0.022989


### <span style="color:darkred"> Checking the most frequent venues by Neigbourhoods </span>

In [90]:
df_Tor_venues_grouped = df_Tor_venues >> group_by(X.Neighbourhood, X.Venue_Category) >> summarize(Count = X.Venue_Category.count()) >> select(X.Neighbourhood, X.Venue_Category, X.Count)
df_Tor_venues_grouped = df_Tor_venues_grouped.sort_values(by=['Neighbourhood','Count'], ascending=[True, False])
df_Tor_venues_grouped

Unnamed: 0,Neighbourhood,Venue_Category,Count
13,"Adelaide, King, Richmond",Coffee Shop,7
12,"Adelaide, King, Richmond",Café,5
5,"Adelaide, King, Richmond",Bar,4
58,"Adelaide, King, Richmond",Thai Restaurant,4
0,"Adelaide, King, Richmond",American Restaurant,3
3,"Adelaide, King, Richmond",Asian Restaurant,3
10,"Adelaide, King, Richmond",Burger Joint,3
27,"Adelaide, King, Richmond",Hotel,3
48,"Adelaide, King, Richmond",Restaurant,3
56,"Adelaide, King, Richmond",Steakhouse,3


In [91]:
# Selecting the first 10 most frequent Vanue Category by Neighbourhood
df_venue = pd.DataFrame() #creates a new dataframe that's empty

for element in df_Tor_venues_grouped.Neighbourhood.unique():
    df_container = df_Tor_venues_grouped >> mask(X.Neighbourhood == element)
    if len(df_container) < 10:
        df_venue = df_venue.append(df_container)
    if len(df_container) >= 10:
        df_venue = df_venue.append(df_container.head(10))
df_venue

Unnamed: 0,Neighbourhood,Venue_Category,Count
13,"Adelaide, King, Richmond",Coffee Shop,7
12,"Adelaide, King, Richmond",Café,5
5,"Adelaide, King, Richmond",Bar,4
58,"Adelaide, King, Richmond",Thai Restaurant,4
0,"Adelaide, King, Richmond",American Restaurant,3
3,"Adelaide, King, Richmond",Asian Restaurant,3
10,"Adelaide, King, Richmond",Burger Joint,3
27,"Adelaide, King, Richmond",Hotel,3
48,"Adelaide, King, Richmond",Restaurant,3
56,"Adelaide, King, Richmond",Steakhouse,3


In [92]:
# Adding geo data to the data frame
df_venue = df_venue >> left_join(df_n, by="Neighbourhood")
df_venue


Unnamed: 0,Neighbourhood,Venue_Category,Count,Postcode,Borough,Latitude,Longitude
0,"Adelaide, King, Richmond",Coffee Shop,7,M5H,Downtown Toronto,43.650571,-79.384568
1,"Adelaide, King, Richmond",Café,5,M5H,Downtown Toronto,43.650571,-79.384568
2,"Adelaide, King, Richmond",Bar,4,M5H,Downtown Toronto,43.650571,-79.384568
3,"Adelaide, King, Richmond",Thai Restaurant,4,M5H,Downtown Toronto,43.650571,-79.384568
4,"Adelaide, King, Richmond",American Restaurant,3,M5H,Downtown Toronto,43.650571,-79.384568
5,"Adelaide, King, Richmond",Asian Restaurant,3,M5H,Downtown Toronto,43.650571,-79.384568
6,"Adelaide, King, Richmond",Burger Joint,3,M5H,Downtown Toronto,43.650571,-79.384568
7,"Adelaide, King, Richmond",Hotel,3,M5H,Downtown Toronto,43.650571,-79.384568
8,"Adelaide, King, Richmond",Restaurant,3,M5H,Downtown Toronto,43.650571,-79.384568
9,"Adelaide, King, Richmond",Steakhouse,3,M5H,Downtown Toronto,43.650571,-79.384568


In [93]:
### <span style="color:darkred"> Clustering neighbourhoods based on most frequent venue types </span>

In [94]:
# set number of clusters
kclusters = 4

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:100]

array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3,
       0, 3, 3, 3, 3, 1, 2, 3, 3, 3, 3, 3, 3, 1, 3, 3])

In [95]:
# Creating dataframe for storing cluster label of Neighbourhoods
toronto_clusters = toronto_grouped >> mutate(Clusters = kmeans.labels_) >> select(X.Neighbourhood, X.Clusters)
toronto_clusters

Unnamed: 0,Neighbourhood,Clusters
0,"Adelaide, King, Richmond",3
1,Berczy Park,3
2,"Brockton, Exhibition Place, Parkdale Village",3
3,Business Reply Mail Processing Centre 969 Eastern,3
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",3
5,"Cabbagetown, St. James Town",3
6,Central Bay Street,3
7,"Chinatown, Grange Park, Kensington Market",3
8,Christie,3
9,Church and Wellesley,3


In [96]:
# Adding clusters to Df venue
df_venue = df_venue >> left_join(toronto_clusters, by="Neighbourhood")
df_venue

Unnamed: 0,Neighbourhood,Venue_Category,Count,Postcode,Borough,Latitude,Longitude,Clusters
0,"Adelaide, King, Richmond",Coffee Shop,7,M5H,Downtown Toronto,43.650571,-79.384568,3
1,"Adelaide, King, Richmond",Café,5,M5H,Downtown Toronto,43.650571,-79.384568,3
2,"Adelaide, King, Richmond",Bar,4,M5H,Downtown Toronto,43.650571,-79.384568,3
3,"Adelaide, King, Richmond",Thai Restaurant,4,M5H,Downtown Toronto,43.650571,-79.384568,3
4,"Adelaide, King, Richmond",American Restaurant,3,M5H,Downtown Toronto,43.650571,-79.384568,3
5,"Adelaide, King, Richmond",Asian Restaurant,3,M5H,Downtown Toronto,43.650571,-79.384568,3
6,"Adelaide, King, Richmond",Burger Joint,3,M5H,Downtown Toronto,43.650571,-79.384568,3
7,"Adelaide, King, Richmond",Hotel,3,M5H,Downtown Toronto,43.650571,-79.384568,3
8,"Adelaide, King, Richmond",Restaurant,3,M5H,Downtown Toronto,43.650571,-79.384568,3
9,"Adelaide, King, Richmond",Steakhouse,3,M5H,Downtown Toronto,43.650571,-79.384568,3


In [97]:
# Creating data_frame for mapping
df_toronto = df_venue >> drop(X.Venue_Category, X.Count) >> distinct(X.Neighbourhood)
df_toronto = df_toronto.reset_index(drop=True)
df_toronto

Unnamed: 0,Neighbourhood,Postcode,Borough,Latitude,Longitude,Clusters
0,"Adelaide, King, Richmond",M5H,Downtown Toronto,43.650571,-79.384568,3
1,Berczy Park,M5E,Downtown Toronto,43.644771,-79.373306,3
2,"Brockton, Exhibition Place, Parkdale Village",M6K,West Toronto,43.636847,-79.428191,3
3,Business Reply Mail Processing Centre 969 Eastern,M7Y,East Toronto,43.662744,-79.321558,3
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",M5V,Downtown Toronto,43.628947,-79.39442,3
5,"Cabbagetown, St. James Town",M4X,Downtown Toronto,43.667967,-79.367675,3
6,Central Bay Street,M5G,Downtown Toronto,43.657952,-79.387383,3
7,"Chinatown, Grange Park, Kensington Market",M5T,Downtown Toronto,43.653206,-79.400049,3
8,Christie,M6G,Downtown Toronto,43.669542,-79.422564,3
9,Church and Wellesley,M4Y,Downtown Toronto,43.66586,-79.38316,3


### <span style="color:darkred"> Mapping clusters </span>

In [98]:
# Finding geospatial data of Toronto
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronot are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronot are 43.653963, -79.387207.


In [99]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Neighbourhood'], df_toronto['Clusters']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### <span style="color:darkred"> Exploring Clusters </span>

In [100]:
# Checking most frequent venues in Cluster 1
df_venue >> mask(X.Clusters == 0) >> select(X.Venue_Category) >> distinct(X.Venue_Category)

Unnamed: 0,Venue_Category
215,Bus Line
216,Park
217,Swim School


In [101]:
# Checking most frequent venues in Cluster 2
df_venue >> mask(X.Clusters == 1) >> select(X.Venue_Category) >> distinct(X.Venue_Category)

Unnamed: 0,Venue_Category
170,Bus Line
171,Jewelry Store
172,Park
173,Sushi Restaurant
174,Trail
253,Building
254,Playground
318,Health Food Store
319,Neighborhood
320,Other Great Outdoors


In [102]:
# Checking most frequent venues in Cluster 3
df_venue >> mask(X.Clusters == 2) >> select(X.Venue_Category) >> distinct(X.Venue_Category)

Unnamed: 0,Venue_Category
256,Garden
257,Home Service


In [103]:
# Checking most frequent venues in Cluster 4
df_venue >> mask(X.Clusters == 3) >> select(X.Venue_Category) >> distinct(X.Venue_Category)

Unnamed: 0,Venue_Category
0,Coffee Shop
1,Café
2,Bar
3,Thai Restaurant
4,American Restaurant
5,Asian Restaurant
6,Burger Joint
7,Hotel
8,Restaurant
9,Steakhouse
