<h1> Segmenting and Clustering Neighborhoods in Toronto </h1>

In [2]:
#imports
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs
import requests
import re

<h2> Web Scraping </h2>

In [3]:
#getting the webpage's html code
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
soup = bs(requests.get(url).text, 'html.parser')

In [4]:
#Scraping all the important data
df = {'PostalCode':[],'Borough':[],'Neighborhood':[]}

for i in soup.tbody.find_all(name='tr'):
    
    #getting all of the current row's td elements
    temp = i.find_all(name='td')
    
    #removing any td elements that display postal codes that are not assigned
    na = [j for j in i.find_all(name='td') if re.search('color:#ccc',str(j))]
    for j in na:
        temp.remove(j)
    
    #scraping and assigning values
    for j in temp:
        children = [i for i in j.p.span.children]
        #Getting with Neighborhood data
        t = []
        for n in children[2:]:
            try:
                n = n.text
            except:
                n = str(n).strip()
            if str(n) not in '()/, ':
                    t.append(n)
        if t:
            df['Neighborhood'].append(t)
        #Getting with Borough data
        try:
            df['Borough'].append(children[0].text)
        except:
            df['Borough'].append(children[0])
        #Assigning the Borough's value to any cells that haven't got a Nieghborhood assigned
        if not t:
            df['Neighborhood'].append(df['Borough'][-1])
        #Getting the Postal Code Data
        df['PostalCode'].append(j.p.b.text)
        
#Cleaning the Neighborhood Data
temp = []
for i in df['Neighborhood']:
    t = str(i).strip().replace("'",'').replace('"','').replace(')','').replace('(','').replace('[','').replace(']','')
    temp.append(t)
df['Neighborhood'] = temp

#Creating the Dataframe
df = pd.DataFrame(df)
df.shape

(103, 3)

<h1> Adding Coordinates (Latitude and Longitude) </h1>

In [5]:
#imports
#! pip install geocoder
import geocoder as gc

In [None]:
#Getting Latitude and Longitude for each Postal Code
lat_lng_dic = {'Latitude' : [], 'Longitude': []}
#creating a variable with None value
for i in df[['PostalCode']]:
    t = []
    while(not t):
        t = gc.google('{}, Toronto, Ontario'.format(i))
    lat_lng_dic['Latitude'].append(t.lat_lng_coords[0])
    lat_lng_dic['Longitude'].append(t.lat_lng_coords[1])
lat_lng_dic

<h3> Due to problems with the geocoder library I am going to use the given csv file. </h3>

In [6]:
#Downloading and Reading the file
#!wget https://cocl.us/Geospatial_data
ldf = pd.read_csv('https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv')
ldf.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [7]:
#Adding Coordinates to the main Dataframe
lat = []
lng = []
for i in range(103):
    for j in range(103):
        if list(df['PostalCode'])[i] == list(ldf['Postal Code'])[j]:
            lat.append(list(ldf['Latitude'])[j])
            lng.append(list(ldf['Longitude'])[j])
            break
df = pd.concat([df,pd.DataFrame({'Latitude' : lat, 'Longitude' : lng})],axis=1)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


<h1> Exploring and Clustering Neighborhoods in Toronto </h1>

In [9]:
#imports
#!pip install folium
import folium
#!pip install geopy
from geopy.geocoders import Nominatim

<h4>Superimposing Neighborhoods in Toronto on top of a map of Toronto</h4>

In [10]:
#Getting Toronto's coordinates
address = 'Toronto, CA'
geolocator = Nominatim(user_agent='ny_explorer')
location = geolocator.geocode(address)
latitude, longitude = location.latitude, location.longitude
print(latitude,longitude)

43.6534817 -79.3839347


In [85]:
#creating the folium map
toronto_map = folium.Map(location=[latitude,longitude], zoom_start=12)
#creating a list of all boroughs with "Toronto" in their name
brs = [i for i in list(df['Borough'].unique()) if re.search('Toronto', i)]
#adding each neighborhood as a marker
for lat, lng, br, neigh in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    if br in brs:
        folium.CircleMarker(
        [lat,lng],
        radius=5,
        popup='{}\n{}'.format(neigh,br),
        color='blue',
        fill=True,
        fill_color='light blue',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)
    else:
        continue
toronto_map

<h3> I am going to cluster Neighborhoods based on best rated venues and most common venues <h3>

In [12]:
#getting venues for each neighborhood using forsquare API
CLIENT_ID = 'NZJJRHRTSR3UZHU5QJF5X53FIGCR2MWPHZDXRNOQH5RJCILM'
CLIENT_SECRET = 'XCXQS02UIBZEK4SAX3KYPFKF2JQYMBO05I2TAUKIKXETAWKC'
VERSION = '20180604'
#Unfortunately getting a venue's rating is considered a premium call which are limited to 500 for personal
#accounts, so I have to go for 13 venues per neighborhood
LIMIT = 13
#the main dictionary where the final data will be saved
venue_data = {'Avg. Rating (Normalized)' : [], 'Most Common Category' : []}

for lat,lng,br in zip(df['Latitude'],df['Longitude'],df['Borough']):
#make a request for each neighborhood in Toronto
    if br in brs:
        #initial request showing venues that are close to the neighborhood
        url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&limit={}'.format(CLIENT_ID,CLIENT_SECRET,lat,lng,VERSION,LIMIT)
        result = requests.get(url).json()
        venues = {'Name' : [], 'Category' : [], 'Rating' : []}
        
        for j in range(LIMIT):
            #getting the name and category of each venue
            venues['Name'].append(result['response']['groups'][0]['items'][j]['venue']['name'])
            venues['Category'].append(result['response']['groups'][0]['items'][j]['venue']['categories'][0]['name'])
            try:
                #getting each venue's unique id to make another request, requesting the venue's rating
                id = result['response']['groups'][0]['items'][j]['venue']['id']
                turl = 'https://api.foursquare.com/v2/venues/{}?client_id={}&client_secret={}&v={}'.format(id,CLIENT_ID,CLIENT_SECRET,VERSION)
                tresult = requests.get(turl).json()
                venues['Rating'].append(tresult['response']['venue']['rating'])
            except:
                #some venues may not have a rating but as our sample size is small this is improbable
                venues['Rating'].append(np.nan)
                
        temp = pd.DataFrame(venues)
        #assigning values back to main directory
        venue_data['Most Common Category'].append(temp['Category'].value_counts().index[0])
        venue_data['Avg. Rating (Normalized)'].append(temp['Rating'].mean(skipna=True))
    else:
        continue
        
venue_data = pd.DataFrame(venue_data)

{'Avg. Rating (Normalized)': [8.807692307692308, 8.323076923076922, 8.607692307692307, 8.523076923076923, 8.499999999999998, 8.399999999999999, 8.607692307692307, 8.46923076923077, 8.37692307692308, 8.753846153846153, 8.823076923076922, 8.476923076923073, 8.384615384615385, 8.492307692307692, 8.523076923076923, 8.392307692307693, 8.461538461538463, 8.323076923076922, 8.392307692307691, 8.407692307692308, 8.2, 8.669230769230769, 8.307692307692308, 8.315384615384614, 8.692307692307692, 8.184615384615382, 8.453846153846154, 7.946153846153847, 8.453846153846156, 8.684615384615386, 8.361538461538462, 8.392307692307693, 8.761538461538462, 8.400000000000002, 8.423076923076923, 8.392307692307693, 8.446153846153846, 8.607692307692307], 'Most Common Category': ['Coffee Shop', 'Café', 'Coffee Shop', 'Beach', 'Farmers Market', 'Coffee Shop', 'Cocktail Bar', 'Coffee Shop', 'Café', 'Plaza', 'Cocktail Bar', 'Greek Restaurant', 'Café', 'Coffee Shop', 'Beach', 'Café', 'Coffee Shop', 'Bakery', 'Coffee S

<h2> Clustering </h2>

<h3> Creating a new Dataframe </h3>

In [102]:
#Creating a new Dataframe that holds all the data relating ONLY to neighborhoods in Toronto
toronto_df = {'Postal Code' : [],
              'Borough' : [],
              'Neighborhood' : [],
              'Latitude' : [],
              'Longitude' : [],
              'Avg. Rating' : [],
              'Most Common Category' : []}

keys = [i for i in toronto_df.keys()]
c=0
for pc,br,neigh,lat,lng in zip(df['PostalCode'],df['Borough'],df['Neighborhood'],df['Latitude'],df['Longitude']):
    if br in brs:
        temp = [pc,br,neigh,lat,lng,venue_data['Avg. Rating (Normalized)'][c],venue_data['Most Common Category'][c]]
        c+=1
        for j in range(len(keys)):
            toronto_df[keys[j]].append(temp[j])
    else:
        continue

toronto_df = pd.DataFrame(toronto_df)

<h3> Preprocessing our new Dataframe </h3>

In [104]:
#Applying one hot encoding to categorical values
cats = pd.get_dummies(toronto_df['Most Common Category'])
#removing any duplicate columns
cats = cats.loc[:,~cats.columns.duplicated()]
cats.head()

Unnamed: 0,Bakery,Beach,Café,Cocktail Bar,Coffee Shop,Dance Studio,Dog Run,Farmers Market,Greek Restaurant,Italian Restaurant,Liquor Store,Park,Pizza Place,Plaza
0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [119]:
#Creating a dataframe that will only be used to fit the model
fit_df = pd.concat([toronto_df['Avg. Rating'],cats],axis=1)
fit_df.head()

Unnamed: 0,Avg. Rating,Bakery,Beach,Café,Cocktail Bar,Coffee Shop,Dance Studio,Dog Run,Farmers Market,Greek Restaurant,Italian Restaurant,Liquor Store,Park,Pizza Place,Plaza
0,8.807692,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,8.323077,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,8.607692,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,8.523077,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,8.5,0,0,0,0,0,0,0,1,0,0,0,0,0,0


<h3> Building and Fitting a k-means Clustering Model </h3>

In [120]:
from sklearn.cluster import KMeans
kmeans = KMeans(init='k-means++', n_clusters=3, random_state=0, n_init=12)
#fitting the model
kmeans.fit(fit_df)
#getting resulting labels
labels = kmeans.labels_
labels

array([2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 2, 1, 1,
       1, 0, 2, 1, 1, 1, 2, 0, 2, 2, 2, 2, 0, 0, 2, 2], dtype=int32)

<h3> Visualizing Clusters </h3>

In [121]:
toronto_map2 = folium.Map(location=[latitude,longitude], zoom_start=12)
#this is a directory matching each label with a different colour
colours = {'0' : 'blue', '1' : 'red' , '2' : 'green' , '3' : 'orange' , '4' : 'purple'}
for lat,lng,neigh,br,col in zip(toronto_df['Latitude'],toronto_df['Longitude'],toronto_df['Neighborhood'],toronto_df['Borough'],labels):
    folium.CircleMarker(
        [lat,lng],
        radius=5,
        popup='{},{}'.format(neigh,br),
        color=colours[str(col)],
        fill=True,
        fill_color='light {}'.format(colours[str(col)]),
        fill_opacity=0.7,
        parse_html=False
    ).add_to(toronto_map2)
    
toronto_map2