# Segmenting and Clustering Neighborhoods in Toronto

### This notebook aims at segmenting and clustering the different neighborhoods in Toronto. 

### Import libraries

In [1]:
import pandas as pd
import numpy as np

# library to open URLs
import urllib.request

#library for parsing HTML and XML documents
from bs4 import BeautifulSoup

from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
#import folium # map rendering library

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

### Request the data from the wiki page

In [2]:
# define URL of the page we want to scrap
url = 'https://web.archive.org/web/20200303121502/https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
        
# Open URL using urllib.request and put the HTML into the page variable
wiki_page = urllib.request.urlopen(url)

In [3]:
# Parse the HTML from the URL into the BeautifulSoup parse tree format

soup = BeautifulSoup(wiki_page,'lxml' )

In [4]:
# find the right table in the HTML code

table = soup.find('table', class_ ="wikitable sortable" )

##### Create a dataframe with the information from the wiki page

In [5]:
# define empty lists for the 3 columns 
A=[]
B=[]
C=[]

# iterate through the rows to extract the information for the different postcodes
# find row start by searching for "tr"
for row in table.findAll('tr'): 
    # find column start and end by searching for "td"
    cells = row.findAll('td')
    # fill in the column lists, when information for 3 columns received
    if len(cells)==3:
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].find(text=True))

In [6]:
# put all 3 column lists to one dataframe
df = pd.DataFrame(A, columns=['Postcode'])
df['Borough'] = B
df['Neighbourhood'] = C
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


##### Data wrangling

In [7]:
# only display Boroughs, which are assigned
df = df[df['Borough'] != 'Not assigned']

#reset index for future use
df.reset_index(drop=True, inplace=True)

##### Group the neighbourhoods with the same postcode and write them in one cell

In [8]:
# define n as the number of the last index to be able to iterate from the back
n = df['Postcode'].count() -1

# interate from the back with a for loop
for postcode in range(n):
    
    # check whether the postcode is the same as the one above
    if df['Postcode'][n] == df['Postcode'][n-1]:
        
        # if the postcodes are the same, define the Neighbourhood, which is above, as both Neighbourhoods seperated by a comma
        df['Neighbourhood'][n-1] = df['Neighbourhood'][n] + ", " + df['Neighbourhood'][n-1] 
        
        # drop the line below to prevent duplicates 
        df.drop(n, inplace=True)
    n = n-1
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
5,M7A,Downtown Toronto,Queen's Park


In [9]:
df.shape

(103, 3)

### Retrieve the gepspatial data from the CSV file

In [10]:
# read the csv file with the geospatial data
geo = pd.read_csv('http://cocl.us/Geospatial_data')

In [11]:
# rename the column for the Postcode for merging purposes
geo.rename(columns={'Postal Code': 'Postcode'}, inplace=True) 
geo.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [12]:
# merge the two dataframe on the Postcode
df_geo = pd.merge(df,geo,how='inner', on='Postcode')
df_geo.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494


### Visualize the map of Toronto

##### Get latitude and longitude from geolocator

In [13]:
address='Toronto, ON'

geolocator = Nominatim(user_agent='my_explorer')
location=geolocator.geocode(address)
lat = location.latitude
lon = location.longitude
print(lat, lon)

43.653963 -79.387207


##### Create map

In [16]:
# create map of Manhattan using latitude and longitude values
map_toronto = folium.Map(location=[lat, lon], zoom_start=11)

# add markers to map
for lati, long, label in zip(df_geo['Latitude'], df_geo['Longitude'], df_geo['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lati, long],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## Forusquare ID

In [7]:
# The code was removed by Watson Studio for sharing.

##### From this point, we will only concentrate on the boroughs, which have 'Toronto' in their names. Therefore, I will apply a filter on the 'Borough' column.

In [15]:
# filter on Boroughs, which have a 'Toronoto' in their name
df_toronto = df_geo[df_geo['Borough'].str.contains('Toronto')]
df_toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District\n, Ryerson\n",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031


##### Now, let's explore the different Postcodes in the dataframe defined above

In [16]:
import requests

In [17]:
# create function to iterate through the dataframe and create requests of the venues in the neighbourhoods
LIMIT = 50

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
        
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Postcode Latitude', 
                  'Postcode Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [18]:
# run function for each postcode

toronto_venues = getNearbyVenues(names=df_toronto['Neighbourhood'],
                                   latitudes=df_toronto['Latitude'],
                                   longitudes=df_toronto['Longitude']
                                  )
print('Information imported')

Information imported


In [19]:
toronto_venues.head()

Unnamed: 0,Neighbourhood,Postcode Latitude,Postcode Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Harbourfront,43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,Harbourfront,43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,Harbourfront,43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,Harbourfront,43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,Harbourfront,43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant


In [20]:
toronto_venues.shape

(1204, 7)

##### Analysis of the different postcodes

In [41]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood']

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

In [44]:
# group rows by neighbourhood and calculate the mean of the frequency of occurrence of each category

toronto_grouped = toronto_onehot.groupby('Neighbourhood').sum().reset_index()
toronto_grouped

Unnamed: 0,Neighbourhood,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Art Gallery,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Yoga Studio
0,Berczy Park,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
1,Business Reply Mail Processing Centre 969 East...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,Central Bay Street,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,1
3,Christie,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Church and Wellesley,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,1,0
5,Davisville,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
6,Davisville North,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,"Dufferin\n, Dovercourt Village",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
8,"Forest Hill West\n, Forest Hill North",0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
9,"Garden District\n, Ryerson\n",0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [70]:
# each neighbourhoods top 5 common vanues

num_top_venues = 5

for hood in toronto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
                venue  freq
0         Coffee Shop  0.10
1                Café  0.04
2            Beer Bar  0.04
3         Cheese Shop  0.04
4  Seafood Restaurant  0.04


----Business Reply Mail Processing Centre 969 Eastern
----
              venue  freq
0       Yoga Studio  0.06
1     Auto Workshop  0.06
2        Comic Shop  0.06
3              Park  0.06
4  Recording Studio  0.06


----Central Bay Street
----
                 venue  freq
0          Coffee Shop  0.18
1   Italian Restaurant  0.06
2      Bubble Tea Shop  0.04
3       Ice Cream Shop  0.04
4  Japanese Restaurant  0.04


----Christie
----
                venue  freq
0       Grocery Store  0.22
1                Café  0.17
2                Park  0.11
3               Diner  0.06
4  Italian Restaurant  0.06


----Church and Wellesley----
         venue  freq
0  Coffee Shop  0.06
1    Gastropub  0.04
2  Men's Store  0.04
3   Restaurant  0.04
4      Gay Bar  0.04


----Davisville
----
            venue  freq


In [71]:
# define function to sort the venues in descending order

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

##### Create the new dataframe and display the top 10 venues for each neighborhood

In [74]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Restaurant,Bakery,Beer Bar,Cocktail Bar,Farmers Market,Cheese Shop,Seafood Restaurant,Café,Japanese Restaurant
1,Business Reply Mail Processing Centre 969 East...,Yoga Studio,Auto Workshop,Light Rail Station,Brewery,Park,Spa,Burrito Place,Farmers Market,Fast Food Restaurant,Restaurant
2,Central Bay Street,Coffee Shop,Italian Restaurant,Chinese Restaurant,Bubble Tea Shop,Japanese Restaurant,Middle Eastern Restaurant,Ice Cream Shop,Café,Bar,Spa
3,Christie,Grocery Store,Café,Park,Candy Store,Baby Store,Athletics & Sports,Italian Restaurant,Diner,Nightclub,Coffee Shop
4,Church and Wellesley,Coffee Shop,Men's Store,Gastropub,Gay Bar,Restaurant,Distribution Center,Salon / Barbershop,Smoke Shop,Hobby Shop,Beer Bar


##### Clustering of the neighbourhoods

In [79]:
# set number of clusters
kclusters = 4

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:39] 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 2,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [77]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df_toronto

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighbourhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,0,Coffee Shop,Bakery,Pub,Park,Mexican Restaurant,Café,Restaurant,Theater,Event Space,Chocolate Shop
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494,0,Coffee Shop,Park,Yoga Studio,Café,Beer Bar,Music Venue,Boutique,Seafood Restaurant,Sandwich Place,Juice Bar
9,M5B,Downtown Toronto,"Garden District\n, Ryerson\n",43.657162,-79.378937,0,Coffee Shop,Café,Bookstore,Clothing Store,Cosmetics Shop,Restaurant,Theater,Ramen Restaurant,Burrito Place,Beer Bar
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0,Coffee Shop,Restaurant,Italian Restaurant,Cocktail Bar,BBQ Joint,Park,Farmers Market,Hotel,Café,Japanese Restaurant
19,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Pub,Trail,Health Food Store,Neighborhood,Yoga Studio,Dance Studio,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Dog Run


##### Visualize the resulting clusters

In [84]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

In [88]:
# create map
map_clusters = folium.Map(location=[lat, lon], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters