# Segmenting and Clustering Neighborhoods in Toronto

## 1. Scrape Wikipedia page and transform the data into a dataframe.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

Parse data from wikipedia and save it into a dataframe

In [2]:
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

html_data = requests.get(URL)
soup = BeautifulSoup(html_data.text, 'html.parser')

title = soup.title

Toronto_data = pd.read_html(str(soup.table))[0]
Toronto_data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


Clean and transform data.

In [3]:
# rename column
Toronto_data.rename(columns={"Postal Code": "PostalCode", "Neighbourhood": "Neighborhood"}, inplace=True)
print('The columns of the Dataframe are:', Toronto_data.columns)

# drop rows where Borough == 'Not assigned'
Toronto_data.drop(Toronto_data[Toronto_data.Borough == 'Not assigned'].index, inplace=True)

# groupby
Toronto_data = Toronto_data.groupby('PostalCode',as_index=False).agg(lambda x: ','.join(set(x.dropna())))

# neighborhood will be the same as the borough, if Neighborhood == 'Not assigned'
Toronto_data.loc[Toronto_data.loc[Toronto_data.Neighborhood == 'Not assigned'].index, 'Neighborhood'] = Toronto_data.loc[Toronto_data.Neighborhood == 'Not assigned'].Borough

# reset index
Toronto_data.reset_index(drop=True, inplace=True)

Toronto_data.head()

The columns of the Dataframe are: Index(['PostalCode', 'Borough', 'Neighborhood'], dtype='object')


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [4]:
print('Number of rows and columns: ', Toronto_data.shape)

Number of rows and columns:  (103, 3)


## 2. Get latitude and longitude coordinates of each neighborhood and add it to the dataframe

In [5]:
#!pip install geocoder
import geocoder

Toronto_data['Latitude'] = ''
Toronto_data['Longitude'] = ''

# loop all PostalCodes
for postal_code in Toronto_data.PostalCode:
    # initialize a variable to None
    lat_lng_coords = None

    # loop until getting the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng

    # add latitude and longitude to dataframe
    index = Toronto_data.loc[Toronto_data.PostalCode == postal_code].index[0]
    latitude = lat_lng_coords[0]
    Toronto_data.at[index, 'Latitude'] = latitude
    longitude = lat_lng_coords[1]
    Toronto_data.at[index, 'Longitude'] = longitude
Toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.8114,-79.1966
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.7857,-79.1587
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.7658,-79.1747
3,M1G,Scarborough,Woburn,43.7681,-79.2176
4,M1H,Scarborough,Cedarbrae,43.7694,-79.2389


Work with boroughs that contain the word "Toronto".

In [6]:
neighborhoods = Toronto_data[Toronto_data.Borough.str.contains("Toronto")].reset_index(drop=True)
neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.6771,-79.2955
1,M4K,East Toronto,"The Danforth West, Riverdale",43.6838,-79.3551
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668,-79.3147
3,M4M,East Toronto,Studio District,43.6621,-79.335
4,M4N,Central Toronto,Lawrence Park,43.7284,-79.3871


Create a map of Toronto with neighborhoods.

In [7]:
#!pip install geopy
from geopy.geocoders import Nominatim

#Use geopy library to get the latitude and longitude values of Toronto.
address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York City are 43.6534817, -79.3839347.


In [8]:
#!pip install folium
import folium

# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
map_toronto

Define Foursquare Credentials and Version

In [9]:
# Foursquare ID and Secret
CLIENT_ID = 'R0BTC2R3MZHPAJOYXQ425KRW40Z5Y1ZCSARGKBMXQNNMFB0B'
CLIENT_SECRET = '01IAJY4C0B34G2SMLT42WFWJ3KGGCZLY4PBQVZGZQDNPNGZ5'

# Foursquare API version
VERSION = '20180605'
# A default Foursquare API limit value
LIMIT = 100

## 3.1. Explore Neighborhoods in Toronto

Create a function to explore all the neighborhoods in Toronto

In [10]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):          
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [11]:
# Create new dataframe with the venues
toronto_venues = getNearbyVenues(names=neighborhoods['Neighborhood'], latitudes=neighborhoods['Latitude'], longitudes=neighborhoods['Longitude'])
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.67709,-79.29547,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.67709,-79.29547,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.67709,-79.29547,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.67709,-79.29547,Seaspray Restaurant,43.678888,-79.298167,Asian Restaurant
4,The Beaches,43.67709,-79.29547,Upper Beaches,43.680563,-79.292869,Neighborhood


Amount of venues returned for each neighborhood

In [12]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,64,64,64,64,64,64
"Brockton, Parkdale Village, Exhibition Place",85,85,85,85,85,85
"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",100,100,100,100,100,100
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",74,74,74,74,74,74
Central Bay Street,61,61,61,61,61,61
Christie,11,11,11,11,11,11
Church and Wellesley,82,82,82,82,82,82
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
Davisville,27,27,27,27,27,27
Davisville North,7,7,7,7,7,7


## Analyze Each Neighborhood

In [13]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
cols=list(toronto_onehot.columns.values)
cols.pop(cols.index('Neighborhood'))
toronto_onehot=toronto_onehot[['Neighborhood']+cols]

toronto_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Adult Boutique,American Restaurant,Antique Shop,Aquarium,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [14]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Accessories Store,Adult Boutique,American Restaurant,Antique Shop,Aquarium,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.015625,0.0,0.015625,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.015625,0.0,0.0,0.0,0.0,0.015625
1,"Brockton, Parkdale Village, Exhibition Place",0.011765,0.0,0.0,0.0,0.0,0.0,0.0,0.023529,0.0,...,0.011765,0.0,0.0,0.0,0.011765,0.0,0.0,0.0,0.0,0.011765
2,"Business reply mail Processing Centre, South C...",0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.01,0.03,...,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013514,...,0.0,0.0,0.0,0.013514,0.0,0.0,0.0,0.0,0.0,0.013514
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.016393,0.016393,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.016393,0.016393,0.016393,0.0,0.0
5,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Church and Wellesley,0.0,0.012195,0.012195,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012195
7,"Commerce Court, Victoria Hotel",0.0,0.0,0.04,0.0,0.0,0.01,0.0,0.0,0.02,...,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0
8,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.037037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Print each neighborhood along with the top 5 most common venues

In [15]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
                venue  freq
0         Coffee Shop  0.08
1        Cocktail Bar  0.05
2              Bakery  0.05
3  Seafood Restaurant  0.05
4            Beer Bar  0.03


----Brockton, Parkdale Village, Exhibition Place----
            venue  freq
0     Coffee Shop  0.06
1             Bar  0.06
2            Café  0.06
3      Restaurant  0.05
4  Sandwich Place  0.04


----Business reply mail Processing Centre, South Central Letter Processing Plant Toronto----
              venue  freq
0       Coffee Shop  0.07
1             Hotel  0.06
2              Café  0.04
3        Restaurant  0.03
4  Asian Restaurant  0.03


----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
                venue  freq
0         Coffee Shop  0.07
1  Italian Restaurant  0.07
2                Café  0.05
3                Park  0.04
4   French Restaurant  0.04


----Central Bay Street----
                       venue  freq
0            

Function to sort the venues in descending order.

In [16]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Create the new dataframe and display the top 10 venues for each neighborhood.

In [17]:
import numpy as np

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Seafood Restaurant,Bakery,Farmers Market,Lounge,Pharmacy,Beer Bar,Breakfast Spot,Cheese Shop
1,"Brockton, Parkdale Village, Exhibition Place",Coffee Shop,Bar,Café,Restaurant,Gift Shop,Sandwich Place,Supermarket,Japanese Restaurant,Furniture / Home Store,French Restaurant
2,"Business reply mail Processing Centre, South C...",Coffee Shop,Hotel,Café,Asian Restaurant,Restaurant,Gym,Steakhouse,Bar,Sushi Restaurant,Taco Place
3,"CN Tower, King and Spadina, Railway Lands, Har...",Italian Restaurant,Coffee Shop,Café,Bar,Park,French Restaurant,Gym / Fitness Center,Restaurant,Lounge,Bakery
4,Central Bay Street,Coffee Shop,Clothing Store,Plaza,Restaurant,Middle Eastern Restaurant,Bubble Tea Shop,Sandwich Place,Cosmetics Shop,Ramen Restaurant,Bookstore


## 3.2. Cluster Neighborhoods

In [18]:
#!pip install sklearn
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [19]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = neighborhoods

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.6771,-79.2955,0,Asian Restaurant,Health Food Store,Pub,Trail,Yoga Studio,Eastern European Restaurant,Electronics Store,Escape Room,Ethiopian Restaurant,Event Space
1,M4K,East Toronto,"The Danforth West, Riverdale",43.6838,-79.3551,0,Ice Cream Shop,Grocery Store,Park,Bus Line,Business Service,Discount Store,Yoga Studio,Escape Room,Ethiopian Restaurant,Event Service
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668,-79.3147,0,Park,Fast Food Restaurant,Pet Store,Restaurant,Italian Restaurant,Steakhouse,Movie Theater,Ice Cream Shop,Pub,Fruit & Vegetable Store
3,M4M,East Toronto,Studio District,43.6621,-79.335,0,Bakery,Italian Restaurant,Diner,Brewery,Gastropub,Bar,Sushi Restaurant,Coffee Shop,Arts & Crafts Store,Pizza Place
4,M4N,Central Toronto,Lawrence Park,43.7284,-79.3871,3,Bus Line,Swim School,Yoga Studio,Dumpling Restaurant,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant,Event Space,Event Service


In [20]:
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters