# Segmenting and Clustering Neighborhoods in Toronto

Explore, segment, and cluster the neighborhoods in the city of Toronto.

#### Import all required libraries

In [1]:
!conda install -c conda-forge geopy --yes 
!conda install -c conda-forge folium=0.5.0 --yes
!conda install lxml --yes

import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation
import folium # plotting library
import matplotlib.cm as cm
import matplotlib.colors as colors

from pandas.io.json import json_normalize # tranforming json file into a pandas dataframe library
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values
from IPython.display import Image # libraries for displaying images
from IPython.core.display import HTML # libraries for displaying images
from sklearn.cluster import KMeans # import k-means from clustering stage

Solving environment: done


  current version: 4.5.11
  latest version: 4.8.1

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    scikit-learn-0.20.1        |   py36h22eb022_0         5.7 MB
    liblapack-3.8.0            |      11_openblas          10 KB  conda-forge
    liblapacke-3.8.0           |      11_openblas          10 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    libopenblas-0.3.6          |       h5a2b251_2         7.7 MB
    scipy-1.4.1                |   py36h921218d_0        18.9 MB  conda-forge
    libcblas-3.8.0             |      11_openblas        

## **PART 1**

#### Scrap data from Wikipedia page into a DataFrame

Read the table with pandas and rename column titles

In [2]:
df0 = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
df0.rename(index = str, columns = {"Postcode": "PostalCode", "Neighbourhood":"Neighborhood"}, inplace = True)

Manage "Not assigned" values

In [3]:
df1 = df0[df0.Borough != "Not assigned"].reset_index(drop = True) # Drop "Not assigned" boroughs
df1.Neighborhood[df1['Neighborhood'] == 'Not assigned'] = df1.Borough[df1['Neighborhood'] == 'Not assigned'] # Rename "Not assigned" neighborhoods

Group neighborhoods in the same borough

In [4]:
df2 = df1.groupby(["PostalCode", "Borough"], as_index = False).agg(lambda x: ", ".join(x))
df2.shape

(103, 3)

## **PART 2**

#### Add geographical coordinates to the DataFrame

Import data with coordinates and rename

In [5]:
dfc = pd.read_csv('https://cocl.us/Geospatial_data')
dfc.rename(index = str, columns = {"Postal Code": "PostalCode"}, inplace = True)

Join the two data frames

In [6]:
df3 = df2.join(dfc.set_index('PostalCode'), on='PostalCode')

## **PART 3**

Print the number of boroughs and postal codes

In [7]:
print('The dataframe has {} boroughs and {} postal codes.'.format(len(df3['Borough'].unique()),len(df3['PostalCode'].unique())))
print('The name of the boroughs are: ',df3['Borough'].unique())

The dataframe has 11 boroughs and 103 postal codes.
The name of the boroughs are:  ['Scarborough' 'North York' 'East York' 'East Toronto' 'Central Toronto'
 'Downtown Toronto' 'York' 'West Toronto' 'Mississauga' 'Etobicoke'
 "Queen's Park"]


#### Create the DataFrame of Neighborhoods for our analysis

Lets work only with the boroughs that contain the word Toronto

In [8]:
dfN = df3[df3['Borough'].str.contains("Toronto")]
print('The new dataframe has {} neighborhoods'.format(len(dfN['Neighborhood'].unique())))

The new dataframe has 39 neighborhoods


Create a map with the postal codes' areas

In [9]:
location = Nominatim(user_agent = "foursquare_agent").geocode('Toronto')

map = folium.Map(location = [location.latitude, location.longitude], zoom_start = 12)

for lat, lng, label in zip(df3.Latitude, df3.Longitude, df3.PostalCode):
        folium.features.CircleMarker([lat, lng],radius=2.5,color='blue' ,popup=label).add_to(map)
for lat, lng, label in zip(dfN.Latitude, dfN.Longitude, dfN.PostalCode):
        folium.features.CircleMarker([lat, lng],radius=7.5,color='green',popup=label).add_to(map)
map

#### Define Foursquare credentials

In [10]:
CLIENT_ID = '5BE1CFT1DAZS0QY4L02YFE3312POUPPMMW3PTEMNBYU3UNPC'
CLIENT_SECRET = 'TTCGHPU30N3LWFLLCHWMOSDHQ1BBFVU2I53NRYUJNEZR3X0F'
VERSION = '20180604'

#### Create the DataFrame of Venues for our analysis

Let's get the top 100 venues of Toronto's neighborhoods inside a radius of 500m

In [11]:
LIMIT = 100
R = 500

venues_list = []

for NB, LT, LG in zip(dfN['Neighborhood'],dfN['Latitude'],dfN['Longitude']):
                    
    # create the API request URL
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, LT, LG, R, LIMIT)
            
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
        
    # return only relevant information for each nearby venue
    venues_list.append([(NB,LT,LG,v['venue']['name'],v['venue']['location']['lat'],v['venue']['location']['lng'], v['venue']['categories'][0]['name']) for v in results])

dfV = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
dfV.columns = ['Neighborhood','Neighborhood Latitude','Neighborhood Longitude','Venue','Venue Latitude','Venue Longitude','Venue Category']
    
print('There are {} venues'.format(dfV.shape[0]))

There are 1702 venues


Let's find out how many unique categories can be curated from all the returned venues

In [12]:
print('There are {} uniques categories.'.format(len(dfV['Venue Category'].unique())))

There are 232 uniques categories.


### One Hot Encoding

In [13]:
OHE = pd.get_dummies(dfV[['Venue Category']], prefix="", prefix_sep="") # one hot encoding
OHE['Neighbourhood'] = dfV['Neighborhood'] # add neighborhood column back to dataframe
fixed_columns = [OHE.columns[-1]] + list(OHE.columns[:-1]) # move neighborhood column to the first column
OHE = OHE[fixed_columns]

Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [14]:
OHG = OHE.groupby('Neighbourhood').mean().reset_index()

Print each neighborhood along with the top 10 Most Common Venues

In [15]:
ntv = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(ntv):
    try:
        columns.append('{}{}'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th'.format(ind+1))

# create a new dataframe
NVS = pd.DataFrame(columns=columns)
NVS['Neighborhood'] = OHG['Neighbourhood']

for ind in np.arange(OHG.shape[0]):
    NVS.iloc[ind, 1:] = OHG.iloc[ind, :].iloc[1:].sort_values(ascending=False).index.values[0:ntv]

NVS

Unnamed: 0,Neighborhood,1st,2nd,3rd,4th,5th,6th,7th,8th,9th,10th
0,"Adelaide, King, Richmond",Coffee Shop,Bar,Steakhouse,Café,Restaurant,Asian Restaurant,Breakfast Spot,Thai Restaurant,Hotel,Seafood Restaurant
1,Berczy Park,Coffee Shop,Cocktail Bar,Café,Steakhouse,Bakery,Cheese Shop,Beer Bar,Farmers Market,Seafood Restaurant,Bistro
2,"Brockton, Exhibition Place, Parkdale Village",Coffee Shop,Breakfast Spot,Café,Bakery,Grocery Store,Stadium,Burrito Place,Restaurant,Climbing Gym,Pet Store
3,Business Reply Mail Processing Centre 969 Eastern,Yoga Studio,Auto Workshop,Comic Shop,Park,Pizza Place,Recording Studio,Restaurant,Burrito Place,Brewery,Light Rail Station
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Lounge,Airport Service,Airport Terminal,Boutique,Harbor / Marina,Boat or Ferry,Rental Car Location,Bar,Plane,Coffee Shop
5,"Cabbagetown, St. James Town",Coffee Shop,Italian Restaurant,Park,Pizza Place,Bakery,Restaurant,Café,Pub,Grocery Store,Diner
6,Central Bay Street,Coffee Shop,Café,Ice Cream Shop,Italian Restaurant,Sandwich Place,Chinese Restaurant,Burger Joint,Japanese Restaurant,Juice Bar,Bar
7,"Chinatown, Grange Park, Kensington Market",Café,Vietnamese Restaurant,Chinese Restaurant,Dumpling Restaurant,Vegetarian / Vegan Restaurant,Coffee Shop,Bar,Mexican Restaurant,Grocery Store,Burger Joint
8,Christie,Grocery Store,Café,Park,Candy Store,Restaurant,Diner,Italian Restaurant,Gas Station,Baby Store,Nightclub
9,Church and Wellesley,Coffee Shop,Japanese Restaurant,Restaurant,Sushi Restaurant,Gay Bar,Café,Gym,Hotel,Mediterranean Restaurant,Fast Food Restaurant


#### Clustering

Run k-means to cluster the Toronto areas into 5 clusters

In [16]:
kclusters = 5 # set number of clusters

OHC = OHG.drop('Neighbourhood', 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(OHC) # run k-means clustering

Create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood

In [17]:
NVS.insert(0, 'Cluster Labels', kmeans.labels_) # add clustering labels

dfM = dfN.join(NVS.set_index('Neighborhood'), on='Neighborhood') # merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood

dfM

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st,2nd,3rd,4th,5th,6th,7th,8th,9th,10th
37,M4E,East Toronto,The Beaches,43.676357,-79.293031,4,Park,Health Food Store,Neighborhood,Pub,Trail,Dumpling Restaurant,Donut Shop,Doner Restaurant,Department Store,Eastern European Restaurant
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,0,Greek Restaurant,Coffee Shop,Italian Restaurant,Ice Cream Shop,Furniture / Home Store,Yoga Studio,Fruit & Vegetable Store,Restaurant,Pub,Pizza Place
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572,0,Park,Movie Theater,Steakhouse,Sushi Restaurant,Fish & Chips Shop,Brewery,Pub,Italian Restaurant,Fast Food Restaurant,Liquor Store
43,M4M,East Toronto,Studio District,43.659526,-79.340923,0,Café,Coffee Shop,Gastropub,Brewery,Bakery,Italian Restaurant,American Restaurant,Yoga Studio,Comfort Food Restaurant,Sandwich Place
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,3,Park,Swim School,Bus Line,Fast Food Restaurant,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
45,M4P,Central Toronto,Davisville North,43.712751,-79.390197,0,Park,Sandwich Place,Department Store,Food & Drink Shop,Hotel,Gym,Breakfast Spot,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
46,M4R,Central Toronto,North Toronto West,43.715383,-79.405678,0,Coffee Shop,Sporting Goods Shop,Fast Food Restaurant,Diner,Mexican Restaurant,Dessert Shop,Park,Clothing Store,Chinese Restaurant,Café
47,M4S,Central Toronto,Davisville,43.704324,-79.38879,0,Sandwich Place,Dessert Shop,Italian Restaurant,Coffee Shop,Gym,Café,Sushi Restaurant,Pizza Place,Pharmacy,Seafood Restaurant
48,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316,2,Gym,Playground,Tennis Court,Yoga Studio,Dessert Shop,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
49,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049,0,Pub,Coffee Shop,American Restaurant,Restaurant,Fried Chicken Joint,Sports Bar,Sushi Restaurant,Pizza Place,Supermarket,Liquor Store


In [18]:
# create map
map_clusters = folium.Map(location=[LT,LG], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(dfM['Latitude'], dfM['Longitude'], dfM['Neighborhood'], dfM['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters