# Segmenting and Clustering Neighbourhoods in Toronto

In [1]:
import pandas as pd
import numpy as np
import requests

In [3]:
wiki = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wiki_page = requests.get(wiki)


In [17]:
wiki_raw = pd.read_html(wiki_page.content, header = 0)[0]
df=wiki_raw[wiki_raw.Neighbourhood !='Not Assigned']
df.reset_index(inplace = True)
df.head()

Unnamed: 0,index,Postal Code,Borough,Neighbourhood
0,0,M1A,Not assigned,Not assigned
1,1,M2A,Not assigned,Not assigned
2,2,M3A,North York,Parkwoods
3,3,M4A,North York,Victoria Village
4,4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [12]:
df.groupby(['Postal Code']).first()

Unnamed: 0_level_0,index,Borough,Neighbourhood
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
M1A,0,Not assigned,Not assigned
M1B,9,Scarborough,"Malvern, Rouge"
M1C,18,Scarborough,"Rouge Hill, Port Union, Highland Creek"
M1E,27,Scarborough,"Guildwood, Morningside, West Hill"
M1G,36,Scarborough,Woburn
M1H,45,Scarborough,Cedarbrae
M1J,54,Scarborough,Scarborough Village
M1K,63,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
M1L,72,Scarborough,"Golden Mile, Clairlea, Oakridge"
M1M,81,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"


In [13]:
len(df['Postal Code'].unique())

180

In [14]:
df[df['Borough'] == 'Not assigned']

Unnamed: 0,index,Postal Code,Borough,Neighbourhood
0,0,M1A,Not assigned,Not assigned
1,1,M2A,Not assigned,Not assigned
7,7,M8A,Not assigned,Not assigned
10,10,M2B,Not assigned,Not assigned
15,15,M7B,Not assigned,Not assigned
16,16,M8B,Not assigned,Not assigned
19,19,M2C,Not assigned,Not assigned
24,24,M7C,Not assigned,Not assigned
25,25,M8C,Not assigned,Not assigned
28,28,M2E,Not assigned,Not assigned


In [15]:
df.shape

(180, 4)

# Part 2

In [21]:
!pip install geocoder

Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |████████████████████████████████| 102kB 8.6MB/s ta 0:00:011
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


In [22]:
import geocoder

In [23]:
url = 'http://cocl.us/Geospatial_data'

In [24]:
df_geo = pd.read_csv(url)
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [25]:
df_geo.dtypes

Postal Code     object
Latitude       float64
Longitude      float64
dtype: object

In [26]:
df.dtypes

index             int64
Postal Code      object
Borough          object
Neighbourhood    object
dtype: object

In [29]:
df.shape

(180, 4)

In [28]:
df_geo.shape

(103, 3)

In [30]:
df = df.join(df_geo.set_index('Postal Code'), on='Postal Code')
df

Unnamed: 0,index,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,0,M1A,Not assigned,Not assigned,,
1,1,M2A,Not assigned,Not assigned,,
2,2,M3A,North York,Parkwoods,43.753259,-79.329656
3,3,M4A,North York,Victoria Village,43.725882,-79.315572
4,4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
5,5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
6,6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
7,7,M8A,Not assigned,Not assigned,,
8,8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
9,9,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353


In [31]:
df = df.reset_index()

In [32]:
df.drop(['index'], axis = 'columns', inplace = True)

In [33]:
df = df.set_index('level_0')

In [34]:
df.head()

Unnamed: 0_level_0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
level_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,M1A,Not assigned,Not assigned,,
1,M2A,Not assigned,Not assigned,,
2,M3A,North York,Parkwoods,43.753259,-79.329656
3,M4A,North York,Victoria Village,43.725882,-79.315572
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636


In [35]:
df = df.rename(index = {'level_0' : 'index'})
df.index.name = 'index'
df.head()

Unnamed: 0_level_0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,M1A,Not assigned,Not assigned,,
1,M2A,Not assigned,Not assigned,,
2,M3A,North York,Parkwoods,43.753259,-79.329656
3,M4A,North York,Victoria Village,43.725882,-79.315572
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636


In [36]:
df.shape

(180, 5)