# Segmenting and Clustering Neighborhoods in Toronto

Explore, segment, and cluster the neighborhoods in the city of Toronto starting with data from a Wikipedia page of Toronto neighborhoods.

### Part 1: Data procurement and pre-processing

First, load libraries.

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

#!pip install wikitextparser
#!pip install beautifulsoup4
#import wikitextparser as wtp

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
wikipedia_link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means for clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


Then conduct some data manipulations on the raw data to set the dataframe.

In [2]:
# download and review wikipedia page
#raw_wiki = requests.get(wikipedia_link)
#raw = requests.get('http://en.wikipedia.org/w/index.php?title='
#    + 'List_of_postal_codes_of_Canada:_M&action=raw')

#page = raw_wiki_page.text
#print(page)
#wiki_text = wtp.parse(raw.text)

#print(wiki_text)
#print(json.dumps({'root': wiki_text.tables[0].data(span=False)[0:5]}))
#print(json.dumps({'root': wiki_text.tables[0].data()[0:10]}, indent=2))

In [3]:
# Read wikipedia page into dataframe
wiki_html = pd.read_html(wikipedia_link, header =0)
print(type(wiki_html))
df = pd.DataFrame(wiki_html[0])
print(df.shape)
#print(df.head)
df.head(25)

<class 'list'>
(289, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [4]:
# Ignore cells with a borough that is not assigned.

df = df[df.Borough != 'Not assigned']
#df.query('Borough != 0')
df.reset_index(inplace=True)
df.head(10)

Unnamed: 0,index,Postcode,Borough,Neighbourhood
0,2,M3A,North York,Parkwoods
1,3,M4A,North York,Victoria Village
2,4,M5A,Downtown Toronto,Harbourfront
3,5,M5A,Downtown Toronto,Regent Park
4,6,M6A,North York,Lawrence Heights
5,7,M6A,North York,Lawrence Manor
6,8,M7A,Queen's Park,Not assigned
7,10,M9A,Etobicoke,Islington Avenue
8,11,M1B,Scarborough,Rouge
9,12,M1B,Scarborough,Malvern


In [5]:
# More than one neighborhood can exist in one postal code area and should be combined, comma separated.

#df_continents = df_can.groupby('Continent', axis=0).sum() #summing
df_merge = df.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()
#df.groupby([‘Column_1’, ‘Column_2’], as_index=false).agg(‘sum’)
df_merge.head(10)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [6]:
# Cells not assigned neighborhoods will have the same name as assigned borough. (ie, M7A)

df_merge['Neighbourhood'] = np.where(df_merge['Neighbourhood'] == 'Not assigned', df_merge['Borough'], df_merge['Neighbourhood'])
df_merge.tail(25)


Unnamed: 0,Postcode,Borough,Neighbourhood
78,M6K,West Toronto,"Brockton, Exhibition Place, Parkdale Village"
79,M6L,North York,"Maple Leaf Park, North Park, Upwood Park"
80,M6M,York,"Del Ray, Keelesdale, Mount Dennis, Silverthorn"
81,M6N,York,"The Junction North, Runnymede"
82,M6P,West Toronto,"High Park, The Junction South"
83,M6R,West Toronto,"Parkdale, Roncesvalles"
84,M6S,West Toronto,"Runnymede, Swansea"
85,M7A,Queen's Park,Queen's Park
86,M7R,Mississauga,Canada Post Gateway Processing Centre
87,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern


In [7]:
print('Original Row & Column Counts:', df.shape)
print('Final Row & Column Counts:', df_merge.shape)

Original Row & Column Counts: (212, 4)
Final Row & Column Counts: (103, 3)
