# Segmenting and Clustering Neighborhoods in Toronto

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

We're using Beautiful Soup library to scrape the data from the Wiki link

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')
#print(soup.prettify())

table = soup.find('table')
#print(table.prettify())

We fetch the entire table containing the postal data and store it in a variable. We shall extract data from this variable now.

In [3]:
columns = []
for header in table.find_all('th'):
    columns.append(header.text.split('\n')[0])
columns

['Postcode', 'Borough', 'Neighbourhood']

In [4]:
li = [[]]
for rows in table.find_all('tr'):
    l = []
    for data in rows.find_all('td'):
        l.append(data.text.split('\n')[0])
    li.append(l)
li[0:10]

[[],
 [],
 ['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront'],
 ['M5A', 'Downtown Toronto', 'Regent Park'],
 ['M6A', 'North York', 'Lawrence Heights'],
 ['M6A', 'North York', 'Lawrence Manor']]

We will convert this list into a dataframe, and add the column names as well.

In [5]:
df = pd.DataFrame(li)
df.columns = columns
df.dropna(inplace=True)
df = df[df['Borough']!='Not assigned']
df.loc[df['Neighbourhood']=='Not assigned','Neighbourhood'] = df.loc[df['Neighbourhood']=='Not assigned','Borough']
df = df.reset_index(drop=True)
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [6]:
df_grouped = df[['Postcode','Borough']].drop_duplicates()
df_grouped = df_grouped.reset_index(drop=True)
df_grouped.head(10)

Unnamed: 0,Postcode,Borough
0,M3A,North York
1,M4A,North York
2,M5A,Downtown Toronto
3,M6A,North York
4,M7A,Queen's Park
5,M9A,Etobicoke
6,M1B,Scarborough
7,M3B,North York
8,M4B,East York
9,M5B,Downtown Toronto


We use the join fuction to group all the neighbourhood together.

In [7]:
for code, df_group in df.groupby('Postcode', sort=False):
    df_grouped.loc[df_grouped['Postcode']==code, 'Neighbourhood'] = (", ".join(df_group['Neighbourhood']))
df_grouped.rename(columns = {'Postcode':'PostalCode'}, inplace = True) 
df_grouped.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [8]:
df_grouped.shape

(103, 3)