# Segmenting and Clustering Neighborhoods in Toronto

In [5]:
#import the packages
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [6]:
#creates a source for the Beautiful Soup
site = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [7]:
#creates the Beautiful Soup of the website
soupsite = BeautifulSoup(site, 'lxml')

In [8]:
#imports the neighbourhoods table from the Wikipedia to a dataframe
torontoTable = soupsite.find('table')
df = pd.read_html(str(torontoTable))[0]

In [9]:
#Drop any rows that do not have a borough assigned to them.
df.drop(df[df.Borough == 'Not assigned'].index, inplace = True)
#Determine any 'Not Assigned' values in Neighbourhood with the Borough
for row in df.itertuples():
    if row.Neighbourhood == "Not assigned":
        print(row)

Pandas(Index=8, Postcode='M7A', Borough="Queen's Park", Neighbourhood='Not assigned')


In [10]:
#Change the neighbourhood to the borough
df.at[8, 'Neighbourhood'] = "Queen's Park"
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [11]:
#Combine rows with multiple neighbourhoods (same postal code)
postcodeGroups = df.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(list)

In [12]:
postcodeGroups.head(5)

Postcode  Borough    
M1B       Scarborough                            [Rouge, Malvern]
M1C       Scarborough    [Highland Creek, Rouge Hill, Port Union]
M1E       Scarborough         [Guildwood, Morningside, West Hill]
M1G       Scarborough                                    [Woburn]
M1H       Scarborough                                 [Cedarbrae]
Name: Neighbourhood, dtype: object

In [13]:
#Turn this into a new dataframe.
dfPCG = pd.DataFrame(postcodeGroups)
dfPCG = dfPCG.reset_index(['Postcode', 'Borough'])
dfPCG.head(5)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"[Rouge, Malvern]"
1,M1C,Scarborough,"[Highland Creek, Rouge Hill, Port Union]"
2,M1E,Scarborough,"[Guildwood, Morningside, West Hill]"
3,M1G,Scarborough,[Woburn]
4,M1H,Scarborough,[Cedarbrae]


In [14]:
# Print the shape
dfPCG.shape

(103, 3)