Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto

In [1]:
from bs4 import BeautifulSoup # use the BeautifulSoup package for web scraping

import requests # library to handle requests

print('Libraries imported.')

Libraries imported.


In [2]:
# the link of the website through I am going to scrape the data
website_url =requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text


In [3]:
soup = BeautifulSoup(website_url, 'lxml')
# print(soup.prettify())

In [4]:
# Carefully inspect the HTML script all the table contents
# that I intend to extract is under class "wikitable sortable".

Toronto_table = soup.find("table",{"class":"wikitable sortable"})
# Toronto_table


In [5]:
#loading empty array
PostalTable = []

#loop through table, get each of the 3 columns 
for row in Toronto_table.find_all('tr')[1:]: # skip the 1st row (the titles)
    cols = row.find_all('td')
    PostalTable.append([cols[0].text.strip(), cols[1].text.strip(), cols[2].text.strip()])

# PostalTable

In [6]:
import pandas as pd # library for data analsysis

In [7]:
postcode=[x[0] for x in PostalTable]
df_postcode=pd.DataFrame(postcode)


In [8]:
borough=[x[1] for x in PostalTable]
df_borough=pd.DataFrame(borough)

In [9]:
neighborhood=[x[2] for x in PostalTable]
df_neighborhood=pd.DataFrame(neighborhood)

In [10]:
# define the dataframe columns
column_names = ['Postcode', 'Borough','Neighborhood'] 

df_PostalCode = pd.concat([df_postcode,df_borough,df_neighborhood],axis=1)

df_PostalCode.columns=column_names

df_PostalCode.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [11]:
# Ignore cells with a borough that is Not assigned.
df_Toronto=df_PostalCode[df_PostalCode.Borough != 'Not assigned']

df_Toronto.reset_index(drop=True, inplace=True)

df_Toronto.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [12]:
# If a cell has a borough but a Not assigned neighborhood, 
# then the neighborhood will be the same as the borough. 

for index, row in df_Toronto.iterrows():
    if row['Neighborhood'] == 'Not assigned':
        row['Neighborhood'] = row['Borough']
df_Toronto.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [13]:
# More than one neighborhood can exist in one postal code area. 
# These two rows will be combined into one row with the neighborhoods separated with a comma.

df_Toronto = df_Toronto.groupby(['Postcode', 'Borough'])[
                    'Neighborhood'].apply(list).apply(lambda x:', '.join(x)).to_frame().reset_index()

df_Toronto.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [14]:
df_Toronto.shape # print the number of rows of this dataframe.

(103, 3)