Import some libraries

In [1]:
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

Download the webpage/data

In [2]:
!wget -q -O 'Toronto_postal_code.html' https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M
print('Data downloaded!')

Data downloaded!


Open the downloaded file and convert it into a beautifulsoup object

In [3]:
with open('Toronto_postal_code.html','r') as fh:
    soup=BeautifulSoup(fh,'html.parser')
soup.title.text

'List of postal codes of Canada: M - Wikipedia'

Find the table in the html code

In [4]:
test1=soup.find_all('tr')
len(test1)

294

Find the column headers in the html table and convert them into a list. 

In [5]:
column_names=[]
column_names.extend([x.text.strip() for x in test1[0].find_all('th')])
column_names

['Postcode', 'Borough', 'Neighbourhood']

Create a dataframe for the table info, with column_names as the columns

In [6]:
df=pd.DataFrame(columns=column_names)

In [7]:
df

Unnamed: 0,Postcode,Borough,Neighbourhood


Find all the rows in the postal code table, the len(row)==3 bit is needed to strip out some of the last "td" entries as they weren't rows with postal code information.


In [8]:
for i in range(1,len(test1)):
    row=[str(p.text.strip()) for p in test1[i].find_all('td')]
    #print(i, ': ', row)
    if len(row)==3:
        df.loc[len(df)]=row

Do some checks of the dataframe

In [9]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [10]:
df.dtypes

Postcode         object
Borough          object
Neighbourhood    object
dtype: object

Drop the rows where there is no Borough entry ('Not assigned') and reset the index.

In [11]:
df.drop(df[df['Borough']=='Not assigned'].index, inplace=True)

In [12]:
df.reset_index(drop=True, inplace=True)

Check if there are any Neighbourhoods with a "Not assigned" entry.

In [13]:
df[df['Neighbourhood']=='Not assigned']

Unnamed: 0,Postcode,Borough,Neighbourhood
6,M7A,Queen's Park,Not assigned


It is only 1 entry, replace it with the Borough name by "hand".

In [14]:
df.loc[6,'Neighbourhood']='Queen\'s Park'

In [15]:
df.loc[6,'Neighbourhood']

"Queen's Park"

Get a quick look at what we have left now.

In [16]:
df.describe()

Unnamed: 0,Postcode,Borough,Neighbourhood
count,211,211,211
unique,103,11,209
top,M9V,Etobicoke,St. James Town
freq,8,45,2


Combine the entries where the postal code is the same and create a comma seperated list of Neighbourhoods for each entry. Write that to a new dataframe.

In [17]:
df_postcode=df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(','.join).reset_index()

The new dataframe has only unique postal codes.

In [18]:
df_postcode.describe()

Unnamed: 0,Postcode,Borough,Neighbourhood
count,103,103,103
unique,103,11,103
top,M1E,North York,"Albion Gardens,Beaumond Heights,Humbergate,Jam..."
freq,1,24,1


In [19]:
df_postcode.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [20]:
df_postcode.shape

(103, 3)

In [21]:
print("Number of rows in df_postcode dataframe: ", df_postcode.shape[0])

Number of rows in df_postcode dataframe:  103
