In [1]:
#!pip install pandas
#!pip install requests
#!pip install bs4
#!pip install plotly

## Importing required libraries

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

## Webscrapping 
We start by scrapping the wikipedia page and prepare the soup 

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [4]:
html_data = requests.get(url).text

In [5]:
soup = BeautifulSoup(html_data, 'html.parser')

By navigating the tree page we find that the post code, borough and neighbourhoods are found in the same level of the parse tree

In [6]:
#table = soup.find_all("table")[0].find_all("td")
soup.find_all('tbody')[0].find_all("p")

[<p><b>M1A</b><br/><span style="font-size:85%;"><i>Not assigned</i></span>
 </p>,
 <p><b>M2A</b><br/><span style="font-size:85%;"><i>Not assigned</i></span>
 </p>,
 <p><b>M3A</b><br/><span style="font-size:85%;"><a href="/wiki/North_York" title="North York">North York</a><br/>(<a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>)</span>
 </p>,
 <p><b>M4A</b><br/><span style="font-size:85%;"><a href="/wiki/North_York" title="North York">North York</a><br/>(<a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>)</span>
 </p>,
 <p><b>M5A</b><br/><span style="font-size:85%;"><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a><br/>(<a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a> / <a href="/wiki/Harbourfront,_Toronto" title="Harbourfront, Toronto">Harbourfront</a>)</span>
 </p>,
 <p><b>M6A</b><br/><span style="font-size:85%;"><a href="/wiki/North_York" title="North York">North York</a><br/>(<a href="/wiki/Lawrence_Manor

So we begin by creating a list that we use to create the data frame

In [7]:
ls = []
n = 0
for x in soup.find_all('tbody')[0].find_all("p"):
    ls.append(soup.find_all('tbody')[0].find_all("p")[n].text)
    n = n + 1
print(ls)
    

['M1ANot assigned\n', 'M2ANot assigned\n', 'M3ANorth York(Parkwoods)\n', 'M4ANorth York(Victoria Village)\n', 'M5ADowntown Toronto(Regent Park / Harbourfront)\n', 'M6ANorth York(Lawrence Manor / Lawrence Heights)\n', "M7AQueen's Park(Ontario Provincial Government)\n", 'M8ANot assigned\n', 'M9AEtobicoke(Islington Avenue)\n', 'M1BScarborough(Malvern / Rouge)\n', 'M2BNot assigned\n', 'M3BNorth York(Don Mills)North\n', 'M4BEast York(Parkview Hill / Woodbine Gardens)\n', 'M5BDowntown Toronto(Garden District, Ryerson)\n', 'M6BNorth York(Glencairn)\n', 'M7BNot assigned\n', 'M8BNot assigned\n', 'M9BEtobicoke(West Deane Park / Princess Gardens / Martin Grove / Islington / Cloverdale)\n', 'M1CScarborough(Rouge Hill / Port Union / Highland Creek)\n', 'M2CNot assigned\n', 'M3CNorth York(Don Mills)South(Flemingdon Park)\n', 'M4CEast York(Woodbine Heights)\n', 'M5CDowntown Toronto(St. James Town)\n', 'M6CYork(Humewood-Cedarvale)\n', 'M7CNot assigned\n', 'M8CNot assigned\n', 'M9CEtobicoke(Eringate / 

The list is converted into a Pandas dataframe

In [8]:
df = pd.DataFrame(ls)

In [9]:
df.head()

Unnamed: 0,0
0,M1ANot assigned\n
1,M2ANot assigned\n
2,M3ANorth York(Parkwoods)\n
3,M4ANorth York(Victoria Village)\n
4,M5ADowntown Toronto(Regent Park / Harbourfront)\n


---
## Cleaning the Dataframe
We creating a dataframe with the required columns and perfom the first data split.

In [10]:
toronto_df = pd.DataFrame(columns=["PostalCode", "Borough", "Neighborhood"])

toronto_df["PostalCode"] = df.iloc[:,0].str[0:3]
toronto_df["Borough"] = df.iloc[:,0].str[3:]



toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned\n,
1,M2A,Not assigned\n,
2,M3A,North York(Parkwoods)\n,
3,M4A,North York(Victoria Village)\n,
4,M5A,Downtown Toronto(Regent Park / Harbourfront)\n,


Removing <code>\n</code> from cells

In [11]:
toronto_df["Borough"] = toronto_df["Borough"].str.replace("\n","")
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York(Parkwoods),
3,M4A,North York(Victoria Village),
4,M5A,Downtown Toronto(Regent Park / Harbourfront),


Removing cells with a borough that is <code>Not assigned</code>

In [12]:
toronto_df = toronto_df[toronto_df.Borough != "Not assigned"]
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York(Parkwoods),
3,M4A,North York(Victoria Village),
4,M5A,Downtown Toronto(Regent Park / Harbourfront),
5,M6A,North York(Lawrence Manor / Lawrence Heights),
6,M7A,Queen's Park(Ontario Provincial Government),


Split boroughs and neighborhoods

In [13]:
toronto_df[['Borough','Neighborhood']] = toronto_df["Borough"].str.split('(', n=1, expand=True)
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods)
3,M4A,North York,Victoria Village)
4,M5A,Downtown Toronto,Regent Park / Harbourfront)
5,M6A,North York,Lawrence Manor / Lawrence Heights)
6,M7A,Queen's Park,Ontario Provincial Government)


Cleaning <code>Neighborhood</code>

In [14]:
toronto_df["Neighborhood"] = toronto_df["Neighborhood"].str.replace(")","")
toronto_df["Neighborhood"] = toronto_df["Neighborhood"].str.replace(" / ",", ")

  toronto_df["Neighborhood"] = toronto_df["Neighborhood"].str.replace(")","")


Reset the index

In [15]:
toronto_df.reset_index(drop=True, inplace=True)

In [16]:
toronto_df.shape

(103, 3)

Replacing neighborhoods that are not assigned with the name of the Borough

In [17]:
#toronto_df = toronto_df[toronto_df.PostalCode != "M7Y"]
#toronto_df = toronto_df[toronto_df.PostalCode != "M5W"]

print(toronto_df.Borough[toronto_df.PostalCode == "M5W"])
print(toronto_df[toronto_df.PostalCode == "M5W"].index)

print(toronto_df.Borough[toronto_df.PostalCode == "M7Y"])
print(toronto_df[toronto_df.PostalCode == "M7Y"].index)



92    Downtown TorontoStn A PO Boxes25 The Esplanade
Name: Borough, dtype: object
Int64Index([92], dtype='int64')
100    East TorontoBusiness reply mail Processing Cen...
Name: Borough, dtype: object
Int64Index([100], dtype='int64')


In [18]:
toronto_df.iloc[92, toronto_df.columns.get_loc('Borough')] = 'Downtown Toronto'
toronto_df.iloc[92, toronto_df.columns.get_loc('Neighborhood')] = 'Downtown Toronto'
toronto_df.iloc[100, toronto_df.columns.get_loc('Borough')] = 'East Toronto'
toronto_df.iloc[100, toronto_df.columns.get_loc('Neighborhood')] = 'East Toronto'

In [19]:
print(toronto_df.Borough[toronto_df.PostalCode == "M5W"])
print(toronto_df[toronto_df.PostalCode == "M5W"].index)

print(toronto_df.Borough[toronto_df.PostalCode == "M7Y"])
print(toronto_df[toronto_df.PostalCode == "M7Y"].index)

92    Downtown Toronto
Name: Borough, dtype: object
Int64Index([92], dtype='int64')
100    East Toronto
Name: Borough, dtype: object
Int64Index([100], dtype='int64')


exploring data to ensure it is ready for analysis

In [21]:
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [22]:
toronto_df.tail()

Unnamed: 0,PostalCode,Borough,Neighborhood
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,East Toronto
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."
102,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [20]:
toronto_df.shape

(103, 3)