# Scraping from Wikipedia, the list of Toronto postal codes

### Import libraries

In [1]:
# Import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# specify the url
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [3]:
# query the website and return the html to the variable ‘page’
page= requests.get(url)


### Use BeautifulSoup to parse the html page

In [5]:
soup = BeautifulSoup(page.text, 'lxml')
#print(soup.prettify())

### extract the postal codes table from html 

In [6]:
postal_codes_table = soup.find("table",{"class":"wikitable sortable"})
table = postal_codes_table
#print(postal_codes_table)

In [7]:
table_rows = table.find_all('tr')

In [14]:
enclosing_list = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [i.text.strip() for i in td]
    enclosing_list.append(row)
#print(enclosing_list)

In [None]:
### Transform list of lists to pandas dataframe

In [15]:
#transform list of lists to pandas dataframe
df = pd.DataFrame(enclosing_list, columns=["Postcode","Borough","Neighborhood"])


### Clean the dataframe of empty cell and rows containing 'Not assigned' borough

In [16]:
#drop first(empty) row of the the dataframe
df = df.drop(df.index[[0]]) #drop first (empty) row
df = df.reset_index(drop=True)

In [17]:
# drop cells with a borough that is Not assigned.
df = df[df.Borough != "Not assigned"]
df = df.reset_index(drop=True) #reset index
df.head(5)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


### Combine into one row with the neighborhoods separated with a comma: when more than one neighborhood exists in one postal code area.

In [30]:
df = df.groupby("Postcode").agg({"Borough": lambda x: list(set(x))[0],"Neighborhood": lambda y: ", ".join(y)})
df.head(10)

Unnamed: 0_level_0,Borough,Neighborhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Rouge, Malvern"
M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae
M1J,Scarborough,Scarborough Village
M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
M1N,Scarborough,"Birch Cliff, Cliffside West"


In [32]:
#convert index postcode to column
df["Postcode"]=df.index
df.head(5)

Unnamed: 0_level_0,Borough,Neighborhood,Postcode
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
M1B,Scarborough,"Rouge, Malvern",M1B
M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",M1C
M1E,Scarborough,"Guildwood, Morningside, West Hill",M1E
M1G,Scarborough,Woburn,M1G
M1H,Scarborough,Cedarbrae,M1H


In [None]:
### Transform dataframe as per requested format:

In [33]:
df = df.reset_index(drop=True)
cols= df.columns.tolist()
cols = cols[-1:] + cols[:-1]
df = df[cols]
df.head(10)


Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [34]:
# no. of lines, columns
df.shape

(103, 3)