#### Importing Requests, BeautifulSoup and Pandas libraries

In [555]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(URL)
soup = BeautifulSoup(response.text, 'html.parser')

# print(soup)

In [556]:
table = soup.find_all('table', class_="wikitable sortable")
table = table[0]

In [557]:
rows = table.find_all('tr')
len(rows)

289

#### Removing new line character and getting column heading

In [558]:
columns = [v.text.replace('\n','') for v in rows[0].find_all('th')]
columns

['Postcode', 'Borough', 'Neighbourhood']

In [559]:
df = pd.DataFrame(columns=columns)
df

Unnamed: 0,Postcode,Borough,Neighbourhood


In [560]:
for i in range(1, len(rows)):
    tds = [v.text.replace('\n','') for v in rows[i].find_all('td')]
    values = [tds[0], tds[1], tds[2]]
    # print(values)
    
    df = df.append(pd.Series(values, index=columns), ignore_index=True)
    
# print(df)

In [561]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [562]:
df.count().Postcode

288

####  Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [563]:
df = df[df.Borough != "Not assigned"]

In [564]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [565]:
df.count().Postcode

211

#### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough

In [566]:
df.loc[df.Postcode == 'M7A']

Unnamed: 0,Postcode,Borough,Neighbourhood
8,M7A,Queen's Park,Not assigned


In [567]:
df.loc[(df.Neighbourhood == 'Not assigned') &  (df.Borough != 'Not assigned'), 'Neighbourhood'] = df.Borough

In [568]:
df.loc[df.Postcode == 'M7A']

Unnamed: 0,Postcode,Borough,Neighbourhood
8,M7A,Queen's Park,Queen's Park


#### More than one neighborhood can exist in one postal code area. Combined into one row with the neighborhoods separated with a comma 

In [569]:
df = df.groupby(['Postcode', 'Borough'], as_index=False).agg(' , '.join)

In [570]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge , Malvern"
1,M1C,Scarborough,"Highland Creek , Rouge Hill , Port Union"
2,M1E,Scarborough,"Guildwood , Morningside , West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [571]:
df.shape[0]

103