### Importing the necessary libraries

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import os

pd.set_option('display.max_columns', 100)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)

In [3]:
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
# print(website_url)

In [4]:
soup = BeautifulSoup(website_url,'lxml')
# print(soup.prettify())

In [5]:
table = soup.find_all('table')[0]
# table

In [6]:
df = pd.read_html(str(table))[0]

In [7]:
df.columns = df.iloc[0]
df = df.reindex(df.index.drop(0))

#### The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood

In [8]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


In [9]:
df.shape

(288, 3)

#### Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned

In [10]:
df_filter = df[df['Borough'] != 'Not assigned']

In [11]:
df_filter.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights


In [11]:
df_filter.shape

(211, 3)

#### More than one neighborhood can exist in one postal code area - This is processed in the below code

In [12]:
df_groupby = df_filter.groupby(['Postcode','Borough']).agg({'Neighbourhood': lambda x: ','.join(x)})
df_groupby.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Neighbourhood
Postcode,Borough,Unnamed: 2_level_1
M1B,Scarborough,"Rouge,Malvern"
M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
M1E,Scarborough,"Guildwood,Morningside,West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae
M1J,Scarborough,Scarborough Village
M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
M1N,Scarborough,"Birch Cliff,Cliffside West"


In [13]:
df_groupby.shape

(103, 1)

In [14]:
df_groupby = df_groupby.reset_index()

In [15]:
df_groupby.columns

Index(['Postcode', 'Borough', 'Neighbourhood'], dtype='object')

In [16]:
df_groupby.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [17]:
df_groupby[df_groupby['Neighbourhood'] == 'Not assigned']

Unnamed: 0,Postcode,Borough,Neighbourhood
85,M7A,Queen's Park,Not assigned


In [18]:
condition = df_groupby['Neighbourhood'] == 'Not assigned'
column_name = 'Neighbourhood'
df_groupby.loc[condition,column_name] = df_groupby['Borough']

In [19]:
# Verified

df_groupby[df_groupby['Borough'] == "Queen's Park"]

Unnamed: 0,Postcode,Borough,Neighbourhood
85,M7A,Queen's Park,Queen's Park


In [20]:
df_groupby.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [22]:
df_groupby.to_csv('Neighbourhood_data.csv', index=None)

In [21]:
df_groupby.shape

(103, 3)