In [1]:
#importing required libraries etc
import pandas as pd
import requests 

<h1> Scraping the Wiki page </h1>

In [2]:
wiki_link = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wiki_page = requests.get(wiki_link)
wiki_doc = wiki_page.text

<h1> Reading the table with BeautifulSoup </h1>

In [4]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(wiki_doc, 'html.parser')
table = soup.find('table', {'class': 'wikitable sortable'})

<h1> Converting the table to a pandas dataframe </h1>

In [5]:
col_names = ["PostalCode", "Borough", "Neighborhood"]
df = pd.read_html(str(table), skiprows=1)
df = pd.DataFrame.from_dict(df[0])
df.columns = col_names
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M2A,Not assigned,
1,M3A,North York,Parkwoods
2,M4A,North York,Victoria Village
3,M5A,Downtown Toronto,"Regent Park, Harbourfront"
4,M6A,North York,"Lawrence Manor, Lawrence Heights"


<h1> Removing cells with no borough assigned </h1>

In [7]:
df = df[df.Borough != 'Not assigned'].reset_index(drop=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


<h1> Joining neighborhoods with the same postal code into one row, separating them by a comma </h1>

In [8]:
def combine_neighborhoods(series):
    return series.str.cat(sep=', ')

df_by_postcode = df.groupby(["PostalCode", "Borough"])
df = df_by_postcode.agg({'Neighborhood': combine_neighborhoods}).reset_index()
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


<h1> Assigning borough name as neighborhood name where the latter is missing </h1>

In [9]:
def impute_neighborhood(row):
    if row['Neighborhood'] == 'Not assigned':
        row['Neighborhood'] = row['Borough']
    
    return row

df = df.apply(impute_neighborhood, axis=1)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


<h1> Total number of rows in the final dataframe </h1>

In [12]:
df.shape[0]

103