#### Import required libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

#### Download website's html doc for scraping

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

data = requests.get(url).text

#### Create Soup object and extract table data

In [3]:
soup = BeautifulSoup(data, 'html5lib')
table = soup.table
tableData = table.find_all('td')

In [4]:
#make rows of desired dataframe into a list and append all into a  general list
data = []
for ind in range(len(tableData)):
    rows = []
    for string in tableData[ind].stripped_strings:
        rows.append(repr(string))
    data.append(rows)

#retain list of all data without a borough = "Not assigned"
dataFiltered = []
for ind in range(len(data)):
    if len(data[ind]) > 2:
        dataFiltered.append(data[ind])

#delete apostrophe
for ind in range(len(dataFiltered)):
    for ind2 in range(len(dataFiltered[ind])):
        dataFiltered[ind][ind2] = dataFiltered[ind][ind2].replace("'","")

#make each list a length of 3 by concatenating all elements except the first two  
for ind in range (len(dataFiltered)):
    dataFiltered[ind] = dataFiltered[ind][0:2] + [''.join(dataFiltered[ind][2:(len(dataFiltered[ind]))])]

data

[["'M1A'", "'Not assigned'"],
 ["'M2A'", "'Not assigned'"],
 ['M3A', 'North York', '(', 'Parkwoods', ')'],
 ['M4A', 'North York', '(', 'Victoria Village', ')'],
 ['M5A', 'Downtown Toronto', '(', 'Regent Park', '/', 'Harbourfront', ')'],
 ['M6A', 'North York', '(', 'Lawrence Manor', '/', 'Lawrence Heights', ')'],
 ['M7A', '"Queens Park"', '(Ontario Provincial Government)'],
 ["'M8A'", "'Not assigned'"],
 ['M9A', 'Etobicoke', '(', 'Islington Avenue', ')'],
 ['M1B', 'Scarborough', '(', 'Malvern', '/', 'Rouge', ')'],
 ["'M2B'", "'Not assigned'"],
 ['M3B', 'North York', '(', 'Don Mills', ')', 'North'],
 ['M4B', 'East York', '(', 'Parkview Hill', '/', 'Woodbine Gardens', ')'],
 ['M5B', 'Downtown Toronto', '(', 'Garden District', ',', 'Ryerson', ')'],
 ['M6B', 'North York', '(Glencairn)'],
 ["'M7B'", "'Not assigned'"],
 ["'M8B'", "'Not assigned'"],
 ['M9B',
  'Etobicoke',
  '(',
  'West Deane Park',
  '/',
  'Princess Gardens',
  '/',
  'Martin Grove',
  '/',
  'Islington',
  '/ Cloverdale)']

#### Convert to dataframe

In [5]:
neighFrame = pd.DataFrame(dataFiltered)
neighFrame.columns = ["Postal Code", "Borough", "Neighbourhood"]
neighFrame

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,(Parkwoods)
1,M4A,North York,(Victoria Village)
2,M5A,Downtown Toronto,(Regent Park/Harbourfront)
3,M6A,North York,(Lawrence Manor/Lawrence Heights)
4,M7A,"""Queens Park""",(Ontario Provincial Government)
...,...,...,...
98,M8X,Etobicoke,(The Kingsway/ Montgomery Road /Old MillNorth)
99,M4Y,Downtown Toronto,(Church and Wellesley)
100,M7Y,East Toronto,Business reply mailProcessing Centre969 Easter...
101,M8Y,Etobicoke,"(Old Mill""South / Kings Mill Park /""Sunnylea/H..."


#### Clean Data

In [6]:
#delete all double quotes
for column in neighFrame.columns:
    neighFrame[column] = neighFrame[column].str.replace('"','')

#delete all brackets in "Neighbourhood" column data starting with "("
colStartingWthBrack = neighFrame[neighFrame["Neighbourhood"].str.startswith("(")]
#remove ")"
neighFrame.loc[colStartingWthBrack.index,"Neighbourhood"] = \
neighFrame.loc[colStartingWthBrack.index,"Neighbourhood"].str.replace(')','')
#remove "("
neighFrame.loc[colStartingWthBrack.index,"Neighbourhood"] = \
neighFrame.loc[colStartingWthBrack.index,"Neighbourhood"].str.replace('(','')

#delete "Business reply mail" from Neighbourhood column row 100
neighFrame.loc[100, "Neighbourhood"] = \
neighFrame.loc[100, "Neighbourhood"][(len("Business reply mail")):]

#change all "/" to ", "
neighFrame["Neighbourhood"] = neighFrame["Neighbourhood"].str.replace('/',', ')

#change all " ," to ","
neighFrame["Neighbourhood"] = neighFrame["Neighbourhood"].str.replace(' ,',',')

#change all "  " to " "
neighFrame["Neighbourhood"] = neighFrame["Neighbourhood"].str.replace('  ',' ')

In [7]:
neighFrame.shape

(103, 3)