## Build a dataframe of the postal code of each neighborhood along with the borough name and neighborhood name in Toronto

# 1. import libraries

In [1]:
import pandas as pd #to handale data
import urllib.request # to handle requests
from bs4 import BeautifulSoup #to parse html documents

# 2. Scrap data from wikipedia page into a  pandas data frame 

In [3]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

page = urllib.request.urlopen(url).read()
soup = BeautifulSoup(page)

In [4]:
# define the dataframe columns
column_names = ['Postcode','Borough', 'Neighborhood'] 

# instantiate the dataframe
df = pd.DataFrame(columns=column_names)

In [5]:
for tr in soup.find_all('tr')[2:]:
    tds = tr.find_all('td')
    if len(tds)==1: #To ignore empty rows
        break
    if tds[1].text=='Not assigned': #To ignore values whose borough in "Not assigned"
        continue
    pc=tds[0].text 
    borough=tds[1].text
    neigh=tds[2].text.rstrip('\n') # rstrip used to ignore extra line
    if neigh=='Not assigned':
        if borough=='Not assigned': # ignoring rows whose neighbor and borough are not assigned
            continue
        else:
            neigh=borough # if neighbor is Not assigned then it is assigned to borough
    # appending values to data frame
    df=df.append({
        'Postcode':pc,
        'Borough':borough,
        'Neighborhood':neigh},ignore_index=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


In [6]:
# grouping rows with same borough and neighbor
df=df.groupby(['Postcode','Borough'])['Neighborhood'].apply(','.join).reset_index()

# printing shape of dataframe

In [7]:
df.shape

(103, 3)