### Step1: Import Required Libraries

In [35]:
import numpy as np 
import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML and XML documents
print("Libraries imported.")

Libraries imported.


### Step 2: Get Canada's Postal Code URL 

In [36]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
cap_data=requests.get(url).text

### Step 3:Using BeautifulSoup for parsing

In [37]:
soup = BeautifulSoup(cap_data, "html.parser")

#Let's find out BeatifulSoup is working....get the Title from the url
title = soup.title
print(title)

<title>List of postal codes of Canada: M - Wikipedia</title>


### Step 4: Get the Table and its contents

In [38]:
table = soup.find("table")
rows = table.find_all("tr")

# storing the table content
table_contents = []  
for tr in rows:
    if rows.index(tr) == 0 : 
        row_cells = [ th.getText().strip() for th in tr.find_all('th') if th.getText().strip() != '' ]  
    else : 
        row_cells = ([ tr.find('th').getText() ] if tr.find('th') else [] ) + [ td.getText().strip() for td in tr.find_all('td') if td.getText().strip() != '' ] 
    if len(row_cells) > 1 : 
        table_contents += [ row_cells ]
table_contents

[['Postcode', 'Borough', 'Neighbourhood'],
 ['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront'],
 ['M5A', 'Downtown Toronto', 'Regent Park'],
 ['M6A', 'North York', 'Lawrence Heights'],
 ['M6A', 'North York', 'Lawrence Manor'],
 ['M7A', "Queen's Park", 'Not assigned'],
 ['M8A', 'Not assigned', 'Not assigned'],
 ['M9A', 'Etobicoke', 'Islington Avenue'],
 ['M1B', 'Scarborough', 'Rouge'],
 ['M1B', 'Scarborough', 'Malvern'],
 ['M2B', 'Not assigned', 'Not assigned'],
 ['M3B', 'North York', 'Don Mills North'],
 ['M4B', 'East York', 'Woodbine Gardens'],
 ['M4B', 'East York', 'Parkview Hill'],
 ['M5B', 'Downtown Toronto', 'Ryerson'],
 ['M5B', 'Downtown Toronto', 'Garden District'],
 ['M6B', 'North York', 'Glencairn'],
 ['M7B', 'Not assigned', 'Not assigned'],
 ['M8B', 'Not assigned', 'Not assigned'],
 ['M9B', 'Etobicoke', 'Cloverdale'],
 [

### Step 5: Storing Table data into Panda's Dataframe

In [39]:
#Storing Canada's Postal Code Data into Dataframe

PC_Canada_df = pd.DataFrame(table_contents)

# Cheking the dataframe

PC_Canada_df.head()

Unnamed: 0,0,1,2
0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [40]:
#Creating 1st row as column

PC_Canada_df.columns = PC_Canada_df.iloc[0]
PC_Canada_df=PC_Canada_df[1:]

In [41]:
#Let's find out how it looks like
PC_Canada_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


### Step 6: Changing the Column name

In [42]:
#Rename the column as required and reset index

PC_Canada_df=PC_Canada_df.rename(columns = {'Postcode':'PostalCode','Neighbourhood':'Neighborhood'}).reset_index(drop=True)
PC_Canada_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [43]:
# Confirming the column heads

PC_Canada_df.columns

Index(['PostalCode', 'Borough', 'Neighborhood'], dtype='object', name=0)

In [44]:
# Lets see what is the shape of the dataframe
PC_Canada_df.shape

(288, 3)

### Step 7: Deleting Rows having "Not assigned" value iin Borough Column

In [45]:
# Finding the value 'Not assigned' in Borough column
df1=PC_Canada_df[PC_Canada_df.Borough == 'Not assigned']
df1

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
9,M8A,Not assigned,Not assigned
13,M2B,Not assigned,Not assigned
20,M7B,Not assigned,Not assigned
21,M8B,Not assigned,Not assigned
30,M2C,Not assigned,Not assigned
36,M7C,Not assigned,Not assigned
37,M8C,Not assigned,Not assigned
45,M2E,Not assigned,Not assigned


In [46]:
#How many data contains the 'Not assigned' value
df1.shape

(77, 3)

In [47]:
#Deleting rows that have 'Not assigned Value' in Borough Column

PC_Canada_df=PC_Canada_df[PC_Canada_df.Borough != 'Not assigned'].reset_index(drop=True)
PC_Canada_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [48]:
#Checking the shape of the dataframe after deleting
PC_Canada_df.shape

(211, 3)

### Step 8: Joining the Neighbors

In [49]:
#Joining Neighbors in the same Postal Code Area
PC_Canada_df= PC_Canada_df.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda val: ", ".join(val))
PC_Canada_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [50]:
#Checking the shape of the dataframe after joining
PC_Canada_df.shape

(103, 3)

### Step 9: Changing the Neighborhood value

In [51]:
# Let's create another dataframe to find out how many rows contains the Neighborhood with "Not assigned" value
#df2=PC_Canada_df
df2=PC_Canada_df[PC_Canada_df.Neighborhood=='Not assigned']
df2.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
85,M7A,Queen's Park,Not assigned


In [52]:
#Changing the value "Not assigned" in the Neighborhood with the value in Borough Column
PC_Canada_df.loc[PC_Canada_df.Neighborhood=='Not assigned','Neighborhood']=PC_Canada_df["Borough"]
#Lets see the value has changed
PC_Canada_df.iloc[85,:]

0
PostalCode               M7A
Borough         Queen's Park
Neighborhood    Queen's Park
Name: 85, dtype: object

### Step 10: Shape of the dataframe

In [53]:
#Number of rows in dataframe now
PC_Canada_df.shape

(103, 3)

In [54]:
# The code was removed by Watson Studio for sharing.

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [55]:
Geo_Cord.shape


(103, 3)

In [56]:
Geo_Cord.rename(columns = {'Postal Code':'PostalCode'},inplace=True)
Geo_Cord.head(10)

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [57]:
PC_Canada_df=PC_Canada_df.merge(Geo_Cord,on='PostalCode',how='left')

In [58]:
PC_Canada_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [59]:
PC_Canada_df.shape

(103, 5)