## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
import folium
import requests
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

## Web Scraping of the `Wikepedia` page

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, "html5lib")
right_table=soup.find('table', class_='wikitable sortable')
#right_table

## Creating DataFrame using the scrapped data

In [3]:
A=[]
B=[]
C=[]

for row in right_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        A.append((cells[0].find(text=True))[:-1])
        B.append((cells[1].find(text=True))[:-1])
        C.append((cells[2].find(text=True))[:-1])
df=pd.DataFrame(A,columns=['PostalCode'])
df['Borough']=B
df['Neighborhood']=C
print("Old shape of dataframe - ",df.shape)

Old shape of dataframe -  (180, 3)


## Cleaning data for `Borough` and `Neighborhood`

In [4]:
#Removing the 'Not assigned' value of 'Borough'
df.drop(df[df.Borough == 'Not assigned'].index, inplace = True)
#Replacing the 'Not assigned' value of 'Neighborhood' column by 'Borough' column's value
df.loc[df['Neighborhood'] == 'Not assigned', 'Neighborhood'] = df.loc[df['Neighborhood'] == 'Not assigned', 'Borough']
df.reset_index(inplace=True)
df.drop(["index"],axis=1,inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## Checking whether is there any `Not assigned` value present in `Neighborhood` column or not

In [5]:
df[df.Neighborhood == 'Not assigned'].head()

Unnamed: 0,PostalCode,Borough,Neighborhood


## Printing the shape of final dataset

In [6]:
print('New shape of dataframe - ',df.shape)

New shape of dataframe -  (103, 3)


## Reading dataset for lattitude and longitude

In [7]:
postalcode_coor = pd.read_csv('http://cocl.us/Geospatial_data')
new_df = postalcode_coor.rename(columns = {'Postal Code': 'PostalCode'})
new_df.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## Merging two dataset

In [8]:
merged_data = pd.merge(left=df, right=new_df, left_on='PostalCode', right_on='PostalCode')
merged_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
