# Part1 - Web Scrapping

## Install BeautifulSoup, lxml, html5lib, requests

In [1]:
!pip install beautifulsoup4



In [2]:
!pip install lxml



In [3]:
!pip install html5lib



In [4]:
!pip install requests



### Import data from URL

In [5]:
from bs4 import BeautifulSoup
import requests

In [6]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source,'lxml')

In [7]:
content = soup.find('div', class_='mw-parser-output').table

In [8]:
import pandas as pd

# define the dataframe columns
column_names = ['PostalCode', 'Borough', 'Neighborhood'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)
neighborhoods

Unnamed: 0,PostalCode,Borough,Neighborhood


In [9]:
pd.set_option('display.max_rows', None)

## Parse data and populate DataFrame

In [10]:
neighborhoods.drop(labels=None, axis=0, index=neighborhoods.index, inplace=True, errors='raise')
for row in content.findAll("tr"):
    for cell in row.findAll("td"):
        if (cell.p.span.text != 'Not assigned'):
            PostalCode = cell.p.b.text
            BoroughAndNeighborhood = cell.p.span.text
            count = 0
            count = BoroughAndNeighborhood.count("(")
            if (count >= 1):
                BoroughAndNeighborhood = cell.p.span.text.split("(")
                Borough = BoroughAndNeighborhood[0]
                Neighborhood1 = BoroughAndNeighborhood[1].replace(" /", ",")
                Neighborhood = Neighborhood1.split(")")[0]
                neighborhoods = neighborhoods.append({'PostalCode': PostalCode, 'Borough': Borough, 'Neighborhood': Neighborhood}, ignore_index=True)


neighborhoods = neighborhoods.append({'PostalCode': PostalCode, 'Borough': Borough, 'Neighborhood': Neighborhood}, ignore_index=True)                
neighborhoods

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M9A,Etobicoke,Islington Avenue
5,M1B,Scarborough,"Malvern, Rouge"
6,M3B,North York,Don Mills
7,M4B,East York,"Parkview Hill, Woodbine Gardens"
8,M5B,Downtown Toronto,"Garden District, Ryerson"
9,M6B,North York,Glencairn


In [11]:
neighborhoods.shape

(103, 3)

In [12]:
# Read Longitude and Lattitude data from CSV file

csv_path = 'https://cocl.us/Geospatial_data/Geospatial_Coordinates.csv'
df_longlat = pd.read_csv(csv_path)

df_longlat.head(5)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
df_longlat.rename(columns = {'Postal Code':'PostalCode'}, inplace = True) 
df_longlat.head(5)

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [14]:
postalcodes = pd.merge(neighborhoods, df_longlat, how='left', on=['PostalCode'])
postalcodes

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
5,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
6,M3B,North York,Don Mills,43.745906,-79.352188
7,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
8,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
9,M6B,North York,Glencairn,43.709577,-79.445073
