#### importing libraries
 the requests library is to request for data from a particular url

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


import requests  


#### Importing BeautifulSoup library.

Next we need to import the functions from Beautiful Soup which will let us parse and work with the HTML we fetched from our Wiki page:

In [2]:
from bs4 import BeautifulSoup #importing BeautifulSoup library for web scraping

Then we use Beautiful Soup to parse the HTML data we stored in our ‘url’ variable and
store it in a new variable called ‘soup’in the Beautiful Soup format.
Jupyter Notebook prefers we specify a parser format so we use the “lxml” library option:

In [3]:
url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

soup = BeautifulSoup(url, 'lxml')


#### Find the Table we want
we will do this using find_all() function

In [4]:
My_table =soup.find_all('table',{'class':'wikitable sortable' })


#### Ignore the headers find the rows


In [5]:
rows = My_table[0].findAll('tr')


#### Find the columns
Within the loop we are going to use find_all again to search each row for <td> tags with the ‘td’ string. We will add all of these to a variable called ‘cells’ and then check to make sure that there are 3 items in our ‘cells’ array (i.e. one for each column)

In [6]:
Postal_Code = []
Borough = []
Neighbourhood = []

for row in rows:
    columns = row.findAll('td')
    if len(columns) == 3:
        Postal_Code.append(columns[0].find(text = True).rstrip())
        Borough.append(columns[1].find(text = True).rstrip())
        Neighbourhood.append(columns[2].find(text = True).rstrip())
        

#### creating a DataFrame
we will create a dataframe and assign each of the above lists to a column 

In [7]:
df = pd.DataFrame(Postal_Code, columns = ['PostalCode'])
df.head()

Unnamed: 0,PostalCode
0,M1A
1,M2A
2,M3A
3,M4A
4,M5A


In [8]:
df['Borough'] = Borough
df['Neighbourhood'] = Neighbourhood
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### Removing rows with a Borough that is 'Not assigned'

In [9]:
df = df[df.Borough != 'Not assigned']
df

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


In [10]:
df.index = range(len(df))

#### Using .shape method to print the number of rows in the dataframe

In [11]:
df.shape
df

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


#### importing 'Geospatial_Coordinates.csv' file

In [12]:
df1 = pd.read_csv('Geospatial_Coordinates.csv')
df1.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
df1.rename(columns={"Postal Code": "PostalCode"}, inplace=True)
df1.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Merge two DataFrames to get the coordinates

In [14]:
toronto_df = df.merge(df1, on="PostalCode", how="left")
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


#### Finally, check to make sure the coordinates are added as required by the question

In [15]:
column_names = ["PostalCode", "Borough", "Neighbourhood", "Latitude", "Longitude"]
test_df = pd.DataFrame(columns=column_names)

test_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for postcode in test_list:
    test_df = test_df.append(toronto_df[toronto_df["PostalCode"]==postcode], ignore_index=True)
    
test_df

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
1,M2H,North York,Hillcrest Village,43.803762,-79.363452
2,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
4,M4G,East York,Leaside,43.70906,-79.363452
5,M4M,East Toronto,Studio District,43.659526,-79.340923
6,M1R,Scarborough,"Wexford, Maryvale",43.750072,-79.295849
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437
8,M9L,North York,Humber Summit,43.756303,-79.565963
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442
