# Getting Data From Wikipedia

In [5]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd

wiki_page = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

soup = bs(wiki_page)
mytable = soup.find('table',{'class':'wikitable sortable'})
data = mytable.findAll('tr')

data_dic = {}

for row in data:
    temp = row.findAll("td")
    if len(temp)==3:
        # PostalCode p, borough b and neighborhood n 
        p,b,n = temp
        
        # Remove spaces and \n
        p = p.text.strip()
        b = b.text.strip()
        n = n.text.strip()

        if b!="Not assigned":
            if n == "Not assigned":
                n=b
            
            # Check if postalcode has been added, if yes, append neighborhood after comma
            if p in data_dic:
                data_dic[p][1] = data_dic[p][1] + ", " + n
            else:
                data_dic[p] = [b, n]

# Create a list to store data                
li = []
for k,v in data_dic.items():
    li.append([k, v[0], v[1]])

# Convert list of lists to DF    
dataframe = pd.DataFrame(li,columns=['PostalCode', 'Borough', 'Neighborhood'])#.set_index('Neighborhood')
print(dataframe.head())

  PostalCode           Borough                      Neighborhood
0        M3A        North York                         Parkwoods
1        M4A        North York                  Victoria Village
2        M5A  Downtown Toronto         Harbourfront, Regent Park
3        M6A        North York  Lawrence Heights, Lawrence Manor
4        M7A      Queen's Park                      Queen's Park


In [2]:
print(dataframe.shape)

(103, 3)


# Getting Geolocation Data
I have downloaded the csv and read it

I merge both dataframe (join operation) based on postal code

Then I drop the redundant column

In [6]:
geospat = pd.read_csv('Geospatial_Coordinates.csv')

dataframe = pd.merge(dataframe, geospat, how='left', left_on='PostalCode', right_on='Postal Code', validate="1:1")
dataframe.drop(labels='Postal Code', axis=1, inplace=True)
print(dataframe.head())

  PostalCode           Borough                      Neighborhood   Latitude  \
0        M3A        North York                         Parkwoods  43.753259   
1        M4A        North York                  Victoria Village  43.725882   
2        M5A  Downtown Toronto         Harbourfront, Regent Park  43.654260   
3        M6A        North York  Lawrence Heights, Lawrence Manor  43.718518   
4        M7A      Queen's Park                      Queen's Park  43.662301   

   Longitude  
0 -79.329656  
1 -79.315572  
2 -79.360636  
3 -79.464763  
4 -79.389494  
