## Importing required libraries

In [1]:
import pandas as pd # library for data analsysis
import urllib.request
from bs4 import BeautifulSoup
import numpy as np
import geocoder
import signal
from geopy.geocoders import Nominatim

### Fetching data from the web

Assigning the URl to a variable

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

Now we need to make a get request to obtain the html of the given url

In [3]:
page = urllib.request.urlopen(url)

Using BeautifulSoup, beautify the html content in order to make it more user readable

In [4]:
soup = BeautifulSoup(page)

#### Now we are assuming that there is only one table present on the page, and our data is present in the body of that table alone

So, let us store the contents of the table>tbody tag into a variable

In [5]:
k = soup.table.tbody

Now we need to locate all 'td' which store the actual required values

In [6]:
l = k.find_all('td')

### Creating the dataframe

Let us create an empty dataframe

In [7]:
# define the dataframe columns
column_names = ['Postal Code', 'Borough', 'Neighbourhood'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

neighborhoods

Unnamed: 0,Postal Code,Borough,Neighbourhood


#### Now we need to reetrive the postal code, borough and neighbourhood and append them to our earlier created dataframe

In [8]:
for i in range(0,len(l),3):
    pc = str(l[i].contents)[2:-2]
    bor = ""

    try:
        bor = l[i+1].a.contents[0]
        neigh = l[i+2].a.contents[0]
    except:
        if(bor=='' or bor==None):
            continue
        neigh = str(l[i+2].contents)[2:-4]
    
    if(neigh=='Not assigned'):
        neigh = bor
    neighborhoods = neighborhoods.append({'Postal Code': pc,
                                          'Borough': bor,
                                          'Neighbourhood': neigh}, ignore_index=True)

Let us have a quick look at our data

In [9]:
neighborhoods.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [10]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)

The dataframe has 10 boroughs and 208 neighborhoods.


#### Now we need to group (merge) our data, so as show unique postal codes

In [11]:
new_neighbourhood = neighborhoods.groupby('Postal Code').agg({'Borough':'first', 
                             'Neighbourhood': ', '.join}).reset_index()

In [13]:
new_neighbourhood.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Let us check the size of our dataframe

In [14]:
cord = pd.read_csv('Geospatial_Coordinates.csv')

In [16]:
tor_neigh = pd.merge(new_neighbourhood, cord, on="Postal Code")

In [18]:
tor_neigh.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
