## Importing required libraries

In [1]:
import pandas as pd # library for data analsysis
import urllib.request
from bs4 import BeautifulSoup
import numpy as np
import geocoder
import signal
from geopy.geocoders import Nominatim

### Fetching data from the web

Assigning the URl to a variable

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

Now we need to make a get request to obtain the html of the given url

In [3]:
page = urllib.request.urlopen(url)

Using BeautifulSoup, beautify the html content in order to make it more user readable

In [4]:
soup = BeautifulSoup(page)

#### Now we are assuming that there is only one table present on the page, and our data is present in the body of that table alone

So, let us store the contents of the table>tbody tag into a variable

In [None]:
k = soup.table.tbody

Now we need to locate all 'td' which store the actual required values

In [None]:
l = k.find_all('td')

### Creating the dataframe

Let us create an empty dataframe

In [None]:
# define the dataframe columns
column_names = ['PostalCode', 'Borough', 'Neighbourhood'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

neighborhoods

Unnamed: 0,PostalCode,Borough,Neighbourhood


#### Now we need to reetrive the postal code, borough and neighbourhood and append them to our earlier created dataframe

In [None]:
for i in range(0,len(l),3):
    pc = str(l[i].contents)[2:-2]
    bor = ""

    try:
        bor = l[i+1].a.contents[0]
        neigh = l[i+2].a.contents[0]
    except:
        if(bor=='' or bor==None):
            continue
        neigh = str(l[i+2].contents)[2:-4]
    
    if(neigh=='Not assigned'):
        neigh = bor
    neighborhoods = neighborhoods.append({'PostalCode': pc,
                                          'Borough': bor,
                                          'Neighbourhood': neigh}, ignore_index=True)

Let us have a quick look at our data

In [None]:
neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [None]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)

The dataframe has 10 boroughs and 208 neighborhoods.


#### Now we need to group (merge) our data, so as show unique postal codes

In [None]:
new_neighbourhood = neighborhoods.groupby('PostalCode').agg({'Borough':'first', 
                             'Neighbourhood': ', '.join}).reset_index()

In [None]:
new_neighbourhood

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


Let us check the size of our dataframe

In [None]:
new_neighbourhood.shape

(100, 3)

In [None]:
new_neighbourhood['Latitude'] = np.nan
new_neighbourhood['Longitude'] = np.nan

In [None]:
ext_list = ['M1E', 'M1H']

In [None]:
def getCordinates(postal_code):
    while(True):
        try:
            address = '{}, Toronto, Ontario'.format(postal_code)
            geolocator = Nominatim(user_agent="toronto_explorer")
            location = geolocator.geocode(address)
        except:
            pass
        if(location is None):
            pass
        else:
            return location[1]

In [None]:
i = 0
for postal_code in new_neighbourhood['PostalCode']:
    
    if(postal_code in ext_list):
        continue
    
    try:
        location = getCordinates(postal_code)
        new_neighbourhood['Latitude'][i] = location[0]
        new_neighbourhood['Longitude'][i] = location[1]
        print("Co-ordinates for {} are {}, {}".format(postal_code, location[0], location[1]))
    except:
        print("time out for: "+str(postal_code))
        
    i+=1

time out for: M1B


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


Co-ordinates for M1C are 43.653963, -79.387207
Co-ordinates for M1G are 43.6449033, -79.3818364
time out for: M1J


In [None]:
k = getCordinates('M1G')

In [None]:
k

In [None]:
new_neighbourhood