In [53]:
import pandas as pd
import numpy as np
import requests

# install bs4 if missing
#!conda install -c conda-forge bs4 --yes
from bs4 import BeautifulSoup

# query the wikipedia to get the text
url = r'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
data = requests.get(url).text

# Initiate BeautifulSoup to parse the data
data_beauty = BeautifulSoup(data,'html.parser')

content = data_beauty.find('div', class_='mw-parser-output')
table = content.table.tbody
# find all the rows of the table in to python list
ll = table.find_all('tr')
xyz = []
for row in ll:
    # load all of the columns in a list, assuming PostalCode is 1st, Borough is 2nd and Neighbourhood is 3rd column
    row_items = row.find_all('td')    
    if len(row_items) > 0:
        pc = row_items[0].text.rstrip()
        bor = row_items[1].text.rstrip()
        nh = row_items[2].text.rstrip()
        # append only if borough doesn't have not assigned
        if bor.lower() != 'not assigned':
            # if neighbourhood has not assigned then assingning a value same as borough
            if nh.lower() == 'not assigned':
                nh = bor
            xyz.append([pc, bor, nh])


In [54]:
# data 
columns = ['PostalCode', 'Borough',  'Neighborhood']
df = pd.DataFrame(data = xyz,columns=columns)

# Short description of data description
df.describe()

Unnamed: 0,PostalCode,Borough,Neighborhood
count,103,103,103
unique,103,10,99
top,M4G,North York,Downsview
freq,1,24,4


In [59]:
# Shape of the data and some data
print('Shape of the data frame : {}'.format(df.shape))
df.head()

Shape of the data frame : (103, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [61]:
#!conda install -c conda-forge geocoder --yes
import geocoder

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - geocoder


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    brotlipy-0.7.0             |py36h8c4c3a4_1000         346 KB  conda-forge
    chardet-3.0.4              |py36h9f0ad1d_1006         188 KB  conda-forge
    click-7.1.2                |     pyh9f0ad1d_0          64 KB  conda-forge
    cryptography-2.9.2         |   py36h45558ae_0         613 KB  conda-forge
    future-0.18.2              |   py36h9f0ad1d_1         714 KB  conda-forge
    geocoder-1.38.1            |             py_1          53 KB  conda-forge
    pysocks-1.7.1              |   py36h9f0ad1d_1          27 KB  conda-forge
    ratelim-0.1.6              |             py_2           6 KB  conda-forge
    ---------------------

In [69]:
# Function get_coordinates to take POstal code as input and return latitude and longitude tuple
def get_coordinates(PostalCode):
    lat_lng_coords = None

    # loop until you get the coordinates
    iter = 0
    while(lat_lng_coords is None and iter < 5):
        g = geocoder.google('{}, Toronto, Ontario'.format(PostalCode))
        lat_lng_coords = g.latlng
        iter = iter +1

    if iter ==5:
        lattitude = 0
        longitude = 0
    else:
        latitude = lat_lng_coords[0]
        longitude = lat_lng_coords[1]
    
    return (lattitude, longitude)

In [70]:
post_codes = df['PostalCode']
lat = []
long = []
for p in post_codes:
    (lt, lo) = get_coordinates(p)
    lat.append(lt)
    long.append(lo)
df['Lattitude'] = lat
df['Longitude'] = long
df

Unnamed: 0,PostalCode,Borough,Neighborhood,Lattitude,Longitude
0,M3A,North York,Parkwoods,0,0
1,M4A,North York,Victoria Village,0,0
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",0,0
3,M6A,North York,"Lawrence Manor, Lawrence Heights",0,0
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",0,0
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",0,0
99,M4Y,Downtown Toronto,Church and Wellesley,0,0
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",0,0
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",0,0


In [75]:
#Just for testing but looks like api is not working
g = geocoder.google('{}, Regent Park, Harbourfront, Downtown Toront'.format('M5A'))
lat_lng_coords = g.latlng
lat_lng_coords, g

(None, <[REQUEST_DENIED] Google - Geocode [empty]>)

In [84]:
#Google lib didnt work so using csv file to get the latitude and longitude
csv = 'http://cocl.us/Geospatial_data'
df_csv = pd.read_csv(csv)
post_codes = df['PostalCode']
lat = []
long = []
for p in post_codes:
    row = df_csv[df_csv['Postal Code'] == p]
    (lt, lo) = (row['Latitude'].to_list()[0], row['Longitude'].to_list()[0])
    lat.append(lt)
    long.append(lo)
df['Lattitude'] = lat
df['Longitude'] = long
df

Unnamed: 0,PostalCode,Borough,Neighborhood,Lattitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509
