# Augment Neightbourhood Dataframe with Geocoding
## Capstone project, Week 3 Notebook 2

In [2]:
# Get the latest version of BeautifulSoup; use XML parser for speed
#!pip install beautifulsoup4
import pandas as pd
from bs4 import BeautifulSoup
from lxml import html
import requests

### Get the data
Read the wikipedia page using BeautifoulSoup

In [3]:
# if we don't submit a User-Agent header, the connection is refused by the server
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
}

# The list of postal codes in Canada where the first letter is M
url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

# Make a GET request to fetch the raw HTML content
html_content = requests.get(url).text

# Parse the html content
soup = BeautifulSoup(html_content, "lxml")

### Get HTML table
Get the HTML elements that interest us

In [4]:
# The content we want is in tr elements inside a tbody tag 
# We must select only the tr's inside the tbody, avoid the extra tr's after that
html_table = soup.tbody
html_rows = html_table.find_all('tr')

### Convert HTML table into dictionary
Create a dictionary for each table row, of the form {'Postal Code': '', 'Borough': '', 'Neighborhood': ''}
This will prove useful when we want to use logic to exclude records, change values etc.

In [6]:
# table header, remove newlines before extracting 
table_header = []
for th in html_rows[0].find_all('th'):
    table_header.append(th.text.replace('\n', ' ').strip())

table_data = []
for tr in html_rows:
    t_row = {}
    # Each table row is stored in the form of t_row = {'Postal Code': '', 'Borough': '', 'Neighborhood': ''}
    for td, th in zip(tr.find_all("td"), table_header):
        cell = td.text.replace('\n', '')
        t_row[th] = cell 
    
    # skip the first empty row
    if not t_row:
        continue
        
    # add to the dataset only if the Borough is not assigned
    if (t_row['Borough'] != 'Not assigned'):
        table_data.append(t_row)
        
    # If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the
    # same as the borough. Today there is no data on the page that fits this rule, let's implement it anyway
    if (t_row['Borough'] != 'Not assigned' and t_row['Neighborhood'] == ''):
        t_row['Neighborhood'] = t_row['Borough']


### Put into dataframe

In [7]:
df = pd.DataFrame(table_data)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [8]:
# Check the number of records
df.shape

(103, 3)

# Get geocoding information
The Google provider returns nothing, Bing et al. need API keys, we use Geocode.Farm that provides 250 free query requests per day. The while loop is still useful because sometimes the API returns no values

In [None]:
#!pip install geocoder
import time

latitudes = []
longitudes = []
import geocoder
for postal_code in df['Postal Code']:
    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.geocodefarm('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
        # print(postal_code, " returned lat =", g.lat)
        time.sleep(1)

    latitudes.append(lat_lng_coords[0])
    longitudes.append(lat_lng_coords[1])

In [66]:
print ("Got ", len(latitudes), "elements for latitude and ", len(longitudes), "for longitude")

Got  103 elements for latitude and  103 for longitude


Now we combine dataframe and lists to produce the desired result

In [67]:
df['Latitude'] = latitudes
df['Longitude'] = longitudes

In [68]:
 # check it out
df.head(12)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.751881,-79.33036
1,M4A,North York,Victoria Village,43.730419,-79.31282
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65514,-79.362648
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.723209,-79.451408
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66449,-79.393021
5,M9A,Etobicoke,Islington Avenue,43.662769,-79.528313
6,M1B,Scarborough,"Malvern, Rouge",43.811531,-79.195518
7,M3B,North York,Don Mills,43.74929,-79.361687
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.707939,-79.3116
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.65736,-79.378181
