#  Applied Data Science Capstone: Week 3

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from unicodedata import normalize
import requests
#from bs4 import BeautifulSoup
import html5lib

## Scraping the Table of Canada's Postal Codes

In [2]:
url =  "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M" 
page = requests.get(url)
table = pd.read_html(url)

In [3]:
print(f'Total tables: {len(table)}')

Total tables: 3


## We want the first table

In [4]:
table = table[0]

## Because the values we want as our separate columns are listed within each box of the table, we will flatten the table into a vector and then use slicing to pull out the features we want.

In [5]:
postal = table.values.flatten()

In [6]:
postal = pd.DataFrame(postal)

### The Postal Code is contained in the first three positions of the string. We extract it and give it its own column.

In [7]:
postal.columns = ['source']
postal['Postal_Code'] = postal['source'].str[:3]
postal['source'] = postal['source'].str[3:]

### We remove any unassigned Postal Codes.

In [8]:
postal.drop(postal[postal['source'] == "Not assigned"].index, inplace = True) 

### Cleaning up any extra characters to make for easy feature selection

In [9]:
postal["source"] = postal.source.str.replace('[^a-zA-Z]', ' ')
postal.head()

Unnamed: 0,source,Postal_Code
2,North York Parkwoods,M3A
3,North York Victoria Village,M4A
4,Downtown Toronto Regent Park Harbourfront,M5A
5,North York Lawrence Manor Lawrence Heights,M6A
6,Queen s Park Ontario Provincial Government,M7A


### Splitting the "source" column into multiple columns, using space as a delimiter

In [10]:
new = postal.source.str.split(expand = True)
new.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
2,North,York,Parkwoods,,,,,,,,...,,,,,,,,,,
3,North,York,Victoria,Village,,,,,,,...,,,,,,,,,,
4,Downtown,Toronto,Regent,Park,Harbourfront,,,,,,...,,,,,,,,,,
5,North,York,Lawrence,Manor,Lawrence,Heights,,,,,...,,,,,,,,,,
6,Queen,s,Park,Ontario,Provincial,Government,,,,,...,,,,,,,,,,


### Borough is the first column. Then, we concatenate the other columns to form the Neighborhoods column and remove all the "nones"

In [11]:
postal["Borough"] = new.iloc[:,0]

In [12]:
postal["Neighborhoods"] = new.iloc[:, 1:8].apply(lambda row:', '.join(row.values.astype(str)), axis = 1)

In [13]:
postal["Neighborhoods"] = postal.Neighborhoods.str.replace(', None', '')

### Here are our final dataframe and its shape

In [14]:
postal = postal.drop(['source'], axis = 1)
postal.head()

Unnamed: 0,Postal_Code,Borough,Neighborhoods
2,M3A,North,"York, Parkwoods"
3,M4A,North,"York, Victoria, Village"
4,M5A,Downtown,"Toronto, Regent, Park, Harbourfront"
5,M6A,North,"York, Lawrence, Manor, Lawrence, Heights"
6,M7A,Queen,"s, Park, Ontario, Provincial, Government"


In [15]:
postal.shape

(103, 3)

## Geocoder didn't work, so I'm loading the values from CSV

In [16]:
#!pip install geocoder
#import geocoder # import geocoder

# initialize your variable to None
#lat_lng_coords = None

# loop until you get the coordinates
#while(lat_lng_coords is None):
  #g = geocoder.google('{}, Toronto, Ontario'.format(postal.Postal_Code))
  #lat_lng_coords = g.latlng

#latitude = lat_lng_coords[0]
#longitude = lat_lng_coords[1]

SyntaxError: EOF while scanning triple-quoted string literal (<ipython-input-16-4ec0c20a907f>, line 14)

In [17]:
data = pd.read_csv("Geospatial_Coordinates.csv") 
data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [19]:
data.columns = ['Postal_Code', 'Latitude', 'Longitude']
merged = pd.merge(postal, data)
merged.head()

Unnamed: 0,Postal_Code,Borough,Neighborhoods,Latitude,Longitude
0,M3A,North,"York, Parkwoods",43.753259,-79.329656
1,M4A,North,"York, Victoria, Village",43.725882,-79.315572
2,M5A,Downtown,"Toronto, Regent, Park, Harbourfront",43.65426,-79.360636
3,M6A,North,"York, Lawrence, Manor, Lawrence, Heights",43.718518,-79.464763
4,M7A,Queen,"s, Park, Ontario, Provincial, Government",43.662301,-79.389494
