# Imports

In [2]:
# !conda install -c anaconda beautifulsoup4 --yes
# !conda install -c anaconda lxml --yes
from bs4 import BeautifulSoup
import requests
import pandas as pd

print("Packages loaded")

Packages loaded


# Load Data from web page into DataFrame

In [3]:
# Use requests to get the html content from url
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(url)

#create BeautifulSoup object to parse the page content
soup = BeautifulSoup(page.content, 'lxml')

#find the table in the content and assign to variable table
table = soup.find('table', class_='wikitable sortable')
# pd.read_html retuns a list of dataframes, so need to get the first element
dfs = pd.read_html(str(table))
df = dfs[0]
# Set column names
df.columns = ['PostalCode', 'Borough', 'Neighborhood']
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


# Remove records with Borough Not assigned

In [15]:
# Remove rows with Borough == Not assigned
neighborhoods = df[df.Borough != 'Not assigned']
neighborhoods = neighborhoods.reset_index(drop=True)
neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


# Verify the requirements

### It appears that postal codes don't appear double (any longer?) in the source on the wikipedia page. 
### Additionally, after removing records where Borough == Not assigned, there are no Neighborhoods left with value Not assigned.

In [49]:
# Examples to explore the dataset

# neighborhoods['Neighborhood'].unique()
# neighborhoods[neighborhoods['Neighborhood'] == 'Not assigned']
# neighborhoods[neighborhoods['Postal Code'] == 'M9V']

# DataFrame Shape

In [16]:
neighborhoods.shape

(103, 3)

# Use CSV for LatLong as geocoder package is unreliable

In [8]:
!wget -q -O 'geospatial_data.csv' http://cocl.us/Geospatial_data
print('Data downloaded!')

Data downloaded!


In [25]:
# Create dataframe from csv
latlong_df = pd.read_csv('geospatial_data.csv')
# merge neighborhoods with the latlong data
neighborhoodslatlong = neighborhoods
neighborhoodslatlong = neighborhoodslatlong.join(latlong_df.set_index('Postal Code'), on='PostalCode')
neighborhoodslatlong
neighborhoodslatlong.head()

# neighborhoods_merged[neighborhoods_merged['PostalCode'] == 'M4M']

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [26]:
neighborhoodslatlong.shape

(103, 5)