In [1]:
# Install Beautiful Soup From Conda Forge
!conda install -c conda-forge beautifulsoup4 --yes

# Install HTML parser
!conda install -c conda-forge lxml --yes

# Install geocoder
!conda install -c conda-forge geopy --yes

Collecting package metadata (current_repodata.json): done
Solving environment: / 
  - anaconda::ca-certificates-2019.8.28-0, anaconda::certifi-2019.9.11-py37_0, anaconda::openssl-1.1.1d-h1de35cc_2
  - anaconda::ca-certificates-2019.8.28-0, anaconda::certifi-2019.9.11-py37_0, defaults::openssl-1.1.1d-h1de35cc_2
  - anaconda::certifi-2019.9.11-py37_0, anaconda::openssl-1.1.1d-h1de35cc_2, defaults::ca-certificates-2019.8.28-0
  - anaconda::certifi-2019.9.11-py37_0, defaults::ca-certificates-2019.8.28-0, defaults::openssl-1.1.1d-h1de35cc_2
  - anaconda::openssl-1.1.1d-h1de35cc_2, defaults::ca-certificates-2019.8.28-0, defaults::certifi-2019.9.11-py37_0
  - defaults::ca-certificates-2019.8.28-0, defaults::certifi-2019.9.11-py37_0, defaults::openssl-1.1.1d-h1de35cc_2
  - anaconda::ca-certificates-2019.8.28-0, anaconda::openssl-1.1.1d-h1de35cc_2, defaults::certifi-2019.9.11-py37_0
  - anaconda::ca-certificates-2019.8.28-0, defaults::certifi-2019.9.11-py37_0, defaults::openssl-1.1.1d-h1de35ccd

In [20]:
# Import the required Libraries we will be using
import pandas as pd
import numpy as np
import requests

from bs4 import BeautifulSoup

In [21]:
# Set the wikipedia data source in a variable
wiki_data_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
html_parser = "lxml"

# Scrape the web page through a get request and return html as text
source = requests.get(wiki_data_url).text 

# Pass the scraped text to Beautiful Soup
soup = BeautifulSoup(source, html_parser)

# Target the table element which holds the postcode data
post_code_data = soup.find("table")

# Load the table data into a dataframe using pandas readHTML
# Also possible to loop through each TR, extract the elements
# and append a row in the dataframe. Less typing this way
df = pd.read_html(str(post_code_data))
df = df[0]

# Check Number of rows
df.shape


(287, 3)

In [22]:
# Rename First Column to Postal Code
df = df.rename(columns={'Postcode': 'PostalCode'})

In [23]:
# Clean up the data Part 1

# Drop Rows with Unassigned Borough field
# Check how many rows are affected
df[["Borough"]].isnull().sum()      # No Rows have Null Borough field
df[df["Borough"]=="Not assigned"]   # 77 Rows have Not assigned Borough field

# Get the Row Index values to drop from the dataframe
rowidx = df[df["Borough"]=="Not assigned"].index
df.drop(rowidx,inplace=True)
df.shape



(210, 3)

In [24]:
# Clean up the data Part 2

# Merge Postcodes with multiple neighbourhoods
# Check how many neighbourhoods belong to more than one postcode
#print(df["Postcode"].value_counts())
#df[df["Postcode"]=="M5C"]

# Merge the Rows Using Group By and Lambda Expression to concetenate 
# Group by Postcode and Borough and apply a string concatenation to join all neighbourhood rows
merged_df = df.groupby(["PostalCode","Borough"])["Neighborhood"].apply(lambda rows: ",".join(rows)).reset_index()

# Check Merged Dataframe has unique value counts per posy code
print(merged_df["PostalCode"].value_counts())

# Check Number of rows
merged_df.shape


M6N    1
M5W    1
M2L    1
M8Y    1
M9A    1
      ..
M4C    1
M5E    1
M9P    1
M5C    1
M4W    1
Name: PostalCode, Length: 103, dtype: int64


(103, 3)

In [25]:
# Clean up the data Part 3
# If a cell has a borough but unassigned neighbourhood then the neighbourhood will be the borough
# Not it sort of makes more sense to do 2 and 3 the other way round but I'm following the question

merged_df[merged_df["Neighborhood"]=="Not assigned"]
# Only impacted item is M9A

merged_df["Neighborhood"].replace(to_replace="Not assigned", value=merged_df["Borough"], inplace=True)

# Check the update worked
merged_df[merged_df["PostalCode"]=="M9A"]

Unnamed: 0,PostalCode,Borough,Neighborhood
93,M9A,Queen's Park,Queen's Park


In [26]:
# Take a look at the distribution of the data by borough
merged_df.groupby("Borough").count()

Unnamed: 0_level_0,PostalCode,Neighborhood
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1
Central Toronto,9,9
Downtown Toronto,19,19
East Toronto,5,5
East York,5,5
Etobicoke,11,11
Mississauga,1,1
North York,24,24
Queen's Park,1,1
Scarborough,17,17
West Toronto,6,6


In [27]:
# Save the data frame so we can easily load it in future exercises
# Save into the current working directory
export = merged_df.to_csv("part1.csv", header=True, index=False)

In [28]:
# Show the row and column counts oif the data frame
merged_df.shape

(103, 3)