In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import json
import geocoder
import folium
import sklearn
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

In [2]:
postal_codes_wiki_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

I use BeautifulSoup to get the raw html from the Toronto postal codes Wikipedia page. There is only one table on the page, under the class _wikitable_.

In [4]:
wiki_html = requests.get(postal_codes_wiki_url)
wiki_soup = BeautifulSoup(wiki_html.text, "lxml")

attrs = {'class': 'wikitable'}
table = wiki_soup.find('table')

For each cell in the original table, I create a new dictionary and append it to the list _table_contents_. I then turn _table_contents_ into a pandas dataframe and clean the data in it.

In [8]:
table_contents = []
for cell in table.find_all('td'): # Each 'td' tag is a different cell in the original table
    new_row = {}
    if cell.find('i'): # If the 'i' tag exists, then borough isn't assigned
        pass
    else:
        new_row['PostalCode'] = cell.find('b').text
        tmp_str = cell.span.text.split('(') # tmp_str contains the borough and the neighborhood
        new_row['Borough'] = tmp_str[0]
        new_row['Neighborhood'] = tmp_str[1].split(')')[0].replace(' / ', ', ') # neighborhood list formatting
        table_contents.append(new_row)

In [10]:
df = pd.DataFrame(table_contents)

# Cleaning specific borough names
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                     'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                     'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                     'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [12]:
df.to_pickle("scraped_dataframe.pkl") # Save df to be used in other notebooks

In [11]:
print(df.shape)

(103, 3)
