In [1]:
import pandas as pd
from bs4 import BeautifulSoup as BS
import requests
import re

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

## Scrap the wikipedia page for Postal codes of canada

In [3]:
res = requests.get(url)
res

<Response [200]>

#### convert the content into soup...

In [4]:
soup = BS(res.content, 'html5lib')

#### find all tables, we are interested in the first one...

In [5]:
table = soup.find_all('table')[0]

**Note**:
* leaving "Not assigned" postal codes.
* collecting postal codes in a dict, postal code as key and borough and neighborhood as values to keep sanity intact.
* later on will separate them in a dict as postal code, borough and neighborhood.

In [6]:
data_dict = dict()

for row in table.find_all('tr'):
    for col in row.find_all('td'):
        pc = col.find('b').get_text()
        b_n = col.find_all('span')[0].get_text()
        if b_n != "Not assigned":
            m1 = re.search(r'(?<=\()[a-zA-z/ ]+', b_n)
            m2 = re.search(r'[a-zA-Z ]+', b_n)
            data_dict[pc] = (m1.group(0), m2.group(0))
        else:
            continue
        
print(f"data_dict keys count: {len(data_dict.keys())}")
    

data_dict keys count: 103


In [7]:
# for sanity check...
data_dict.keys()

dict_keys(['M3A', 'M4A', 'M5A', 'M6A', 'M7A', 'M9A', 'M1B', 'M3B', 'M4B', 'M5B', 'M6B', 'M9B', 'M1C', 'M3C', 'M4C', 'M5C', 'M6C', 'M9C', 'M1E', 'M4E', 'M5E', 'M6E', 'M1G', 'M4G', 'M5G', 'M6G', 'M1H', 'M2H', 'M3H', 'M4H', 'M5H', 'M6H', 'M1J', 'M2J', 'M3J', 'M4J', 'M5J', 'M6J', 'M1K', 'M2K', 'M3K', 'M4K', 'M5K', 'M6K', 'M1L', 'M2L', 'M3L', 'M4L', 'M5L', 'M6L', 'M9L', 'M1M', 'M2M', 'M3M', 'M4M', 'M5M', 'M6M', 'M9M', 'M1N', 'M2N', 'M3N', 'M4N', 'M5N', 'M6N', 'M9N', 'M1P', 'M2P', 'M4P', 'M5P', 'M6P', 'M9P', 'M1R', 'M2R', 'M4R', 'M5R', 'M6R', 'M7R', 'M9R', 'M1S', 'M4S', 'M5S', 'M6S', 'M1T', 'M4T', 'M5T', 'M1V', 'M4V', 'M5V', 'M8V', 'M9V', 'M1W', 'M4W', 'M5W', 'M8W', 'M9W', 'M1X', 'M4X', 'M5X', 'M8X', 'M4Y', 'M7Y', 'M8Y', 'M8Z'])

In [8]:
# separate them in their own values to feed them into pandas as a dict...
canada = {'postal_code': [], 'borough': [], 'neighborhood': []}
for k, v in data_dict.items():
    canada['postal_code'].append(k)
    canada['borough'].append(v[1])
    canada['neighborhood'].append(v[0].replace('/', ','))

df = pd.DataFrame.from_dict(canada)
df.head(10)

Unnamed: 0,postal_code,borough,neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Queen,Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern , Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill , Woodbine Gardens"
9,M5B,Downtown Toronto,Garden District


In [9]:
df.shape

(103, 3)

save the df as csv file for further exploration

In [9]:
df.to_csv('./toranto_postal_codes.csv', index=False)