In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [73]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests

In [74]:
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text


In [75]:
soup = BeautifulSoup(data, 'html.parser')

In [76]:
postalCodeList = []
boroughList = []
neighborhoodList = []

In [77]:
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postalCodeList.append(cells[0].text)
        boroughList.append(cells[1].text)
        neighborhoodList.append(cells[2].text.rstrip('\n'))

In [78]:
df = pd.DataFrame({"Postal Code": postalCodeList, "Borough": boroughList, "Neighborhood": neighborhoodList})

df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


A new dataframe stores only those rows where the boroughs are assigned, or rather not 'not assigned'.

In [79]:
newdf = df[df.Borough != "Not assigned"].reset_index(drop=True)
newdf.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


A new dataframe groups by postal code and borough, then aggregates the neighbourhoods that match both of them and joins them together using the lambda function.

In [80]:
groupeddf = newdf.groupby(["Postal Code", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
groupeddf.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


A loop iterates through all the rows by their index. If the neighbourhood of that row is not assigned, the neighbourhood of that row will be the borough of that row.

In [81]:
for index, row in groupeddf.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]
        
groupeddf.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [82]:
groupeddf.shape

(103, 3)