In [3]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

#### Fetch the data

In [4]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

website_url = requests.get(url).text
soup = BeautifulSoup(website_url, 'lxml')
table = soup.find('table', {'class': 'wikitable sortable'})
rows = table.text.split('\n\n\n')

data = list()
for r in rows:
    items = r.replace('\n\n', '').split('\n')
    data.append(items)

df = pd.DataFrame(data=data[1:], columns=data[0])

#### Clean the DataFrame


In [5]:
# Rename Postcode by PostalCode
df['PostalCode'] = df['Postcode']
del df['Postcode']

# Remove Borough = Not assigned
df = df[df['Borough'] != 'Not assigned']

# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
mask = (df['Neighborhood'] == 'Not assigned')
df['Neighborhood'][mask] = df['Borough']

# Combined into one row with the neighborhoods separated with a comma
transformed_df = df.groupby(['PostalCode'])['Neighborhood'].apply(','.join).reset_index()
df = pd.merge(left=transformed_df, right=df[['PostalCode', 'Borough']], on='PostalCode', how='left')
df = df[['PostalCode', 'Borough', 'Neighborhood']]
df.drop_duplicates(inplace=True)

#### Get the latitude & longitude

In [6]:
filepath = 'C:\\temp\\Geospatial_Coordinates.csv'

geo_coordinates_df = pd.read_csv(filepath_or_buffer=filepath)

geo_coordinates_df = geo_coordinates_df.rename(columns={'Postal Code': 'PostalCode'})

geo_coordinates_df.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [7]:
df = pd.merge(left=df, right=geo_coordinates_df, on='PostalCode')

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [8]:
df.shape

(103, 5)