### Import the BeautifulSoup Library

In [332]:
from bs4 import BeautifulSoup
import folium
from geopy.geocoders import Nominatim
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import requests
import matplotlib.cm as cm
import matplotlib.colors as colors

### Convert Canada Postal Codes Wiki HTML into 'soup'

In [333]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

with open('Canada_Postal_Codes_Wiki.html') as fp:
    soup = BeautifulSoup(fp)

### Save Wiki Table Columns Names and Create Empty Pandas DataFrame

In [334]:
header_row = soup.find('table').find('tbody').find('tr').get_text(",", strip=True).split(',')
print(header_row)
toronto_hoods = pd.DataFrame(columns=header_row)
toronto_hoods

['Postcode', 'Borough', 'Neighbourhood']


Unnamed: 0,Postcode,Borough,Neighbourhood


### Add All Table Rows to DataFrame

In [335]:
rows = soup.find('table').find('tbody').find_all('tr')

for row in rows[1:]:
    row = row.get_text(",", strip=True).split(',')
    # Skip rows that do not have an assigned Borough
    if row[1] == 'Not assigned':
        pass
    else:
        # Neighborhood = Borough if Neighborhood is not assigned
        if row[2] == 'Not assigned':
            row[2] = row[1]
        toronto_hoods = toronto_hoods.append({'Postcode':row[0], 'Borough':row[1], 'Neighbourhood':row[2]}, ignore_index=True)
    
toronto_hoods.shape

(211, 3)

In [336]:
toronto_hoods.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


### Reducing DataFrame so there is one entry per Postcode

In [337]:
postcodes = toronto_hoods.Postcode.unique()
postcodes

array(['M3A', 'M4A', 'M5A', 'M6A', 'M7A', 'M9A', 'M1B', 'M3B', 'M4B',
       'M5B', 'M6B', 'M9B', 'M1C', 'M3C', 'M4C', 'M5C', 'M6C', 'M9C',
       'M1E', 'M4E', 'M5E', 'M6E', 'M1G', 'M4G', 'M5G', 'M6G', 'M1H',
       'M2H', 'M3H', 'M4H', 'M5H', 'M6H', 'M1J', 'M2J', 'M3J', 'M4J',
       'M5J', 'M6J', 'M1K', 'M2K', 'M3K', 'M4K', 'M5K', 'M6K', 'M1L',
       'M2L', 'M3L', 'M4L', 'M5L', 'M6L', 'M9L', 'M1M', 'M2M', 'M3M',
       'M4M', 'M5M', 'M6M', 'M9M', 'M1N', 'M2N', 'M3N', 'M4N', 'M5N',
       'M6N', 'M9N', 'M1P', 'M2P', 'M4P', 'M5P', 'M6P', 'M9P', 'M1R',
       'M2R', 'M4R', 'M5R', 'M6R', 'M7R', 'M9R', 'M1S', 'M4S', 'M5S',
       'M6S', 'M1T', 'M4T', 'M5T', 'M1V', 'M4V', 'M5V', 'M8V', 'M9V',
       'M1W', 'M4W', 'M5W', 'M8W', 'M9W', 'M1X', 'M4X', 'M5X', 'M8X',
       'M4Y', 'M7Y', 'M8Y', 'M8Z'], dtype=object)

In [338]:
toronto_hoods_reduced = pd.DataFrame(columns=header_row)
neighbourhoods = []
borough = ''
for postcode in postcodes:
    for index, row in toronto_hoods.iterrows():
        if postcode == row.Postcode:
            neighbourhoods.append(row.Neighbourhood)
            # Take note of the Borough associated with this postal code
            if borough == '':
                borough = row.Borough
        else:
            pass
    neighbourhood_string = ', '.join(neighbourhoods)
    toronto_hoods_reduced = toronto_hoods_reduced.append({'Postcode':postcode, 'Borough':borough, 'Neighbourhood':neighbourhood_string}, ignore_index=True)
    neighbourhoods = []
    borough = ''
toronto_hoods_reduced

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


### Showing the 'shape' of the reduced DataFrame

In [339]:
toronto_hoods_reduced.shape

(103, 3)

### Matching Latitude and Longitude of each PostCode

In [325]:
lat_long_coords = pd.read_csv('Geospatial_Coordinates.csv')
lat_long_coords.head(5)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [340]:
toronto_neighborhoods = toronto_hoods_reduced.merge(lat_long_coords, left_on='Postcode', right_on='Postal Code').drop('Postal Code', axis=1)

In [341]:
toronto_neighborhoods.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
