#  PART 1 - scraping the postcode data and preprocessing it.

#### Import the necessary modules

In [110]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
pd.set_option('display.max_rows', None)

#### Use BeautifulSoup to extract the table we want

In [113]:
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(website_url,'lxml')
table = soup.find('table',{'class':'wikitable sortable'})

#### Loop through the table using HTML tags to create a dataframe of the table entries

In [114]:
rows = table.find_all('tr')
data = []
for row in rows[1:]:
    cols = row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    data.append([ele for ele in cols if ele])

result = pd.DataFrame(data, columns=['Postcode', 'Borough', 'Neighborhood'])

#### Verify the dataframe's appearence and size

In [115]:
print(result.head())
print()
print('Table shape: ', result.shape)

  Postcode           Borough      Neighborhood
0      M1A      Not assigned      Not assigned
1      M2A      Not assigned      Not assigned
2      M3A        North York         Parkwoods
3      M4A        North York  Victoria Village
4      M5A  Downtown Toronto      Harbourfront

Table shape:  (287, 3)


#### Drop records with no assigned Borough

In [116]:
result.drop(result[result.Borough == 'Not assigned'].index, inplace=True)
result.head()

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


#### Group results by Postcode to concatenate Neighborhood values where multiple neighborhoods share a postcode

In [117]:
df_post = result.groupby(['Postcode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()
df_post.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### Set neighborhood to borough where no neighborhood is assigned

In [118]:
df_post.loc[df_post['Neighborhood'] == 'Not assigned', ['Neighborhood']] = df_post['Borough']

#### Display final dataframe shape

In [119]:
df_post.shape

(103, 3)

# PART 2 - adding the geo-coordinates to the dataframe

#### Download the geospatial coordinates csv

In [132]:
!wget -q -O 'Geospatial_Coordinates.csv' http://cocl.us/Geospatial_data
geo_data = pd.read_csv('Geospatial_Coordinates.csv')
geo_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Set the geo_data csv column names to match the postcode dataframe columns and merge

In [136]:
geo_data.columns = ['Postcode', 'Latitude', 'Longitude']
df_post = df_post.merge(geo_data)
df_post.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# PART 3 - exploring and clustering the neighborhoods