# Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto

## Contents
- [Part 1: Scraping and Cleaning the Postcode Data](#Part-1)
- [Part 2: Adding Coordinates to the Postcodes DataFrame](#Part-2)

In [117]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

## Part 1
### Scraping and cleaning the postcode data

In [3]:
# url of page we are going to scrape
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

#### An image showing the layout of the page we are scraping:

![](./assets/site_snippet.JPG)

In [5]:
# get the page html
data  = requests.get(url).text

In [6]:
# convert to BS object
soup = BeautifulSoup(data,"html5lib")

In [17]:
# find the table containing the post codes
table = soup.find_all('table')[0]

In [21]:
# get the entries in the table
entries = table.find_all('td')

In [37]:
len(entries)

180

In [27]:
# the table entries that have unassigned post codes contain the <i> tag
entries[3].find('i')

None


In [33]:
# creating a new list, removing the unassigned post codes

populated_entries = list()

for entry in entries:
    if entry.find('i') == None:
        populated_entries.append(entry)

In [35]:
len(populated_entries)

103

In [38]:
populated_entries[0]

<td style="width:11%; vertical-align:top;">
<p><b>M3A</b><br/><span style="font-size:85%;"><a href="/wiki/North_York" title="North York">North York</a><br/>(<a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>)</span>
</p>
</td>

In [39]:
populated_entries[0].find('b').text

'M3A'

In [41]:
# create lists to store the key information
postcodes = list()
boroughs = list()
neighbourhoods = list()

In [42]:
# fill the postcodes list
for ent in populated_entries:
    postcode = ent.find('b').text
    postcodes.append(postcode)

In [46]:
for i in range(5):
    print(postcodes[i])

M3A
M4A
M5A
M6A
M7A


In [81]:
for i in range(10):
    print(populated_entries[i].find('span').text.split('('))

['North York', 'Parkwoods)']
['North York', 'Victoria Village)']
['Downtown Toronto', 'Regent Park / Harbourfront)']
['North York', 'Lawrence Manor / Lawrence Heights)']
["Queen's Park", 'Ontario Provincial Government)']
['Etobicoke', 'Islington Avenue)']
['Scarborough', 'Malvern / Rouge)']
['North York', 'Don Mills)North']
['East York', 'Parkview Hill / Woodbine Gardens)']
['Downtown Toronto', 'Garden District, Ryerson)']


In [64]:
# fill the boroughs list
for ent in populated_entries:
    borough = ent.find('span').text.split('(')[0]
    boroughs.append(borough)

In [82]:
for i in range(5):
    print(boroughs[i])

North York
North York
Downtown Toronto
North York
Queen's Park


In [80]:
for i in range(10):
    print(populated_entries[i].find('span').text.split('(')[1].split(')')[0])

Parkwoods
Victoria Village
Regent Park / Harbourfront
Lawrence Manor / Lawrence Heights
Ontario Provincial Government
Islington Avenue
Malvern / Rouge
Don Mills
Parkview Hill / Woodbine Gardens
Garden District, Ryerson


In [79]:
# fill the neighbourhoods list
for ent in populated_entries:
    neighbourhood = ent.find('span').text.split('(')[1].split(')')[0]
    neighbourhoods.append(neighbourhood)

In [87]:
for i in range(10):
    print(neighbourhoods[i])

Parkwoods
Victoria Village
Regent Park / Harbourfront
Lawrence Manor / Lawrence Heights
Ontario Provincial Government
Islington Avenue
Malvern / Rouge
Don Mills
Parkview Hill / Woodbine Gardens
Garden District, Ryerson


In [94]:
# clean up the neighbourhood entries
neighbourhoods_cleaned = list()
for ent in neighbourhoods:
    cleaned = ent.replace(' /', ',')
    neighbourhoods_cleaned.append(cleaned)

In [95]:
for i in range(10):
    print(neighbourhoods_cleaned[i])

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Ontario Provincial Government
Islington Avenue
Malvern, Rouge
Don Mills
Parkview Hill, Woodbine Gardens
Garden District, Ryerson


In [96]:
len(neighbourhoods_cleaned)

103

In [97]:
neighbourhoods = neighbourhoods_cleaned

In [98]:
for i in range(10):
    print(neighbourhoods[i])

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Ontario Provincial Government
Islington Avenue
Malvern, Rouge
Don Mills
Parkview Hill, Woodbine Gardens
Garden District, Ryerson


In [100]:
# create a dataframe
df = pd.DataFrame(columns=['PostalCode', 'Borough', 'Neighbourhood'])

In [104]:
# assign the entry lists to the columns
df['PostalCode'] = postcodes
df['Borough'] = boroughs
df['Neighbourhood'] = neighbourhoods

In [106]:
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [107]:
df.shape

(103, 3)

## Part 2
### Adding Coordinates to the Postcodes DataFrame

In [114]:
coords_df = pd.read_csv('./assets/Geospatial_Coordinates.csv')

In [116]:
coords_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [118]:
df['Latitude'] = np.nan
df['Longitude'] = np.nan
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,,
1,M4A,North York,Victoria Village,,
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",,
3,M6A,North York,"Lawrence Manor, Lawrence Heights",,
4,M7A,Queen's Park,Ontario Provincial Government,,


In [144]:
# combine the two dataframes to give a new df with the coordinates
for index, row in df.iterrows():
    code = row['PostalCode']
    lat = coords_df.loc[coords_df['Postal Code'] == code]['Latitude']
    long = coords_df.loc[coords_df['Postal Code'] == code]['Longitude']
    df.loc[index, 'Latitude'] = lat.values[0]
    df.loc[index, 'Longitude'] = long.values[0]

In [145]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


In [151]:
df.Latitude.isna().sum()

0

## Part 3
### Looking at clusters of the neighbourhoods