# Neighbourhood Segmentation and Clustering

This notebook is used to explore, segment and cluster neighbourboods in the city of Toronto.

## Part 1: Dataframe Creation

In [8]:
import pandas as pd
import requests

# !conda install -c anaconda beautifulsoup4
# Note: Commented above line since package was installed after first run and hence not required to be rerun
from bs4 import BeautifulSoup

In [34]:
# Parsing source html page
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'html.parser')

In [35]:
# Importing parsed data into dataframe
table = soup.find_all('table', class_='wikitable sortable')
df = pd.read_html(str(table))[0] # Note: Included [0] to access item in list as function will always return a list of DataFrames
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [36]:
# Processing data in dataframe

# Course Instruction: The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood.
df.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)

# Course Instruction: Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
df = df[df['Borough'] != 'Not assigned']

# Course Instruction: More than one neighborhood can exist in one postal code area.
# For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park.
# These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.
df = df.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(', '.join).to_frame().reset_index()

# Course Instruction: If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
df.loc[df['Neighborhood'] == 'Not assigned', 'Neighborhood'] = df['Borough']

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [37]:
# Printing number of rows of dataframe

# Course Instruction: In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.
df.shape

(103, 3)

## Part 2: Coordinates Addition

In [38]:
df_coordinates = pd.read_csv('https://cocl.us/Geospatial_data')
df_coordinates.rename(columns={'Postal Code':'PostalCode'}, inplace=True)

df = df.merge(df_coordinates, on='PostalCode')
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Part 3: Neighbourhood Clustering

In [None]:
# To be completed