# Segmenting and Clustering Neighborhoods in Toronto

In this notebook, I will explore and cluster the neighbourhoods of Toronto by web-scraping postal codes, building an appropriate dataframe, then visualizing our data.

In [135]:
# inviting the VIPs to the party
import numpy as np
import pandas as pd
import json
import requests

In [136]:
# also invite party people who have good style
pd.set_option('display.max_colwidth', -1)

In [137]:
#special guests in order to do some postcal code web-scraping
from bs4 import BeautifulSoup
from urllib.request import urlopen

In [156]:
#scrape the data from the wikipedia page
path = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = urlopen(path).read()
bsoup = BeautifulSoup(page, 'html.parser')

relevant = bsoup.body.table.tbody

In [164]:
# parsing all the data into a dataframe
data = []
for tr in wiki_table.find_all('tr'):
    row = []
    tds = tr.find_all('td')
    for td in tds:    
        if td.text:
            relevant = td.text.strip()
            row.append(relevant)
    if len(row) >= 3:
        data.append(row)

columns = ['PostalCode', 'Borough', 'Neighborhood']
df = pd.DataFrame(data, columns=columns)
print('There are {0} rows and {1} columns in this dataframe.'.format(df.shape[0],df.shape[1]))

There are 288 rows and 3 columns in this dataframe.


In [165]:
# Remove rows with Boroughs that are 'Not Assigned'
df = df[df.Borough != 'Not assigned']
# reset the index
df.reset_index(inplace=True,drop=True)

In [166]:
# Group Neighborhoods with the same PostalCode and Borough
df = df.groupby(['PostalCode','Borough'])['Neighborhood'].apply(lambda x: ','.join(x.astype(str))).reset_index()

In [167]:
print('There are now {0} rows and {1} columns in this dataframe after data cleaning.'.format(df.shape[0],df.shape[1]))

There are now 103 rows and 3 columns in this dataframe after data cleaning.


---

In [168]:
# Add Latitude and Longitude to the dataframe using local geospatial data
geo = pd.read_csv('./Geospatial_Coordinates.csv')
df = df.merge(geo, left_on='PostalCode', right_on='Postal Code').drop(columns=['Postal Code'])

---

In [172]:
# Only working with boroughs that contain the word 'Toronto'
df = df[df['Borough'].str.contains("Toronto")]
df.reset_index(inplace=True,drop=True)
df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park,Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West",43.686412,-79.400049
