# Segmenting and Clustering Neighborhoods in Toronto

## Web Scraping Toronto Postal Codes

In [71]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tabulate import tabulate

In [72]:
URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [73]:
page = requests.get(URL)

In [74]:
soup = BeautifulSoup(page.text, 'html.parser')

In [75]:
columns = ['PostalCode','Borough','Neighborhood']

In [76]:
postaltable = soup.find_all('table')[0]

### Converting Postal Codes html table info to Pandas Dataframe

In [78]:
df = pd.read_html(str(postaltable))

In [79]:
df[0].head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [80]:
df[0].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 287 entries, 0 to 286
Data columns (total 3 columns):
Postcode         287 non-null object
Borough          287 non-null object
Neighbourhood    287 non-null object
dtypes: object(3)
memory usage: 6.8+ KB


In [81]:
postalcodes = df[0]

In [82]:
postalcodes.columns = columns

In [83]:
postalcodes.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Narrow down the dataframe to ignore/exclude Not Assigned Boroughs

In [84]:
postalcodes = postalcodes[postalcodes['Borough'] != 'Not assigned']

### Grouping the Neighborhoods w.r.t each Boroughs

In [93]:
toronto_pcodes = postalcodes.groupby(['PostalCode','Borough'])['Neighborhood'].agg(', '.join).to_frame()

In [98]:
toronto_pcodes = toronto_pcodes.reset_index()

In [100]:
toronto_pcodes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 3 columns):
PostalCode      103 non-null object
Borough         103 non-null object
Neighborhood    103 non-null object
dtypes: object(3)
memory usage: 2.5+ KB


In [101]:
toronto_pcodes.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [103]:
toronto_pcodes.shape

(103, 3)

### overall 103 postal codes are identified and integrated in the dataframe