# Segmenting and Clustering Neighborhoods in Toronto

In [1]:
! pip install lxml html5lib beautifulsoup4



In [2]:
import pandas as pd

### Download Toronto postal code data from Wikipedia

In [3]:
# Download Toronto postal code data
!wget -q -O 'canada_postal_codes_m.htm' https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M
print('Data downloaded!')

Data downloaded!


### Read data from html file. The dataframe has 3 columns: PostalCode, Borough, and Neighborhood

In [4]:
df = pd.read_html('canada_postal_codes_m.htm')[0]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Ignore cells with a borough that is 'Not assigned'

In [5]:
# Check
df[df['Borough'] == 'Not assigned'].shape

(77, 3)

In [6]:
# Remove them
df = df[df['Borough'] != 'Not assigned'].reset_index(drop=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [7]:
# Verify
df[df['Borough'] == 'Not assigned']

Unnamed: 0,Postcode,Borough,Neighbourhood


### Combine neighbourhoods that have same postal code

In [8]:
df = df.groupby(['Postcode', 'Borough']).agg({'Neighbourhood': lambda x: ', '.join(x)}).reset_index()
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### If neighborhood is not assigned, it will be the same as the borough

In [9]:
# Check
df[df['Neighbourhood'] == 'Not assigned'].shape

(1, 3)

In [10]:
# Set values to them
df['Neighbourhood'] = df['Neighbourhood'].where(df['Neighbourhood'] != 'Not assigned', df['Borough'])

In [11]:
# Verify
df[df['Neighbourhood'] == 'Not assigned']

Unnamed: 0,Postcode,Borough,Neighbourhood


### Print the number of rows

In [12]:
df.shape

(103, 3)