# Segmenting and clustering neighbourhoods in the city of Torronto, Canada- Part I

In [1]:
import pandas as pd

### 1. Converting the data into pandas dataframe

In [2]:
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]

In [3]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### 2. Dropping the cells where Borough is not assigned

In [4]:

df.drop(df[df["Borough"]=="Not assigned"].index, axis=0, inplace=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### 3. Combining the rows with same postal code

In [5]:
df_new=df.groupby('Postal Code')['Neighbourhood'].apply(','.join).reset_index()
df_new.head()

Unnamed: 0,Postal Code,Neighbourhood
0,M1B,"Malvern, Rouge"
1,M1C,"Rouge Hill, Port Union, Highland Creek"
2,M1E,"Guildwood, Morningside, West Hill"
3,M1G,Woburn
4,M1H,Cedarbrae


In [6]:
PostalCode_Canada = pd.merge(df, df_new, on='Postal Code', how='inner')

PostalCode_Canada.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood_x,Neighbourhood_y
0,M3A,North York,Parkwoods,Parkwoods
1,M4A,North York,Victoria Village,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront","Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights","Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government","Queen's Park, Ontario Provincial Government"


In [7]:
PostalCode_Canada.drop(['Neighbourhood_y'], axis=1, inplace=True)
PostalCode_Canada.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood_x
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [8]:
PostalCode_Canada.rename(columns={'Neighbourhood_x':'Neighbourhood'}, inplace=True)
PostalCode_Canada.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### 4. Replacing the 'Not assigned' neighbourhood with the corressponding borough

In [9]:
import numpy as np

In [10]:
PostalCode_Canada['Neighbourhood'] = np.where(PostalCode_Canada['Neighbourhood'] == 'Not assigned', PostalCode_Canada['Borough'], PostalCode_Canada['Neighbourhood'])
PostalCode_Canada.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### 5. Shape of the final dataset

In [12]:
PostalCode_Canada.shape

(103, 3)