# Segmenting and Clustering Neighborhoods in Toronto

#### 1. Scrape data from Wikipedia page and save in dataframe.

In [1]:
import pandas as pd
import numpy as np

url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

df=pd.read_html(url, header=0)[0]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### 2. Remove all rows where Borough is not assigned.

In [2]:
df = df[df['Borough'] != 'Not assigned']
df.rename(columns = {'Postcode':'PostalCode'}, inplace=True)
df.reset_index(inplace=True)
df.drop(['index'], axis=1, inplace=True)
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Downtown Toronto,Queen's Park
6,M9A,Queen's Park,Not assigned
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


#### 3. Change all 'not assigned' Neighborhoods to name of Borough.

In [3]:
for m in range(0,len(df.Neighbourhood)):
    if df['Neighbourhood'][m] == 'Not assigned':
        df['Neighbourhood'][m] = df['Borough'][m]

df.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Downtown Toronto,Queen's Park
6,M9A,Queen's Park,Queen's Park
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


#### 4. Group all neighborhoods under the same postal code, and store neighborhood names under the list *neigh_list*.

In [4]:
postcodes_names = df['PostalCode'].unique()
postcodes_values = df['PostalCode'].values
neigh_list = [[] for _ in range(len(postcodes_names))]

for n in range(0,len(postcodes_values)):
    for i in range(0,len(postcodes_names)):
        if postcodes_values[n] == postcodes_names[i]:
            neigh_list[i].append(df['Neighbourhood'][n])

#### 5. Combine new list as a column in the dataframe.

In [5]:
df_new = pd.DataFrame({'PostalCode':postcodes_names, 'Neighbourhood_list': neigh_list})
df = df.join(df_new.set_index('PostalCode'), on='PostalCode')
df.drop('Neighbourhood', axis=1, inplace=True)
df.rename(columns={'Neighbourhood_list':'Neighborhood'},inplace=True)
df.drop_duplicates('PostalCode', inplace=True)
df.reset_index(inplace=True)
df.drop('index', axis=1, inplace=True)
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,[Parkwoods]
1,M4A,North York,[Victoria Village]
2,M5A,Downtown Toronto,[Harbourfront]
3,M6A,North York,"[Lawrence Heights, Lawrence Manor]"
4,M7A,Downtown Toronto,[Queen's Park]
5,M9A,Queen's Park,[Queen's Park]
6,M1B,Scarborough,"[Rouge, Malvern]"
7,M3B,North York,[Don Mills North]
8,M4B,East York,"[Woodbine Gardens, Parkview Hill]"
9,M5B,Downtown Toronto,"[Ryerson, Garden District]"


### 6. Check size of final dataframe.

In [6]:
df.shape

(103, 3)