## Segmenting and Clustering Neighbourhoods in Toronto

### Part 1: Converting Wikipedia html table into a DataFrame

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from bs4 import BeautifulSoup as bsoup
from urllib.request import urlopen as uReq
import requests
import lxml

In [2]:
source_file='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
r=requests.get(source_file)

#### 1.1 Read the web html file with BeautifulSoup 

In [3]:
page_content=bsoup(r.text,"html.parser")
#page_content

#### 1.2 Find Table in html file

In [4]:
mytable=page_content.table
#mytable

In [5]:
results=mytable.find_all('tr')
no_lines=len(results)
print('now we have found', no_lines, ' lines incl. the Header, which is the first line with 0 index')

now we have found 288  lines incl. the Header, which is the first line with 0 index


In [6]:
header=results[0].text.split()
header

['Postcode', 'Borough', 'Neighbourhood']

#### 1.3 Create a loop to extract the table contents into a dataframe df1

In [7]:
lines =[]
n=1
while n < no_lines :
    Postcode=results[n].text.split('\n')[1]
    Borough=results[n].text.split('\n')[2]
    Neighborhood=results[n].text.split('\n')[3]
    lines.append((Postcode, Borough,Neighborhood))
    n=n+1

df1=pd.DataFrame(lines, columns=['PostalCode', 'Borough', 'Neighbourhood'])
df1.head(5)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [8]:
df1.shape

(287, 3)

#### 1.4 Remove all rows with Borough = 'Not assigned' 

In [9]:
df1[df1['Borough']=='Not assigned'].count()

PostalCode       77
Borough          77
Neighbourhood    77
dtype: int64

In [10]:
df1.drop(df1[df1.Borough =='Not assigned'].index, inplace = True)
df1.reset_index(drop=True, inplace = True)

In [11]:
df1.head(5)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


In [12]:
df1.tail()

Unnamed: 0,PostalCode,Borough,Neighbourhood
205,M8Z,Etobicoke,Kingsway Park South West
206,M8Z,Etobicoke,Mimico NW
207,M8Z,Etobicoke,The Queensway West
208,M8Z,Etobicoke,Royal York South West
209,M8Z,Etobicoke,South of Bloor


In [13]:
df1.shape

(210, 3)

#### 1.5 Replace 'Not assigned' neighborhoods with the name of the Borough

In [14]:
df1[df1['Neighbourhood']=='Not assigned'].count()

PostalCode       1
Borough          1
Neighbourhood    1
dtype: int64

In [15]:
df1.loc[df1['Neighbourhood'] == 'Not assigned', 'Neighbourhood'] = df1['Borough']

In [16]:
df1[df1['Neighbourhood']=='Not assigned'].count()

PostalCode       0
Borough          0
Neighbourhood    0
dtype: int64

##### 1 row with 'not assigned' Neighborhood was replaced by Borough

In [17]:
df1.shape

(210, 3)

#### 1.6 Combine into one row all PostalCodes/ Boroughs with several neighborhoods with the neighborhoods separated with a comma

In [18]:
print('Target Data Frame should only consists of ', len(df1.groupby(['PostalCode','Borough'])), ' lines')


Target Data Frame should only consists of  103  lines


In [36]:
df1 = df1[~df1['Borough'].isnull()]  # to filter out bad rows

In [39]:
df1.shape

(210, 3)

In [19]:
df_target = df1.groupby(['PostalCode','Borough'])['Neighbourhood'].apply(lambda x: ','.join(x)).reset_index()

In [20]:
df_target.head(5)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [46]:
df_target.shape

(103, 3)

### Now we have the first part of Target Data Frame ready with only 103 lines

In [21]:
df_target


Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


### End of Part 1 -------------------------------------