## Assignment - Segmenting and Clustering Neighbourhoods in the city of Toronto, Canada

`Import Library and Modules`

In [97]:
from bs4 import BeautifulSoup
import pandas as pd 
import requests

**[a] Scrape wikipedia page to extract the required information**

In [98]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(url)

# parse the wikipedia HTML page using beautiful soup library
soup = BeautifulSoup(response.text, 'html.parser')

# extract the required table
table = soup.find_all('table')[0]
size = len(table.find_all('td'))

In [99]:
# Loop through the 'td' tab of the 'table' tab and extract the text in a list 
rowDataVal = []
for i in range(size):
    rowVal = table.find_all('td')[i].get_text()
    rowDataVal.append(rowVal)
length = len(rowDataVal)

In [100]:
# Rearrange the data into their respective categories
PostalCode =[]
for i in range(0,length,3):
    PostalCode.append(rowDataVal[i])

Borough = []
for i in range(1, length, 3):
    Borough.append(rowDataVal[i])

Neighborhood = []
for i in range(2, length, 3):
    Neighborhood.append(rowDataVal[i])

**[b] Creating dataframe from extracted data**

In [101]:
df = pd.DataFrame()
df['PostalCode'] = PostalCode
df['Borough'] = Borough
df['Neighborhood'] = Neighborhood

# drop the trailing new line after each text from the dataframe
df = df.replace('\n','', regex=True) 
df.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


**[c] Cleaning dataframe by dropping rows having unavailable data**

In [102]:
# drop cells with borough 'not assigned' and reset the index of new dataframe
df_clean = df[~df.Borough.str.contains('Not assigned')].reset_index(drop=True) 
df_clean.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [103]:
# check if a cell has a borough but 'Not Assigned' Neighborhood
check = df_clean.loc[df_clean.Neighborhood.str.contains('Not assigned')]
print ("There are {} cells with 'Not Assigned' Neighborhood".format(len(check)))

There are 0 cells with 'Not Assigned' Neighborhood


In [104]:
# display number of rows in the clean dataframe
rows = df_clean.shape
size_df = len(df_clean)
print ("DataFrame Size : {} ".format(rows))
print ("Total Rows in Dataframe : {} ".format(size_df))

DataFrame Size : (103, 3) 
Total Rows in Dataframe : 103 
