# Clustering of Toronto Neighborhoods - Final Capstone project

## First Part  - Webscraping and importing of data into a dataframe

### Import standard libraries

In [232]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


### Import BeautifulSoup and the Toronto postal codes wikipedia page

In [233]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [234]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
html = urlopen(url)
soup = BeautifulSoup(html, 'lxml')
type(soup)

bs4.BeautifulSoup

In [235]:
title = soup.title
print(title)

<title>List of postal codes of Canada: M - Wikipedia</title>


### Get HTML text data from the webpage, cleanup and copy it into a dataframe

In [236]:
rows = soup.find_all('tr')
for row in rows:
    row_td = row.find_all('td')

In [237]:
str_cells = str(row_td)
cleantext = BeautifulSoup(str_cells, "lxml").get_text()

In [238]:
import re

list_rows = []
for row in rows:
    cells = row.find_all('td')
    str_cells = str(cells)
    clean = re.compile('<.*?>')
    clean2 = (re.sub(clean, '',str_cells))
    list_rows.append(clean2)

In [239]:
df = pd.DataFrame(list_rows)

### Further cleanup of the data to remove special chars, space etc

In [240]:
df1 = df[0].str.split('\n', expand=True)
df1[0] = df1[0].str.strip('[')
df1[2] = df1[2].str.strip(']')
df1[3] = df1[3].str.strip(']')
df1 =df1.drop(df1.columns[3:], axis = 1)
#df1 =df1.drop(df1.columns[0], axis = 1)
#df1 = df1.replace('\n','', regex=True)
df1[1] = df1[1].str.lstrip(',')
df1[2] = df1[2].str.lstrip(',')
df1[0] = df1[0].str.strip()
df1[1] = df1[1].str.strip()
df1[2] = df1[2].str.strip()
df1 = df1.iloc[1:]

### Get the table headers and clean up the header data

In [241]:
col_labels = soup.find_all('th')

In [242]:
all_header = []
col_str = str(col_labels)
cleantext2 = BeautifulSoup(col_str, "lxml").get_text()
all_header.append(cleantext2)
print(all_header)

['[Postal Code\n, Borough\n, Neighborhood\n, Canadian postal codes\n]']


In [243]:
df2 = pd.DataFrame(all_header)
df2 = df2.replace('\n','', regex=True)
df2.head()

Unnamed: 0,0
0,"[Postal Code, Borough, Neighborhood, Canadian ..."


In [244]:
df2 = df2[0].str.split(',', expand=True)
df2[0] = df2[0].str.strip('[')
df2[1] = df2[1].str.strip()
df2[2] = df2[2].str.strip()

### Concatenate the header and text data, do some more data cleanup

In [245]:
frames = [df2,df1]
df3 = pd.concat(frames)

In [246]:
df3 = df3.rename(columns=df2.iloc[0])
df3 =df3.drop(df3.columns[[3]], axis = 1)
df3 = df3.rename(columns={'Postal Code':'PostalCode'})

In [247]:
postal_code_df = df3.iloc[1:]
list(postal_code_df.columns)

['PostalCode', 'Borough', 'Neighborhood']

### Remove records where Borough is 'Not assigned', also check for Neighborhood where it is 'Not assigned' and replace with Borough data

In [248]:
postal_code_df = postal_code_df[postal_code_df['Borough'] != 'Not assigned']
postal_code_df = postal_code_df.reset_index(drop=True)

In [249]:
postal_code_df['Neighborhood'] = np.where(postal_code_df['Neighborhood'] == 'Not assigned', postal_code_df['Borough'], postal_code_df['Neighborhood'])
postal_code_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


### Get the shape of the final dataframe

In [250]:
postal_code_df.shape

(107, 3)