## Assignment - Segmenting and Clustering Neighbourhoods in the city of Toronto, Canada

### Part 1 - Scrape the wikipedia page, Data Wrangling and Cleaning, Reading into Pandas Dataframe

`Import Library and Modules`

In [15]:
from bs4 import BeautifulSoup
import pandas as pd 
import requests

**[a] Scrape wikipedia page to extract the required information**

In [16]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(url)

# parse the wikipedia HTML page using beautiful soup library
soup = BeautifulSoup(response.text, 'html.parser')

# extract the required table
table = soup.find_all('table')[0]
size = len(table.find_all('td'))

In [17]:
# Loop through the 'td' tab of the 'table' tab and extract the text in a list 
rowDataVal = []
for i in range(size):
    rowVal = table.find_all('td')[i].get_text()
    rowDataVal.append(rowVal)
length = len(rowDataVal)

In [18]:
# Rearrange the data into their respective categories
PostalCode =[]
for i in range(0,length,3):
    PostalCode.append(rowDataVal[i])

Borough = []
for i in range(1, length, 3):
    Borough.append(rowDataVal[i])

Neighborhood = []
for i in range(2, length, 3):
    Neighborhood.append(rowDataVal[i])

**[b] Creating dataframe from extracted data**

In [19]:
df = pd.DataFrame()
df['PostalCode'] = PostalCode
df['Borough'] = Borough
df['Neighborhood'] = Neighborhood

# drop the trailing new line after each text from the dataframe
df = df.replace('\n','', regex=True) 
df.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


**[c] Cleaning dataframe by dropping rows having unavailable data**

In [20]:
# drop cells with borough 'not assigned' and reset the index of new dataframe
df_clean = df[~df.Borough.str.contains('Not assigned')].reset_index(drop=True) 
df_clean.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [21]:
# check if a cell has a borough but 'Not Assigned' Neighborhood
check = df_clean.loc[df_clean.Neighborhood.str.contains('Not assigned')]
print ("There are {} cells with 'Not Assigned' Neighborhood".format(len(check)))

There are 0 cells with 'Not Assigned' Neighborhood


In [22]:
# display number of rows in the clean dataframe
rows = df_clean.shape
size_df = len(df_clean)
print ("DataFrame Size : {} ".format(rows))
print ("Total Rows in Dataframe : {} ".format(size_df))

DataFrame Size : (103, 3) 
Total Rows in Dataframe : 103 


### Part 2 - Getting Latitude and Longitude Coordinates of Neigborhoods

In [37]:
import types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.
client_8dbf0725bdee4042881dc53e79d63a02 = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='98fDWNT-aajkRYnooCDQp_H-1WRL2inWZCholIVvmOke',
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')

body = client_8dbf0725bdee4042881dc53e79d63a02.get_object(Bucket='courseraassignment-donotdelete-pr-pobx7wk7wfmv1x',Key='Geospatial_Coordinates.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

df_coordinates = pd.read_csv(body)
df_coordinates.rename(columns={"Postal Code": "PostalCode"}, inplace = True)
df_coordinates.head()


Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [41]:
# Merge Dataframes 
df_merged = pd.merge(df_clean, df_coordinates, on='PostalCode')
df_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
