# Segmenting and Clustering Neighborhoods in Toronto

## Obtain data from Wikipedia page

#### Import libraries

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import json # library to handle JSON files
from bs4 import BeautifulSoup

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

# import folium # map rendering library

print('Libraries imported.')

Libraries imported.


#### Parse Wikipedia page data

In [2]:
# Make a GET request to fetch the raw HTML content
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html_content = requests.get(url).text

# Parse the html content
soup = BeautifulSoup(html_content, "lxml")

#Get the table having the class wikitable sortable
neighborhood_table = soup.find("table", attrs={"class": "wikitable sortable"})

# Get headers of table
t_headers = []
for th in neighborhood_table.find_all("th"):
    # remove any newlines and extra spaces from left and right
    t_headers.append(th.text.replace('\n', ' ').strip())
    
# Get all the rows of table
table_data = []
for tr in neighborhood_table.tbody.find_all("tr"): # find all tr's from table's tbody
    t_row = {}
    # find all td's in tr and zip it with t_header
    for td, th in zip(tr.find_all("td"), t_headers):
        t_row[th] = td.text.replace('\n', '').strip()
        t_row[th] = td.text.replace(' /', ',').strip()
        table_data.append(t_row)

#### Convert data to Data Frame

In [3]:
df = pd.DataFrame(data=table_data, columns=t_headers)
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M1A,Not assigned,
2,M1A,Not assigned,
3,M2A,Not assigned,
4,M2A,Not assigned,


In [48]:
df.rename(columns={'Postal code':'Postal Code'}, inplace=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
29,M1B,Scarborough,"Malvern, Rouge"
55,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
83,M1E,Scarborough,"Guildwood, Morningside, West Hill"
110,M1G,Scarborough,Woburn
137,M1H,Scarborough,Cedarbrae


### Clean data

#### Remove rows with Borough as 'Not assigned'

In [49]:
df['Borough'].value_counts(dropna=False)

North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
York                 5
East Toronto         5
East York            5
Mississauga          1
Name: Borough, dtype: int64

In [50]:
# Remove rows with Borough as 'Not assigned'
df['Borough'].replace('Not assigned', np.nan, inplace=True)
df.dropna(subset=['Borough'], axis=0, inplace=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
29,M1B,Scarborough,"Malvern, Rouge"
55,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
83,M1E,Scarborough,"Guildwood, Morningside, West Hill"
110,M1G,Scarborough,Woburn
137,M1H,Scarborough,Cedarbrae


In [51]:
df['Borough'].value_counts(dropna=False)

North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
York                 5
East Toronto         5
East York            5
Mississauga          1
Name: Borough, dtype: int64

#### Set the 'Not assigned' neighborhood to the same as the borough

In [52]:
df.loc[df['Neighborhood']=='Not assigned']

Unnamed: 0,Postal Code,Borough,Neighborhood


In [56]:
# Set the 'Not assigned' neighborhood to the same as the borough
df['Neighborhood'].replace('Not assigned', df['Borough'], inplace=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
29,M1B,Scarborough,"Malvern, Rouge"
55,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
83,M1E,Scarborough,"Guildwood, Morningside, West Hill"
110,M1G,Scarborough,Woburn
137,M1H,Scarborough,Cedarbrae


#### Combined into one row with the neighborhoods separated with a comma

In [57]:
# sorting by Postal Code
df.sort_values('Postal Code', inplace=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
29,M1B,Scarborough,"Malvern, Rouge"
55,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
83,M1E,Scarborough,"Guildwood, Morningside, West Hill"
110,M1G,Scarborough,Woburn
137,M1H,Scarborough,Cedarbrae


In [58]:
# dropping duplicate values 
df.drop_duplicates(inplace=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
29,M1B,Scarborough,"Malvern, Rouge"
55,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
83,M1E,Scarborough,"Guildwood, Morningside, West Hill"
110,M1G,Scarborough,Woburn
137,M1H,Scarborough,Cedarbrae


#### Size of the data frame

In [59]:
df.size

309

## Obtain a data frame with coorinates

In [60]:
import types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.
client_335529e8571343eaa4cecdd51c945c46 = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='nFP-qxW9J6G3d8qv4JZRJrpQHgqrX9W19_zGc_K0oS_J',
    ibm_auth_endpoint="https://iam.eu-gb.bluemix.net/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3.eu-geo.objectstorage.service.networklayer.com')

body = client_335529e8571343eaa4cecdd51c945c46.get_object(Bucket='segmentingandclusteringneighborho-donotdelete-pr-fr6cs4xdaaovpv',Key='Geospatial_Coordinates.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

# If you are reading an Excel file into a pandas DataFrame, replace `read_csv` by `read_excel` in the next statement.
df_geo_coor = pd.read_csv(body)
df_geo_coor.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [61]:
# Merge both data frames so all the neighborhoods have their coordinates
df = pd.merge(df, df_geo_coor, how='left', on='Postal Code')
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
