# Segmenting and Clustering Neighbourhoods in Toronto

In [159]:
# Importing libraries

import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

#!conda install --yes BeautifulSoup4
from bs4 import BeautifulSoup

print('Libraries imported.')

Libraries imported.


## Part 1: Web Scraping

In [160]:
# Use the requests to download the data from Wikipedia
website_data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

# Use BeautifulSoup to parse the table, inspect source to get the relevant class
soup = BeautifulSoup(website_data.text, 'html.parser')
table = soup.find('table', attrs={'class':'wikitable sortable'})

# 'table' is a table in HTML, let's parse this and turn this into a dataframe
rows = table.findAll('tr')
postal_code_list = []
borough_list = []
nhood_list = []

for row in rows[1:]:
    cols = row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    postal_code_list.append(cols[0])
    borough_list.append(cols[1])
    nhood_list.append(cols[2])    

# The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood

df = pd.DataFrame({
    'PostalCode':postal_code_list, 
    'Borough':borough_list, 
    'Neighborhood':nhood_list
    })

# Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
df = df.drop(df[(df.Borough == "Not assigned")].index)

# Asof writing, a lot of the clean up tasks were already completed on the Wikipedia page, 
# e.g. Neighbourhood has been combined already, no 'Not assigned' neighbourhood remained after the deletion rule above

print(df.head(10))
print(df.shape)


   PostalCode           Borough                                 Neighborhood
2         M3A        North York                                    Parkwoods
3         M4A        North York                             Victoria Village
4         M5A  Downtown Toronto                    Regent Park, Harbourfront
5         M6A        North York             Lawrence Manor, Lawrence Heights
6         M7A  Downtown Toronto  Queen's Park, Ontario Provincial Government
8         M9A         Etobicoke      Islington Avenue, Humber Valley Village
9         M1B       Scarborough                               Malvern, Rouge
11        M3B        North York                                    Don Mills
12        M4B         East York              Parkview Hill, Woodbine Gardens
13        M5B  Downtown Toronto                     Garden District, Ryerson
(103, 3)
