# Segmenting and Clustering Neighbourhoods in Toronto (prt3)
This notebook focuses on part 3 of the peer graded assignment; exploring and clustering neighbourhoods in Toronto

### Start with code from Part 1 to get the data

In [1]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
print('Libraries imported.')

# import the library we use to open URLs
import requests
# specify which URL/web page we are going to be scraping
website_url = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text

# import the BeautifulSoup library so we can parse HTML and XML documents
from bs4 import BeautifulSoup
# parse the HTML from our URL into the BeautifulSoup parse tree format
soup = BeautifulSoup(website_url, 'lxml')
# use the BeautifulSoup function 'prettify' to look a the HTML underlying our chosen web page 
print(soup.prettify())

# Use the 'find' function to bring back the table data only using the class id: 'wikitable sortable' (found in code print out above)
My_table = soup.find('table', {'class':'wikitable sortable'})

# Loop through data 
A = []
B = []
C = []

for row in My_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].find(text=True))


df2 = pd.DataFrame(A, columns=['PostalCode'])
df2['Borough'] = B
df2['Neighborhood'] = C

indexNames = df2[ (df2['Borough']=='Not assigned')].index
df2.drop(indexNames, inplace = True)

for i, row in df2.iterrows():
    if "Not assigned" in row['Neighborhood']:
        row['Neighborhood'] = row['Borough']
        
        
df2.shape

Libraries imported.
<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRequestId":"Xk0JdwpAIDEAAFG8H5MAAABX","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":935851093,"wgRevisionId":935851093,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgC

(210, 3)

### Continue with code from Part 2 to get the longitude and latitude for each PostalCode

In [2]:
# import data into notebook 
!wget -q -O 'geospatial_data.csv' https://cocl.us/Geospatial_data
print('Data downloaded!')

# convert from csv to a pandas dataframe 
geo_data = pd.read_csv('geospatial_data.csv')
geo_data.head()

# Change the column names to match df2 
geo_data = geo_data.rename({'Postal Code' : 'PostalCode'}, axis = 1)
geo_data.head()

# Merge data on 'PostalCode' to get the lat and long

toronto_data = pd.merge(df2, geo_data, on = 'PostalCode')
toronto_data.head(30)


Data downloaded!


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,Lawrence Heights,43.718518,-79.464763
4,M6A,North York,Lawrence Manor,43.718518,-79.464763
5,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
6,M9A,Queen's Park,Queen's Park,43.667856,-79.532242
7,M1B,Scarborough,Rouge,43.806686,-79.194353
8,M1B,Scarborough,Malvern,43.806686,-79.194353
9,M3B,North York,Don Mills North,43.745906,-79.352188


In [3]:
toronto_data.shape

(210, 5)

## Part 3

This section looks at clustering and visualising our data; first we need to download all of the relevant dependencies. This can take a while if not been run before!

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

# All requested packages already installed.

Solving environment: done

# All requested packages already installed.

Libraries imported.


Using the geolocator and the Nominatim package to obtain the coordinates of Toronto

In [5]:
# To define an instance of the geocoder, we need to define a user_agent
address = 'Toronto, ON'

# Use this to obtain the longitude and latitude of Toronto 
geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


Creating a map of Toronto with the neighbourhoods superimposed on top. 
I will use Folium to do this, this is a good visualisation library.  
This is a helpful visual to show how the data is spread out over the City and see if any observations can be made from it. 
Clicking on the blue markers reveals the name of each neighbourhood and the respective borough. 

In [6]:
# Create a map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Borough'], toronto_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

#### Break it down to only Boroughs that contain the word 'Toronto'

In [7]:
# filter data to only get Boroughs which contain the word Toronto

toron_data = toronto_data[toronto_data['Borough'].str.contains("Toronto")]
toron_data.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
5,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
12,M5B,Downtown Toronto,Ryerson,43.657162,-79.378937
13,M5B,Downtown Toronto,Garden District,43.657162,-79.378937
26,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
35,M4E,East Toronto,The Beaches,43.676357,-79.293031
36,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
40,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
41,M6G,Downtown Toronto,Christie,43.669542,-79.422564
48,M5H,Downtown Toronto,Adelaide,43.650571,-79.384568


Using the same details as before alongside the filtered data set to generate the new map. 

In [8]:
# Create a map of Toronto using latitude and longitude values
map_toronto2 = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood in zip(toron_data['Latitude'], toron_data['Longitude'], toron_data['Borough'], toron_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto2)  
    
map_toronto2

Here I changed the zoom feature, enabling me to get a clearer view of the individual neighbourhoods.  

#### Focus on one Borough 

As an example I am going to look at the Borough of Downtown Toronto. There are a lot of neighbourhoods quite close together so will be good to look further. 

This is a nice technique if you have specific areas you need to focus on.  

In [9]:
# filter data to only get neighbourhoods in Scarborough 

dwnt_data = toron_data[toron_data['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
dwnt_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
1,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
2,M5B,Downtown Toronto,Ryerson,43.657162,-79.378937
3,M5B,Downtown Toronto,Garden District,43.657162,-79.378937
4,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418


Get the coordinates of Downtown Toronto using the same user agent as before. 

In [10]:
address = 'Downtown Toronto, ON'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Downtown Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Downtown Toronto are 43.6563221, -79.3809161.


In [11]:
# Create a map of Downtown Toronto using latitude and longitude values
dwnt_toronto = folium.Map(location=[latitude, longitude], zoom_start=13)

# add markers to map
for lat, lng, borough, neighborhood in zip(dwnt_data['Latitude'], dwnt_data['Longitude'], dwnt_data['Borough'], dwnt_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(dwnt_toronto)  
    
dwnt_toronto

Again, changing the zoom variable to establish a clearer view of all the neighbourhoods in the Downtown Toronto Borough. 

These are simple yet effective ways to get a general overview of the City of Toronto and visualise how the neighbourhoods may relate to each other. 