# **Segmenting and Clustering Neighborhoods in Toronto**

# In this assignment, we will explore, segment, and cluster theneighborhoods in the city of Toronto.

In [57]:
#library to handle data in a vectorized manner
import numpy as np

#library for data analysis
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#library to handle json files
import json

#convert an address into longitude and latitude values
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim

#library to handle requests
import requests

#transform json file into pandas dataframe
from pandas.io.json import json_normalize

#matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

#import kmeans from clustering stage
from sklearn.cluster import KMeans

#import map rendering library
!conda install -c conda-forge folium=0.5.0 --yes
import folium

#import BeautifulSoup
from bs4 import BeautifulSoup

from IPython.display import Image 
from IPython.core.display import HTML 

import xml

print('libraries imported')

/bin/bash: conda: command not found
/bin/bash: conda: command not found
libraries imported


# For the Toronto neighborhood data, a Wikipedia page exists that has all the information we need to explore and cluster the neighborhoods in Toronto. We will scrape the Wikipedia page and wrangle the data, clean it, and then read it into a pandas dataframe so that it is in a structured format like the New York dataset.

In [60]:
#scrape Wikipedia
url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

soup = BeautifulSoup(url,'lxml')
soup

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>List of postal codes of Canada: M - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"b4a616bf-5ee5-46af-865c-e46dc6dad3bf","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":979555370,"wgRevisionId":979555370,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Short description is different from Wikidata","Communications in Ontario","P

In [82]:
#locate table and use tags to find postal code by 'borough' and 'neighbourhood'
table_post = soup.find('table')
fields = table_post.find_all('td')

postalcode = []
borough = []
neighborhood = []

for i in range(0, len(fields), 3):
  postalcode.append(fields[i].text.strip())
  borough.append(fields[i+1].text.strip())
  neighborhood.append(fields[i+2].text.strip())

df_postalcode = pd.DataFrame(data = [postalcode, borough, neighborhood]).transpose()
df_postalcode.columns = ['Postalcode', 'Borough', 'Neighborhood']
df_postalcode.head(10)

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


In [83]:
#remove 'Not assigned'
df_postalcode['Borough'].replace('Not assigned', np.nan, inplace = True)
df_postalcode.dropna(subset = ['Borough'], inplace = True)
df_postalcode.head(10)

Unnamed: 0,Postalcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


In [85]:
df_postalcode = df_postalcode.groupby(['Postalcode', 'Borough'])['Neighborhood'].apply(','.join).reset_index()
df_postalcode.columns = ['Postalcode', 'Borough', 'Neighborhood']
df_postalcode

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [86]:
df_postalcode.shape

(103, 3)

# Now that we have built a dataframe of the postal code of each neighborhood along with the borough name and neighborhood name, in order to utilize the Foursquare location data, we need to get the latitude and the longitude coordinates of each neighborhood.

In [75]:
#add latitude and longitude for our Postal codes using this csv file: http://cocl.us/Geospatial_data
!wget -O geospatial_data.csv 'http://cocl.us/Geospatial_data'

--2020-10-28 05:32:07--  http://cocl.us/Geospatial_data
Resolving cocl.us (cocl.us)... 169.63.96.211, 169.63.97.45
Connecting to cocl.us (cocl.us)|169.63.96.211|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://cocl.us/Geospatial_data [following]
--2020-10-28 05:32:07--  https://cocl.us/Geospatial_data
Connecting to cocl.us (cocl.us)|169.63.96.211|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2020-10-28 05:32:08--  https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Resolving ibm.box.com (ibm.box.com)... 107.152.26.197
Connecting to ibm.box.com (ibm.box.com)|107.152.26.197|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2020-10-28 05:32:08--  https://ibm.box.com/public/static/9afzr83pp

In [88]:
#load csv into a pandas dataframe
geospatial_data = pd.read_csv('geospatial_data.csv')
geospatial_data.columns = ['Postalcode', 'Latitude', 'Longitude']
geospatial_data.head()

Unnamed: 0,Postalcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [89]:
#merge 2 dataframes
df_merged_postalcode = df_postalcode.merge(geospatial_data, on = 'Postalcode')
df_merged_postalcode.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [91]:
print('The dataframe has {} boroughs and {} neighborhoods'.format(len(df_merged_postalcode['Borough'].unique()), df_merged_postalcode.shape[0]))

The dataframe has 10 boroughs and 103 neighborhoods


In [92]:
#get the geographical coordinates of Toronto
address = 'Toronto'

#to define an instance of the geocoder, we need to define a user_agent
geolocator = Nominatim(user_agent = 't_explorer')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinate of Toronto are {}, {}'.format(latitude, longitude))

The geographical coordinate of Toronto are 43.6534817, -79.3839347


In [96]:
#visualizat Toronto the neighborhoods in it
map_toronto = folium.Map(location = [latitude, longitude], zoom_start = 11)

#add markers
for lat, lng, label in zip(df_merged_postalcode['Latitude'], df_merged_postalcode['Longitude'], df_merged_postalcode['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto