# Segmenting and Clustering Neighbourhoods in Toronto
## For the Coursera Capstone Project - Assignment 2
### IBM Applied Data Science Professional Certificate

## Scraping Toronto Location Data

In [1]:
#Import all necessary libraries for screen scraping.
import pandas as pd
import numpy as np
!pip install lxml html5lib beautifulsoup4
print("Libraries installed")

Libraries installed


In [2]:
#Defines the URL to scrape and reads its tables into a dataframe
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
dfs = pd.read_html(url)

In [3]:
#Shows how many tables are contained in the dataframe
print(len(dfs))

3


In [4]:
#This cell shows which table is in at index 0
print(dfs[0])

    Postal Code           Borough  \
0           M1A      Not assigned   
1           M2A      Not assigned   
2           M3A        North York   
3           M4A        North York   
4           M5A  Downtown Toronto   
5           M6A        North York   
6           M7A  Downtown Toronto   
7           M8A      Not assigned   
8           M9A         Etobicoke   
9           M1B       Scarborough   
10          M2B      Not assigned   
11          M3B        North York   
12          M4B         East York   
13          M5B  Downtown Toronto   
14          M6B        North York   
15          M7B      Not assigned   
16          M8B      Not assigned   
17          M9B         Etobicoke   
18          M1C       Scarborough   
19          M2C      Not assigned   
20          M3C        North York   
21          M4C         East York   
22          M5C  Downtown Toronto   
23          M6C              York   
24          M7C      Not assigned   
25          M8C      Not assigned   
2

In [5]:
#This is the table we're looking for, so let's assign it to a container
hoods = dfs[0]

In [6]:
#Shows the shape of the dataframe
hoods.shape

(180, 3)

In [7]:
#Shows the first five rows of the dataframe
hoods.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [8]:
#Assigns all rows where the Borough column has the value Not Assigned to a container
NAboroughs = hoods[hoods['Borough'] == 'Not assigned'].index
NAboroughs

Int64Index([  0,   1,   7,  10,  15,  16,  19,  24,  25,  28,  29,  33,  34,
             35,  37,  38,  42,  43,  44,  51,  52,  53,  60,  61,  62,  69,
             70,  71,  78,  79,  87,  88,  96,  97, 101, 105, 106, 110, 115,
            118, 119, 123, 124, 125, 127, 128, 131, 132, 133, 134, 136, 137,
            140, 141, 145, 146, 149, 150, 154, 155, 158, 159, 161, 162, 163,
            164, 166, 167, 170, 171, 172, 173, 174, 175, 176, 177, 179],
           dtype='int64')

In [9]:
#Drop the Not Assigned Boroughs from the dataframe & give the shape of the dataframe afterwards
hoods.drop(NAboroughs, inplace=True)
hoods.shape

(103, 3)

In [10]:
#Find duplicate values in the Postal Code column
dupehoods = hoods.duplicated(['Postal Code'])
print("Out of 103 columns in the dataframe, none are duplicate values.")
dupehoods.value_counts()

Out of 103 columns in the dataframe, none are duplicate values.


False    103
dtype: int64

In [11]:
#Find Neighborhoods with Not Assigned values
NANeighborhood = hoods[hoods['Neighborhood'] == 'Not assigned'].index
print('No Neighborhoods are Not Assigned.')
NANeighborhood


No Neighborhoods are Not Assigned.


Int64Index([], dtype='int64')

In [12]:
#Reset Index, show hoods shape
hoods.reset_index(drop=True, inplace=True)
hoods.shape

(103, 3)

## Pulling in Latitude & Longitude

### Using CSV File Due to Geocoder Failure

In [13]:
#Read in the Postal Code Lat & Long file
pc = 'https://cocl.us/Geospatial_data'
pc = pd.read_csv(pc)
pc.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [14]:
#Merge the Neighborhoods dataframe with the Lat & Long dataframe
hoods_pc = pd.merge(hoods, pc, on="Postal Code")

In [15]:
#Show the newly merged dataframe
hoods_pc

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


## Explore & Cluster Toronto Neighborhoods

### Working with only the Boroughs with "Toronto" in the name

In [16]:
#Create a new dataframe with only Boroughs with "Toronto" in the name
new_hoods = hoods_pc[hoods_pc.Borough.str.contains("Toronto")]
new_hoods.reset_index(drop=True, inplace=True)
new_hoods

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031
5,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
6,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
7,M6G,Downtown Toronto,Christie,43.669542,-79.422564
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


In [17]:
#Install & import geopy and folium
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

#!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

print("Geopy and Folium imported")

Geopy and Folium imported


In [18]:
#Use Geopy to get the longitude & latitude of Toronto

address = 'Toronto, ON'
geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geographical coordinates of Toronto are 43.6534817, -79.3839347.


In [19]:
#Create a map of Toronto using latitude & longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood in zip(new_hoods['Latitude'], new_hoods['Longitude'], new_hoods['Borough'], new_hoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [20]:
#Let's take a look at which Boroughs are in Toronto
sorted = new_hoods.groupby('Borough')
sorted.first()

Unnamed: 0_level_0,Postal Code,Neighborhood,Latitude,Longitude
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Central Toronto,M4N,Lawrence Park,43.72802,-79.38879
Downtown Toronto,M5A,"Regent Park, Harbourfront",43.65426,-79.360636
East Toronto,M4E,The Beaches,43.676357,-79.293031
West Toronto,M6H,"Dufferin, Dovercourt Village",43.669005,-79.442259


In [21]:
#Let's create a new dataframe for each of the Boroughs
CT_data = new_hoods[new_hoods['Borough'] == 'Central Toronto'].reset_index(drop=True)
DT_data = new_hoods[new_hoods['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
ET_data = new_hoods[new_hoods['Borough'] == 'East Toronto'].reset_index(drop=True)
WT_data = new_hoods[new_hoods['Borough'] == 'West Toronto'].reset_index(drop=True)

In [22]:
#Let's get the geographical coordinates for each of the Boroughs
#Central Toronto
CT_address = 'Central Toronto, ON'

geolocator = Nominatim(user_agent="tor_explorer")
CTlocation = geolocator.geocode(CT_address)
CTlatitude = CTlocation.latitude
CTlongitude = CTlocation.longitude
print('The geograpical coordinate of Central Toronto are {}, {}.'.format(CTlatitude, CTlongitude))

#Downtown Toronto
DT_address = 'Downtown Toronto, ON'

geolocator = Nominatim(user_agent="tor_explorer")
DTlocation = geolocator.geocode(DT_address)
DTlatitude = DTlocation.latitude
DTlongitude = DTlocation.longitude
print('The geograpical coordinate of Downtown Toronto are {}, {}.'.format(DTlatitude, DTlongitude))

#East Toronto
ET_address = 'East Toronto, ON'

geolocator = Nominatim(user_agent="tor_explorer")
ETlocation = geolocator.geocode(ET_address)
ETlatitude = ETlocation.latitude
ETlongitude = ETlocation.longitude
print('The geograpical coordinate of East Toronto are {}, {}.'.format(ETlatitude, ETlongitude))

#West Toronto
WT_address = 'West Toronto, ON'

geolocator = Nominatim(user_agent="tor_explorer")
WTlocation = geolocator.geocode(WT_address)
WTlatitude = WTlocation.latitude
WTlongitude = WTlocation.longitude
print('The geograpical coordinate of West Toronto are {}, {}.'.format(WTlatitude, WTlongitude))
#It looks like Central Toronto and West Toronto have the same coordinates

The geograpical coordinate of Central Toronto are 43.6534817, -79.3839347.
The geograpical coordinate of Downtown Toronto are 43.6563221, -79.3809161.
The geograpical coordinate of East Toronto are 43.72178945, -79.37402706301704.
The geograpical coordinate of West Toronto are 43.6534817, -79.3839347.


In [23]:
# create map of Central Toronto
CT_map = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, label in zip(CT_data['Latitude'], CT_data['Longitude'], CT_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(CT_map)  
    
CT_map

In [24]:
# create map of Downtown Toronto
DT_map = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, label in zip(DT_data['Latitude'], DT_data['Longitude'], DT_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(DT_map)  
    
DT_map

In [25]:
# create map of East Toronto
ET_map = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, label in zip(ET_data['Latitude'], ET_data['Longitude'], ET_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(ET_map)  
    
ET_map

In [26]:
# create map of Central Toronto
WT_map = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, label in zip(WT_data['Latitude'], WT_data['Longitude'], WT_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(WT_map)  
    
WT_map

In [27]:
#Print some conclusions after looking at the maps of each borough:
print('Central Toronto is much further North than the other three boroughs.')
print('There is a clear delineation between Downtown Toronto and East Toronto.')
print('West Toronto does not have a clear separator from Downtown Toronto.')

Central Toronto is much further North than the other three boroughs.
There is a clear delineation between Downtown Toronto and East Toronto.
West Toronto does not have a clear separator from Downtown Toronto.
