# Assignment on Segmenting and Clustering Neighborhoods in Toronto

Import required libraries

In [359]:
import numpy as np 
import pandas as pd 
import re
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [360]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from urllib.request import urlopen
from bs4 import BeautifulSoup
import json # library to handle JSON files
import ast

In [361]:
from geopy.geocoders import Nominatim 
import requests 
from pandas.io.json import json_normalize 
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
# import k-means from clustering stage
from sklearn.cluster import KMeans
import folium # map rendering library

### Method 1 - Scrap Using Beautiful Soup :

In [362]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
html = urlopen(url)

Convert HTML text to Python Object

In [363]:
soup = BeautifulSoup(html, 'lxml')
type(soup)

bs4.BeautifulSoup

In [364]:
# add a check to break the code since there are more than 1 table, table 1 has 180rows, so exist loop after 179
postal = []
borough = []
neighborhood = []
# enumerate usage google
for i, items in enumerate(soup.find_all('tr')[1::1]):
    data = items.find_all(['th','td'])
    postal.append(data[0].get_text().replace('\n', ''))
    borough.append(data[1].get_text().replace('\n', ''))
    neighborhood.append(data[2].get_text().replace('\n', ''))
    if i == 179:
        break

In [365]:
table_dict = {
                "Postal Code": postal,
                 "Borough": borough,
                 "Neighborhood": neighborhood
}

In [366]:
table = pd.DataFrame(table_dict)
table.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


### Alternative Method - Scrape using Pandas :

In [367]:
# directly read html using pandas
dfs = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
# returns a list of all the tables in the webpage
# here we have 3 tables
len(dfs)

3

In [368]:
#The one we need is the first table in webpage
df = dfs[0]
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [369]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 3 columns):
Postal code     180 non-null object
Borough         180 non-null object
Neighborhood    103 non-null object
dtypes: object(3)
memory usage: 4.3+ KB


In [372]:
#Create New data frame excluding Borough having Not Assigned value
df_new = df[df['Borough'] != 'Not assigned']
df_new.head()

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [373]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103 entries, 2 to 178
Data columns (total 3 columns):
Postal code     103 non-null object
Borough         103 non-null object
Neighborhood    103 non-null object
dtypes: object(3)
memory usage: 3.2+ KB


In [374]:
#Create a group of Borough and assign each neighborhood against them
df1=df_new.groupby("Postal code").agg(lambda x: ','.join(x))
df1.head()

Unnamed: 0_level_0,Borough,Neighborhood
Postal code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,Malvern / Rouge
M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
M1E,Scarborough,Guildwood / Morningside / West Hill
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae


In [375]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 103 entries, M1B to M9W
Data columns (total 2 columns):
Borough         103 non-null object
Neighborhood    103 non-null object
dtypes: object(2)
memory usage: 2.4+ KB


In [376]:
#Remove the index
df1=df1.reset_index()

In [377]:
#Assign value of Borough to Neighborhood where Neighborhood = "Not assigned"
df1.loc[df1['Neighborhood']=="Not assigned",'Neighborhood']=df1.loc[df1['Neighborhood']=="Not assigned",'Borough']

In [378]:
df1.shape

(103, 3)

In [379]:
# Check the data heds
df1.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [380]:
#Replace the forward slash character with comma fron neighborhood  and check data head.
df1['Neighborhood']=df1['Neighborhood'].str.replace('/', ',')
df1.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern , Rouge"
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek"
2,M1E,Scarborough,"Guildwood , Morningside , West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [382]:
df1.shape

(103, 3)

In [338]:
#Read geo spatial data
geo = pd.read_csv('http://cocl.us/Geospatial_data')
geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [339]:
#Concatenate geography data with toronat data using postal code column
source = pd.concat([df1, geo], axis=1, keys='Postal Code' )
source.head()

Unnamed: 0_level_0,P,P,P,o,o,o
Unnamed: 0_level_1,Postal code,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Malvern , Rouge",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood , Morningside , West Hill",M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


In [340]:
#Remove postal code of geography data after concatination
source = source.drop(source.columns[[3]], axis=1)
source.head()

Unnamed: 0_level_0,P,P,P,o,o
Unnamed: 0_level_1,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [341]:
#Drop Multilevel
source.columns=source.columns.droplevel()
source.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [342]:
source.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 5 columns):
Postal code     103 non-null object
Borough         103 non-null object
Neighborhood    103 non-null object
Latitude        103 non-null float64
Longitude       103 non-null float64
dtypes: float64(2), object(3)
memory usage: 4.1+ KB


### Clustering the Neighborhoods

#### Use geopy library to get the latitude and longitude values in Toranto.

In [344]:
address = 'ALDER RD , TORONTO, ON'
geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toranto City is {}, {}.'.format(latitude,longitude))

The geograpical coordinate of Toranto City is 43.7036881, -79.3255013.


In [346]:
# create map of Toranto using latitude and longitude values
map_toranto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(source['Latitude'], source['Longitude'], source['Borough'], source['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toranto)
map_toranto

In [349]:
#Select data for Scarborough
source_data = source[source['Borough'] == 'Scarborough'].reset_index(drop=True)
source_data.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [350]:
source_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17 entries, 0 to 16
Data columns (total 5 columns):
Postal code     17 non-null object
Borough         17 non-null object
Neighborhood    17 non-null object
Latitude        17 non-null float64
Longitude       17 non-null float64
dtypes: float64(2), object(3)
memory usage: 760.0+ bytes


In [352]:
#Use geopy library to get the latitude and longitude values in Scarborough
address = 'Scarborough, ON'

geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Scarborough are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Scarborough are 43.773077, -79.257774.


In [353]:
# create map of Scarborough using latitude and longitude values
map_eto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(source_data['Latitude'], source_data['Longitude'], source_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_eto)  
    
map_eto

### Lets explorer first neighborhood's name

In [294]:
source_data.loc[0, 'Neighborhood']

'Malvern , Rouge'

In [354]:
neighborhood_latitude = source_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = source_data.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = source_data.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Malvern , Rouge are 43.806686299999996, -79.19435340000001.


## 2. Explore Neighborhoods in Scarborough

#### Let's create a function to repeat the same process to all the neighborhoods in Scarborough

In [355]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            "WF2HLZPWOXQ0BDJ4DIO4ESR4QBDE5Q3NFM1Y4KQUQEG2WCR2", 
            "YCZKUFTVKWSJZMIU3P4WZA0ZXPE0R3SEKKSYQXCGB3RE1KZC", 
            "20180604", 
            lat, 
            lng, 
            radius, 
            30)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)


In [356]:
#Extract details of all neighborhood of Scarborough.
eto_venues = getNearbyVenues(names=source_data['Neighborhood'],
                                   latitudes=source_data['Latitude'],
                                   longitudes=source_data['Longitude']
                                  )


Malvern , Rouge
Rouge Hill , Port Union , Highland Creek
Guildwood , Morningside , West Hill
Woburn
Cedarbrae
Scarborough Village
Kennedy Park , Ionview , East Birchmount Park
Golden Mile , Clairlea , Oakridge
Cliffside , Cliffcrest , Scarborough Village West
Birch Cliff , Cliffside West
Dorset Park , Wexford Heights , Scarborough Town Centre
Wexford , Maryvale
Agincourt
Clarks Corners , Tam O'Shanter , Sullivan
Milliken , Agincourt North , Steeles East , L'Amoreaux East
Steeles West , L'Amoreaux West
Upper Rouge


In [298]:
#Size of Data Frame
print(eto_venues.shape)
#eto_venues.head()

(91, 7)
