## Exploring and clustering neighborhoods in Toronto - IBM data science certification

### Part 1 - building a dataframe of the postal code of each neighborhood along with the borough name and neighborhood name

In [3]:
# IMPORTING LIBRARIES
import requests
from bs4 import BeautifulSoup
from csv import writer
import pandas as pd
import numpy as np

In [4]:
# SETTING UP BEAUTIFULL SOUP

response = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M') # REQUESTING URL
soup = BeautifulSoup(response.text, "lxml")
wiki_table = soup.find("table",{ "class" : "wikitable sortable" } ) #LOCATING WIKIPEDIA TABLE
table_data = wiki_table.tbody.find_all("tr") # LOCATING THE HEADER

In [5]:
#lOCATING AND SAVING Wikipedia COLUMN HEADERS
column = [x.text.strip() for x in table_data[0].find_all("th")]
column # COLUMN HEADERS

['Postcode', 'Borough', 'Neighbourhood']

In [6]:
# FINDING AND SAVING EACH ELEMENT OF TABLE AS PER COLUMNS AND ROWS
data_table = pd.DataFrame(columns = column)
rows = []
for i in range(1,len(table_data)-1):
    rows = [x.text.strip() for x in table_data[i].find_all("td")]
    data_table.loc[i] = rows

data_table.head() # SAMPLE OF TABLE

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


In [7]:
# LOOKING AT UN-ASSIGNED BOROUGH AND NEIGHBOURHOOD 
data_table.reset_index(inplace = True)
data_table = data_table[data_table["Borough"] != "Not assigned"]
data_table["Neighbourhood"][data_table["Neighbourhood"] == "Not assigned"] = data_table["Borough"]

In [8]:
# POSTCODES WITH MULTIPLE NEIGHBOUR HOOD

aggre_postcode = data_table.pivot_table(index = "Postcode", values = "Neighbourhood", aggfunc = "count")
postcode_mult = aggre_postcode.index[aggre_postcode["Neighbourhood"] > 1]
postcode_mult

Index(['M1B', 'M1C', 'M1E', 'M1K', 'M1L', 'M1M', 'M1N', 'M1P', 'M1R', 'M1T',
       'M1V', 'M2J', 'M2L', 'M2M', 'M3C', 'M3H', 'M3J', 'M3K', 'M4B', 'M4K',
       'M4L', 'M4T', 'M4V', 'M4X', 'M5B', 'M5H', 'M5J', 'M5K', 'M5L', 'M5M',
       'M5P', 'M5R', 'M5S', 'M5T', 'M5V', 'M5X', 'M6A', 'M6H', 'M6J', 'M6K',
       'M6L', 'M6M', 'M6N', 'M6P', 'M6R', 'M6S', 'M8V', 'M8W', 'M8X', 'M8Y',
       'M8Z', 'M9B', 'M9C', 'M9M', 'M9R', 'M9V'],
      dtype='object', name='Postcode')

In [9]:
# POSTCODES WITH single NEIGHBOUR HOOD

postcode_single = aggre_postcode.index[aggre_postcode["Neighbourhood"] == 1]
postcode_single

Index(['M1G', 'M1H', 'M1J', 'M1S', 'M1W', 'M1X', 'M2H', 'M2K', 'M2N', 'M2P',
       'M2R', 'M3A', 'M3B', 'M3L', 'M3M', 'M3N', 'M4A', 'M4C', 'M4E', 'M4G',
       'M4H', 'M4J', 'M4M', 'M4N', 'M4P', 'M4R', 'M4S', 'M4W', 'M4Y', 'M5A',
       'M5C', 'M5E', 'M5G', 'M5N', 'M5W', 'M6B', 'M6C', 'M6E', 'M6G', 'M7A',
       'M7R', 'M7Y', 'M9A', 'M9L', 'M9N', 'M9P', 'M9W'],
      dtype='object', name='Postcode')

In [10]:
# TABLE WITH MULTIPLE NEIGHBOURHOOD FOR SINGLE postocde
tab1 = data_table.loc[data_table.Postcode.isin(postcode_mult),:].sort_values(by='Postcode').reset_index(drop=True)
tab1

Unnamed: 0,index,Postcode,Borough,Neighbourhood
0,11,M1B,Scarborough,Rouge
1,12,M1B,Scarborough,Malvern
2,29,M1C,Scarborough,Port Union
3,28,M1C,Scarborough,Rouge Hill
4,27,M1C,Scarborough,Highland Creek
5,42,M1E,Scarborough,Guildwood
6,44,M1E,Scarborough,West Hill
7,43,M1E,Scarborough,Morningside
8,91,M1K,Scarborough,East Birchmount Park
9,92,M1K,Scarborough,Ionview


In [11]:
# TABLE WITH single NEIGHBOURHOOD FOR SINGLE postocde
tab2 = data_table.loc[data_table.Postcode.isin(postcode_single),:].sort_values(by='Postcode').reset_index(drop=True)
tab2

Unnamed: 0,index,Postcode,Borough,Neighbourhood
0,53,M1G,Scarborough,Woburn
1,62,M1H,Scarborough,Cedarbrae
2,76,M1J,Scarborough,Scarborough Village
3,180,M1S,Scarborough,Agincourt
4,236,M1W,Scarborough,L'Amoreaux West
5,246,M1X,Scarborough,Upper Rouge
6,63,M2H,North York,Hillcrest Village
7,94,M2K,North York,Bayview Village
8,142,M2N,North York,Willowdale South
9,154,M2P,North York,York Mills West


In [12]:
#nEIGHBOUR LIST FROM TABLE 1
codes = []
boroughs = []
neighbours = []

for code in postcode_mult:
    
    table = tab1.loc[tab1.Postcode == code, :] # split 't1' to specific table by 'postcode'
    
    code = np.unique(table.Postcode) # extract unique 'postcode' in the column
    codes.append(code[0])
    
    borough = np.unique(table.Borough) # extract unique 'borough' in the column
    boroughs.append(borough[0])
    
    neighbour = table["Neighbourhood"].tolist() # extract all 'neighbourhood' & convert to list format
    neighbour = ', '.join(neighbour) # use .join() method to combine each 'neighbourhood' with ", "
    neighbours.append(neighbour)



In [13]:
# NEW DF FOR POSTCODE WITH MULTIPLE NEIGHBOURHOODS

table_new = pd.DataFrame({'Postcode': codes, 'Borough': boroughs, 'Neighbourhood': neighbours})
table_new.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek"
2,M1E,Scarborough,"Guildwood, West Hill, Morningside"
3,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
4,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"


In [14]:
# CREATING NEW TABLE (TREATED) WITH SINGLE TABLE, WITH MULTIPLE NEIGHBOURHOODS ADDED AS SINGLE ROWS

new_data = pd.concat([table_new, tab2], axis=0).sort_values(by="Postcode").reset_index(drop=True)
new_data.drop("index", axis = 1, inplace = True)
new_data.shape

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  This is separate from the ipykernel package so we can avoid doing imports until


(103, 3)

In [15]:
new_data.shape

(103, 3)

 ### PART 2 - getting the latitude and the longitude coordinates of each neighborhood

In [16]:
# importing libraries
import geopy
import geopandas

In [17]:
geo_lati = {}
geo_long = {}

geo = geopy.Nominatim(user_agent="Detector", timeout=20)

for z in np.unique(new_data.Borough):
    
    location = geo.geocode("{}, Toronto, Ontario".format(z))
    
    geo_lati[x] = location.latitude
    geo_long[x] = location.longitude

In [18]:
# latitude
geo_lati

{'Central Toronto': 43.6449033,
 'Downtown Toronto': 43.6563221,
 'East Toronto': 43.626243,
 'East York': 43.699971000000005,
 'Etobicoke': 43.671459150000004,
 'Mississauga': 43.6597818,
 'North York': 43.7543263,
 'Scarborough': 43.773077,
 'West Toronto': 43.6449033,
 'York': 43.67910515}

In [19]:
# Longitude
geo_long

{'Central Toronto': -79.3818364,
 'Downtown Toronto': -79.3809161,
 'East Toronto': -79.396962,
 'East York': -79.33251996261595,
 'Etobicoke': -79.55249206611668,
 'Mississauga': -79.64739965714713,
 'North York': -79.44911696639593,
 'Scarborough': -79.257774,
 'West Toronto': -79.3818364,
 'York': -79.49118414007154}

In [74]:
# Error; Hence going to use csv
geo = geopy.Nominatim(user_agent="Detector", timeout=50)

for x, y in zip(new_data.Postcode[0:5], new_data.Borough[0:5]):
    
    loc = geo.geocode("{}, {}, Toronto, Canada".format(x, y))
    print("{}, {}: latitude {}, longitude {}".format(x, y, loc.latitude, loc.longitude))

M1B, Scarborough: latitude 43.773077, longitude -79.257774
M1C, Scarborough: latitude 43.773077, longitude -79.257774


AttributeError: 'NoneType' object has no attribute 'latitude'

In [21]:
# using the data provided instead, geopy seems to be broken

geo_data = pd.read_csv("http://cocl.us/Geospatial_data")
geo_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [22]:
# merging to get lat and longitude

geo_df = geo_data.loc[geo_data["Postal Code"].isin(new_data.Postcode.values), :]
print(geo_df.shape)

geo_table = new_data.merge(geo_df, left_on="Postcode", right_on="Postal Code")
geo_table.head()

(103, 3)


Unnamed: 0,Borough,Neighbourhood,Postcode,Postal Code,Latitude,Longitude
0,Scarborough,"Rouge, Malvern",M1B,M1B,43.806686,-79.194353
1,Scarborough,"Port Union, Rouge Hill, Highland Creek",M1C,M1C,43.784535,-79.160497
2,Scarborough,"Guildwood, West Hill, Morningside",M1E,M1E,43.763573,-79.188711
3,Scarborough,Woburn,M1G,M1G,43.770992,-79.216917
4,Scarborough,Cedarbrae,M1H,M1H,43.773136,-79.239476


In [23]:
geo_table = geo_table[["Postcode", "Borough", "Neighbourhood", "Latitude", "Longitude"]]
geo_table.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, West Hill, Morningside",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [24]:
geo_table.shape

(103, 5)

### Part 3 - Explore and cluster the neighborhoods in Toronto

In [28]:
import folium
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import json # library to handle JSON files
address = 'Toronto, Ontario'

geolocator = geopy.Nominatim(user_agent="ny_explorer", timeout=30)
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto {}, {}.'.format(latitude, longitude))

# create map of Toronto using latitude and longitude values
map_of_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)


# add markers to map based on coordinates of postcode
for lat, lng, neighbourhood in zip(geo_table['Latitude'], geo_table['Longitude'], geo_table['Neighbourhood']):
    label = '{}'.format(neighbourhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='grey',
        fill=True,
        fill_color='#009fe3',
        fill_opacity=0.7,
        parse_html=False).add_to(map_of_toronto)  
    
map_of_toronto

The geograpical coordinate of Toronto 43.653963, -79.387207.


In [29]:
#Foursquare ID
CLIENT_ID = "GPB2DWEOQ2QZ1D1DXQYB1IXWQT0YCXRJWR1OHQ5BCB1BTBBP" 
#Foursquare Secret
CLIENT_SECRET = "ILCEJ32YAXA3E5W3FB1WFAQ2RHXEFX2YW414DY4YQBUXJKGB"
# Foursquare API version
VERSION = "20161225"

In [31]:
LIMIT = 5
radius = 500

location_list = [] # list to store data from Foursquare API requests

for neighbourhood, latitude, longitude in zip(geo_table.Neighbourhood, geo_table.Latitude, geo_table.Longitude):
    
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude, 
    longitude, 
    radius, 
    LIMIT)
    
    data = requests.get(url).json()
    
    # use len() to check if any data within "items" (len = 0 if nothing)
    length = len(data['response']['groups'][0]['items'])
    if length == 0:
        continue # skip the row if nothing is found
    else:
        venue = data['response']['groups'][0]['items'][0]['venue']
        
        # extract info within 'venue'
        name = venue['name']
        lat = venue['location']['lat']
        lon = venue['location']['lng']
        cat = venue['categories'][0]['name']
        
        location_list.append([(neighbourhood, latitude, longitude, name, lat, lon, cat)])

In [32]:
dat = pd.DataFrame(x for row in location_list for x in row)
dat.columns = ['Neighbourhood','N_Latitude','N_Longitude','Venue','V_Latitude','V_Longitude','category']
dat.head()

Unnamed: 0,Neighbourhood,N_Latitude,N_Longitude,Venue,V_Latitude,V_Longitude,category
0,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,"Port Union, Rouge Hill, Highland Creek",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,"Guildwood, West Hill, Morningside",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store
3,Woburn,43.770992,-79.216917,Starbucks,43.770037,-79.221156,Coffee Shop
4,Cedarbrae,43.773136,-79.239476,Federick Restaurant,43.774697,-79.241142,Hakka Restaurant


In [35]:
# create map of Toronto using latitude and longitude values
map_dat = folium.Map(location=[latitude, longitude], zoom_start=10)


# add markers to map based on coordinates of postcode
for lat, lng, neighbourhood in zip(dat['V_Latitude'], dat['V_Longitude'], dat['Venue']):
    label = '{}'.format(neighbourhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='grey',
        fill=True,
        fill_color='#009fe3',
        fill_opacity=0.7,
        parse_html=False).add_to(map_dat)  
    
map_dat

## K- means clustering based on lat and long of neighbourhood

In [66]:
# setting up packages for clustering
import numpy as np
import pandas as pdb
from matplotlib import pyplot as plt
from sklearn.datasets.samples_generator import make_blobs
from sklearn.cluster import KMeans

import matplotlib.cm as cm
import matplotlib.colors as colors
import folium # map rendering library

In [67]:
geo_table

Unnamed: 0,Cluster Labels,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,0,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek",43.784535,-79.160497
2,0,M1E,Scarborough,"Guildwood, West Hill, Morningside",43.763573,-79.188711
3,0,M1G,Scarborough,Woburn,43.770992,-79.216917
4,0,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,0,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,0,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,2,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,0,M1M,Scarborough,"Scarborough Village West, Cliffside, Cliffcrest",43.716316,-79.239476
9,2,M1N,Scarborough,"Cliffside West, Birch Cliff",43.692657,-79.264848


In [60]:
# dataframe for clustering

# set number of clusters
kclusters = 5

toronto_grouped_clustering = geo_table.drop(["Postcode","Borough","Neighbourhood"],axis= 1)
#toronto_grouped_clustering

# # run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# # check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 2, 0, 2])

In [61]:
geo_table.insert(0, 'Cluster Labels', kmeans.labels_)


geo_table.head()

Unnamed: 0,Cluster Labels,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,0,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek",43.784535,-79.160497
2,0,M1E,Scarborough,"Guildwood, West Hill, Morningside",43.763573,-79.188711
3,0,M1G,Scarborough,Woburn,43.770992,-79.216917
4,0,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [69]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]


In [72]:

# add clusters to the map
markers_colors = []
for lat, lon, poi, cluster in zip(geo_table['Latitude'], geo_table['Longitude'], geo_table['Borough'], geo_table['Cluster Labels']):
    
    label = folium.Popup(
        str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters