<a href="https://colab.research.google.com/github/abdurrahmanshidiq/IBM-Capstone-Project/blob/master/Segmenting_and_Clustering_Neighborhoods_in_Toronto.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1 align=center><font size = 5>Segmenting and Clustering Neighborhoods in New York City</font></h1>

# Table of Content


1. Chapter 1 : Scrapping Wikipedia to get Toronto Neighborhoods Data 
2. Chapter 2 : Load Geolocation Data, then Merged to Toronto Neighborhoods Data based on Postal Code
3. Chapter 3 : Explore & Cluster the Neighborhoods in Toronto using Foursquare API & K-Means Clustering

# Chapter 1

Scrapping Wikipedia to get *Toronto Neighborhoods Data*

In [44]:
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

from bs4 import BeautifulSoup
import requests
import json
from geopy.geocoders import Nominatim
from pandas import json_normalize

import folium
from folium import plugins

from sklearn.cluster import KMeans

In [45]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
raw = requests.get(url).text

In [46]:
soup = BeautifulSoup(raw, 'html5lib')

In [47]:
len(soup.find('table'))

2

In [48]:
table = soup.find_all('table')[0].find_all('td')

post_code = []
borough = []
neigh = []

for i,row in enumerate(table):
  if table[i].span.text == 'Not assigned':
    pass
  else:
    post_code.append(table[i].p.text[:3])
    borough.append(table[i].span.text.split('(')[0])
    neigh.append(table[i].span.text.split('(')[1].replace('/',',').replace(')',''.strip(' ')))

**Alternative Code**
<!--
table_contents=[]
table=soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

# print(table_contents)
df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})
-->

In [49]:
#Checking Unique Values for Borough

df_toronto = pd.DataFrame(columns=['PostalCode', 'Borough', 'Neighborhood'])
df_toronto['PostalCode'] = post_code
df_toronto['Borough'] = borough
df_toronto['Neighborhood'] = neigh

df_toronto['Borough'].unique()

array(['North York', 'Downtown Toronto', "Queen's Park", 'Etobicoke',
       'Scarborough', 'East York', 'York', 'East Toronto', 'West Toronto',
       'East YorkEast Toronto', 'Central Toronto',
       'MississaugaCanada Post Gateway Processing Centre',
       'Downtown TorontoStn A PO Boxes25 The Esplanade',
       'EtobicokeNorthwest',
       'East TorontoBusiness reply mail Processing Centre969 Eastern'],
      dtype=object)

In [50]:
# Fixing Borough Name
# Getting the Data Index
print(df_toronto[df_toronto['Borough']=='East YorkEast Toronto'].index)
print(df_toronto[df_toronto['Borough']=='MississaugaCanada Post Gateway Processing Centre'].index)
print(df_toronto[df_toronto['Borough']=='Downtown TorontoStn A PO Boxes25 The Esplanade'].index)
print(df_toronto[df_toronto['Borough']=='EtobicokeNorthwest'].index)
print(df_toronto[df_toronto['Borough']=='East TorontoBusiness reply mail Processing Centre969 Eastern'].index)

Int64Index([35], dtype='int64')
Int64Index([76], dtype='int64')
Int64Index([92], dtype='int64')
Int64Index([94], dtype='int64')
Int64Index([100], dtype='int64')


In [51]:
# Fixing Borough Name
# Replace wrong `Borough` name with the correct name
df_toronto.loc[35,'Borough'] = df_toronto.loc[35,'Borough'].replace('East YorkEast Toronto', 'East York')
df_toronto.loc[76,'Borough'] = df_toronto.loc[76,'Borough'].replace('MississaugaCanada Post Gateway Processing Centre', 'Mississauga')
df_toronto.loc[92,'Borough'] = df_toronto.loc[92,'Borough'].replace('Downtown TorontoStn A PO Boxes25 The Esplanade', 'Downtown Toronto')
df_toronto.loc[94,'Borough'] = df_toronto.loc[94,'Borough'].replace('EtobicokeNorthwest', 'Etobicoke')
df_toronto.loc[100,'Borough'] = df_toronto.loc[100,'Borough'].replace('East TorontoBusiness reply mail Processing Centre969 Eastern', 'East Toronto')

In [52]:
df_toronto

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern , Rouge"
7,M3B,North York,Don MillsNorth
8,M4B,East York,"Parkview Hill , Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [53]:
df_toronto.shape

(103, 3)

**End of Chapter 1**

# Chapter 2

Adding Latitude & Longitude to the Data Frame

In [54]:
# Import csv file to google colab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [55]:
df_geo = pd.read_csv('/content/drive/My Drive/IBM DS Certificate/10. Capstone Project/2. K-Means Clustering/Geospatial_Coordinates.csv')
df_geo.columns = ['PostalCode', 'Latitude', 'Longitude']
df_geo

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [56]:
# Merge `df_toronto` with `df_geo`based on PostalCode
df_merged = df_toronto.merge(df_geo, how='inner', on='PostalCode')
df_merged

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353
7,M3B,North York,Don MillsNorth,43.745906,-79.352188
8,M4B,East York,"Parkview Hill , Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


**End of Chapter 2**

# Chapter 3

Explore and Cluster the neighborhoods in Toronto

In [57]:
neighborhoods = df_merged.copy()
neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


In [58]:
print('The dataframe has {} borough & {} neighborhoods.'.format(len(neighborhoods['Borough'].unique()), neighborhoods.shape[0]))

The dataframe has 11 borough & 103 neighborhoods.


In [59]:
# Use Geopy to get Lat, Lon of Toronto

address = 'Toronto, ON, Canada'

geolocator = Nominatim(user_agent='on_explore')
location = geolocator.geocode(address)
lat = location.latitude
lon = location.longitude
print('Geographical Coordinate of Toronto are {}, {}'.format(lat, lon))

Geographical Coordinate of Toronto are 43.6534817, -79.3839347


In [60]:
# Create Map of Toronto
map_toronto = folium.Map(location=[lat,lon], zoom_start=10)

for lat, lon, borough, neigh in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
  label = 'Neighborhood: {} \n\n Borough: {}'.format(neigh, borough)
  label = folium.Popup(label, parse_html=True)
  folium.CircleMarker([lat,lon], radius=5,
                      popup=label, color='blue',
                      fill=True, fill_color='#3186cc',
                      fill_opacity=0.5, parse_html=False).add_to(map_toronto)

map_toronto

Simplify the above map and segment and cluster only the neighborhoods in North York. So let's slice the original dataframe and create a new dataframe of the North York data.

In [61]:
neighborhoods['Borough'].value_counts()

North York          24
Downtown Toronto    18
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
East York            5
East Toronto         5
York                 5
Queen's Park         1
Mississauga          1
Name: Borough, dtype: int64

In [62]:
# Select Only North York Borough

df_NY = neighborhoods[neighborhoods['Borough']=='North York'].reset_index(drop=True)[['Borough', 'Neighborhood', 'Latitude', 'Longitude']]
df_NY.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,North York,Parkwoods,43.753259,-79.329656
1,North York,Victoria Village,43.725882,-79.315572
2,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
3,North York,Don MillsNorth,43.745906,-79.352188
4,North York,Glencairn,43.709577,-79.445073


In [63]:
df_NY.shape

(24, 4)

In [64]:
# Get Coordiates for North York, Toronto, Canada
address = 'North York, ON, Canada'

geolocator = Nominatim(user_agent='on_explore')
location = geolocator.geocode(address)
lat = location.latitude
lon = location.longitude
print('Geographical Coordinate of North York, Toronto are {}, {}'.format(lat, lon))

Geographical Coordinate of North York, Toronto are 43.7543263, -79.44911696639593


In [65]:
# Visualizing North York Map

map_NorthYork = folium.Map(location=[lat,lon], zoom_start=11)

for lat, lon, neigh in zip(df_NY['Latitude'], df_NY['Longitude'], df_NY['Neighborhood']):
  label = '{}'.format(neigh)
  label = folium.Popup(label, parse_html=True)
  folium.CircleMarker([lat,lon], radius=5,
                      popup=label, color='blue',
                      fill=True, fill_color='#3186cc',
                      fill_opacity=0.5, parse_html=False).add_to(map_NorthYork)

map_NorthYork

In [66]:
map_NorthYork = folium.Map(location=[lat,lon], zoom_start=11)

incidents = plugins.MarkerCluster().add_to(map_NorthYork)

for lat, lon, neigh in zip(df_NY['Latitude'], df_NY['Longitude'], df_NY['Neighborhood']):
  label = '{}'.format(neigh)
  label = folium.Popup(label, parse_html=True)
  folium.CircleMarker([lat,lon], radius=5,
                      popup=label, color='blue',
                      fill=True, fill_color='#3186cc',
                      fill_opacity=0.5, parse_html=False).add_to(incidents)

map_NorthYork

## Utilizing the Foursquare API to explore the neighborhoods and segment them

In [67]:
CLIENT_ID = '3UT2CP1TASOWFFEBJL3DGSHWTXORPEA5H1UQLX02M3HGHDS1' # your Foursquare ID
CLIENT_SECRET = 'P4N2G2HR4TKTXLDEEVUT1EWROXQ3ZSRHO1TFGGZ2JAWKEDIS' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 3UT2CP1TASOWFFEBJL3DGSHWTXORPEA5H1UQLX02M3HGHDS1
CLIENT_SECRET:P4N2G2HR4TKTXLDEEVUT1EWROXQ3ZSRHO1TFGGZ2JAWKEDIS


In [68]:
df_NY.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,North York,Parkwoods,43.753259,-79.329656
1,North York,Victoria Village,43.725882,-79.315572
2,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
3,North York,Don MillsNorth,43.745906,-79.352188
4,North York,Glencairn,43.709577,-79.445073


In [69]:
df_NY.shape

(24, 4)

EXPLORE `Parkwoods`  neighborhood

In [70]:
# Parkwoods

neigh_name = df_NY.loc[0,'Neighborhood']
neigh_lat = df_NY.loc[0,'Latitude']
neigh_lon = df_NY.loc[0,'Longitude']

print('Coordinate for "{}" neighborhood is = Lat : {}, Lng : {}'.format(neigh_name, neigh_lat, neigh_lon))

Coordinate for "Parkwoods" neighborhood is = Lat : 43.7532586, Lng : -79.3296565


End Point Venue/Explore = Returns a list of recommended venues near the current location.

 Get the top 100 venues that are in `Parkwoods` within a radius of 500 meters.

In [71]:
# Get the top 100 venues that are in Parkwoods within a radius of 500 meters.
LIMIT = 100
radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET,
                                                                                                                           VERSION, neigh_lat, neigh_lon,
                                                                                                                           radius, LIMIT)
print(url)

https://api.foursquare.com/v2/venues/explore?client_id=3UT2CP1TASOWFFEBJL3DGSHWTXORPEA5H1UQLX02M3HGHDS1&client_secret=P4N2G2HR4TKTXLDEEVUT1EWROXQ3ZSRHO1TFGGZ2JAWKEDIS&v=20180605&ll=43.7532586,-79.3296565&radius=500&limit=100


### Inspecting Json File ##

In [72]:
res = requests.get(url).json()

In [73]:
res.keys()

dict_keys(['meta', 'response'])

In [74]:
res['response'].keys()

dict_keys(['headerLocation', 'headerFullLocation', 'headerLocationGranularity', 'totalResults', 'suggestedBounds', 'groups'])

In [75]:
res['response']['groups'][0].keys()

dict_keys(['type', 'name', 'items'])

In [76]:
res['response']['groups'][0]['type']

'Recommended Places'

In [77]:
res['response']['groups'][0]['name']

'recommended'

In [78]:
res['response']['groups'][0]['items']

[{'reasons': {'count': 0,
   'items': [{'reasonName': 'globalInteractionReason',
     'summary': 'This spot is popular',
     'type': 'general'}]},
  'referralId': 'e-0-4e8d9dcdd5fbbbb6b3003c7b-0',
  'venue': {'categories': [{'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/parks_outdoors/park_',
      'suffix': '.png'},
     'id': '4bf58dd8d48988d163941735',
     'name': 'Park',
     'pluralName': 'Parks',
     'primary': True,
     'shortName': 'Park'}],
   'id': '4e8d9dcdd5fbbbb6b3003c7b',
   'location': {'address': 'Toronto',
    'cc': 'CA',
    'city': 'Toronto',
    'country': 'Canada',
    'distance': 245,
    'formattedAddress': ['Toronto', 'Toronto ON', 'Canada'],
    'labeledLatLngs': [{'label': 'display',
      'lat': 43.751976046055574,
      'lng': -79.33214044722958}],
    'lat': 43.751976046055574,
    'lng': -79.33214044722958,
    'state': 'ON'},
   'name': 'Brookbanks Park',
   'photos': {'count': 0, 'groups': []},
   'venuePage': {'id': '600917367'}}},
 {'r

In [79]:
len(res['response']['groups'][0]['items'])

4

In [80]:
json_normalize(res['response']['groups'][0]['items']).head()

Unnamed: 0,referralId,reasons.count,reasons.items,venue.id,venue.name,venue.location.address,venue.location.lat,venue.location.lng,venue.location.labeledLatLngs,venue.location.distance,venue.location.cc,venue.location.city,venue.location.state,venue.location.country,venue.location.formattedAddress,venue.categories,venue.photos.count,venue.photos.groups,venue.venuePage.id
0,e-0-4e8d9dcdd5fbbbb6b3003c7b-0,0,"[{'summary': 'This spot is popular', 'type': '...",4e8d9dcdd5fbbbb6b3003c7b,Brookbanks Park,Toronto,43.751976,-79.33214,"[{'label': 'display', 'lat': 43.75197604605557...",245,CA,Toronto,ON,Canada,"[Toronto, Toronto ON, Canada]","[{'id': '4bf58dd8d48988d163941735', 'name': 'P...",0,[],600917367.0
1,e-0-4e6696b6d16433b9ffff47c3-1,0,"[{'summary': 'This spot is popular', 'type': '...",4e6696b6d16433b9ffff47c3,KFC,,43.754387,-79.333021,"[{'label': 'display', 'lat': 43.75438666345904...",298,CA,,,Canada,[Canada],"[{'id': '4bf58dd8d48988d16e941735', 'name': 'F...",0,[],
2,e-0-4cb11e2075ebb60cd1c4caad-2,0,"[{'summary': 'This spot is popular', 'type': '...",4cb11e2075ebb60cd1c4caad,Variety Store,29 Valley Woods Road,43.751974,-79.333114,"[{'label': 'display', 'lat': 43.75197441585782...",312,CA,Toronto,ON,Canada,"[29 Valley Woods Road, Toronto ON, Canada]","[{'id': '4bf58dd8d48988d1f9941735', 'name': 'F...",0,[],
3,e-0-4e1205c0b61c637b97b299cd-3,0,"[{'summary': 'This spot is popular', 'type': '...",4e1205c0b61c637b97b299cd,GreenWin pool,44 Valley woods road,43.756232,-79.333842,"[{'label': 'display', 'lat': 43.7562324728174,...",472,CA,Toronto,ON,Canada,"[44 Valley woods road, Toronto ON, Canada]","[{'id': '4bf58dd8d48988d15e941735', 'name': 'P...",0,[],


### Extracting Nearby `Venues` Information from JSON file

In [81]:

def getNearbyVenues(names, latitudes, longitudes, radius=500):
  neigh_name = []
  neigh_lat = []
  neigh_lon = []
  venue_name = []
  venue_lat = []
  venue_lon = []
  venue_cat_name = []

  for name, lat, lon in zip(names, latitudes, longitudes):
    print(name)

    url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET,
                                                                                                                           VERSION, lat, lon,
                                                                                                                           radius, LIMIT)
    results = requests.get(url).json()['response']['groups'][0]['items']

    for val in results:
      neigh_name.append(name)
      neigh_lat.append(lat)
      neigh_lon.append(lon)
      venue_name.append(val['venue']['name'])
      venue_lat.append(val['venue']['location']['lat'])
      venue_lon.append(val['venue']['location']['lng'])
      venue_cat_name.append(val['venue']['categories'][0]['name'])


  nearby_venue = pd.DataFrame({'Neighborhood':neigh_name, 'Neighboorhood Latitude':neigh_lat, 'Neighborhood Longitude':neigh_lon, 
                               'venue':venue_name, 'Venue Latitude':venue_lat, 'Venue Longitude':venue_lon, 'Venue Category':venue_cat_name})
  return nearby_venue



In [82]:
northyork_venues = getNearbyVenues(names=df_NY['Neighborhood'],
                                   latitudes=df_NY['Latitude'],
                                   longitudes=df_NY['Longitude']
                                  )

Parkwoods
Victoria Village
Lawrence Manor , Lawrence Heights
Don MillsNorth
Glencairn
Don MillsSouth
Hillcrest Village
Bathurst Manor , Wilson Heights , Downsview North
Fairview , Henry Farm , Oriole
Northwood Park , York University
Bayview Village
DownsviewEast  
York Mills , Silver Hills
DownsviewWest
North Park , Maple Leaf Park , Upwood Park
Humber Summit
Willowdale , Newtonbrook
DownsviewCentral
Bedford Park , Lawrence Manor East
Humberlea , Emery
WillowdaleSouth
DownsviewNorthwest
York Mills West
WillowdaleWest


In [83]:
northyork_venues.head()

Unnamed: 0,Neighborhood,Neighboorhood Latitude,Neighborhood Longitude,venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,KFC,43.754387,-79.333021,Fast Food Restaurant
2,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
3,Parkwoods,43.753259,-79.329656,GreenWin pool,43.756232,-79.333842,Pool
4,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena


In [84]:
northyork_venues.shape

(239, 7)

In [85]:
northyork_venues.groupby('Neighborhood').count().reset_index()[['Neighborhood','venue']].sort_values('venue', ascending=False).head()

Unnamed: 0,Neighborhood,venue
9,"Fairview , Henry Farm , Oriole",63
20,WillowdaleSouth,33
2,"Bedford Park , Lawrence Manor East",23
0,"Bathurst Manor , Wilson Heights , Downsview North",22
4,Don MillsSouth,20


In [86]:
northyork_venues[northyork_venues['Neighborhood']=='Parkwoods']['venue'].value_counts()

Brookbanks Park    1
GreenWin pool      1
KFC                1
Variety Store      1
Name: venue, dtype: int64