# Assignment 2: Segmenting and clustering Neighborhoods in Toronto.

## 1st: Scraping the web-data (we use 'BeautifulSoup' package.)

In [2]:
# Import basic modules used for this assignment 
import pandas as pd
import numpy as np 

In [3]:
# Import modules for scraping the Wikipedia table (retrieved from https://sateesh110.medium.com/how-to-scrape-wikipedia-table-using-python-beautiful-soup-cd0d8ee1a319)
import requests # for performing HTTP requests 
from bs4 import BeautifulSoup # for xml & html scraping

In [3]:
# The url of Wikipedia page 
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [4]:
# Create a soup
soup = BeautifulSoup(requests.get(url).text, 'lxml')

In [5]:
# Title of Wikipedia page
soup.title.string

'List of postal codes of Canada: M - Wikipedia'

In [6]:
# Find class 'wikitable sortable' in the HTML scrip
table = soup.find('table', {'class':'wikitable sortable'})

In [7]:
# Number of columns in the table
for row in table.findAll("tr"):
    cells = row.findAll("td")

len(cells)

3

In [8]:
# Number of rows in the table including header
rows = table.findAll("tr")
len(rows)

181

In [9]:
# Get Table Header
header = [th.text.rstrip() for th in rows[0].find_all('th')]

print(header)
print(len(header))

['Postal Code', 'Borough', 'Neighbourhood']
3


## 2nd: Making the dataframe

In [10]:
# Scrap the data and append to respective lists
c1 = []
c2 = []
c3 = []

for row in table.findAll("tr"):
    cells = row.findAll('td')
    if len(cells) == 3: # Only extract table body not heading
        c1.append(cells[0].find(text = True))
        c2.append(cells[1].find(text = True))
        c3.append(cells[2].find(text = True))

In [11]:
# Make a dictionary to become a dataframe
d = dict([(x, 0) for x in header])
d

{'Postal Code': 0, 'Borough': 0, 'Neighbourhood': 0}

In [12]:
d['Postal Code'] = c1
d['Borough'] = c2
d['Neighbourhood'] = c3

In [13]:
# Trasform the dictionary to a dataframe
df = pd.DataFrame(d)

In [14]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A\n,Not assigned\n,Not assigned\n
1,M2A\n,Not assigned\n,Not assigned\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"


In [15]:
# Remove '\n' from the dataframe
df.replace(r'\n', '', regex = True, inplace = True)

In [16]:
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [17]:
# Drop rows whose 'Borough' values are 'Not assigned'
df.drop(df[df['Borough'] == 'Not assigned'].index, inplace = True)

In [18]:
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [19]:
# to make sure whether rows having 'Not assigned ' as 'Neighbourhoods's value
df.loc[df['Neighbourhood'] == "Not assigned"]

Unnamed: 0,Postal Code,Borough,Neighbourhood


In [20]:
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [21]:
df.shape

(103, 3)

## 3rd: acquiring latitude and longitude of each postal code using Bing Map API

In [22]:
# import geocoder
import geocoder

In [23]:
# add two columns 'Latitude' and 'Longitude'
df['Latitude'] = ''
df['Longitude'] = ''

In [24]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,M3A,North York,Parkwoods,,
3,M4A,North York,Victoria Village,,
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",,
5,M6A,North York,"Lawrence Manor, Lawrence Heights",,
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",,


In [25]:
# fill up the two columns using loop
for i in range(len(df)):
    g = geocoder.bing('{}, Toronto, Ontario'.format(df.iloc[i, 0]), key = "AveqYCAVfTS2P8hy16Zdv1JCwo_5CbSiObipo9UsshRtze28C--AYls_MLl0eFLd")
    df.iloc[i, 3] = g.latlng[0]
    df.iloc[i, 4] = g.latlng[1]

Status code Unknown from http://dev.virtualearth.net/REST/v1/Locations: ERROR - HTTPConnectionPool(host='dev.virtualearth.net', port=80): Read timed out. (read timeout=5.0)


TypeError: 'NoneType' object is not subscriptable

In [26]:
df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.756123,-79.329636
3,M4A,North York,Victoria Village,43.72678,-79.310738
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.655354,-79.365044
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.721996,-79.445915
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66391,-79.388733
...,...,...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",,
165,M4Y,Downtown Toronto,Church and Wellesley,,
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",,
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",,


In [41]:
df.loc[df["Latitude"] == '']

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
39,M4G,East York,Leaside,,
40,M5G,Downtown Toronto,Central Bay Street,,
41,M6G,Downtown Toronto,Christie,,
45,M1H,Scarborough,Cedarbrae,,
46,M2H,North York,Hillcrest Village,,
...,...,...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",,
165,M4Y,Downtown Toronto,Church and Wellesley,,
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",,
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",,


In [42]:
df.loc[df["Latitude"] != '']

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.756123,-79.329636
3,M4A,North York,Victoria Village,43.72678,-79.310738
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.655354,-79.365044
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.721996,-79.445915
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66391,-79.388733
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.66452,-79.529076
9,M1B,Scarborough,"Malvern, Rouge",43.806847,-79.201469
11,M3B,North York,Don Mills,43.746311,-79.357605
12,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706676,-79.30658
13,M5B,Downtown Toronto,"Garden District, Ryerson",43.657478,-79.378632


In [49]:
df.iloc[102, :]

Postal Code                                                    M8Z
Borough                                                  Etobicoke
Neighbourhood    Mimico NW, The Queensway West, South of Bloor,...
Latitude                                                          
Longitude                                                         
Name: 178, dtype: object

In [46]:
g_1 = geocoder.bing('M4G, Toronto, Ontario', key = "AveqYCAVfTS2P8hy16Zdv1JCwo_5CbSiObipo9UsshRtze28C--AYls_MLl0eFLd")
df.iloc[23, 3] = g_1.latlng[0]
df.iloc[23, 4] = g_1.latlng[1]

In [47]:
df.loc[df["Latitude"] != '']

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.756123,-79.329636
3,M4A,North York,Victoria Village,43.72678,-79.310738
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.655354,-79.365044
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.721996,-79.445915
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66391,-79.388733
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.66452,-79.529076
9,M1B,Scarborough,"Malvern, Rouge",43.806847,-79.201469
11,M3B,North York,Don Mills,43.746311,-79.357605
12,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706676,-79.30658
13,M5B,Downtown Toronto,"Garden District, Ryerson",43.657478,-79.378632


In [50]:
for j in range(24,103):
    g_2 = geocoder.bing('{}, Toronto, Ontario'.format(df.iloc[j, 0]), key = "AveqYCAVfTS2P8hy16Zdv1JCwo_5CbSiObipo9UsshRtze28C--AYls_MLl0eFLd")
    df.iloc[j, 3] = g_2.latlng[0]
    df.iloc[j, 4] = g_2.latlng[1]

Status code Unknown from http://dev.virtualearth.net/REST/v1/Locations: ERROR - HTTPConnectionPool(host='dev.virtualearth.net', port=80): Read timed out. (read timeout=5.0)


TypeError: 'NoneType' object is not subscriptable

In [54]:
df.loc[df["Latitude"] != ''].shape

(35, 5)

In [55]:
for k in range(36,103):
    g_3 = geocoder.bing('{}, Toronto, Ontario'.format(df.iloc[k, 0]), key = "AveqYCAVfTS2P8hy16Zdv1JCwo_5CbSiObipo9UsshRtze28C--AYls_MLl0eFLd")
    df.iloc[k, 3] = g_3.latlng[0]
    df.iloc[k, 4] = g_3.latlng[1]

Status code Unknown from http://dev.virtualearth.net/REST/v1/Locations: ERROR - HTTPConnectionPool(host='dev.virtualearth.net', port=80): Read timed out. (read timeout=5.0)


TypeError: 'NoneType' object is not subscriptable

In [56]:
df.loc[df["Latitude"] != ''].shape

(60, 5)

In [57]:
for l in range(61,103):
    g_4 = geocoder.bing('{}, Toronto, Ontario'.format(df.iloc[l, 0]), key = "AveqYCAVfTS2P8hy16Zdv1JCwo_5CbSiObipo9UsshRtze28C--AYls_MLl0eFLd")
    df.iloc[l, 3] = g_4.latlng[0]
    df.iloc[l, 4] = g_4.latlng[1]

Status code Unknown from http://dev.virtualearth.net/REST/v1/Locations: ERROR - HTTPConnectionPool(host='dev.virtualearth.net', port=80): Read timed out. (read timeout=5.0)


TypeError: 'NoneType' object is not subscriptable

In [58]:
df.loc[df["Latitude"] != ''].shape

(99, 5)

In [59]:
for m in range(100,103):
    g_5 = geocoder.bing('{}, Toronto, Ontario'.format(df.iloc[m, 0]), key = "AveqYCAVfTS2P8hy16Zdv1JCwo_5CbSiObipo9UsshRtze28C--AYls_MLl0eFLd")
    df.iloc[m, 3] = g_5.latlng[0]
    df.iloc[m, 4] = g_5.latlng[1]

In [60]:
df.loc[df["Latitude"] != ''].shape

(102, 5)

In [69]:
df.loc[df["Latitude"] == '']

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
57,M4J,East York,"East Toronto, Broadview North (Old East York)",,


In [78]:
df.loc[df["Postal Code"] == "M4J", "Latitude"]

57    
Name: Latitude, dtype: object

In [79]:
g_6 = geocoder.bing('M4J, Toronto, Ontario', key = "AveqYCAVfTS2P8hy16Zdv1JCwo_5CbSiObipo9UsshRtze28C--AYls_MLl0eFLd")
df.loc[df["Postal Code"] == "M4J", "Latitude"] = g_6.latlng[0]
df.loc[df["Postal Code"] == "M4J", "Longitude"] = g_6.latlng[1]

In [81]:
df.loc[df["Postal Code"] == 'M4J']

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
57,M4J,East York,"East Toronto, Broadview North (Old East York)",43.685272,-79.337265


In [83]:
df.reset_index(inplace = True)

In [85]:
df.drop(["index"], axis = 1, inplace = True)

In [86]:
df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.756123,-79.329636
1,M4A,North York,Victoria Village,43.72678,-79.310738
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.655354,-79.365044
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.721996,-79.445915
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66391,-79.388733
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.652699,-79.511276
99,M4Y,Downtown Toronto,Church and Wellesley,43.666286,-79.382446
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.651894,-79.381714
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.633709,-79.496521


## 4th: explorin and clustering the neighbourhoods in Toronto (we do this based on the Lab session contents)

In [87]:
# import libraries to be used
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium

In [89]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

In [92]:
# import Nominatim which converts an address into latitude and longitude values
from geopy.geocoders import Nominatim

In [93]:
# Use geopy library to get the latitude and longitude values of Toronto, Ontario

address = "Toronto, Ontario"

geolocator = Nominatim(user_agent = "Toronto Explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

print("The geographical coordinate of Toronto are {}, {}.".format(latitude, longitude))

The geographical coordinate of Toronto are 43.6534817, -79.3839347.


In [96]:
# Create a map of Toronto with neighborhoods superimposed on top
map_toronto = folium.Map(location = [latitude, longitude], zoom_start = 10)

# Add markers to map
for lat, lng, borough, neighbourhood in zip(df["Latitude"], df["Longitude"], df["Borough"], df["Neighbourhood"]):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker([lat, lng], radius = 5, popup = label, color = 'blue', fill = True, fill_color = "#3186cc", fill_opacity = 0.7, parse_html = False).add_to(map_toronto)

map_toronto

In [97]:
# Segment and cluster only the neighbourhoods in Downtown Toronto.
# Slice the original dafaframe and create a new dataframe of the Downtown Toronto data.

Downtown_df = df[df['Borough'] == 'Downtown Toronto'].reset_index(drop = True)
Downtown_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.655354,-79.365044
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66391,-79.388733
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657478,-79.378632
3,M5C,Downtown Toronto,St. James Town,43.651112,-79.375732
4,M5E,Downtown Toronto,Berczy Park,43.647018,-79.374084


In [98]:
# Get the geographical coordinates of Downtown Toronto

address = "Downtown Toronto"

geolocator = Nominatim(user_agent = "Toronto Explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

print("The geographical coordinate of Downtown Toronto are {}, {}".format(latitude, longitude))

The geographical coordinate of Downtown Toronto are 43.6541737, -79.38081164513409


In [100]:
# Create map of Manhattan using latitude and longitude values

map_Downtown = folium.Map(location = [latitude, longitude], zoom_start = 11)

# Add markers to map
for lat, lng, label in zip(Downtown_df["Latitude"], Downtown_df["Longitude"], Downtown_df["Neighbourhood"]):
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker([lat, lng], radius = 5, popup = label, color = 'blue', fill = True, fill_color = "#3186cc", fill_opacity = 0.7, parse_html = False).add_to(map_Downtown)

map_Downtown

In [101]:
# Define Foursquare Credentials and Version

CLIENT_ID = "H1RCC0M44AYUJUXSK4H3IBJCEAGRS54WJHD3ABHKNUBNGAI0"
CLIENT_SECRET = '4DGTG1NSFOP1KU333FRGDPRHAWB0FW2YC523ZPDKIG1D042B' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: H1RCC0M44AYUJUXSK4H3IBJCEAGRS54WJHD3ABHKNUBNGAI0
CLIENT_SECRET:4DGTG1NSFOP1KU333FRGDPRHAWB0FW2YC523ZPDKIG1D042B


In [102]:
# Get the neighbourhood's name

Downtown_df.loc[0, 'Neighbourhood']

'Regent Park, Harbourfront'

In [103]:
# Get the neighbourhood's latitude and longitude

neighbourhood_latitude = Downtown_df.loc[0, 'Latitude']
neighbourhood_longitude = Downtown_df.loc[0, "Longitude"]

neighbourhood_name = Downtown_df.loc[0, "Neighbourhood"]

print("Latitude and longitude values of {} are {}, {}.".format(neighbourhood_name, neighbourhood_latitude, neighbourhood_longitude))

Latitude and longitude values of Regent Park, Harbourfront are 43.65535354614258, -79.36504364013672.


In [105]:
# Get the top 100 venues that are in Regent Park and Harbourfront within a radius of 500 meters.

LIMIT = 100
radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighbourhood_latitude, 
    neighbourhood_longitude, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=H1RCC0M44AYUJUXSK4H3IBJCEAGRS54WJHD3ABHKNUBNGAI0&client_secret=4DGTG1NSFOP1KU333FRGDPRHAWB0FW2YC523ZPDKIG1D042B&v=20180605&ll=43.65535354614258,-79.36504364013672&radius=500&limit=100'

In [106]:
# Send the GET request and examine the results

results = requests.get(url).json()
results

        'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4cbdc6784495721ea262617a',
       'name': "Fusaro's",
       'location': {'address': '294 Richmond St. E.',
        'lat': 43.65334679826618,
        'lng': -79.36951731755141,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.65334679826618,
          'lng': -79.36951731755141}],
        'distance': 423,
        'cc': 'CA',
        'city': 'Toronto',
        'state': 'ON',
        'country': 'Canada',
        'formattedAddress': ['294 Richmond St. E.', 'Toronto ON', 'Canada']},
       'categories': [{'id': '4bf58dd8d48988d110941735',
         'name': 'Italian Restaurant',
         'pluralName': 'Italian Restaurants',
         'shortName': 'Italian',
         'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/italian_',
          'suffix': '.png'},
         'primary': True}],
       'photos': {'count': 0, 'groups': []}},
      'referralId': 'e-0-4cbdc6784495721ea262617a-8'},
     {

In [108]:
# Borrow the get_category_type function from the Foursqure lab session

def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
    if len(categories_list) == 0:
        return None
    else: return categories_list[0]['name']

In [110]:
# Transform JSON file into a pandas dataframe
from pandas.io.json import json_normalize

In [111]:
# Clean the json and structure it into a pandas dataframe

venues = results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues)

# Filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues = nearby_venues.loc[:, filtered_columns]

# Filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis = 1)

# Clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Figs Breakfast & Lunch,Breakfast Spot,43.655675,-79.364503
1,Roselle Desserts,Bakery,43.653447,-79.362017
2,Tandem Coffee,Coffee Shop,43.653559,-79.361809
3,Berkeley Church,Event Space,43.655123,-79.365873
4,The Yoga Lounge,Yoga Studio,43.655515,-79.364955


In [112]:
print("{} venues were returned by Foursquare.".format(nearby_venues.shape[0]))

25 venues were returned by Foursquare.


In [120]:
# Create a function to repeat the same process to all the neighbourhoods in Downtown Toronoto

def getNearbyVenues(names, latitudes, longitudes, radius = 500):

    venues_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)

        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)

        # make the GET request
        results = requests.get(url).json()["response"]["groups"][0]['items']

        # return only relevant information for each nearby venue
        venues_list.append([(name, lat, lng, v['venue']['name'], v['venue']['location']['lat'], v['venue']['location']['lng'], v['venue']['categories'][0]['name']) for v in results])
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 'Neighbourhood Latitude', 'Neighbourhood Longitude', 'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category']

    return(nearby_venues)

In [121]:
Downtown_venues = getNearbyVenues(names = Downtown_df['Neighbourhood'], latitudes = Downtown_df['Latitude'], longitudes = Downtown_df['Longitude'])

Regent Park, Harbourfront
Queen's Park, Ontario Provincial Government
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Harbourfront East, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
University of Toronto, Harbord
Kensington Market, Chinatown, Grange Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Rosedale
Stn A PO Boxes
St. James Town, Cabbagetown
First Canadian Place, Underground city
Church and Wellesley


In [122]:
print(Downtown_venues.shape)
Downtown_venues.head()

(1269, 7)


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.655354,-79.365044,Figs Breakfast & Lunch,43.655675,-79.364503,Breakfast Spot
1,"Regent Park, Harbourfront",43.655354,-79.365044,Roselle Desserts,43.653447,-79.362017,Bakery
2,"Regent Park, Harbourfront",43.655354,-79.365044,Tandem Coffee,43.653559,-79.361809,Coffee Shop
3,"Regent Park, Harbourfront",43.655354,-79.365044,Berkeley Church,43.655123,-79.365873,Event Space
4,"Regent Park, Harbourfront",43.655354,-79.365044,The Yoga Lounge,43.655515,-79.364955,Yoga Studio


In [123]:
# Check how many venues were returned for each neighbourhood
Downtown_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,98,98,98,98,98,98
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",58,58,58,58,58,58
Central Bay Street,58,58,58,58,58,58
Christie,21,21,21,21,21,21
Church and Wellesley,79,79,79,79,79,79
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
"First Canadian Place, Underground city",100,100,100,100,100,100
"Garden District, Ryerson",100,100,100,100,100,100
"Harbourfront East, Union Station, Toronto Islands",54,54,54,54,54,54
"Kensington Market, Chinatown, Grange Park",51,51,51,51,51,51


In [125]:
print("There are {} unique categories.".format(len(Downtown_venues['Venue Category'].unique())))

There are 193 unique categories.


## 5: Analyze each neighbourhood

In [127]:
# one hot encoding
Downtown_onehot = pd.get_dummies(Downtown_venues[['Venue Category']], prefix = '', prefix_sep = '')

# add neighbourhood column back to dataframe
Downtown_onehot['Neighbourhood'] = Downtown_venues['Neighbourhood']

# Move neighbourhood column to the first column
fixed_columns = [Downtown_onehot.columns[-1]] + list(Downtown_onehot.columns[:-1])
Downtown_onehot = Downtown_onehot[fixed_columns]

Downtown_onehot.head()

Unnamed: 0,Neighbourhood,Adult Boutique,American Restaurant,Antique Shop,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,BBQ Joint,Baby Store,...,Theater,Theme Restaurant,Trail,Train Station,University,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Yoga Studio
0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [128]:
Downtown_onehot.shape

(1269, 194)

In [129]:
# Group rows by neighbourhood and by taking the mean of the frequency of occurrence of each category

Downtown_grouped = Downtown_onehot.groupby('Neighbourhood').mean().reset_index()
Downtown_grouped

Unnamed: 0,Neighbourhood,Adult Boutique,American Restaurant,Antique Shop,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,BBQ Joint,Baby Store,...,Theater,Theme Restaurant,Trail,Train Station,University,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Yoga Studio
0,Berczy Park,0.0,0.010204,0.010204,0.020408,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.010204,0.0,0.0,0.0,0.010204
1,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0,0.0,0.017241,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.017241,0.0,0.017241,0.0,0.0,0.0,0.017241
2,Central Bay Street,0.0,0.0,0.0,0.0,0.017241,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017241,0.0
3,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Church and Wellesley,0.012658,0.012658,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.012658,0.012658,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012658
5,"Commerce Court, Victoria Hotel",0.0,0.04,0.0,0.01,0.0,0.0,0.02,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.01
6,"First Canadian Place, Underground city",0.0,0.03,0.0,0.01,0.0,0.0,0.03,0.0,0.0,...,0.01,0.0,0.0,0.01,0.0,0.01,0.0,0.0,0.01,0.0
7,"Garden District, Ryerson",0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,...,0.02,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.01,0.0
8,"Harbourfront East, Union Station, Toronto Islands",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.018519,0.0,0.0,0.0,0.0
9,"Kensington Market, Chinatown, Grange Park",0.0,0.0,0.0,0.019608,0.0,0.019608,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.019608,0.078431,0.019608,0.039216,0.019608,0.0


In [130]:
Downtown_grouped.shape

(19, 194)

In [131]:
# Print each neighbourhood along with the top 5 most common values

num_top_venues = 5

for hood in Downtown_grouped['Neighbourhood']:
    print("----" + hood + "----")
    temp = Downtown_grouped[Downtown_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ["venue", 'freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending = False).reset_index(drop = True).head(num_top_venues))
    print('\n')

----Berczy Park----
                 venue  freq
0          Coffee Shop  0.08
1                 Café  0.04
2  Japanese Restaurant  0.04
3   Italian Restaurant  0.04
4   Seafood Restaurant  0.04


----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
                venue  freq
0         Coffee Shop  0.09
1  Italian Restaurant  0.05
2                Café  0.05
3          Restaurant  0.05
4                Park  0.03


----Central Bay Street----
                venue  freq
0         Coffee Shop  0.19
1  Italian Restaurant  0.07
2      Sandwich Place  0.03
3     Bubble Tea Shop  0.03
4                Café  0.03


----Christie----
               venue  freq
0  Korean Restaurant  0.19
1      Grocery Store  0.14
2               Café  0.14
3        Coffee Shop  0.10
4     Ice Cream Shop  0.05


----Church and Wellesley----
                 venue  freq
0          Coffee Shop  0.09
1  Japanese Restaurant  0.06
2     Sushi Restaurant  0

In [132]:
# Write a function to sort the venues in descending order

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending = False)

    return row_categories_sorted.index.values[0:num_top_venues]

In [136]:
# Create the new dataframe and display the top 10 venues for each neighbourhood.

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

## create columns according to number of top venues

columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

## create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns = columns)
neighbourhoods_venues_sorted['Neighbourhood'] = Downtown_grouped['Neighbourhood']

for ind in np.arange(Downtown_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Downtown_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Café,Japanese Restaurant,Italian Restaurant,Seafood Restaurant,Beer Bar,Hotel,Restaurant,Cocktail Bar,Breakfast Spot
1,"CN Tower, King and Spadina, Railway Lands, Har...",Coffee Shop,Italian Restaurant,Café,Restaurant,Park,Hotel,Gym,Grocery Store,French Restaurant,Japanese Restaurant
2,Central Bay Street,Coffee Shop,Italian Restaurant,Sandwich Place,Bubble Tea Shop,Café,Clothing Store,Restaurant,Department Store,Neighborhood,Furniture / Home Store
3,Christie,Korean Restaurant,Grocery Store,Café,Coffee Shop,Ice Cream Shop,Park,Sandwich Place,Candy Store,Playground,Pizza Place
4,Church and Wellesley,Coffee Shop,Japanese Restaurant,Sushi Restaurant,Restaurant,Pub,Gay Bar,Café,Hotel,Mediterranean Restaurant,Men's Store


## 6: Cluster Neighbourhoods

In [138]:
# Run k-means to cluster the neighbourhood into 5 clusters

## set number of clusters
kclusters = 5

Downtown_grouped_clustering = Downtown_grouped.drop('Neighbourhood', 1)

## run k-means clustering
kmeans = KMeans(n_clusters = kclusters, random_state = 0).fit(Downtown_grouped_clustering)

## check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 2, 4, 3, 0, 0, 0, 2, 2, 2], dtype=int32)

In [140]:
# Create a new dataframe that includes the cluster as well as the top 10 venues for each neighbourhood

## add clustering labels
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

Downtown_merged = Downtown_df

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighbourhood
Downtown_merged = Downtown_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on = 'Neighbourhood')

Downtown_merged.head()

ValueError: cannot insert Cluster Labels, already exists

In [147]:
# Visualize the resulting clusters

## create map
map_clusters = folium.Map(location = [latitude, longitude], zoom_start = 11)

## set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

## add markers to the map
markers_colors = []
for lat, lng, poi, cluster in zip(Downtown_merged['Latitude'], Downtown_merged['Longitude'], Downtown_merged['Neighbourhood'], Downtown_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html = True)
    folium.CircleMarker([lat, lng], radius = 5, popup = label, color = rainbow[cluster-1], fill = True, fill_color = rainbow[cluster-1], fill_opacity = 0.7).add_to(map_clusters)

map_clusters

In [151]:
# Examine Clusters

## Cluster 1

Downtown_merged.loc[Downtown_merged['Cluster Labels'] == 0, Downtown_merged.columns[[1] + list(range(5, Downtown_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,Downtown Toronto,0,Coffee Shop,Café,Seafood Restaurant,Cocktail Bar,Italian Restaurant,Restaurant,Gastropub,American Restaurant,Gym,Bakery
4,Downtown Toronto,0,Coffee Shop,Café,Japanese Restaurant,Italian Restaurant,Seafood Restaurant,Beer Bar,Hotel,Restaurant,Cocktail Bar,Breakfast Spot
7,Downtown Toronto,0,Hotel,Café,Coffee Shop,Restaurant,Gym,Steakhouse,Salad Place,Japanese Restaurant,American Restaurant,Seafood Restaurant
9,Downtown Toronto,0,Coffee Shop,Hotel,Café,Restaurant,Gym,American Restaurant,Japanese Restaurant,Asian Restaurant,Seafood Restaurant,Breakfast Spot
10,Downtown Toronto,0,Coffee Shop,Restaurant,Hotel,Café,Italian Restaurant,Gym,American Restaurant,Japanese Restaurant,Gastropub,Seafood Restaurant
15,Downtown Toronto,0,Coffee Shop,Restaurant,Hotel,Beer Bar,Cocktail Bar,Café,Deli / Bodega,Japanese Restaurant,Bakery,Seafood Restaurant
17,Downtown Toronto,0,Coffee Shop,Hotel,Restaurant,Café,Gym,Japanese Restaurant,Deli / Bodega,Steakhouse,Salad Place,Asian Restaurant
18,Downtown Toronto,0,Coffee Shop,Japanese Restaurant,Sushi Restaurant,Restaurant,Pub,Gay Bar,Café,Hotel,Mediterranean Restaurant,Men's Store


In [152]:
## Cluster 2

Downtown_merged.loc[Downtown_merged['Cluster Labels'] == 1, Downtown_merged.columns[[1] + list(range(5, Downtown_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
14,Downtown Toronto,1,Pie Shop,Sandwich Place,Japanese Restaurant,Park,Trail,Bank,Coffee Shop,Movie Theater,Museum,Music School


In [153]:
## Cluster 3

Downtown_merged.loc[Downtown_merged['Cluster Labels'] == 2, Downtown_merged.columns[[1] + list(range(5, Downtown_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Downtown Toronto,2,Coffee Shop,Gym,Sushi Restaurant,Adult Boutique,Diner,Burrito Place,Café,College Cafeteria,College Theater,Creperie
2,Downtown Toronto,2,Coffee Shop,Clothing Store,Middle Eastern Restaurant,Movie Theater,Cosmetics Shop,Café,Italian Restaurant,Japanese Restaurant,Hotel,Ramen Restaurant
8,Downtown Toronto,2,Coffee Shop,Boat or Ferry,Liquor Store,Restaurant,Plaza,Pizza Place,Hotel,Bar,Sporting Goods Shop,Fried Chicken Joint
11,Downtown Toronto,2,Café,Coffee Shop,Bookstore,Restaurant,Museum,Park,College Arts Building,Music School,Social Club,College Theater
12,Downtown Toronto,2,Vegetarian / Vegan Restaurant,Coffee Shop,Café,Farmers Market,Vietnamese Restaurant,Mexican Restaurant,Clothing Store,Bakery,Bar,Grocery Store
13,Downtown Toronto,2,Coffee Shop,Italian Restaurant,Café,Restaurant,Park,Hotel,Gym,Grocery Store,French Restaurant,Japanese Restaurant
16,Downtown Toronto,2,Coffee Shop,Restaurant,Pizza Place,Café,Japanese Restaurant,Italian Restaurant,Chinese Restaurant,Pub,Bakery,Grocery Store


In [154]:
## Cluster 4

Downtown_merged.loc[Downtown_merged['Cluster Labels'] == 3, Downtown_merged.columns[[1] + list(range(5, Downtown_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,Downtown Toronto,3,Korean Restaurant,Grocery Store,Café,Coffee Shop,Ice Cream Shop,Park,Sandwich Place,Candy Store,Playground,Pizza Place


In [155]:
## Cluster 5

Downtown_merged.loc[Downtown_merged['Cluster Labels'] == 4, Downtown_merged.columns[[1] + list(range(5, Downtown_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,4,Coffee Shop,Italian Restaurant,Breakfast Spot,Yoga Studio,Gym / Fitness Center,Diner,Electronics Store,Event Space,Food Truck,Sandwich Place
5,Downtown Toronto,4,Coffee Shop,Italian Restaurant,Sandwich Place,Bubble Tea Shop,Café,Clothing Store,Restaurant,Department Store,Neighborhood,Furniture / Home Store
