## Exploration of  neighbourhoods of Toronto City 


**Question 1**

 **Step 1: Scrape the web page having details of Toronto neighbourhoods** 

In [1]:
# Using pandas.read_html package for web scraping 
import pandas as pd 
import numpy as np 

url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df1=pd.read_html(url)
df1=df1[0]   # To get the first table 
df2 = df1[['Postal Code','Borough','Neighbourhood']]
df2

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


**Step 2: Cleaning the dataframe to filter out only ASSIGNED Borough and Neighbourhood values**

In [2]:

#1. Create a Boolean to check against each row
is_borough_assigned= df2['Borough']!='Not assigned'
print(is_borough_assigned.head)


<bound method NDFrame.head of 0      False
1      False
2       True
3       True
4       True
       ...  
175    False
176    False
177    False
178     True
179    False
Name: Borough, Length: 180, dtype: bool>


In [3]:
#2 counting the number of rows with 'Not assigned' values in Borough column

print(df2[df2.Borough == 'Not assigned'].shape[0])

77


In [4]:
#3. Get all records without rows where Borough='Not assigned'. i.e filtering out all the records that have Borough values
df3=df2[df2.Borough != 'Not assigned']
df3.shape


(103, 3)

In [5]:
#4 counting the number of rows with 'Not assigned' values in Borough column. We should get 0 

print(df3[df3.Borough == 'Not assigned'].shape[0])

0


In [6]:
#counting the number of rows with 'Not assigned' values in Neighbourhood column. There are no rows with 'Not assigned' values. Hence no changes made

print(df3[df3.Neighbourhood == 'Not assigned'].shape[0])

0


In [6]:
print(df3.shape)

(103, 3)


**Question 2**

__Step 3  Create a new dataframe with above details and coordinates included.__

In [7]:
#Read csv file containing coordinates
import pandas as pd

df5 = pd.read_csv('http://cocl.us/Geospatial_data')



In [16]:
df5

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


__Step 4: Joining both dataframes__


In [8]:
#using join command to append columns .
#For this converting the postal code of both columns as index , so that they will be joined on basis of that


toronto_neighbourhoods=df3.set_index('Postal Code').join(df5.set_index('Postal Code'))

In [9]:
toronto_neighbourhoods

Unnamed: 0_level_0,Borough,Neighbourhood,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M3A,North York,Parkwoods,43.753259,-79.329656
M4A,North York,Victoria Village,43.725882,-79.315572
M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...
M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


In [10]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(toronto_neighbourhoods['Borough'].unique()),
        toronto_neighbourhoods.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


**Question3**

# Exploring the Neighbourhoods

In [11]:
#importing dependant libraries

import pandas as pd 
import numpy as np 

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
#from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans



In [12]:
!pip install geocoder==1.5.0
from geopy.geocoders import Nominatim

!pip install folium==0.5.0
import folium #map rendering library

print('Libraries imported.')

Collecting geocoder==1.5.0
  Downloading geocoder-1.5.0-py2.py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 6.3 MB/s  eta 0:00:01
Collecting ratelim
  Downloading ratelim-0.1.6-py2.py3-none-any.whl (4.0 kB)
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.5.0 ratelim-0.1.6
Collecting folium==0.5.0
  Downloading folium-0.5.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 9.1 MB/s  eta 0:00:01
[?25hCollecting branca
  Downloading branca-0.4.2-py3-none-any.whl (24 kB)
Building wheels for collected packages: folium
  Building wheel for folium (setup.py) ... [?25ldone
[?25h  Created wheel for folium: filename=folium-0.5.0-py3-none-any.whl size=76240 sha256=81090681d243a56a87670b53ce3754b9bd77e3d74ce36efe714026b3f71fcc85
  Stored in directory: /tmp/wsuser/.cache/pip/wheels/b2/2f/2c/109e446b990d663ea5ce9b078b5e7c1a9c45cca91f377080f8
Successfully built folium
Installing collected packages: branca, folium
Successful

In [13]:
#coordinates of Toronto from google 

latitude = 43.6532
longitude = -79.3832
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_neighbourhoods['Latitude'], toronto_neighbourhoods['Longitude'], toronto_neighbourhoods['Borough'], toronto_neighbourhoods['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

The geograpical coordinate of Toronto City are 43.6532, -79.3832.


**Step 5: Exploring neighbourhoods using foursquare client API**

In [14]:
# 1: Lets explore the neighbourhoods of only borough North York for simplicity

NorthYork_data = toronto_neighbourhoods[toronto_neighbourhoods['Borough'] == 'North York'].reset_index(drop=True)
NorthYork_data.head(30)

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
0,North York,Parkwoods,43.753259,-79.329656
1,North York,Victoria Village,43.725882,-79.315572
2,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
3,North York,Don Mills,43.745906,-79.352188
4,North York,Glencairn,43.709577,-79.445073
5,North York,Don Mills,43.7259,-79.340923
6,North York,Hillcrest Village,43.803762,-79.363452
7,North York,"Bathurst Manor, Wilson Heights, Downsview North",43.754328,-79.442259
8,North York,"Fairview, Henry Farm, Oriole",43.778517,-79.346556
9,North York,"Northwood Park, York University",43.76798,-79.487262


In [15]:
#2. Now Considering Geographical coordinates of North York
latitude=43.7615
longitude=-73.4111

# create map of North York using latitude and longitude values
map_NorthYork = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(NorthYork_data['Latitude'], NorthYork_data['Longitude'], NorthYork_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_NorthYork)  
    
map_NorthYork

In [16]:
#Foursquare API Credentials

CLIENT_ID = '1A1N2XCTOL42F0KHJLEZ41NMQBORTNVWB4NMYQED5GUEHFBL' # your Foursquare ID
CLIENT_SECRET = '4XRZR12N5DMBJKXHXX0HSPBF2EMDD1FVIUZVCWXE4IGYVTW0' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 1A1N2XCTOL42F0KHJLEZ41NMQBORTNVWB4NMYQED5GUEHFBL
CLIENT_SECRET:4XRZR12N5DMBJKXHXX0HSPBF2EMDD1FVIUZVCWXE4IGYVTW0


In [17]:
# function that extracts the category of the venue from foursquare API

def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [18]:
# Function to Get all nearby venues of all neighbourhood in North York

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [22]:
#Modify the code to run the above function on each neighborhood and create a new dataframe called NorthYork_venues.


NorthYork_venues = getNearbyVenues(names=NorthYork_data['Neighbourhood'],
                                   latitudes=NorthYork_data['Latitude'],
                                   longitudes=NorthYork_data['Longitude']
                                  )
    






Parkwoods
Victoria Village
Lawrence Manor, Lawrence Heights
Don Mills
Glencairn
Don Mills
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Fairview, Henry Farm, Oriole
Northwood Park, York University
Bayview Village
Downsview
York Mills, Silver Hills
Downsview
North Park, Maple Leaf Park, Upwood Park
Humber Summit
Willowdale, Newtonbrook
Downsview
Bedford Park, Lawrence Manor East
Humberlea, Emery
Willowdale, Willowdale East
Downsview
York Mills West
Willowdale, Willowdale West


In [23]:
#check the size of NorthYork venues dataframe
print(NorthYork_venues.shape)
NorthYork_venues.head()

(239, 7)


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,TTC stop #8380,43.752672,-79.326351,Bus Stop
2,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
3,Parkwoods,43.753259,-79.329656,Corrosion Service Company Limited,43.752432,-79.334661,Construction & Landscaping
4,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena


In [41]:
#debug where 'York Mills, Silver Hills' getting filtered out

#NorthYork_venues[Neighbourhood]=='York Mills, Silver Hills'

#df.loc[df['First_name'] == 'Bill', 'name_match'] = 'Match'

NorthYork_venues.loc[NorthYork_venues['Neighbourhood']=="York Mills, Silver Hills"]

#NorthYork_venues.loc[NorthYork_venues['Neighbourhood']=="North Park, Maple Leaf Park, Upwood Park"]

#df[df.isnull().any(axis=1)]

#NorthYork_venues[NorthYork_venues.isnull().any(axis=1)]


#NorthYork_venues[NorthYork_venues.isna().any(axis=1)]



Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
162,"North Park, Maple Leaf Park, Upwood Park",43.713756,-79.490074,Rustic Bakery,43.715414,-79.4903,Bakery
163,"North Park, Maple Leaf Park, Upwood Park",43.713756,-79.490074,Rustic Massage Therapy and Health Clinic,43.715798,-79.490644,Massage Studio
164,"North Park, Maple Leaf Park, Upwood Park",43.713756,-79.490074,Maple leaf park,43.716188,-79.493531,Park
165,"North Park, Maple Leaf Park, Upwood Park",43.713756,-79.490074,Mika's Trim,43.714068,-79.496113,Construction & Landscaping


In [29]:
#count the number of venues for each neighbourhood

NorthYork_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Bathurst Manor, Wilson Heights, Downsview North",23,23,23,23,23,23
Bayview Village,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",23,23,23,23,23,23
Don Mills,24,24,24,24,24,24
Downsview,15,15,15,15,15,15
"Fairview, Henry Farm, Oriole",67,67,67,67,67,67
Glencairn,4,4,4,4,4,4
Hillcrest Village,5,5,5,5,5,5
Humber Summit,2,2,2,2,2,2
"Humberlea, Emery",2,2,2,2,2,2


In [30]:
# Number of unique venue category from above list

print('There are {} uniques categories.'.format(len(NorthYork_venues['Venue Category'].unique())))

There are 99 uniques categories.


In [31]:
#Analyze each neighbourhood using one hot encoding

# one hot encoding
NorthYork_onehot = pd.get_dummies(NorthYork_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
NorthYork_onehot['Neighbourhood'] = NorthYork_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [NorthYork_onehot.columns[-1]] + list(NorthYork_onehot.columns[:-1])
NorthYork_onehot = NorthYork_onehot[fixed_columns]

NorthYork_onehot.head()

Unnamed: 0,Neighbourhood,Accessories Store,Airport,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bakery,Bank,...,Supermarket,Supplement Shop,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Video Game Store,Vietnamese Restaurant,Women's Store
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
#size
NorthYork_onehot.shape

(239, 100)

In [33]:
#Group the rows of neighbourhood by taking the mean of frequency of occurence of each category
NorthYork_grouped = NorthYork_onehot.groupby('Neighbourhood').mean().reset_index()
NorthYork_grouped

Unnamed: 0,Neighbourhood,Accessories Store,Airport,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bakery,Bank,...,Supermarket,Supplement Shop,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Video Game Store,Vietnamese Restaurant,Women's Store
0,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.086957,...,0.043478,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bedford Park, Lawrence Manor East",0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.043478,0.0,0.043478,0.0,0.0,0.0,0.0,0.0
3,Don Mills,0.0,0.0,0.0,0.041667,0.0,0.0,0.041667,0.0,0.0,...,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Downsview,0.0,0.066667,0.0,0.0,0.0,0.0,0.066667,0.0,0.066667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Fairview, Henry Farm, Oriole",0.0,0.0,0.014925,0.0,0.0,0.014925,0.0,0.029851,0.029851,...,0.0,0.014925,0.0,0.014925,0.0,0.014925,0.014925,0.014925,0.0,0.014925
6,Glencairn,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.25,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Hillcrest Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Humber Summit,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"Humberlea, Emery",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
#size of grouped dataframe
NorthYork_grouped.shape

(19, 100)

In [35]:
# List each neighbourhood with top 3 common venues
num_top_venues = 3

for hood in NorthYork_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = NorthYork_grouped[NorthYork_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Bathurst Manor, Wilson Heights, Downsview North----
            venue  freq
0     Coffee Shop  0.09
1            Bank  0.09
2  Sandwich Place  0.04


----Bayview Village----
                venue  freq
0  Chinese Restaurant  0.25
1                Café  0.25
2                Bank  0.25


----Bedford Park, Lawrence Manor East----
                venue  freq
0      Sandwich Place  0.09
1  Italian Restaurant  0.09
2         Coffee Shop  0.09


----Don Mills----
                 venue  freq
0                  Gym  0.12
1          Coffee Shop  0.08
2  Japanese Restaurant  0.08


----Downsview----
            venue  freq
0   Grocery Store  0.20
1            Park  0.13
2  Baseball Field  0.07


----Fairview, Henry Farm, Oriole----
                  venue  freq
0        Clothing Store  0.12
1           Coffee Shop  0.07
2  Fast Food Restaurant  0.06


----Glencairn----
              venue  freq
0               Pub  0.25
1  Asian Restaurant  0.25
2            Bakery  0.25


----Hillcrest Vil

In [36]:
#put the above records in dataframe
#For that lets order the venues in descending order first

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [43]:
#create the new dataframe and display the top 10 venues for each neighborhood.

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = NorthYork_grouped['Neighbourhood']

for ind in np.arange(NorthYork_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(NorthYork_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()



Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Shopping Mall,Grocery Store,Pizza Place,Park,Ice Cream Shop,Bridal Shop,Intersection,Deli / Bodega
1,Bayview Village,Chinese Restaurant,Café,Bank,Japanese Restaurant,Discount Store,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop
2,"Bedford Park, Lawrence Manor East",Sandwich Place,Coffee Shop,Italian Restaurant,Greek Restaurant,Café,Butcher,Liquor Store,Juice Bar,Comfort Food Restaurant,Japanese Restaurant
3,Don Mills,Gym,Restaurant,Japanese Restaurant,Beer Store,Coffee Shop,Sandwich Place,Athletics & Sports,Art Gallery,Supermarket,Clothing Store
4,Downsview,Grocery Store,Park,Baseball Field,Discount Store,Liquor Store,Business Service,Hotel,Shopping Mall,Bank,Athletics & Sports


In [42]:
#test debug

#neighbourhoods_venues_sorted[neighbourhoods_venues_sorted.isna().any(axis=1)]

#neighbourhoods_venues_sorted.loc[neighbourhoods_venues_sorted['Neighbourhood']=="York Mills, Silver Hills"]



Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue


In [47]:
#cluster the neighbourhoods into 5 cluters using k means algorithm
# set number of clusters
kclusters = 5

NorthYork_grouped_clustering = NorthYork_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(NorthYork_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:100] 

array([2, 2, 2, 2, 2, 2, 2, 4, 1, 1, 2, 0, 2, 0, 2, 3, 2, 2, 3],
      dtype=int32)

In [45]:
# Debug 
#For some reason the foursquareclient is unable to get the nearby venues of neighbourhood York Mills, Silver Hills and hence gets 
#eliminated subsequently in neighbourhoods_venues_sorted. When dataframes neighbourhoods_venues_sorted and NorthYork gets merged it defines cluster NaN for it
# Manually Removing the row with Nan value in cluster column

#NorthYork_data.loc[NorthYork_data['Neighbourhood']=="York Mills, Silver Hills"]

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
12,North York,"York Mills, Silver Hills",43.75749,-79.374714


In [48]:
# add clustering labels
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

NorthYork_merged = NorthYork_data

# merge NorthYork_grouped with NorthYork_data to add latitude/longitude for each neighborhood
NorthYork_merged = NorthYork_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

NorthYork_merged.head() # check the last columns!

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,North York,Parkwoods,43.753259,-79.329656,0.0,Bus Stop,Construction & Landscaping,Food & Drink Shop,Park,Women's Store,Dim Sum Restaurant,Clothing Store,Coffee Shop,Comfort Food Restaurant,Convenience Store
1,North York,Victoria Village,43.725882,-79.315572,2.0,Grocery Store,Coffee Shop,Hockey Arena,Portuguese Restaurant,Women's Store,Diner,Clothing Store,Comfort Food Restaurant,Construction & Landscaping,Convenience Store
2,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,2.0,Clothing Store,Furniture / Home Store,Accessories Store,Boutique,Event Space,Coffee Shop,Vietnamese Restaurant,Athletics & Sports,Electronics Store,Convenience Store
3,North York,Don Mills,43.745906,-79.352188,2.0,Gym,Restaurant,Japanese Restaurant,Beer Store,Coffee Shop,Sandwich Place,Athletics & Sports,Art Gallery,Supermarket,Clothing Store
4,North York,Glencairn,43.709577,-79.445073,2.0,Japanese Restaurant,Asian Restaurant,Bakery,Pub,Discount Store,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop


In [55]:
#Debug print all the rows with Neigbourhood York Mills, Silver Hills

#NorthYork_merged.loc[NorthYork_merged['Neighbourhood']=="York Mills, Silver Hills"]



# check if there is any other Null

#NorthYork_merged[NorthYork_merged['Cluster Labels'].isna()]

# Drop the Na values

#NorthYork_merged=NorthYork_merged.dropna()



Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue


In [56]:
# convert column "Cluster Labels" of a DataFrame to int from float
 
NorthYork_merged['Cluster Labels'] = NorthYork_merged['Cluster Labels'].apply(np.int64)

print(NorthYork_merged.dtypes)

#NorthYork_merged.head()

Borough                    object
Neighbourhood              object
Latitude                  float64
Longitude                 float64
Cluster Labels              int64
1st Most Common Venue      object
2nd Most Common Venue      object
3rd Most Common Venue      object
4th Most Common Venue      object
5th Most Common Venue      object
6th Most Common Venue      object
7th Most Common Venue      object
8th Most Common Venue      object
9th Most Common Venue      object
10th Most Common Venue     object
dtype: object


In [57]:
# create visualization map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(NorthYork_merged['Latitude'], NorthYork_merged['Longitude'], NorthYork_merged['Neighbourhood'], NorthYork_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [58]:
#Examine Clusters

#Cluster 0

NorthYork_merged.loc[NorthYork_merged['Cluster Labels'] == 0, NorthYork_merged.columns[[1] + list(range(5, NorthYork_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Parkwoods,Bus Stop,Construction & Landscaping,Food & Drink Shop,Park,Women's Store,Dim Sum Restaurant,Clothing Store,Coffee Shop,Comfort Food Restaurant,Convenience Store
14,"North Park, Maple Leaf Park, Upwood Park",Construction & Landscaping,Massage Studio,Bakery,Park,Women's Store,Dim Sum Restaurant,Clothing Store,Coffee Shop,Comfort Food Restaurant,Convenience Store


In [59]:
#Cluster 1

NorthYork_merged.loc[NorthYork_merged['Cluster Labels'] == 1,NorthYork_merged.columns[[1] + list(range(5, NorthYork_merged.shape[1]))]]


Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
15,Humber Summit,Furniture / Home Store,Pizza Place,Women's Store,Diner,Clothing Store,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop
19,"Humberlea, Emery",Furniture / Home Store,Baseball Field,Women's Store,Diner,Clothing Store,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop


In [60]:
#Cluster 2

NorthYork_merged.loc[NorthYork_merged['Cluster Labels'] == 2,NorthYork_merged.columns[[1] + list(range(5, NorthYork_merged.shape[1]))]]


Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Victoria Village,Grocery Store,Coffee Shop,Hockey Arena,Portuguese Restaurant,Women's Store,Diner,Clothing Store,Comfort Food Restaurant,Construction & Landscaping,Convenience Store
2,"Lawrence Manor, Lawrence Heights",Clothing Store,Furniture / Home Store,Accessories Store,Boutique,Event Space,Coffee Shop,Vietnamese Restaurant,Athletics & Sports,Electronics Store,Convenience Store
3,Don Mills,Gym,Restaurant,Japanese Restaurant,Beer Store,Coffee Shop,Sandwich Place,Athletics & Sports,Art Gallery,Supermarket,Clothing Store
4,Glencairn,Japanese Restaurant,Asian Restaurant,Bakery,Pub,Discount Store,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop
5,Don Mills,Gym,Restaurant,Japanese Restaurant,Beer Store,Coffee Shop,Sandwich Place,Athletics & Sports,Art Gallery,Supermarket,Clothing Store
7,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Shopping Mall,Grocery Store,Pizza Place,Park,Ice Cream Shop,Bridal Shop,Intersection,Deli / Bodega
8,"Fairview, Henry Farm, Oriole",Clothing Store,Coffee Shop,Fast Food Restaurant,Restaurant,Japanese Restaurant,Shoe Store,Bakery,Bank,Juice Bar,Convenience Store
9,"Northwood Park, York University",Furniture / Home Store,Vietnamese Restaurant,Caribbean Restaurant,Coffee Shop,Massage Studio,Bar,Diner,Comfort Food Restaurant,Construction & Landscaping,Convenience Store
10,Bayview Village,Chinese Restaurant,Café,Bank,Japanese Restaurant,Discount Store,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop
11,Downsview,Grocery Store,Park,Baseball Field,Discount Store,Liquor Store,Business Service,Hotel,Shopping Mall,Bank,Athletics & Sports


In [61]:
#Cluster 3

NorthYork_merged.loc[NorthYork_merged['Cluster Labels'] == 3,NorthYork_merged.columns[[1] + list(range(5, NorthYork_merged.shape[1]))]]


Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
16,"Willowdale, Newtonbrook",Park,Diner,Chocolate Shop,Clothing Store,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega
22,York Mills West,Park,Convenience Store,Diner,Chocolate Shop,Clothing Store,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Cosmetics Shop,Deli / Bodega


In [62]:
#Cluster 4

NorthYork_merged.loc[NorthYork_merged['Cluster Labels'] == 4,NorthYork_merged.columns[[1] + list(range(5, NorthYork_merged.shape[1]))]]


Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,Hillcrest Village,Golf Course,Pool,Mediterranean Restaurant,Fast Food Restaurant,Dog Run,Dim Sum Restaurant,Clothing Store,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping
