In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [32]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests

In [33]:
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text


In [34]:
soup = BeautifulSoup(data, 'html.parser')

In [35]:
postalCodeList = []
boroughList = []
neighborhoodList = []

In [36]:
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postalCodeList.append(cells[0].text)
        boroughList.append(cells[1].text)
        neighborhoodList.append(cells[2].text.rstrip('\n'))

In [37]:
df = pd.DataFrame({"Postal Code": postalCodeList, "Borough": boroughList, "Neighborhood": neighborhoodList})

df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


A new dataframe stores only those rows where the boroughs are assigned, or rather not 'not assigned'.

In [38]:
newdf = df[df.Borough != "Not assigned"].reset_index(drop=True)
newdf.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


A new dataframe groups by postal code and borough, then aggregates the neighbourhoods that match both of them and joins them together using the lambda function.

In [39]:
groupeddf = newdf.groupby(["Postal Code", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
groupeddf.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


A loop iterates through all the rows by their index. If the neighbourhood of that row is not assigned, the neighbourhood of that row will be the borough of that row.

In [40]:
for index, row in groupeddf.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]
        
groupeddf.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [41]:
groupeddf.shape

(103, 3)

In [42]:
import numpy as np 
import json 
from geopy.geocoders import Nominatim 
from pandas.io.json import json_normalize 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
!pip install folium

import folium 

print("Libraries imported.")

Libraries imported.


In [43]:
coord = pd.read_csv('http://cocl.us/Geospatial_data')
coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Merging the two dataframes on their common column 'Postal Code'.

In [44]:
mergeddf=groupeddf.merge(coord,on="Postal Code")
mergeddf.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


Finding the coordinates for Toronto

In [45]:
address = 'Toronto'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))


The geograpical coordinate of Toronto are 43.653963, -79.387207.


Creating a map of Toronto with neighbourhoods superimposed on each other

In [46]:
toronto_map = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(mergeddf['Latitude'], mergeddf['Longitude'], mergeddf['Borough'], mergeddf['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  
    
toronto_map

Counting how many boroughs and neighbourhoods are there in Toronto

In [47]:
print("Number of boroughs in Toronto is ", mergeddf['Borough'].nunique())
print("Number of neighbourhoods in Toronto is", newdf['Neighborhood'].nunique())

Number of boroughs in Toronto is  11
Number of neighbourhoods in Toronto is 209


Visualising all the neighbourhoods in Scarborough

In [48]:
scardf=newdf.merge(coord,on="Postal Code")
scardf= scardf[scardf['Borough']=="Scarborough"]
scardf

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
8,M1B,Scarborough,Rouge,43.806686,-79.194353
9,M1B,Scarborough,Malvern,43.806686,-79.194353
21,M1C,Scarborough,Highland Creek,43.784535,-79.160497
22,M1C,Scarborough,Rouge Hill,43.784535,-79.160497
23,M1C,Scarborough,Port Union,43.784535,-79.160497
33,M1E,Scarborough,Guildwood,43.763573,-79.188711
34,M1E,Scarborough,Morningside,43.763573,-79.188711
35,M1E,Scarborough,West Hill,43.763573,-79.188711
39,M1G,Scarborough,Woburn,43.770992,-79.216917
43,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [49]:
scarmap = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, neighborhood in zip(scardf['Latitude'], scardf['Longitude'], scardf['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(scarmap)  
    
scarmap

Using the Foursquare API

In [50]:
CLIENT_ID = 'PBD2ONZGZWMVXXCKG23LKHZ4KSDE3GSIVVB5SOXBTW2LF5HQ' # your Foursquare ID
CLIENT_SECRET = 'WBLFH2G2IRXP0UNNW2ZNWINDAZQ1UVLC2XQ4JQ0PZTZHPASV' # your Foursquare Secret
VERSION = '20180605'
LIMIT=100
radius=500

Finding the coordinates of Toronto Zoo in Scarborough

In [51]:
from geopy.geocoders import Nominatim
address = '2000 Meadowvale Rd, Toronto'
geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(latitude, longitude)

43.822937 -79.1776523


Finding landmarks near Toronto Zoo for tourists to explore

In [52]:
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude, 
    longitude, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=PBD2ONZGZWMVXXCKG23LKHZ4KSDE3GSIVVB5SOXBTW2LF5HQ&client_secret=WBLFH2G2IRXP0UNNW2ZNWINDAZQ1UVLC2XQ4JQ0PZTZHPASV&v=20180605&ll=43.822937,-79.1776523&radius=500&limit=100'

In [53]:
results = requests.get(url).json()
print(results)

{'meta': {'code': 200, 'requestId': '5d919833b77c7700394706f1'}, 'response': {'headerLocation': 'Rouge', 'headerFullLocation': 'Rouge, Toronto', 'headerLocationGranularity': 'neighborhood', 'totalResults': 11, 'suggestedBounds': {'ne': {'lat': 43.827437004500005, 'lng': -79.17142678680906}, 'sw': {'lat': 43.8184369955, 'lng': -79.18387781319095}}, 'groups': [{'type': 'Recommended Places', 'name': 'recommended', 'items': [{'reasons': {'count': 0, 'items': [{'summary': 'This spot is popular', 'type': 'general', 'reasonName': 'globalInteractionReason'}]}, 'venue': {'id': '4ad4c05ef964a52093f620e3', 'name': 'Toronto Zoo', 'location': {'address': '361 Old Finch Av', 'crossStreet': 'at Meadowvale & Toronto Zoo Rds', 'lat': 43.82058189639563, 'lng': -79.18155125697548, 'labeledLatLngs': [{'label': 'display', 'lat': 43.82058189639563, 'lng': -79.18155125697548}], 'distance': 408, 'postalCode': 'M1B 5K7', 'cc': 'CA', 'city': 'Scarborough', 'state': 'ON', 'country': 'Canada', 'formattedAddress':

In [54]:

def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [55]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head(10)

Unnamed: 0,name,categories,lat,lng
0,Toronto Zoo,Zoo,43.820582,-79.181551
1,Australasia Pavillion,Zoo Exhibit,43.822563,-79.183286
2,Kids Zoo,Zoo Exhibit,43.821714,-79.181755
3,Peacock Café (Toronto Zoo),Café,43.820012,-79.181563
4,Oasis Zoo Run,Other Great Outdoors,43.819933,-79.179242
5,Stingray Bay,Zoo Exhibit,43.821742,-79.182201
6,Greenhouse Gift Shop,Gift Shop,43.820396,-79.181546
7,Zoomobile Tour Entrance,Tram Station,43.819926,-79.181243
8,Zoomobile,Zoo Exhibit,43.819887,-79.181279
9,Great Barrier Reef Exhibit,Zoo Exhibit,43.822767,-79.183243


The first venue is the Toronto Zoo since those were the coordinates we searched for. However, the next venues are ones which a family can visit after a tiring day at the zoo to buy souvenirs or some snacks.

Now clustering all neighbourhoods in Scarborough

In [56]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [57]:
scar_venues = getNearbyVenues(names=scardf['Neighborhood'],
                                   latitudes=scardf['Latitude'],
                                   longitudes=scardf['Longitude']
                                  )



Rouge
Malvern
Highland Creek
Rouge Hill
Port Union
Guildwood
Morningside
West Hill
Woburn
Cedarbrae
Scarborough Village
East Birchmount Park
Ionview
Kennedy Park
Clairlea
Golden Mile
Oakridge
Cliffcrest
Cliffside
Scarborough Village West
Birch Cliff
Cliffside West
Dorset Park
Scarborough Town Centre
Wexford Heights
Maryvale
Wexford
Agincourt
Clarks Corners
Sullivan
Tam O'Shanter
Agincourt North
L'Amoreaux East
Milliken
Steeles East
L'Amoreaux West
Upper Rouge


Checking how many values were returned for each neighbourhood

In [58]:
scar_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,4,4,4,4,4,4
Agincourt North,3,3,3,3,3,3
Birch Cliff,4,4,4,4,4,4
Cedarbrae,7,7,7,7,7,7
Clairlea,9,9,9,9,9,9
Clarks Corners,13,13,13,13,13,13
Cliffcrest,3,3,3,3,3,3
Cliffside,3,3,3,3,3,3
Cliffside West,4,4,4,4,4,4
Dorset Park,7,7,7,7,7,7


In [59]:
scar_onehot = pd.get_dummies(scar_venues[['Venue Category']], prefix="", prefix_sep="")

scar_onehot['Neighborhood'] = scar_venues['Neighborhood'] 

fixed_columns = [scar_onehot.columns[-1]] + list(scar_onehot.columns[:-1])
scar_onehot = scar_onehot[fixed_columns]
scar_onehot

Unnamed: 0,Neighborhood,American Restaurant,Athletics & Sports,Auto Garage,Bakery,Bank,Bar,Breakfast Spot,Bubble Tea Shop,Bus Line,...,Pizza Place,Playground,Rental Car Location,Sandwich Place,Sculpture Garden,Shopping Mall,Skating Rink,Soccer Field,Thai Restaurant,Vietnamese Restaurant
0,Rouge,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Malvern,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Highland Creek,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Rouge Hill,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Port Union,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,Guildwood,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
6,Guildwood,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,Guildwood,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,Guildwood,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
9,Guildwood,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Grouping venues by neighbourhood and taking their mean

In [60]:
scar_grouped = scar_onehot.groupby('Neighborhood').mean().reset_index()
scar_grouped.head()

Unnamed: 0,Neighborhood,American Restaurant,Athletics & Sports,Auto Garage,Bakery,Bank,Bar,Breakfast Spot,Bubble Tea Shop,Bus Line,...,Pizza Place,Playground,Rental Car Location,Sandwich Place,Sculpture Garden,Shopping Mall,Skating Rink,Soccer Field,Thai Restaurant,Vietnamese Restaurant
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0
1,Agincourt North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.333333,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0
2,Birch Cliff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0
3,Cedarbrae,0.0,0.142857,0.0,0.142857,0.142857,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0
4,Clairlea,0.0,0.0,0.0,0.222222,0.0,0.0,0.0,0.0,0.111111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0


Printing each neighbourhood with 5 most common venues

In [61]:
num_top_venues = 5

for hood in scar_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = scar_grouped[scar_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt----
            venue  freq
0  Clothing Store  0.25
1    Skating Rink  0.25
2          Lounge  0.25
3  Breakfast Spot  0.25
4            Park  0.00


----Agincourt North----
               venue  freq
0               Park  0.33
1   Sculpture Garden  0.33
2         Playground  0.33
3       Noodle House  0.00
4  Korean Restaurant  0.00


----Birch Cliff----
                   venue  freq
0        College Stadium  0.25
1           Skating Rink  0.25
2  General Entertainment  0.25
3                   Café  0.25
4    American Restaurant  0.00


----Cedarbrae----
                venue  freq
0     Thai Restaurant  0.14
1              Bakery  0.14
2                Bank  0.14
3  Athletics & Sports  0.14
4    Hakka Restaurant  0.14


----Clairlea----
                  venue  freq
0                Bakery  0.22
1              Bus Line  0.11
2           Bus Station  0.11
3  Fast Food Restaurant  0.11
4         Metro Station  0.11


----Clarks Corners----
                  venue  freq


In [62]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [63]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = scar_grouped['Neighborhood']

for ind in np.arange(scar_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(scar_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Clothing Store,Skating Rink,Breakfast Spot,Lounge,Vietnamese Restaurant,General Entertainment,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store
1,Agincourt North,Sculpture Garden,Playground,Park,Vietnamese Restaurant,Chinese Restaurant,General Entertainment,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store
2,Birch Cliff,Skating Rink,General Entertainment,College Stadium,Café,Vietnamese Restaurant,Clothing Store,Grocery Store,Fried Chicken Joint,Fast Food Restaurant,Electronics Store
3,Cedarbrae,Thai Restaurant,Athletics & Sports,Hakka Restaurant,Bakery,Bank,Fried Chicken Joint,Caribbean Restaurant,College Stadium,Grocery Store,General Entertainment
4,Clairlea,Bakery,Soccer Field,Intersection,Fast Food Restaurant,Bus Line,Bus Station,Park,Metro Station,Vietnamese Restaurant,Coffee Shop


Now clustering neighbourhoods

In [64]:
kclusters = 3

scar_grouped_clustering = scar_grouped.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(scar_grouped_clustering)

kmeans.labels_[0:20]


array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1],
      dtype=int32)

In [68]:

scar_merged = scardf.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

scar_merged['Cluster Labels'] = pd.to_numeric(scar_merged['Cluster Labels'])
scar_merged= scar_merged.fillna(0)
scar_merged['Cluster Labels'] = scar_merged['Cluster Labels'].astype(int)

scar_merged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
8,M1B,Scarborough,Rouge,43.806686,-79.194353,0,Fast Food Restaurant,Vietnamese Restaurant,Hobby Shop,Grocery Store,General Entertainment,Fried Chicken Joint,Electronics Store,Discount Store,Department Store,Convenience Store
9,M1B,Scarborough,Malvern,43.806686,-79.194353,0,Fast Food Restaurant,Vietnamese Restaurant,Hobby Shop,Grocery Store,General Entertainment,Fried Chicken Joint,Electronics Store,Discount Store,Department Store,Convenience Store
21,M1C,Scarborough,Highland Creek,43.784535,-79.160497,2,Bar,Vietnamese Restaurant,Clothing Store,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store,Department Store
22,M1C,Scarborough,Rouge Hill,43.784535,-79.160497,2,Bar,Vietnamese Restaurant,Clothing Store,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store,Department Store
23,M1C,Scarborough,Port Union,43.784535,-79.160497,2,Bar,Vietnamese Restaurant,Clothing Store,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store,Department Store


In [69]:
from sklearn.cluster import KMeans

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0.0, 1.0, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(scar_merged['Latitude'], scar_merged['Longitude'], scar_merged['Neighborhood'], scar_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Observation: Since I grouped the neighbourhoods into only 3 clusters and they were all from the same borough, the algorithm grouped most of the neighbourhoods into a single group, with only few neighbourhoods in the other two clusters. This may prove that most neighbourhoods within a borough are similar in terms of the venues and activities they offer. A person looking to shift house shouldn't shift to another one within their borough thinking that the facilities around them would differ widely. 

Examining Clusters

In [70]:
scar_merged.loc[scar_merged['Cluster Labels'] == 0, scar_merged.columns[[2] + list(range(5, scar_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
8,Rouge,0,Fast Food Restaurant,Vietnamese Restaurant,Hobby Shop,Grocery Store,General Entertainment,Fried Chicken Joint,Electronics Store,Discount Store,Department Store,Convenience Store
9,Malvern,0,Fast Food Restaurant,Vietnamese Restaurant,Hobby Shop,Grocery Store,General Entertainment,Fried Chicken Joint,Electronics Store,Discount Store,Department Store,Convenience Store
188,Upper Rouge,0,0,0,0,0,0,0,0,0,0,0


In [71]:
scar_merged.loc[scar_merged['Cluster Labels'] == 1, scar_merged.columns[[2] + list(range(5, scar_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
33,Guildwood,1,Intersection,Electronics Store,Rental Car Location,Breakfast Spot,Pizza Place,Medical Center,Mexican Restaurant,Vietnamese Restaurant,Clothing Store,Fried Chicken Joint
34,Morningside,1,Intersection,Electronics Store,Rental Car Location,Breakfast Spot,Pizza Place,Medical Center,Mexican Restaurant,Vietnamese Restaurant,Clothing Store,Fried Chicken Joint
35,West Hill,1,Intersection,Electronics Store,Rental Car Location,Breakfast Spot,Pizza Place,Medical Center,Mexican Restaurant,Vietnamese Restaurant,Clothing Store,Fried Chicken Joint
39,Woburn,1,Coffee Shop,Korean Restaurant,Vietnamese Restaurant,Clothing Store,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store
43,Cedarbrae,1,Thai Restaurant,Athletics & Sports,Hakka Restaurant,Bakery,Bank,Fried Chicken Joint,Caribbean Restaurant,College Stadium,Grocery Store,General Entertainment
54,Scarborough Village,1,Grocery Store,Playground,Vietnamese Restaurant,Chinese Restaurant,General Entertainment,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store,Department Store
66,East Birchmount Park,1,Discount Store,Hobby Shop,Department Store,Convenience Store,Coffee Shop,Vietnamese Restaurant,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant
67,Ionview,1,Discount Store,Hobby Shop,Department Store,Convenience Store,Coffee Shop,Vietnamese Restaurant,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant
68,Kennedy Park,1,Discount Store,Hobby Shop,Department Store,Convenience Store,Coffee Shop,Vietnamese Restaurant,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant
79,Clairlea,1,Bakery,Soccer Field,Intersection,Fast Food Restaurant,Bus Line,Bus Station,Park,Metro Station,Vietnamese Restaurant,Coffee Shop


In [72]:
scar_merged.loc[scar_merged['Cluster Labels'] == 2, scar_merged.columns[[2] + list(range(5, scar_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
21,Highland Creek,2,Bar,Vietnamese Restaurant,Clothing Store,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store,Department Store
22,Rouge Hill,2,Bar,Vietnamese Restaurant,Clothing Store,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store,Department Store
23,Port Union,2,Bar,Vietnamese Restaurant,Clothing Store,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store,Department Store


Analysing the results:
While Cluster 2 has a variety of activities and restaurants, a lot of them focus on kids or younger demographics like pizza places and skating rinks. The three neighbourhoods in Cluster 3 may have an older and more mature audience since the most common venue is a bar and a lot of other venues like electronics and department stores seem catered to them. Cluster 1, on the other hand, seems like a shopping neighbourhood which might have plenty of malls, given the fast food stalls and department stores.