# Part 1: scraping web data & creating clean dataframe

In [1]:
import requests
import lxml.html as lh
import pandas as pd
import numpy as np
import itertools
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page=requests.get(url)
doc=lh.fromstring(page.content)
#inspected table elements to find that table data is stored between HTML code <tr>...</tr>
tr_elements=doc.xpath('//tr')
col=[]
i=0
#For each row, store each first element (header) and an empty list, then print the names of the headers
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    print((i,name))
    col.append((name,[]))


(1, 'Postal Code\n')
(2, 'Borough\n')
(3, 'Neighbourhood\n')


In [2]:
#first row is header, data is stored on the second row(index=1) onwards
#iterate through rows in the table 
for j in range(1,len(tr_elements)):
    #if row is not 3 columns long, do not iterate through because that means it doesn't belong to this table
    T=tr_elements[j]
    if len(T)!=3:
        break
    i=0
    #Iterate through each element of the row & add to dictionary
    for t in T.iterchildren():
        data=t.text_content()
        col[i][1].append(data)
        i+=1
dict={title:column for (title,column) in col}
#convert dictionary into a dataframe
df=pd.DataFrame(dict)
tor_neighborhoods=pd.DataFrame(dict)
tor_neighborhoods.head()
tor_neighborhoods.tail()
#after inspecting head & tail of data frame, found that last row is not needed, not a postal code
tor_neighborhoods.drop(tor_neighborhoods.tail(1).index,inplace=True)
tor_neighborhoods.head()

Unnamed: 0,Postal Code\n,Borough\n,Neighbourhood\n
0,M1A\n,Not assigned\n,Not assigned\n
1,M2A\n,Not assigned\n,Not assigned\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"


In [3]:
#clean up data frame by removing '\n' from headers & all data points
tor_neighborhoods.columns=tor_neighborhoods.columns.str.replace('\n','')
tor_neighborhoods.rename(columns={'Neighbourhood':'Neighborhood'},inplace=True)
tor_neighborhoods['Postal Code']=tor_neighborhoods['Postal Code'].str.replace('\n','')
tor_neighborhoods['Borough']=tor_neighborhoods['Borough'].str.replace('\n','')
tor_neighborhoods['Neighborhood']=tor_neighborhoods['Neighborhood'].str.replace('\n','')
print(tor_neighborhoods.shape)
tor_neighborhoods.head()

(180, 3)


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [4]:
#there were 180 postal codes scraped from the Wikipedia page/table
#drop cells with a borough that is 'Not assigned', per assignment instructions
tor_neighborhoods=tor_neighborhoods[tor_neighborhoods.Borough != 'Not assigned']
tor_neighborhoods = tor_neighborhoods[tor_neighborhoods['Borough'] != 'Not assigned'].reset_index(drop = True)
#If a cell has a borough but a 'Not assigned'  neighborhood, then the neighborhood will be the same as the borough.
tor_neighborhoods['Neighborhood'] = np.where(tor_neighborhoods['Neighborhood']=='Not assigned',tor_neighborhoods['Borough'],tor_neighborhoods['Neighborhood'])
#this ended up not affecting anything as there were no neighborhoods that were 'Not assigned'
tor_neighborhoods.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [5]:
#after dropping 'Not assigned' boroughs, how many postal codes are left in the dataframe?
tor_neighborhoods.shape

(103, 3)

# Part 2: Getting latitude & longitude of each neighborhood

In [6]:
#for ease of coding & saving time, converted .csv file to dataframe instead of using Geocoder package
geospatialdata=pd.read_csv('https://cocl.us/Geospatial_data')
geospatialdata.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [7]:
#now merge geospatialdata with tor_neighborhoods dataframe made in part 1 to assign latitude/longitude to each postal code & correstponding neighborhood
tor_geospatial=pd.merge(tor_neighborhoods,geospatialdata,how='inner',on='Postal Code')
tor_geospatial.head()


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


# Part 3: Exploring & clustering neighborhoods in Toronto

In [8]:
#import all necessary libraries
import numpy as npd
import pandas as pd
import json
!pip install geopy
from geopy.geocoders import Nominatim
import requests
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
%matplotlib inline
from sklearn.cluster import KMeans
!pip install folium
import folium
print('libraries imported')

libraries imported


In [9]:
#finding geographical coordinates of Toronto
address = 'Toronto, Ontario, Canada'
geolocator=Nominatim(user_agent='tor_explorer')
location=geolocator.geocode(address)
latitude=location.latitude
longitude=location.longitude
print('the geographical coordinates of Toronto, ON are {},{}'.format(latitude,longitude))

the geographical coordinates of Toronto, ON are 43.6534817,-79.3839347


In [35]:
#visualizing Toronto & the neighborhoods in it
map_tor=folium.Map(location=[latitude,longitude],zoom_start=11)
#add markers to map
for lat,long,label in zip(tor_geospatial['Latitude'],tor_geospatial['Longitude'],tor_geospatial['Neighborhood']):
    label=folium.Popup(label,parse_html=True)
    folium.CircleMarker([lat,long],radius=5,popup=label,color='purple',fill=True,fill_color='#b031cc',fill_opacity=0.6,parse_html=False).add_to(map_tor)
map_tor

In [11]:
# The code was removed by Watson Studio for sharing.

In [36]:
#define Foursquare credentials, access token, & version in above cell (credentials hidden)
#I chose to explore North York, specifically
northyork_data=tor_geospatial[tor_geospatial['Borough']=='North York'].reset_index(drop=True)
#finding geographical coordinates of North York
address_nyt = 'North York, Toronto'
geolocator_nyt=Nominatim(user_agent='nyt_explorer')
location_nyt=geolocator_nyt.geocode(address_nyt)
latitude_nyt=location_nyt.latitude
longitude_nyt=location_nyt.longitude
print('the geographical coordinates of North York, Toronto, ON are {},{}'.format(latitude_nyt,longitude_nyt))

#visualizing North York & the neighborhoods in it
map_nyt=folium.Map(location=[latitude_nyt,longitude_nyt],zoom_start=12)
#add markers to map
for lat,long,label in zip(northyork_data['Latitude'],northyork_data['Longitude'],northyork_data['Neighborhood']):
    label=folium.Popup(label,parse_html=True)
    folium.CircleMarker([lat,long],radius=5,popup=label,color='blue',fill=True,fill_color='#0067A5',fill_opacity=0.6,parse_html=False).add_to(map_nyt)
map_nyt

the geographical coordinates of North York, Toronto, ON are 43.7543263,-79.44911696639593


In [37]:
#getting top 50 venues that are in North York within a radius of 750 meters from Foursquare API
limit=50
radius=750
foursquare_url='https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&oauth_token={}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude,ACCESS_TOKEN, VERSION, radius, limit)
#foursquare_url
#checked url but redacted code to hide credentials
results=requests.get(foursquare_url).json()
results

{'meta': {'code': 200, 'requestId': '5ffdc2ed9661b37fc4ffff22'},
 'notifications': [{'type': 'notificationTray', 'item': {'unreadCount': 0}}],
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Bay Street Corridor',
  'headerFullLocation': 'Bay Street Corridor, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 189,
  'suggestedBounds': {'ne': {'lat': 43.660231706750004,
    'lng': -79.37462282089147},
   'sw': {'lat': 43.64673169325, 'lng': -79.39324657910852}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '5227bb01498e17bf485e6202',
       'name': 'Downtown Toronto',
       'location': {'lat': 43.65323167517444,
        'lng': -79.38529600606677,
        'l

In [14]:
# defining a function that extracts the category of each venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [38]:
#creating list of nearby venues' JSON data
venues = results['response']['groups'][0]['items']
# flatten JSON
nearby_venues = pd.json_normalize(venues) 
# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]
# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)
# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
#the limit for venues to be returned was defined to be limit=50, so there should be no more than 50 venues returned by Foursquare
print('{} venues were returned by Foursquare'.format(nearby_venues.shape[0]))
nearby_venues.head()

50 venues were returned by Foursquare


Unnamed: 0,name,categories,lat,lng
0,Downtown Toronto,Neighborhood,43.653232,-79.385296
1,Nathan Phillips Square,Plaza,43.65227,-79.383516
2,Japango,Sushi Restaurant,43.655268,-79.385165
3,Indigo,Bookstore,43.653515,-79.380696
4,Chatime 日出茶太,Bubble Tea Shop,43.655542,-79.384684


In [16]:
#defining a function to get the details of nearby venues for each neighborhood within the borough of North York
def getNearbyVenues(names, latitudes, longitudes, radius=750):
    venues_list=[]
    for name, lat, long in zip(names, latitudes, longitudes):
        print(name)
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID,CLIENT_SECRET,VERSION,lat,long,radius,limit)
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood','Neighborhood Latitude','Neighborhood Longitude','Venue','Venue Latitude','Venue Longitude','Venue Category']
    return(nearby_venues)

In [17]:
#convert list into dataframe
northyork_venues=getNearbyVenues(names=northyork_data['Neighborhood'],latitudes=northyork_data['Latitude'],longitudes=northyork_data['Longitude'],radius=750)
northyork_venues=pd.DataFrame(northyork_venues)
print(northyork_venues.shape)
northyork_venues.head()

Parkwoods
Victoria Village
Lawrence Manor, Lawrence Heights
Don Mills
Glencairn
Don Mills
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Fairview, Henry Farm, Oriole
Northwood Park, York University
Bayview Village
Downsview
York Mills, Silver Hills
Downsview
North Park, Maple Leaf Park, Upwood Park
Humber Summit
Willowdale, Newtonbrook
Downsview
Bedford Park, Lawrence Manor East
Humberlea, Emery
Willowdale, Willowdale East
Downsview
York Mills West
Willowdale, Willowdale West
(379, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Parkwoods,43.753259,-79.329656,DVP at York Mills,43.758899,-79.334099,Intersection
3,Parkwoods,43.753259,-79.329656,TTC Stop #09083,43.759655,-79.332223,Bus Stop
4,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena


In [41]:
#from the shape of the dataframe found above, we see that there are a total of 379 nearby venues within 750 meters of each neighborhood's given latitude & longitude
#group venues by neighborhood to see how many venues are in each neighborhood in descending order to see which neighborhoods have the most venues
northyork_venues.groupby('Neighborhood').count().sort_values(by='Venue',ascending=False)

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Willowdale, Willowdale East",79,79,79,79,79,79
Downsview,44,44,44,44,44,44
"Fairview, Henry Farm, Oriole",39,39,39,39,39,39
"Lawrence Manor, Lawrence Heights",35,35,35,35,35,35
"Bedford Park, Lawrence Manor East",32,32,32,32,32,32
Don Mills,32,32,32,32,32,32
"Bathurst Manor, Wilson Heights, Downsview North",25,25,25,25,25,25
Glencairn,18,18,18,18,18,18
Hillcrest Village,13,13,13,13,13,13
Bayview Village,10,10,10,10,10,10


In [19]:
#separate venue categories with one hot encoding & add neighborhood column back to dataframe
northyork_onehot=pd.get_dummies(northyork_venues[['Venue Category']],prefix='',prefix_sep='')
northyork_onehot['Neighborhood']=northyork_venues['Neighborhood']
#move neighborhood column to first column/index
fixed_columns = [northyork_onehot.columns[-1]] + list(northyork_onehot.columns[:-1])
northyork_onehot = northyork_onehot[fixed_columns]
#make sure we didn't lose any venue data by printing shape; should still have 379 rows
print(northyork_onehot.shape)
northyork_onehot.head()

(379, 115)


Unnamed: 0,Neighborhood,Accessories Store,Airport,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bagel Shop,Bakery,...,Steakhouse,Supermarket,Sushi Restaurant,Thai Restaurant,Theater,Toy / Game Store,Trail,Video Game Store,Vietnamese Restaurant,Wings Joint
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
#now group by neighborhood & calculate mean of venue occurance per category to see what types of venues are most common in each neighborhood
northyork_grouped=northyork_onehot.groupby('Neighborhood').mean().reset_index()
northyork_grouped

Unnamed: 0,Neighborhood,Accessories Store,Airport,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bagel Shop,Bakery,...,Steakhouse,Supermarket,Sushi Restaurant,Thai Restaurant,Theater,Toy / Game Store,Trail,Video Game Store,Vietnamese Restaurant,Wings Joint
0,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.04,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bedford Park, Lawrence Manor East",0.0,0.0,0.03125,0.0,0.0,0.0,0.0,0.03125,0.03125,...,0.0,0.0,0.03125,0.03125,0.0,0.0,0.0,0.0,0.0,0.03125
3,Don Mills,0.0,0.0,0.0,0.03125,0.0,0.0,0.03125,0.0,0.0,...,0.0,0.03125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Downsview,0.0,0.022727,0.0,0.0,0.0,0.0,0.022727,0.0,0.022727,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068182,0.0
5,"Fairview, Henry Farm, Oriole",0.0,0.0,0.025641,0.0,0.0,0.025641,0.0,0.0,0.025641,...,0.0,0.0,0.0,0.0,0.025641,0.025641,0.0,0.025641,0.0,0.0
6,Glencairn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Hillcrest Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Humber Summit,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"Humberlea, Emery",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
#find the top 10 types of venues per neighborhood
for neigh in northyork_grouped['Neighborhood']:
    print("------"+neigh+"------")
    temp=northyork_grouped[northyork_grouped['Neighborhood']==neigh].T.reset_index()
    temp.columns=['venue','freq']
    temp=temp.iloc[1:]
    temp['freq']=temp['freq'].astype(float)
    temp=temp.round({'freq':2})
    print(temp.sort_values('freq',ascending=False).reset_index(drop=True).head(10))
    print('\n')
    

------Bathurst Manor, Wilson Heights, Downsview North------
                venue  freq
0                Park  0.08
1                Bank  0.08
2         Coffee Shop  0.08
3         Pizza Place  0.08
4          Restaurant  0.04
5  Chinese Restaurant  0.04
6       Deli / Bodega  0.04
7            Pharmacy  0.04
8    Community Center  0.04
9       Shopping Mall  0.04


------Bayview Village------
                  venue  freq
0                  Bank   0.2
1   Japanese Restaurant   0.2
2          Intersection   0.1
3          Skating Rink   0.1
4                  Café   0.1
5    Chinese Restaurant   0.1
6         Shopping Mall   0.1
7         Grocery Store   0.1
8                Office   0.0
9  Outdoor Supply Store   0.0


------Bedford Park, Lawrence Manor East------
                     venue  freq
0       Italian Restaurant  0.09
1              Coffee Shop  0.09
2           Sandwich Place  0.06
3              Wings Joint  0.03
4              Pizza Place  0.03
5               Restaurant

In [22]:
#convert top 10 types of venues per neighborhood into a dataframe
#sort the venues in descending order
def return_most_common_venues(row,num_top_venues):
    row_categories=row.iloc[1:]
    row_categories_sorted=row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:num_top_venues]
#create new dataframe & display top 10 types of venues per neighborhood
num_top_venues=10
indicators=['st','nd','rd']
# create columns according to number of top venues; in this case, 10
columns=['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))
# create a new dataframe for the venues sorted by most common
neighborhoods_venues_sorted=pd.DataFrame(columns=columns)
#add the neighborhood names back in
neighborhoods_venues_sorted['Neighborhood']=northyork_grouped['Neighborhood']
for ind in np.arange(northyork_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(northyork_grouped.iloc[ind, :], num_top_venues)
neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Park,Pizza Place,Bank,Ice Cream Shop,Chinese Restaurant,Middle Eastern Restaurant,Community Center,Frozen Yogurt Shop,Pharmacy
1,Bayview Village,Bank,Japanese Restaurant,Chinese Restaurant,Intersection,Café,Shopping Mall,Skating Rink,Grocery Store,Bagel Shop,Fast Food Restaurant
2,"Bedford Park, Lawrence Manor East",Italian Restaurant,Coffee Shop,Sandwich Place,Wings Joint,Liquor Store,Pizza Place,Pharmacy,Park,Café,Comfort Food Restaurant
3,Don Mills,Japanese Restaurant,Gym,Beer Store,Coffee Shop,Restaurant,Intersection,Italian Restaurant,Discount Store,Dim Sum Restaurant,Sandwich Place
4,Downsview,Grocery Store,Vietnamese Restaurant,Coffee Shop,Pizza Place,Pharmacy,Gas Station,Gym / Fitness Center,Fast Food Restaurant,Discount Store,Park


In [23]:
#cluster North York neighborhoods into 7 clusters using KMeans
#I first tried the analysis with k=5 clusters but was unsatisfied with the groupings so I increased the number of clusters to 7 to see if there would any added clarity/better groupings
k=7
#drop the neighborhood names so that KMeans clustering can run on unlabeled data
northyork_grouped_clus=northyork_grouped.drop('Neighborhood',1)
kmeans=KMeans(n_clusters=k,random_state=0).fit(northyork_grouped_clus)
#check cluster labels for each row in the dataframe; commented out for cleaner notebook
#kmeans.labels_[0:10]
#add cluster labels
neighborhoods_venues_sorted.insert(0,'cluster labels',kmeans.labels_)
northyork_merged=northyork_data
#merge northyork_data with neighborhoods_venues_sorted to add back neighborhood names & latitude/longitude info
northyork_merged=northyork_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'),on='Neighborhood')
northyork_merged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,cluster labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,0,Bus Stop,Intersection,Food & Drink Shop,Park,Wings Joint,Electronics Store,Comfort Food Restaurant,Community Center,Convenience Store,Cosmetics Shop
1,M4A,North York,Victoria Village,43.725882,-79.315572,2,Park,Intersection,Coffee Shop,French Restaurant,Hockey Arena,Playground,Sporting Goods Shop,Portuguese Restaurant,Dessert Shop,Department Store
2,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,2,Clothing Store,Furniture / Home Store,Fast Food Restaurant,Coffee Shop,Dessert Shop,Restaurant,Vietnamese Restaurant,Gym / Fitness Center,Event Space,Boutique
3,M3B,North York,Don Mills,43.745906,-79.352188,2,Japanese Restaurant,Gym,Beer Store,Coffee Shop,Restaurant,Intersection,Italian Restaurant,Discount Store,Dim Sum Restaurant,Sandwich Place
4,M6B,North York,Glencairn,43.709577,-79.445073,2,Gas Station,Grocery Store,Pizza Place,Japanese Restaurant,Latin American Restaurant,Pub,Coffee Shop,Playground,Ice Cream Shop,Restaurant


In [43]:
#now visualize the 7 clusters made above
#create map of North York
map_clusters=folium.Map(location=[latitude_nyt,longitude_nyt],zoom_start=12)
#set color scheme for clusters
x=np.arange(k)
ys=[i+x+(i*x)**2 for i in range(k)]
colors_array=cm.rainbow(np.linspace(0,1,len(ys)))
rainbow=[colors.rgb2hex(i) for i in colors_array]
#add markers to map with label that includes neighborhood name & assigned cluster
markers_colors=[]
for lat,long,poi,cluster in zip(northyork_merged['Latitude'],northyork_merged['Longitude'],northyork_merged['Neighborhood'],northyork_merged['cluster labels']):
    label=folium.Popup(str(poi)+'\n cluster'+str(cluster),parse_html=True)
    folium.CircleMarker([lat,long],radius=5,popup=label,color=rainbow[cluster-1],fill=True,fill_color=rainbow[cluster-1],fill_opacity=0.8).add_to(map_clusters)
map_clusters

In [25]:
#examine each cluster individually
#cluster 1
northyork_merged.loc[northyork_merged['cluster labels']==0,northyork_merged.columns[[2]+list(range(5,northyork_merged.shape[1]))]]

Unnamed: 0,Neighborhood,cluster labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Parkwoods,0,Bus Stop,Intersection,Food & Drink Shop,Park,Wings Joint,Electronics Store,Comfort Food Restaurant,Community Center,Convenience Store,Cosmetics Shop


In [26]:
#cluster 2
northyork_merged.loc[northyork_merged['cluster labels']==1,northyork_merged.columns[[2]+list(range(5,northyork_merged.shape[1]))]]

Unnamed: 0,Neighborhood,cluster labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
12,"York Mills, Silver Hills",1,Pool,Wings Joint,Electronics Store,Coffee Shop,Comfort Food Restaurant,Community Center,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store


#cluster 3
northyork_merged.loc[northyork_merged['cluster labels']==2,northyork_merged.columns[[2]+list(range(5,northyork_merged.shape[1]))]]

In [30]:
#cluster 4
northyork_merged.loc[northyork_merged['cluster labels']==3,northyork_merged.columns[[2]+list(range(5,northyork_merged.shape[1]))]]

Unnamed: 0,Neighborhood,cluster labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
15,Humber Summit,3,Bakery,Pizza Place,Arts & Crafts Store,Wings Joint,Event Space,Comfort Food Restaurant,Community Center,Convenience Store,Cosmetics Shop,Deli / Bodega


In [31]:
#cluster 5
northyork_merged.loc[northyork_merged['cluster labels']==4,northyork_merged.columns[[2]+list(range(5,northyork_merged.shape[1]))]]

Unnamed: 0,Neighborhood,cluster labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
19,"Humberlea, Emery",4,Gas Station,Convenience Store,Baseball Field,Discount Store,Wings Joint,Event Space,Comfort Food Restaurant,Community Center,Cosmetics Shop,Deli / Bodega


In [32]:
#cluster 6
northyork_merged.loc[northyork_merged['cluster labels']==5,northyork_merged.columns[[2]+list(range(5,northyork_merged.shape[1]))]]

Unnamed: 0,Neighborhood,cluster labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
14,"North Park, Maple Leaf Park, Upwood Park",5,Garden Center,Home Service,Bakery,Wings Joint,Event Space,Comfort Food Restaurant,Community Center,Convenience Store,Cosmetics Shop,Deli / Bodega


In [33]:
#cluster 7
northyork_merged.loc[northyork_merged['cluster labels']==6,northyork_merged.columns[[2]+list(range(5,northyork_merged.shape[1]))]]

Unnamed: 0,Neighborhood,cluster labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
22,York Mills West,6,Park,Gym,Convenience Store,Pet Store,Art Gallery,Event Space,Comfort Food Restaurant,Community Center,Airport,Cosmetics Shop


permalink to Watson Studio notebook for map visualization: https://dataplatform.cloud.ibm.com/analytics/notebooks/v2/675d2df9-a6c1-4469-b73e-d25b8e678652/view?access_token=1e66f4290301d3ffd101995e0f9d2c59f71c6676e0bd78843f647f9e72b1cf9f