# Toronto Clustering Project - Alex Newman

## Question 1 - data  post code information

In [413]:
import json


In [414]:
from sklearn.cluster import KMeans

In [436]:
from pandas.io.json import json_normalize

In [416]:
import matplotlib.cm as cm
import matplotlib.colors as colors

In [417]:
!pip install folium
!pip install pgeocode



In [418]:
import requests
import pandas as pd
import numpy as np

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html = requests.get(url).content
df_list = pd.read_html(html)
df = df_list[0]
#print(df)
#df.to_csv('testdata.csv')

In [419]:
#df.head()

In [420]:
df['Borough'].replace("Not assigned", np.nan, inplace=True)
df.dropna(subset=["Borough"], axis=0, inplace=True)

df.loc[df["Neighborhood"]== 'Not assigned']


df.reset_index(drop=True, inplace=True)

In [421]:
#df.head()

In [422]:
missing_data = df.isnull()
#missing_data.head()

## Question 2 - append geocoded post code info

In [423]:
import pgeocode

nomi = pgeocode.Nominatim('ca')
df['Latitude'] = np.nan
df['Longitude'] = np.nan
N = len(df)

for i in range(N):
    #print(df['Postal Code'].values[i])
    df['Latitude'].values[i] = nomi.query_postal_code(df['Postal Code'].values[i]).latitude
    df['Longitude'].values[i] = nomi.query_postal_code(df['Postal Code'].values[i]).longitude
    
#df.head()

In [424]:
#df.dropna(subset=["price"], axis=0, inplace=True)
#for column in missing_data.columns.values.tolist():
    #print(column)
   # print (missing_data[column].value_counts())
   # print("")  

In [425]:
df.dropna(subset=["Latitude"], axis=0, inplace=True)

for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("")  

Postal Code
False    103
Name: Postal Code, dtype: int64

Borough
False    103
Name: Borough, dtype: int64

Neighborhood
False    103
Name: Neighborhood, dtype: int64



## Question 3 - Toronto Clustering using business info from 4square

In [426]:
# set map paramaeters - using address for Toronto

from geopy.geocoders import Nominatim

address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="canada_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [427]:
# import folium maps
import folium

# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Define Foursquare Credentials and Version

In [428]:
#define 4square info

CLIENT_ID = 'RGSEAE3TO55N3VLHU4RYF3C5A5PWCKZUOBQ5JLQAWUTMBNLF' # your Foursquare ID
CLIENT_SECRET = 'TWFMJ0UAR5ILWQ1VGL5LFAEMJ44A4DXAXRM0YQTBG3BY2U5K' # your Foursquare Secret
VERSION = '20180605'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: RGSEAE3TO55N3VLHU4RYF3C5A5PWCKZUOBQ5JLQAWUTMBNLF
CLIENT_SECRET:TWFMJ0UAR5ILWQ1VGL5LFAEMJ44A4DXAXRM0YQTBG3BY2U5K


In [429]:
#run a few tests on retrieving information from 4square

df.loc[0, 'Neighborhood']

'Parkwoods'

In [430]:
neighborhood_latitude = df.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = df.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Parkwoods are 43.7545, -79.33.


In [431]:
LIMIT = 100 # limit of number of venues returned by Foursquare API

radius = 500 # define radius

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url 



'https://api.foursquare.com/v2/venues/explore?&client_id=RGSEAE3TO55N3VLHU4RYF3C5A5PWCKZUOBQ5JLQAWUTMBNLF&client_secret=TWFMJ0UAR5ILWQ1VGL5LFAEMJ44A4DXAXRM0YQTBG3BY2U5K&v=20180605&ll=43.7545,-79.33&radius=500&limit=100'

In [432]:
#verify information being returned from 4square

results = requests.get(url).json()
results


{'meta': {'code': 429,
  'errorType': 'quota_exceeded',
  'errorDetail': 'Quota exceeded',
  'requestId': '5ebba0e9963d29001b3614bd'},
 'response': {}}

In [433]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [437]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

KeyError: 'groups'

In [438]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

3 venues were returned by Foursquare.


In [439]:
#function that gets near by venue names

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [440]:
#sometimes throws an error on groups

toronto_venues = getNearbyVenues(names=df['Neighborhood'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )

Parkwoods


KeyError: 'groups'

In [441]:
print(toronto_venues.shape)
toronto_venues.head()

(2150, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.7545,-79.33,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.7545,-79.33,GTA Restoration,43.753396,-79.333477,Fireworks Store
2,Parkwoods,43.7545,-79.33,Variety Store,43.751974,-79.333114,Food & Drink Shop
3,Victoria Village,43.7276,-79.3148,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,Victoria Village,43.7276,-79.3148,Tim Hortons,43.725517,-79.313103,Coffee Shop


In [442]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,4,4,4,4,4,4
"Alderwood, Long Branch",9,9,9,9,9,9
"Bathurst Manor, Wilson Heights, Downsview North",6,6,6,6,6,6
Bayview Village,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",23,23,23,23,23,23
Berczy Park,89,89,89,89,89,89
"Birch Cliff, Cliffside West",4,4,4,4,4,4
"Brockton, Parkdale Village, Exhibition Place",39,39,39,39,39,39
Business reply mail Processing Centre,14,14,14,14,14,14
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",57,57,57,57,57,57


In [443]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 253 uniques categories.


In [445]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [446]:
toronto_onehot.shape

(2150, 253)

In [447]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,...,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store
0,Agincourt,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,"Alderwood, Long Branch",0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,"Bathurst Manor, Wilson Heights, Downsview North",0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,Bayview Village,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,"Bedford Park, Lawrence Manor East",0.000000,0.000000,0.000000,0.00,0.043478,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,Berczy Park,0.011236,0.000000,0.000000,0.00,0.011236,0.022472,0.000000,0.000000,0.000000,...,0.000000,0.011236,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6,"Birch Cliff, Cliffside West",0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,"Brockton, Parkdale Village, Exhibition Place",0.000000,0.025641,0.000000,0.00,0.000000,0.025641,0.000000,0.025641,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,Business reply mail Processing Centre,0.071429,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9,"CN Tower, King and Spadina, Railway Lands, Har...",0.017544,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.017544,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.017544,0.000000


In [448]:
toronto_grouped.shape

(95, 253)

In [449]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt----
                       venue  freq
0            Badminton Court  0.25
1             Breakfast Spot  0.25
2  Latin American Restaurant  0.25
3               Skating Rink  0.25
4                Yoga Studio  0.00


----Alderwood, Long Branch----
                venue  freq
0         Pizza Place  0.11
1  Athletics & Sports  0.11
2      Sandwich Place  0.11
3                 Gym  0.11
4                 Pub  0.11


----Bathurst Manor, Wilson Heights, Downsview North----
                       venue  freq
0                Pizza Place  0.17
1   Mediterranean Restaurant  0.17
2        Fried Chicken Joint  0.17
3  Middle Eastern Restaurant  0.17
4              Deli / Bodega  0.17


----Bayview Village----
         venue  freq
0  Flower Shop  0.25
1         Park  0.25
2        Trail  0.25
3  Gas Station  0.25
4  Yoga Studio  0.00


----Bedford Park, Lawrence Manor East----
                venue  freq
0      Sandwich Place  0.09
1         Coffee Shop  0.09
2    Sushi Restaurant  

                      venue  freq
0              Home Service   0.5
1       Rental Car Location   0.5
2                    Museum   0.0
3  Mediterranean Restaurant   0.0
4               Men's Store   0.0


----Humberlea, Emery----
                       venue  freq
0                       Café  0.17
1  Latin American Restaurant  0.17
2                  Nightclub  0.17
3                Coffee Shop  0.17
4              Grocery Store  0.17


----Humewood-Cedarvale----
           venue  freq
0   Hockey Arena  0.17
1          Field  0.17
2           Park  0.17
3  Grocery Store  0.17
4  Deli / Bodega  0.17


----India Bazaar, The Beaches West----
                  venue  freq
0  Fast Food Restaurant  0.08
1                  Park  0.08
2            Restaurant  0.08
3        Sandwich Place  0.08
4     Food & Drink Shop  0.04


----Islington Avenue----
           venue  freq
0       Pharmacy  0.33
1           Bank  0.17
2           Park  0.17
3   Skating Rink  0.17
4  Grocery Store  0.17


----

             venue  freq
0  Coworking Space  0.12
1             Park  0.12
2      Coffee Shop  0.12
3              Gym  0.12
4    Garden Center  0.12


----Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park----
                        venue  freq
0          Light Rail Station  0.33
1                 Coffee Shop  0.33
2                Liquor Store  0.17
3                 Supermarket  0.17
4  Modern European Restaurant  0.00


----The Annex, North Midtown, Yorkville----
                 venue  freq
0       Sandwich Place  0.11
1                 Café  0.11
2  American Restaurant  0.07
3         Burger Joint  0.04
4         Liquor Store  0.04


----The Beaches----
               venue  freq
0                Pub  0.14
1             Bakery  0.14
2        Cheese Shop  0.14
3          Gastropub  0.14
4  Health Food Store  0.14


----The Danforth West, Riverdale----
                venue  freq
0    Greek Restaurant  0.23
1      Ice Cream Shop  0.06
2  Italian Restaurant  0.06
3  

In [450]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [451]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Agincourt,Badminton Court,Breakfast Spot,Skating Rink,Latin American Restaurant,Women's Store
1,"Alderwood, Long Branch",Convenience Store,Gym,Pub,Coffee Shop,Dance Studio
2,"Bathurst Manor, Wilson Heights, Downsview North",Middle Eastern Restaurant,Fried Chicken Joint,Pizza Place,Mediterranean Restaurant,Coffee Shop
3,Bayview Village,Flower Shop,Gas Station,Park,Trail,Women's Store
4,"Bedford Park, Lawrence Manor East",Italian Restaurant,Sandwich Place,Restaurant,Sushi Restaurant,Coffee Shop


#### Clustering

In [452]:
# set number of clusters
kclusters = 10

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 0, 0, 8, 3, 3, 3, 3, 3, 3], dtype=int32)

In [453]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() 

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M3A,North York,Parkwoods,43.7545,-79.33,8.0,Food & Drink Shop,Park,Fireworks Store,Women's Store,Doner Restaurant
1,M4A,North York,Victoria Village,43.7276,-79.3148,3.0,Portuguese Restaurant,French Restaurant,Park,Hockey Arena,Pizza Place
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626,3.0,Coffee Shop,Breakfast Spot,Restaurant,Theater,Health Food Store
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504,3.0,Clothing Store,Coffee Shop,Restaurant,Women's Store,Toy / Game Store
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6641,-79.3889,3.0,Sushi Restaurant,Gym,Italian Restaurant,Hobby Shop,Mexican Restaurant


In [454]:
# drop blank Cluster Labels

toronto_merged.dropna(subset=["Cluster Labels"], axis=0, inplace=True)

# set Cluster labels to int

toronto_merged[["Cluster Labels"]] = toronto_merged[["Cluster Labels"]].astype("int")
toronto_merged



Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M3A,North York,Parkwoods,43.7545,-79.3300,8,Food & Drink Shop,Park,Fireworks Store,Women's Store,Doner Restaurant
1,M4A,North York,Victoria Village,43.7276,-79.3148,3,Portuguese Restaurant,French Restaurant,Park,Hockey Arena,Pizza Place
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626,3,Coffee Shop,Breakfast Spot,Restaurant,Theater,Health Food Store
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504,3,Clothing Store,Coffee Shop,Restaurant,Women's Store,Toy / Game Store
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6641,-79.3889,3,Sushi Restaurant,Gym,Italian Restaurant,Hobby Shop,Mexican Restaurant
5,M9A,Etobicoke,Islington Avenue,43.6662,-79.5282,8,Pharmacy,Skating Rink,Bank,Grocery Store,Park
6,M1B,Scarborough,"Malvern, Rouge",43.8113,-79.1930,1,Home Service,Women's Store,Doner Restaurant,Flea Market,Fish Market
7,M3B,North York,Don Mills,43.7450,-79.3590,8,Park,River,Construction & Landscaping,Home Service,Gym
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.7063,-79.3094,0,Pizza Place,Pharmacy,Gym / Fitness Center,Bank,Intersection
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.6572,-79.3783,3,Coffee Shop,Clothing Store,Café,Middle Eastern Restaurant,Cosmetics Shop


In [455]:
print(toronto_merged.dtypes) 

Postal Code               object
Borough                   object
Neighborhood              object
Latitude                 float64
Longitude                float64
Cluster Labels             int64
1st Most Common Venue     object
2nd Most Common Venue     object
3rd Most Common Venue     object
4th Most Common Venue     object
5th Most Common Venue     object
dtype: object


In [476]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=15,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.4).add_to(map_clusters)
       
map_clusters

### Examine Clusters

#### Cluster 0 - Food Related

In [457]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
8,East York,0,Pizza Place,Pharmacy,Gym / Fitness Center,Bank,Intersection
10,North York,0,Pizza Place,Japanese Restaurant,Grocery Store,Fish Market,Italian Restaurant
11,Etobicoke,0,Pizza Place,Tea Room,Coffee Shop,Sandwich Place,Chinese Restaurant
18,Scarborough,0,Pizza Place,Coffee Shop,Fast Food Restaurant,Electronics Store,Bank
28,North York,0,Middle Eastern Restaurant,Fried Chicken Joint,Pizza Place,Mediterranean Restaurant,Coffee Shop
34,North York,0,Sandwich Place,Massage Studio,Sports Bar,Middle Eastern Restaurant,Women's Store
40,North York,0,Discount Store,Coffee Shop,Shopping Mall,Grocery Store,Pizza Place
46,North York,0,Discount Store,Coffee Shop,Shopping Mall,Grocery Store,Pizza Place
51,Scarborough,0,Ice Cream Shop,Pharmacy,Bank,Pizza Place,Coffee Shop
53,North York,0,Discount Store,Coffee Shop,Shopping Mall,Grocery Store,Pizza Place


#### Cluster 1 - Home services and womens fashion

In [458]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
6,Scarborough,1,Home Service,Women's Store,Doner Restaurant,Flea Market,Fish Market
62,Central Toronto,1,Home Service,Women's Store,Doner Restaurant,Flea Market,Fish Market


In [459]:
#### Cluster 2 - High density housing zone

In [460]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
27,North York,2,Residential Building (Apartment / Condo),Park,Women's Store,Falafel Restaurant,Donut Shop
69,West Toronto,2,Residential Building (Apartment / Condo),Park,Women's Store,Falafel Restaurant,Donut Shop


In [461]:
#### Cluster 3 - Gym junkies of Toronto

In [462]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
1,North York,3,Portuguese Restaurant,French Restaurant,Park,Hockey Arena,Pizza Place
2,Downtown Toronto,3,Coffee Shop,Breakfast Spot,Restaurant,Theater,Health Food Store
3,North York,3,Clothing Store,Coffee Shop,Restaurant,Women's Store,Toy / Game Store
4,Downtown Toronto,3,Sushi Restaurant,Gym,Italian Restaurant,Hobby Shop,Mexican Restaurant
9,Downtown Toronto,3,Coffee Shop,Clothing Store,Café,Middle Eastern Restaurant,Cosmetics Shop
14,East York,3,Convenience Store,Beer Store,Spa,Video Store,Diner
15,Downtown Toronto,3,Coffee Shop,Café,Cocktail Bar,Seafood Restaurant,Gastropub
17,Etobicoke,3,Convenience Store,Coffee Shop,Liquor Store,Beer Store,Café
19,East Toronto,3,Bakery,Health Food Store,Gastropub,Cheese Shop,Pub
20,Downtown Toronto,3,Coffee Shop,Café,Hotel,Seafood Restaurant,Beer Bar


In [463]:
#### Cluster 4 - Golf freaks with taste for things aquatic

In [464]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
12,Scarborough,4,Golf Course,Women's Store,Doner Restaurant,Flea Market,Fish Market


In [465]:
#### Cluster 5 - Car Rental central

In [466]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 5, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
50,North York,5,Home Service,Rental Car Location,Women's Store,Falafel Restaurant,Donut Shop


In [467]:
#### Cluster 6 - Toronto United Soccer fans

In [468]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 6, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
22,Scarborough,6,Korean Restaurant,Women's Store,Food,Flea Market,Fish Market


In [469]:
#### Cluster 7 - Disillusioned shop owners united cluster

In [470]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 7, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
45,North York,7,Pool,Cafeteria,Women's Store,Doner Restaurant,Fish Market


In [471]:
#### Cluster 8 - PArk and outdoor activity

In [472]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 8, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,North York,8,Food & Drink Shop,Park,Fireworks Store,Women's Store,Doner Restaurant
5,Etobicoke,8,Pharmacy,Skating Rink,Bank,Grocery Store,Park
7,North York,8,Park,River,Construction & Landscaping,Home Service,Gym
13,North York,8,Park,River,Construction & Landscaping,Home Service,Gym
16,York,8,Grocery Store,Trail,Hockey Arena,Park,Field
32,Scarborough,8,Grocery Store,Park,Women's Store,Donut Shop,Eastern European Restaurant
35,East York,8,Park,Convenience Store,Coffee Shop,Farmers Market,Eastern European Restaurant
36,Downtown Toronto,8,Café,Park,Harbor / Marina,Music Venue,Women's Store
39,North York,8,Flower Shop,Gas Station,Park,Trail,Women's Store
61,Central Toronto,8,Photography Studio,Park,Lawyer,Doner Restaurant,Fish Market


In [473]:
#### Cluster 9 - Sports stores

In [474]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 9, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
88,Etobicoke,9,Athletics & Sports,Women's Store,Donut Shop,Flower Shop,Flea Market


### Toronto Clustering Report

I ran with a cluster of 10 based on the 5 top venue types - more work required. Initial clustering of 5 didn't result in optimal visual impact of map - played around with the numbers for a little but was a little directionless in what I was attempting to achieve. If I had more time I probably would have investigated the foursquare information further and prioritised specific venue types: for example gym, korean food, womens fashion etc - would have given the clustering much more meaning. I spent an eternity attempting to work out how to render maps in github... it's like I relunctantly read the relevant forums. I have to remember to tell myself - I am not the first person to do this course. Despite there being a lot of copying and pasting from the labs - I do feel that I have learned a lot from this exercise.