**The goal here is to explore & cluster neighborhoods in Toronto**

In [5]:
import pandas as pd
from bs4 import BeautifulSoup as bs
import numpy as np
import requests
from pathlib import Path
from geopy.geocoders import Nominatim
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
from sklearn.cluster import KMeans

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage

!conda install -c conda-forge folium=0.5.0 --yes
import folium 

print('Libraries imported.')

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00  10.38 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  39.13 MB/s
vincent-0.4.4- 100% |################################| Time: 0:00:00  38.59 MB/s
folium-0.5.0-p 100% |################################| Time: 0:00:00  48.89 MB/s
Libraries imported.


**Load the pandas dataframe (created in part 2 of the assignment)**

In [8]:
df_toronto = pd.read_csv('toronto_base.csv')
df_toronto.drop(['Unnamed: 0'], axis = 1, inplace = True)
df_toronto.head(7)

Unnamed: 0,Postalcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",M1B,43.806686,-79.194353
1,M2M,North York,"Newtonbrook, Willowdale",M1C,43.784535,-79.160497
2,M1V,Scarborough,"Agincourt North, Milliken",M1E,43.763573,-79.188711
3,M5E,Downtown Toronto,Berczy Park,M1G,43.770992,-79.216917
4,M6B,North York,Glencairn,M1H,43.773136,-79.239476
5,M6H,West Toronto,Dovercourt Village,M1J,43.744734,-79.239476
6,M3H,North York,"Bathurst Manor, Wilson Heights",M1K,43.727929,-79.262029


In [12]:
df_toronto = df_toronto[df_toronto['Neighbourhood'].str.contains('Toronto')] #Use a mask to filter out only neighborhoods that include Toronto
df_toronto

Unnamed: 0,Postalcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
19,M5K,Downtown Toronto,"Design Exchange, Toronto Dominion Centre",M2K,43.786947,-79.385975
23,M3K,North York,CFB Toronto,M2P,43.752758,-79.400049
36,M8V,Etobicoke,New Toronto,M4C,43.695344,-79.318389
45,M5J,Downtown Toronto,"Toronto Islands, Union Station",M4P,43.712751,-79.390197
62,M4J,East York,East Toronto,M5M,43.733283,-79.41975
76,M5S,Downtown Toronto,University of Toronto,M6H,43.669005,-79.442259


In [14]:
# create map of Toronto using latitude and longitude values
latitude = 43.6532
longitude = -79.3832

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [50]:
# @hidden cell
CLIENT_ID = 'MK51CHNXA04S2LWHNEVGIAIC43GBF1CPZKPPDVNLY35XFGLJ' # your Foursquare ID
CLIENT_SECRET = 'JHFP3TDXYXYBIABED0UBWA53WRGDTO5IYGY0GZTTZBW33QK0' # No peeking at my secret key
VERSION = '20190101' # Foursquare API version

In [53]:
import json
def getNearbyVenues(names, latitudes, longitudes, radius=500): #This function was built by the Coursera lab
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [55]:
#Use the function to pull venues near each neighborhood
LIMIT = 500
toronto_venues = getNearbyVenues(names=df_toronto['Neighbourhood'],
                                   latitudes=df_toronto['Latitude'],
                                   longitudes=df_toronto['Longitude']
                                  )

Design Exchange, Toronto Dominion Centre
CFB Toronto
New Toronto
Toronto Islands, Union Station
East Toronto
University of Toronto


In [56]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 47 uniques categories.


In [57]:
#Start the clustering analysis with one-hot encoding for pre-processing
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.shape

(67, 48)

In [58]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,American Restaurant,Asian Restaurant,Bakery,Bank,Bar,Beer Store,Breakfast Spot,Brewery,Burger Joint,...,Pool,Pub,Restaurant,Sandwich Place,Skating Rink,Supermarket,Sushi Restaurant,Thai Restaurant,Video Store,Wine Shop
0,CFB Toronto,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Design Exchange, Toronto Dominion Centre",0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,East Toronto,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.043478,0.043478,0.043478,0.0,0.0,0.043478,0.043478,0.0,0.0
3,New Toronto,0.0,0.1,0.0,0.0,0.0,0.1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.1,0.0
4,"Toronto Islands, Union Station",0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.125,...,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0
5,University of Toronto,0.0,0.0,0.1,0.05,0.05,0.0,0.0,0.05,0.0,...,0.05,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.05


In [59]:
#Show the top 5 most frequent venues in our neighborhood analysis
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----CFB Toronto----
                 venue  freq
0                 Bank   0.5
1                 Park   0.5
2  American Restaurant   0.0
3             Pharmacy   0.0
4                Hotel   0.0


----Design Exchange, Toronto Dominion Centre----
                 venue  freq
0   Chinese Restaurant  0.25
1                 Bank  0.25
2  Japanese Restaurant  0.25
3                 Café  0.25
4          Pizza Place  0.00


----East Toronto----
                  venue  freq
0           Coffee Shop  0.09
1  Fast Food Restaurant  0.09
2    Italian Restaurant  0.09
3           Pizza Place  0.04
4        Hardware Store  0.04


----New Toronto----
              venue  freq
0      Skating Rink   0.2
1    Cosmetics Shop   0.1
2       Video Store   0.1
3  Asian Restaurant   0.1
4        Beer Store   0.1


----Toronto Islands, Union Station----
               venue  freq
0                Gym  0.12
1  Food & Drink Shop  0.12
2     Breakfast Spot  0.12
3       Burger Joint  0.12
4     Sandwich Place  0.

In [60]:
#Create a function that sorts the venues by frequency
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [61]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.shape

(6, 11)

In [62]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:103]

array([2, 1, 3, 4, 0, 3], dtype=int32)

In [65]:
toronto_merged = df_toronto

# add clustering labels
toronto_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postalcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
19,M5K,Downtown Toronto,"Design Exchange, Toronto Dominion Centre",M2K,43.786947,-79.385975,2,Café,Chinese Restaurant,Bank,Japanese Restaurant,Food & Drink Shop,Fast Food Restaurant,Discount Store,Curling Ice,Cosmetics Shop,Comfort Food Restaurant
23,M3K,North York,CFB Toronto,M2P,43.752758,-79.400049,1,Park,Bank,Café,Food & Drink Shop,Fast Food Restaurant,Discount Store,Curling Ice,Cosmetics Shop,Comfort Food Restaurant,Coffee Shop
36,M8V,Etobicoke,New Toronto,M4C,43.695344,-79.318389,3,Skating Rink,Park,Asian Restaurant,Video Store,Curling Ice,Beer Store,Cosmetics Shop,Pharmacy,Bus Stop,Café
45,M5J,Downtown Toronto,"Toronto Islands, Union Station",M4P,43.712751,-79.390197,4,Gym,Sandwich Place,Hotel,Clothing Store,Burger Joint,Breakfast Spot,Park,Food & Drink Shop,Bar,Fast Food Restaurant
62,M4J,East York,East Toronto,M5M,43.733283,-79.41975,0,Coffee Shop,Fast Food Restaurant,Italian Restaurant,Comfort Food Restaurant,Café,Greek Restaurant,Grocery Store,Butcher,Hardware Store,Indian Restaurant


In [67]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters