In [1]:
from bs4 import BeautifulSoup
import requests


# wikipedia 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
# check for status code and headers


#url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
result = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

print(result.status_code)
print(result.headers)


200
{'Date': 'Wed, 12 Jun 2019 00:35:30 GMT', 'Content-Type': 'text/html; charset=UTF-8', 'Content-Length': '14902', 'Connection': 'keep-alive', 'Server': 'mw1254.eqiad.wmnet', 'X-Content-Type-Options': 'nosniff', 'P3P': 'CP="This is not a P3P policy! See https://en.wikipedia.org/wiki/Special:CentralAutoLogin/P3P for more info."', 'X-Powered-By': 'HHVM/3.18.6-dev', 'Content-language': 'en', 'Last-Modified': 'Thu, 06 Jun 2019 10:22:24 GMT', 'Backend-Timing': 'D=108335 t=1560103974168521', 'Content-Encoding': 'gzip', 'Vary': 'Accept-Encoding,Cookie,Authorization,X-Seven', 'X-Varnish': '595197947 275277105, 388672223 914171420', 'Via': '1.1 varnish (Varnish/5.1), 1.1 varnish (Varnish/5.1)', 'Age': '109272', 'X-Cache': 'cp1081 hit/4, cp1085 hit/54', 'X-Cache-Status': 'hit-front', 'Server-Timing': 'cache;desc="hit-front"', 'Strict-Transport-Security': 'max-age=106384710; includeSubDomains; preload', 'Set-Cookie': 'WMF-Last-Access=12-Jun-2019;Path=/;HttpOnly;secure;Expires=Sun, 14 Jul 2019 0

### Clean the data

In [2]:
soup = BeautifulSoup(result.content, 'html.parser')
table = soup.find('table')
trs = table.find_all('tr')
rows = []
for tr in trs:
    i = tr.find_all('td')
    if i:
        rows.append(i)
        
lst = []
for row in rows:
    postalcode = row[0].text.rstrip()
    borough = row[1].text.rstrip()
    neighborhood = row[2].text.rstrip()
    if borough != 'Not assigned':
        if neighborhood == 'Not assigned':
            neighborhood = borough
        lst.append([postalcode, borough, neighborhood])

lst

[['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront'],
 ['M5A', 'Downtown Toronto', 'Regent Park'],
 ['M6A', 'North York', 'Lawrence Heights'],
 ['M6A', 'North York', 'Lawrence Manor'],
 ['M7A', "Queen's Park", "Queen's Park"],
 ['M9A', 'Etobicoke', 'Islington Avenue'],
 ['M1B', 'Scarborough', 'Rouge'],
 ['M1B', 'Scarborough', 'Malvern'],
 ['M3B', 'North York', 'Don Mills North'],
 ['M4B', 'East York', 'Woodbine Gardens'],
 ['M4B', 'East York', 'Parkview Hill'],
 ['M5B', 'Downtown Toronto', 'Ryerson'],
 ['M5B', 'Downtown Toronto', 'Garden District'],
 ['M6B', 'North York', 'Glencairn'],
 ['M9B', 'Etobicoke', 'Cloverdale'],
 ['M9B', 'Etobicoke', 'Islington'],
 ['M9B', 'Etobicoke', 'Martin Grove'],
 ['M9B', 'Etobicoke', 'Princess Gardens'],
 ['M9B', 'Etobicoke', 'West Deane Park'],
 ['M1C', 'Scarborough', 'Highland Creek'],
 ['M1C', 'Scarborough', 'Rouge Hill'],
 ['M1C', 'Scarborough', 'Port Union'],
 ['M3C', 'North 

In [3]:
import numpy as np
import pandas as pd

In [4]:
df = pd.DataFrame(lst, columns= ['postalcode', 'borough', 'neighborhood'])
print(df.shape)

(211, 3)


Delete rows with "Not assigned"

In [5]:
df_final = df[df.borough != 'Not assigned']
df_final.reset_index(drop = True, inplace = True)

df_final = df.groupby(['postalcode', 'borough'], as_index=False).agg(lambda neighborhoods: ', '.join(neighborhoods))

In [6]:
df_final.head()

Unnamed: 0,postalcode,borough,neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [7]:
df_final.shape

(103, 3)

Add columns Latitude & Longitude in df_final

In [8]:
df_final['Latitude'] = 0.00
df_final['Longitude'] = 0.00

In [9]:
df_final.shape

(103, 5)

Get the csv file with the geolocation data

In [10]:
!wget -O Geospatial_Coordinates.csv http://cocl.us/Geospatial_data/

--2019-06-11 20:35:49--  http://cocl.us/Geospatial_data/
Resolving cocl.us... 169.48.113.201
Connecting to cocl.us|169.48.113.201|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://cocl.us/Geospatial_data/ [following]
--2019-06-11 20:35:49--  https://cocl.us/Geospatial_data/
Connecting to cocl.us|169.48.113.201|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2019-06-11 20:35:50--  https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Resolving ibm.box.com... 107.152.27.197
Connecting to ibm.box.com|107.152.27.197|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2019-06-11 20:35:51--  https://ibm.box.com/public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Reusing existing connection to ibm.box

Read the data

In [11]:
df_geocord = pd.read_csv('Geospatial_Coordinates.csv')
df_geocord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Pull geolocation data from df_geoform and add to df_final

In [12]:
for idx in df_final.index:
    geocord_idx = df_geocord['Postal Code'] == df_final.loc[idx, 'postalcode']
    df_final.at[idx, 'Latitude'] = df_geocord.loc[geocord_idx, 'Latitude'].values
    df_final.at[idx, 'Longitude'] = df_geocord.loc[geocord_idx, 'Longitude'].values


df_final.head()

Unnamed: 0,postalcode,borough,neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [13]:
df_final.shape

(103, 5)

In [14]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df_final['borough'].unique()),
        df_final.shape[0]
    )
)

The dataframe has 11 boroughs and 103 neighborhoods.


In [15]:
from geopy.geocoders import Nominatim
import folium
import json
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors


import random # library for random number generation

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 

# import k-means from clustering stage
from sklearn.cluster import KMeans
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

#!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library

print('Folium installed')
print('Libraries imported.')

Collecting package metadata: done
Solving environment: \ 
  - anaconda::ca-certificates-2018.03.07-0, anaconda::certifi-2018.11.29-py37_0, anaconda::openssl-1.1.1a-h1de35cc_0
  - anaconda::ca-certificates-2018.03.07-0, anaconda::openssl-1.1.1a-h1de35cc_0, defaults::certifi-2018.11.29-py37_0
  - anaconda::certifi-2018.11.29-py37_0, anaconda::openssl-1.1.1a-h1de35cc_0, defaults::ca-certificates-2018.03.07-0
  - anaconda::openssl-1.1.1a-h1de35cc_0, defaults::ca-certificates-2018.03.07-0, defaults::certifi-2018.11.29-py37_0
  - anaconda::certifi-2018.11.29-py37_0, defaults::ca-certificates-2018.03.07-0, defaults::openssl-1.1.1a-h1de35cc_0
  - defaults::ca-certificates-2018.03.07-0, defaults::certifi-2018.11.29-py37_0, defaults::openssl-1.1.1a-h1de35cc_0
  - anaconda::ca-certificates-2018.03.07-0, anaconda::certifi-2018.11.29-py37_0, defaults::openssl-1.1.1a-h1de35cc_0
  - anaconda::ca-certificates-2018.03.07-0, defaults::certifi-2018.11.29-py37_0, defaults::openssl-1.1.1a-h1de35ccdone

# A

In [16]:
CLIENT_ID = 'VD3ERGIFB2LCDFYRZANSUH3F35RHGE54I53Y1A4KCLEGEYG3' # Foursquare ID
CLIENT_SECRET = 'J2TEKN3YEOXRHWZZUM4HSWAF3NFROX1FVSVDHJBGG2A5YXGU' # Foursquare Secret
VERSION = '20180605' # API version
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: VD3ERGIFB2LCDFYRZANSUH3F35RHGE54I53Y1A4KCLEGEYG3
CLIENT_SECRET:J2TEKN3YEOXRHWZZUM4HSWAF3NFROX1FVSVDHJBGG2A5YXGU


In [17]:
df_final.loc[0, 'neighborhood']

'Rouge, Malvern'

## Explore Neighborhoods in Toronto

In [18]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Run the above function on each neighborhood and create a new dataframe called *toronto_venues*.

In [19]:
toronto_venues = getNearbyVenues(names=df_final['neighborhood'],
                                   latitudes=df_final['Latitude'],
                                   longitudes=df_final['Longitude']
                                  )

Rouge, Malvern
Highland Creek, Rouge Hill, Port Union
Guildwood, Morningside, West Hill
Woburn
Cedarbrae
Scarborough Village
East Birchmount Park, Ionview, Kennedy Park
Clairlea, Golden Mile, Oakridge
Cliffcrest, Cliffside, Scarborough Village West
Birch Cliff, Cliffside West
Dorset Park, Scarborough Town Centre, Wexford Heights
Maryvale, Wexford
Agincourt
Clarks Corners, Sullivan, Tam O'Shanter
Agincourt North, L'Amoreaux East, Milliken, Steeles East
L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview, Henry Farm, Oriole
Bayview Village
Silver Hills, York Mills
Newtonbrook, Willowdale
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Flemingdon Park, Don Mills South
Bathurst Manor, Downsview North, Wilson Heights
Northwood Park, York University
CFB Toronto, Downsview East
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Woodbine Gardens, Parkview Hill
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
The Danforth West, 

In [20]:
print(toronto_venues.shape)
toronto_venues.head()

(1328, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,"Rouge, Malvern",43.806686,-79.194353,Interprovincial Group,43.80563,-79.200378,Print Shop
2,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Chris Effects Painting,43.784343,-79.163742,Construction & Landscaping
3,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
4,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place


Let's check how many venues were returned for each neighborhood

In [21]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",30,30,30,30,30,30
Agincourt,4,4,4,4,4,4
"Agincourt North, L'Amoreaux East, Milliken, Steeles East",3,3,3,3,3,3
"Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown",11,11,11,11,11,11
"Alderwood, Long Branch",10,10,10,10,10,10
"Bathurst Manor, Downsview North, Wilson Heights",18,18,18,18,18,18
Bayview Village,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",25,25,25,25,25,25
Berczy Park,30,30,30,30,30,30
"Birch Cliff, Cliffside West",4,4,4,4,4,4


#### Let's find out how many unique categories can be curated from all the returned venues

In [22]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 239 uniques categories.


##  Analyze Each Neighborhood

In [23]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
toronto_onehot.shape

(1328, 239)

#### Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [25]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,"Adelaide, King, Richmond",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.033333,...,0.000000,0.0,0.033333,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,Agincourt,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.090909,0.000000,0.000000,0.000000,0.000000,0.000000
4,"Alderwood, Long Branch",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,"Bathurst Manor, Downsview North, Wilson Heights",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.055556,0.000000,0.000000,0.000000,0.000000,0.000000
6,Bayview Village,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,"Bedford Park, Lawrence Manor East",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.040000,...,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,Berczy Park,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.033333,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9,"Birch Cliff, Cliffside West",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [26]:
toronto_grouped.shape

(100, 239)

#### Let's print each neighborhood along with the top 5 most common venues

In [27]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide, King, Richmond----
              venue  freq
0        Steakhouse  0.10
1       Pizza Place  0.07
2  Asian Restaurant  0.07
3              Café  0.07
4             Hotel  0.07


----Agincourt----
                venue  freq
0              Lounge  0.25
1      Breakfast Spot  0.25
2  Chinese Restaurant  0.25
3      Sandwich Place  0.25
4         Yoga Studio  0.00


----Agincourt North, L'Amoreaux East, Milliken, Steeles East----
              venue  freq
0        Playground  0.33
1              Park  0.33
2  Asian Restaurant  0.33
3       Yoga Studio  0.00
4     Movie Theater  0.00


----Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown----
                 venue  freq
0        Grocery Store  0.18
1  Fried Chicken Joint  0.09
2          Pizza Place  0.09
3          Coffee Shop  0.09
4       Sandwich Place  0.09


----Alderwood, Long Branch----
            venue  freq
0     Pizza Place   0.2
1    Skating Rink   0.1
2

               venue  freq
0        Coffee Shop  0.33
1               Park  0.33
2  Convenience Store  0.33
3      Moving Target  0.00
4     Massage Studio  0.00


----Emery, Humberlea----
                venue  freq
0      Baseball Field   1.0
1         Yoga Studio   0.0
2  Mac & Cheese Joint   0.0
3      Massage Studio   0.0
4      Medical Center   0.0


----Fairview, Henry Farm, Oriole----
            venue  freq
0  Clothing Store  0.17
1     Coffee Shop  0.13
2    Burger Joint  0.03
3    Liquor Store  0.03
4       Juice Bar  0.03


----First Canadian Place, Underground city----
         venue  freq
0         Café  0.13
1  Coffee Shop  0.10
2    Gastropub  0.07
3   Restaurant  0.07
4   Steakhouse  0.07


----Flemingdon Park, Don Mills South----
              venue  freq
0               Gym  0.09
1        Beer Store  0.09
2       Coffee Shop  0.09
3  Asian Restaurant  0.09
4     Grocery Store  0.04


----Forest Hill North, Forest Hill West----
              venue  freq
0             

            venue  freq
0     Coffee Shop  0.13
1  Sandwich Place  0.13
2            Café  0.13
3     Pizza Place  0.09
4        Pharmacy  0.04


----The Beaches----
                       venue  freq
0          Health Food Store  0.25
1                        Pub  0.25
2                      Trail  0.25
3                Yoga Studio  0.00
4  Middle Eastern Restaurant  0.00


----The Beaches West, India Bazaar----
                  venue  freq
0                  Park  0.11
1  Fast Food Restaurant  0.06
2    Italian Restaurant  0.06
3           Pizza Place  0.06
4                   Pub  0.06


----The Danforth West, Riverdale----
                venue  freq
0    Greek Restaurant  0.27
1      Ice Cream Shop  0.07
2  Italian Restaurant  0.07
3     Bubble Tea Shop  0.03
4         Coffee Shop  0.03


----The Junction North, Runnymede----
               venue  freq
0      Grocery Store  0.25
1           Bus Line  0.25
2  Convenience Store  0.25
3        Pizza Place  0.25
4    Warehouse Store 

#### Let's put that into a *pandas* dataframe

In [28]:
# Sort the venues in descending order

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [29]:
# Create a new dataframe and display the top 10 venues for each neighborhoods.

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Steakhouse,Café,Pizza Place,Hotel,Asian Restaurant,Concert Hall,Monument / Landmark,Plaza,Sushi Restaurant,Lounge
1,Agincourt,Lounge,Chinese Restaurant,Breakfast Spot,Sandwich Place,Dance Studio,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Dog Run,Discount Store
2,"Agincourt North, L'Amoreaux East, Milliken, St...",Asian Restaurant,Park,Playground,Cupcake Shop,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Dog Run,Discount Store,Diner
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",Grocery Store,Fast Food Restaurant,Coffee Shop,Video Store,Pharmacy,Pizza Place,Beer Store,Liquor Store,Fried Chicken Joint,Sandwich Place
4,"Alderwood, Long Branch",Pizza Place,Pool,Coffee Shop,Skating Rink,Gym,Pharmacy,Pub,Dance Studio,Sandwich Place,Women's Store


## Cluster Neighborhoods

Run *k*-means to cluster the neighborhood into 5 clusters.

In [30]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 4, 0, 0, 0, 3, 0, 0, 0], dtype=int32)

In [31]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_venues

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head(20) # check the last columns!

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant,1,Fast Food Restaurant,Print Shop,Women's Store,Cupcake Shop,Dumpling Restaurant,Drugstore,Dog Run,Discount Store,Diner,Dim Sum Restaurant
1,"Rouge, Malvern",43.806686,-79.194353,Interprovincial Group,43.80563,-79.200378,Print Shop,1,Fast Food Restaurant,Print Shop,Women's Store,Cupcake Shop,Dumpling Restaurant,Drugstore,Dog Run,Discount Store,Diner,Dim Sum Restaurant
2,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Chris Effects Painting,43.784343,-79.163742,Construction & Landscaping,1,Bar,Construction & Landscaping,Women's Store,Dance Studio,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Dog Run,Discount Store
3,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar,1,Bar,Construction & Landscaping,Women's Store,Dance Studio,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Dog Run,Discount Store
4,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place,0,Spa,Rental Car Location,Electronics Store,Breakfast Spot,Medical Center,Intersection,Mexican Restaurant,Tech Startup,Pizza Place,Construction & Landscaping
5,"Guildwood, Morningside, West Hill",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store,0,Spa,Rental Car Location,Electronics Store,Breakfast Spot,Medical Center,Intersection,Mexican Restaurant,Tech Startup,Pizza Place,Construction & Landscaping
6,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Marina Spa,43.766,-79.191,Spa,0,Spa,Rental Car Location,Electronics Store,Breakfast Spot,Medical Center,Intersection,Mexican Restaurant,Tech Startup,Pizza Place,Construction & Landscaping
7,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Big Bite Burrito,43.766299,-79.19072,Mexican Restaurant,0,Spa,Rental Car Location,Electronics Store,Breakfast Spot,Medical Center,Intersection,Mexican Restaurant,Tech Startup,Pizza Place,Construction & Landscaping
8,"Guildwood, Morningside, West Hill",43.763573,-79.188711,chatr Mobile,43.765917,-79.191672,Tech Startup,0,Spa,Rental Car Location,Electronics Store,Breakfast Spot,Medical Center,Intersection,Mexican Restaurant,Tech Startup,Pizza Place,Construction & Landscaping
9,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Enterprise Rent-A-Car,43.764076,-79.193406,Rental Car Location,0,Spa,Rental Car Location,Electronics Store,Breakfast Spot,Medical Center,Intersection,Mexican Restaurant,Tech Startup,Pizza Place,Construction & Landscaping


Visualization of the resulting clusters

In [33]:
address = 'Toronto'

geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(latitude, longitude)

43.653963 -79.387207


In [34]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Neighborhood Latitude'], toronto_merged['Neighborhood Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### Cluster 1

In [35]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood Latitude,Venue Longitude,Venue Category,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,43.763573,-79.189914,Pizza Place,0,Spa,Rental Car Location,Electronics Store,Breakfast Spot,Medical Center,Intersection,Mexican Restaurant,Tech Startup,Pizza Place,Construction & Landscaping
5,43.763573,-79.191537,Electronics Store,0,Spa,Rental Car Location,Electronics Store,Breakfast Spot,Medical Center,Intersection,Mexican Restaurant,Tech Startup,Pizza Place,Construction & Landscaping
6,43.763573,-79.191000,Spa,0,Spa,Rental Car Location,Electronics Store,Breakfast Spot,Medical Center,Intersection,Mexican Restaurant,Tech Startup,Pizza Place,Construction & Landscaping
7,43.763573,-79.190720,Mexican Restaurant,0,Spa,Rental Car Location,Electronics Store,Breakfast Spot,Medical Center,Intersection,Mexican Restaurant,Tech Startup,Pizza Place,Construction & Landscaping
8,43.763573,-79.191672,Tech Startup,0,Spa,Rental Car Location,Electronics Store,Breakfast Spot,Medical Center,Intersection,Mexican Restaurant,Tech Startup,Pizza Place,Construction & Landscaping
9,43.763573,-79.193406,Rental Car Location,0,Spa,Rental Car Location,Electronics Store,Breakfast Spot,Medical Center,Intersection,Mexican Restaurant,Tech Startup,Pizza Place,Construction & Landscaping
10,43.763573,-79.192286,Medical Center,0,Spa,Rental Car Location,Electronics Store,Breakfast Spot,Medical Center,Intersection,Mexican Restaurant,Tech Startup,Pizza Place,Construction & Landscaping
11,43.763573,-79.189490,Intersection,0,Spa,Rental Car Location,Electronics Store,Breakfast Spot,Medical Center,Intersection,Mexican Restaurant,Tech Startup,Pizza Place,Construction & Landscaping
12,43.763573,-79.190466,Breakfast Spot,0,Spa,Rental Car Location,Electronics Store,Breakfast Spot,Medical Center,Intersection,Mexican Restaurant,Tech Startup,Pizza Place,Construction & Landscaping
13,43.770992,-79.221156,Coffee Shop,0,Coffee Shop,Korean Restaurant,Convenience Store,Dance Studio,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Dog Run,Discount Store


#### Cluster 2

In [36]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood Latitude,Venue Longitude,Venue Category,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,43.806686,-79.199056,Fast Food Restaurant,1,Fast Food Restaurant,Print Shop,Women's Store,Cupcake Shop,Dumpling Restaurant,Drugstore,Dog Run,Discount Store,Diner,Dim Sum Restaurant
1,43.806686,-79.200378,Print Shop,1,Fast Food Restaurant,Print Shop,Women's Store,Cupcake Shop,Dumpling Restaurant,Drugstore,Dog Run,Discount Store,Diner,Dim Sum Restaurant
2,43.784535,-79.163742,Construction & Landscaping,1,Bar,Construction & Landscaping,Women's Store,Dance Studio,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Dog Run,Discount Store
3,43.784535,-79.163085,Bar,1,Bar,Construction & Landscaping,Women's Store,Dance Studio,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Dog Run,Discount Store
25,43.744734,-79.239336,Playground,1,Playground,Women's Store,Cupcake Shop,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Dog Run,Discount Store,Diner,Dim Sum Restaurant
127,43.75749,-79.370649,Cafeteria,1,Cafeteria,Women's Store,Curling Ice,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Dog Run,Discount Store,Diner
456,43.689574,-79.3829,Tennis Court,1,Tennis Court,Playground,Cuban Restaurant,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Dog Run,Discount Store,Diner,Dim Sum Restaurant
457,43.689574,-79.383465,Playground,1,Tennis Court,Playground,Cuban Restaurant,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Dog Run,Discount Store,Diner,Dim Sum Restaurant
831,43.711695,-79.411978,Garden,1,Ice Cream Shop,Garden,Cupcake Shop,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Dog Run,Discount Store,Diner,Dim Sum Restaurant
832,43.711695,-79.414301,Ice Cream Shop,1,Ice Cream Shop,Garden,Cupcake Shop,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Dog Run,Discount Store,Diner,Dim Sum Restaurant


#### Cluster 3

In [37]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood Latitude,Venue Longitude,Venue Category,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1278,43.636258,-79.496266,Baseball Field,2,Baseball Field,Women's Store,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Dog Run,Discount Store,Diner
1301,43.724766,-79.532854,Baseball Field,2,Baseball Field,Women's Store,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Dog Run,Discount Store,Diner


#### Cluster 4

In [38]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood Latitude,Venue Longitude,Venue Category,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
123,43.786947,-79.381234,Chinese Restaurant,3,Chinese Restaurant,Bank,Japanese Restaurant,Café,Women's Store,Dance Studio,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Dog Run
124,43.786947,-79.380367,Bank,3,Chinese Restaurant,Bank,Japanese Restaurant,Café,Women's Store,Dance Studio,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Dog Run
125,43.786947,-79.380751,Café,3,Chinese Restaurant,Bank,Japanese Restaurant,Café,Women's Store,Dance Studio,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Dog Run
126,43.786947,-79.38109,Japanese Restaurant,3,Chinese Restaurant,Bank,Japanese Restaurant,Café,Women's Store,Dance Studio,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Dog Run
1291,43.650943,-79.549748,Bank,3,Bank,Women's Store,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Dog Run,Discount Store,Diner


#### Cluster 5

In [39]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood Latitude,Venue Longitude,Venue Category,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
74,43.815252,-79.289773,Park,4,Asian Restaurant,Park,Playground,Cupcake Shop,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Dog Run,Discount Store,Diner
75,43.815252,-79.289824,Asian Restaurant,4,Asian Restaurant,Park,Playground,Cupcake Shop,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Dog Run,Discount Store,Diner
76,43.815252,-79.289867,Playground,4,Asian Restaurant,Park,Playground,Cupcake Shop,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Dog Run,Discount Store,Diner
158,43.752758,-79.401004,Bank,4,Park,Convenience Store,Bar,Bank,Women's Store,Deli / Bodega,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore
159,43.752758,-79.401393,Convenience Store,4,Park,Convenience Store,Bar,Bank,Women's Store,Deli / Bodega,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore
160,43.752758,-79.399717,Park,4,Park,Convenience Store,Bar,Bank,Women's Store,Deli / Bodega,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore
161,43.752758,-79.401712,Park,4,Park,Convenience Store,Bar,Bank,Women's Store,Deli / Bodega,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore
162,43.752758,-79.405016,Bar,4,Park,Convenience Store,Bar,Bank,Women's Store,Deli / Bodega,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore
168,43.753259,-79.33214,Park,4,Park,Fast Food Restaurant,Food & Drink Shop,Women's Store,Curling Ice,Dumpling Restaurant,Drugstore,Dog Run,Discount Store,Diner
169,43.753259,-79.333021,Fast Food Restaurant,4,Park,Fast Food Restaurant,Food & Drink Shop,Women's Store,Curling Ice,Dumpling Restaurant,Drugstore,Dog Run,Discount Store,Diner
