# clusterer

Foursquare API keys and version

In [1]:
CLIENT_ID = 'redacted for github'
CLIENT_SECRET = 'redacted for github'
VERSION = '20180605'

imports

In [2]:
import pandas as pd
import numpy as np
import requests
import folium
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

set wiki page URL

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

read webpage and store as 'tables'

In [4]:
tables = pd.read_html(url, header=0)

since there are multiple tables, set a list of the headings of the table we're interested in

In [5]:
headings = ['Postcode', 'Borough', 'Neighbourhood']

loop over the tables in the webpage, and if the current table in the loop's headings match the headings we're interested in, we've found the table; thus exit the loop

In [6]:
for table in tables:
    current_headings = table.columns.values[:4]
    if all(current_headings == headings):
        break

filter out postcodes without boroughs

In [7]:
codes = table[table.Borough != 'Not assigned']

create a new dataframe grouping by postcode+borough combinations and aggregating neighborhoods into a comma-separated list

In [8]:
agg = codes.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(', '.join).to_frame()

reset the index of the aggregated dataframe and save it to a new dataframe

In [9]:
agg2 = agg.reset_index()

##### note
I abaonded geocoder because Anaconda couldn't install it correctly

set URL of lat/lon data

In [10]:
backup_url = 'http://cocl.us/Geospatial_data'

read file and store lat/lon data as 'geo_data'

In [11]:
geo_data = pd.read_csv(backup_url)

add the latitude and longitude columns to the aggregated and indexed dataframe

In [12]:
agg2['Latitude'] = geo_data['Latitude']
agg2['Longitude'] = geo_data['Longitude']

set search radius (in meters) and response limit for the Foursquare API call

note that the radius was increased from 500 to 1000 because the example neighborhood I tried exploring didn't return any venues with a 500 meter radius

In [13]:
radius = 1000
limit = 100

define a function to make the Foursquare API calls for each postcode in the dataframe

In [15]:
def getVenues(names, latitudes, longitudes, radius=1000):
    
    venues_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID,
            CLIENT_SECRET,
            VERSION,
            lat,
            lng,
            radius,
            limit)
        
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        venues_list.append([(
            name,
            lat,
            lng,
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results])
        
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = [
            'Neighborhood',
            'Neighborhood latitude',
            'Neighborhood longitude',
            'Venue',
            'Venue latitude',
            'Venue longitude',
            'Venue category']
        
    return(nearby_venues)

invoke the Foursquare API call function for each row in agg2, the dataframe of postcode+borough combinations

In [16]:
toronto_venues = getVenues(names=agg2['Neighbourhood'],
                          latitudes=agg2['Latitude'],
                          longitudes=agg2['Longitude'])

Rouge, Malvern
Highland Creek, Rouge Hill, Port Union
Guildwood, Morningside, West Hill
Woburn
Cedarbrae
Scarborough Village
East Birchmount Park, Ionview, Kennedy Park
Clairlea, Golden Mile, Oakridge
Cliffcrest, Cliffside, Scarborough Village West
Birch Cliff, Cliffside West
Dorset Park, Scarborough Town Centre, Wexford Heights
Maryvale, Wexford
Agincourt
Clarks Corners, Sullivan, Tam O'Shanter
Agincourt North, L'Amoreaux East, Milliken, Steeles East
L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview, Henry Farm, Oriole
Bayview Village
Silver Hills, York Mills
Newtonbrook, Willowdale
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Flemingdon Park, Don Mills South
Bathurst Manor, Downsview North, Wilson Heights
Northwood Park, York University
CFB Toronto, Downsview East
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Woodbine Gardens, Parkview Hill
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
The Danforth West, 

display the resultant dataframe

In [17]:
toronto_venues

Unnamed: 0,Neighborhood,Neighborhood latitude,Neighborhood longitude,Venue,Venue latitude,Venue longitude,Venue category
0,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.802008,-79.198080,Fast Food Restaurant
1,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
2,"Rouge, Malvern",43.806686,-79.194353,Harvey's,43.800020,-79.198307,Restaurant
3,"Rouge, Malvern",43.806686,-79.194353,Caribbean Wave,43.798558,-79.195777,Caribbean Restaurant
4,"Rouge, Malvern",43.806686,-79.194353,Staples Morningside,43.800285,-79.196607,Paper / Office Supplies Store
...,...,...,...,...,...,...,...
4908,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437,Caribbean Heat 2,43.743186,-79.582367,Caribbean Restaurant
4909,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437,46 Martingrove North,43.732211,-79.589618,Bus Line
4910,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437,Panorama Park,43.747021,-79.583497,Park
4911,Northwest,43.706748,-79.594054,Tim Hortons,43.714657,-79.593716,Coffee Shop


one-hot encode the venue values

In [18]:
onehot = pd.get_dummies(toronto_venues[['Venue category']], prefix='', prefix_sep='')
onehot['Neighborhood'] = toronto_venues['Neighborhood']
fixed_columns = [onehot.columns[-1]] + list(onehot.columns[:-1])
onehot = onehot[fixed_columns]

group the results by neighborhood and take the mean of component one-hot encoded venue categories to measure how frequently each venue type appears in each neighborhood

In [19]:
grouped = onehot.groupby('Neighborhood').mean().reset_index()

write out the top five venue categories for each neighborhood

In [20]:
top_what = 5

for hood in grouped['Neighborhood']:
    print('----'+hood+'----')
    temp = grouped[grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue', 'freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(top_what))
    print('\n')

----Adelaide, King, Richmond----
         venue  freq
0         Café  0.06
1  Coffee Shop  0.06
2        Hotel  0.05
3      Theater  0.04
4   Restaurant  0.04


----Agincourt----
                  venue  freq
0    Chinese Restaurant  0.13
1         Shopping Mall  0.04
2            Restaurant  0.04
3                Bakery  0.04
4  Caribbean Restaurant  0.04


----Agincourt North, L'Amoreaux East, Milliken, Steeles East----
                venue  freq
0  Chinese Restaurant  0.14
1                Park  0.07
2        Noodle House  0.07
3         Pizza Place  0.07
4              Bakery  0.07


----Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown----
                  venue  freq
0           Pizza Place  0.20
1         Grocery Store  0.15
2  Fast Food Restaurant  0.05
3        Discount Store  0.05
4                  Park  0.05


----Alderwood, Long Branch----
            venue  freq
0        Pharmacy  0.12
1  Discount Store  0.12
2

4           Coffee Shop  0.06


----Guildwood, Morningside, West Hill----
                  venue  freq
0           Pizza Place  0.14
1  Fast Food Restaurant  0.09
2           Coffee Shop  0.09
3     Food & Drink Shop  0.05
4             Juice Bar  0.05


----Harbord, University of Toronto----
                           venue  freq
0                           Café  0.09
1  Vegetarian / Vegan Restaurant  0.05
2                            Bar  0.05
3                         Bakery  0.05
4                    Coffee Shop  0.04


----Harbourfront----
         venue  freq
0  Coffee Shop  0.15
1         Café  0.05
2         Park  0.04
3      Theater  0.04
4        Diner  0.04


----Harbourfront East, Toronto Islands, Union Station----
                venue  freq
0         Coffee Shop  0.07
1                Café  0.05
2               Hotel  0.05
3          Restaurant  0.05
4  Italian Restaurant  0.04


----High Park, The Junction South----
                venue  freq
0                Café  0.0

               venue  freq
0   Ramen Restaurant  0.06
1        Coffee Shop  0.06
2        Pizza Place  0.05
3  Korean Restaurant  0.05
4    Bubble Tea Shop  0.05


----Willowdale West----
                         venue  freq
0                     Pharmacy  0.15
1                       Bakery  0.08
2                  Coffee Shop  0.08
3  Eastern European Restaurant  0.08
4                     Bus Line  0.08


----Woburn----
                venue  freq
0                Park   0.2
1         Coffee Shop   0.2
2  Chinese Restaurant   0.1
3   Mobile Phone Shop   0.1
4   Indian Restaurant   0.1


----Woodbine Gardens, Parkview Hill----
            venue  freq
0     Pizza Place  0.10
1         Brewery  0.10
2      Restaurant  0.05
3            Café  0.05
4  Breakfast Spot  0.05


----Woodbine Heights----
            venue  freq
0            Park  0.10
1     Coffee Shop  0.10
2    Skating Rink  0.07
3  Sandwich Place  0.07
4     Pizza Place  0.07


----York Mills West----
          venue  freq


define a function to calculate the most common venues in each neighborhood

In [21]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

invoke the function and create a dataframe of the top ten venue categories in each neighborhood

In [22]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = grouped['Neighborhood']

for ind in np.arange(grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Café,Coffee Shop,Hotel,Theater,Restaurant,Sushi Restaurant,Bakery,Pizza Place,Breakfast Spot,Seafood Restaurant
1,Agincourt,Chinese Restaurant,Caribbean Restaurant,Shopping Mall,Restaurant,Bakery,Park,Indian Restaurant,Print Shop,Clothing Store,Shanghai Restaurant
2,"Agincourt North, L'Amoreaux East, Milliken, St...",Chinese Restaurant,Bakery,Park,Pizza Place,Fast Food Restaurant,Noodle House,Event Space,Malay Restaurant,Shopping Mall,Shop & Service
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",Pizza Place,Grocery Store,Pharmacy,Sandwich Place,Discount Store,Liquor Store,Beer Store,Park,Fried Chicken Joint,Japanese Restaurant
4,"Alderwood, Long Branch",Discount Store,Pharmacy,Pizza Place,Coffee Shop,Moroccan Restaurant,Skating Rink,Shopping Mall,Liquor Store,Donut Shop,Sandwich Place


perform k-means clustering on the dataframe to see which neighborhoods are similar

In [23]:
kclusters = 5
grouped_clustering = grouped.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(grouped_clustering)

In [24]:
kmeans.labels_[0:10]

array([2, 0, 0, 0, 0, 2, 2, 2, 2, 2])

add the cluster labels as a column in the dataframe

In [25]:
neighborhoods_venues_sorted.insert(0, 'Cluster labels', kmeans.labels_)
merged = agg2
merged = merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')
merged.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,0.0,Fast Food Restaurant,Trail,Coffee Shop,Arts & Crafts Store,Chinese Restaurant,Bakery,Caribbean Restaurant,Fruit & Vegetable Store,Paper / Office Supplies Store,Restaurant
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,2.0,Breakfast Spot,Playground,Burger Joint,Park,Italian Restaurant,Falafel Restaurant,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Empanada Restaurant
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,0.0,Pizza Place,Fast Food Restaurant,Coffee Shop,Bank,Restaurant,Sports Bar,Food & Drink Shop,Liquor Store,Supermarket,Beer Store
3,M1G,Scarborough,Woburn,43.770992,-79.216917,0.0,Park,Coffee Shop,Chinese Restaurant,Indian Restaurant,Fast Food Restaurant,Business Service,Pharmacy,Mobile Phone Shop,Deli / Bodega,Falafel Restaurant
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,0.0,Bakery,Coffee Shop,Pharmacy,Gas Station,Indian Restaurant,Sporting Goods Shop,Bank,Fried Chicken Joint,Lounge,Yoga Studio


examine the neighborhoods of each cluster

In [26]:
merged.loc[merged['Cluster labels'] == 4, merged.columns[[1] + list(range(5, merged.shape[1]))]]

Unnamed: 0,Borough,Cluster labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
32,North York,4.0,Vietnamese Restaurant,Thai Restaurant,Baseball Field,Yoga Studio,Event Space,Drugstore,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Empanada Restaurant


check for null cluster values as the postcode list and lat/lon data may disagree, thus a neighborhood may not have Foursquare data

In [27]:
merged[merged['Cluster labels'].isnull()]

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
16,M1X,Scarborough,Upper Rouge,43.836125,-79.205636,,,,,,,,,,,


drop rows with nulls

In [37]:
merged = merged.dropna()

In [38]:
merged[merged['Cluster labels'].isnull()]

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue


cast cluster labels to integers

In [39]:
merged['Cluster labels'] = merged['Cluster labels'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


define a map object located on Toronto's lat/lon coordinates

In [41]:
map_clusters = folium.Map(location=[43.65, -79.38], zoom_start=10)

set a color palatte based on the number of clusters

In [42]:
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

create a list of map markers using each neighborhood's cluster label

In [43]:
markers_colors = []
for lat, lon, poi, cluster in zip(merged['Latitude'], merged['Longitude'], merged['Neighbourhood'], merged['Cluster labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

display the map

In [44]:
map_clusters