In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import json
import geocoder
import folium
import sklearn
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

I set up two datasets so I can choose to do analysis on all 103 neighborhoods ( _df_geo_all_ ) or only 39 neighborhoods ( _df_geo_tor_only_ ).

In [5]:
df_geo_all = pd.read_pickle("geographical_dataframe.pkl") # Read in saved dataframe from previous notebook

# Select only postal codes with 'Toronto' in the Borough name
df_geo_tor_only = df_geo[df_geo.Borough.str.contains('Toronto')].reset_index(drop = True)

print(f"There are {df_geo_all.shape[0]} postal codes in the overall dataset.")
print(f"There are {df_geo_tor_only.shape[0]} postal codes in the Toronto dataset.")

There are 103 postal codes in the overall dataset.
There are 39 postal codes in the Toronto dataset.


Visualization of the overall dataset (top) and the Toronto only dataset (bottom).

In [38]:
# Used trial and error to determine the best center coordinates for the following maps
map_all_lat = 43.7000
map_all_lon = -79.4200
map_tor_lat = 43.6750
map_tor_lon = -79.3832

In [39]:
map_all = folium.Map(location = [map_all_lat, map_all_lon], zoom_start = 11)

for code, bor, lat, lng in zip(df_geo_all.PostalCode,
                               df_geo_all.Borough,
                               df_geo_all.Latitude,
                               df_geo_all.Longitude):

    folium.CircleMarker(
        [lat, lng],
        radius = 5,
        popup = code,
        color = 'blue',
        fill = True).add_to(map_all)

map_all

In [29]:
map_toronto = folium.Map(location = [map_tor_lat, map_tor_lon], zoom_start = 12)

for code, bor, lat, lng in zip(df_geo_tor_only.PostalCode, 
                               df_geo_tor_only.Borough, 
                               df_geo_tor_only.Latitude, 
                               df_geo_tor_only.Longitude):

    folium.CircleMarker(
        [lat, lng],
        radius = 5,
        popup = code,
        color = 'blue',
        fill = True).add_to(map_toronto)

map_toronto

I use Foursquare to find up to 100 venues within a 500m radius of each postal code. For this analysis, I'm considering the overall dataset (103 postal codes).

In [40]:
with open('foursquare_secrets.json') as f_in: # Load foursquare API credentials
    fsq_secrets = json.load(f_in)

CLIENT_SECRET = fsq_secrets['CLIENT_SECRET']
CLIENT_ID = fsq_secrets['CLIENT_ID']
VERSION = '20180605'
LIMIT = 100

In [43]:
def get_nearby_venues(codes, lats, lons, radius = 500):
    
    # Foursquare API url for venue queries
    base_url = "https://api.foursquare.com/v2/venues/explore"
    
    # Set up query parameters
    params = {
        'client_id': CLIENT_ID,
        'client_secret': CLIENT_SECRET,
        'v': VERSION,
        'll': '',
        'radius': radius,
        'limit': LIMIT
    } 
    
    results = []
    
    for code, lat, lon in zip(codes, lats, lons):
        
        params['ll'] = str(lat) + ',' + str(lon) # Update latitude and longitude
        query = requests.get(base_url, params = params)
        query_items = query.json()['response']['groups'][0]['items']
        for q_item in query_items:
            new_row = [
                code, # postal code
                lat, # neighborhood latitude
                lon, # neighborhood longitude
                q_item['venue']['name'], # venue name
                q_item['venue']['categories'][0]['name'], # venue category
                q_item['venue']['location']['lat'], # venue latitude
                q_item['venue']['location']['lng'] # venue longitude
            ]
            results.append(new_row) # Create new row for each venue returned 
        
    df_results = pd.DataFrame(results, columns = [
        'PostalCode',
        'nbhood_lat',
        'nbhood_lon',
        'venue_name',
        'categories',
        'venue_lat',
        'venue_lon'
    ])
    
    return df_results

In [44]:
all_nearby = get_nearby_venues(
    df_geo_all['PostalCode'],
    df_geo_all['Latitude'],
    df_geo_all['Longitude']
)

In [61]:
all_nearby.head()

Unnamed: 0,PostalCode,nbhood_lat,nbhood_lon,venue_name,categories,venue_lat,venue_lon
0,M3A,43.753259,-79.329656,Brookbanks Park,Park,43.751976,-79.33214
1,M3A,43.753259,-79.329656,Variety Store,Food & Drink Shop,43.751974,-79.333114
2,M3A,43.753259,-79.329656,Corrosion Service Company Limited,Construction & Landscaping,43.752432,-79.334661
3,M4A,43.725882,-79.315572,Victoria Village Arena,Hockey Arena,43.723481,-79.315635
4,M4A,43.725882,-79.315572,Portugril,Portuguese Restaurant,43.725819,-79.312785


In [76]:
print("{} total venues were returned across {} postal codes in {} categories.".format(
    all_nearby.shape[0],
    len(all_nearby['PostalCode'].unique()),
    len(all_nearby['categories'].unique())
))

2135 total venues were returned across 99 postal codes in 272 categories.


Since the starting dataset had 103 postal codes, here I check what the 4 missing postal codes are and visualize them on a map. They won't be included in the clustering analysis.

In [78]:
returned = all_nearby['PostalCode'].unique()

zero_venues = []
for code in df_geo_all['PostalCode'].unique():
    if code not in returned:
        zero_venues.append(code)
        print(code)

M9A
M2L
M2M
M1X


In [83]:
df_zero_venues = df_geo_all[df_geo_all.PostalCode.isin(zero_venues)]
df_zero_venues # Information about the four postal codes that returned zero values

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
45,M2L,North York,"York Mills, Silver Hills",43.75749,-79.374714
52,M2M,North York,"Willowdale, Newtonbrook",43.789053,-79.408493
95,M1X,Scarborough,Upper Rouge,43.836125,-79.205636


In [84]:
map_all = folium.Map(location = [map_all_lat, map_all_lon], zoom_start = 11)

for code, bor, lat, lng in zip(df_geo_all.PostalCode,
                               df_geo_all.Borough,
                               df_geo_all.Latitude,
                               df_geo_all.Longitude):
    
    if code in zero_venues:
        color = 'red' # Postal codes with zero returned venues will show up in red
    else:
        color = 'green'

    folium.CircleMarker(
        [lat, lng],
        radius = 5,
        popup = code,
        color = color,
        fill = True).add_to(map_all)

map_all

I format the data to be able to perform k-means clustering.

In [90]:
all_one_hot = pd.get_dummies(all_nearby[['categories']], prefix = "", prefix_sep = "")
print(all_one_hot.shape)

# Add postal code column to one hot dataframe
all_one_hot = pd.concat([all_nearby[['PostalCode']], all_one_hot], axis = 1)
print(all_one_hot.shape)

all_one_hot.head()

(2135, 272)
(2135, 273)


Unnamed: 0,PostalCode,Accessories Store,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Truck Stop,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M4A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [92]:
all_grouped = all_one_hot.groupby('PostalCode').mean().reset_index() # Aggregate each category over postal code
print(all_grouped.shape)
all_grouped.head()

(99, 273)


Unnamed: 0,PostalCode,Accessories Store,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Truck Stop,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M1B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [93]:
all_to_cluster = all_grouped.drop(columns = ['PostalCode']) # Remove postal code column
all_to_cluster.head()

Unnamed: 0,Accessories Store,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Truck Stop,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


I create the dataframe _venues_sorted_ which ranks the most common venues seen within the 500m radius of each postal code. 

In [117]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['PostalCode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
venues_sorted = pd.DataFrame(columns=columns)
venues_sorted['PostalCode'] = all_grouped['PostalCode']

for ind in np.arange(all_grouped.shape[0]):
    venues_sorted.iloc[ind, 1:] = return_most_common_venues(all_grouped.iloc[ind, :], num_top_venues)

print(venues_sorted.shape)
venues_sorted.head()

(99, 11)


Unnamed: 0,PostalCode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Fast Food Restaurant,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Drugstore,Falafel Restaurant
1,M1C,Home Service,Bar,Dessert Shop,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Donut Shop,Yoga Studio
2,M1E,Electronics Store,Donut Shop,Intersection,Restaurant,Rental Car Location,Bank,Mexican Restaurant,Medical Center,Breakfast Spot,Drugstore
3,M1G,Coffee Shop,Korean BBQ Restaurant,Yoga Studio,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Donut Shop,Drugstore
4,M1H,Hakka Restaurant,Gas Station,Bakery,Bank,Thai Restaurant,Athletics & Sports,Caribbean Restaurant,Fried Chicken Joint,Donut Shop,Doner Restaurant


In [137]:
# Takes in a target number of clusters, and calls analysis and plotting functions
def perform_clustering(kclusters, df_cluster = all_to_cluster):
    kmeans = KMeans(n_clusters = kclusters, random_state=0).fit(df_cluster)
    
    w_cluster_labels = pd.concat([pd.DataFrame({'Cluster Labels': kmeans.labels_}), all_grouped], axis = 1)
    cluster_merged = pd.merge(w_cluster_labels[['Cluster Labels', 'PostalCode']], df_geo_all, on = 'PostalCode')
    
    analyze_clusters(kclusters, cluster_merged)
    
    return plot_clusters(kclusters, cluster_merged)

# Displays a dataframe with the most commonly found venues for each cluster
def analyze_clusters(kclusters, cluster_merged):
    df_to_analyze = pd.merge(cluster_merged[['Cluster Labels', 'PostalCode']], venues_sorted, on = 'PostalCode')
    
    display(df_to_analyze.groupby('Cluster Labels').agg(pd.Series.mode).iloc[:, [0, 2, 3, 4, 5]])
    
    return
        
# Plots each postal code as a single point with color corresponding to its k-means assigned cluster
def plot_clusters(kclusters, cluster_merged):
    map_clusters = folium.Map(location = [map_all_lat, map_all_lon], zoom_start = 11)
 
    x = np.arange(kclusters)
    ys = [i + x + (i*x)**2 for i in range(kclusters)]

    colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
    rainbow = [colors.rgb2hex(i) for i in colors_array]

    markers_colors = []
    for lat, lon, code, cluster in zip(cluster_merged['Latitude'], cluster_merged['Longitude'], cluster_merged['PostalCode'], cluster_merged['Cluster Labels']):
        label = folium.Popup(code + ' Cluster ' + str(cluster), parse_html=True)
        folium.CircleMarker(
            [lat, lon],
            radius=5,
            popup=label,
            color=rainbow[cluster-1],
            fill=True,
            fill_color=rainbow[cluster-1],
            fill_opacity=0.7).add_to(map_clusters)

    return map_clusters

In [138]:
perform_clustering(3)

Unnamed: 0_level_0,1st Most Common Venue,3rd Most Common Venue,6th Most Common Venue,8th Most Common Venue,9th Most Common Venue
Cluster Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Coffee Shop,Yoga Studio,Diner,Dog Run,Donut Shop
1,"[Baseball Field, Pool]","[Diner, Yoga Studio]","[Diner, Dog Run]","[Distribution Center, Donut Shop]","[Dog Run, Drugstore]"
2,Park,"[Bus Line, Yoga Studio]",Dim Sum Restaurant,Discount Store,Distribution Center


In [139]:
perform_clustering(4)

Unnamed: 0_level_0,1st Most Common Venue,3rd Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue
Cluster Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Coffee Shop,Yoga Studio,Discount Store,Diner,Distribution Center
1,"[Baseball Field, Pool]","[Diner, Yoga Studio]","[Dim Sum Restaurant, Distribution Center]","[Diner, Dog Run]","[Discount Store, Doner Restaurant]"
2,Park,"[Bus Line, Yoga Studio]",Dessert Shop,Dim Sum Restaurant,Diner
3,Pizza Place,"[Athletics & Sports, Bakery, Chinese Restauran...",Sandwich Place,Coffee Shop,Discount Store


In [140]:
perform_clustering(5)

Unnamed: 0_level_0,1st Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue
Cluster Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Home Service,Discount Store,Distribution Center,Dog Run,Doner Restaurant
1,Park,Dim Sum Restaurant,Diner,Discount Store,Distribution Center
2,Pizza Place,"[Diner, Electronics Store, Fried Chicken Joint]","[Discount Store, Eastern European Restaurant, ...","[Distribution Center, Dumpling Restaurant, Eas...","[Dog Run, Drugstore, Dumpling Restaurant]"
3,Fast Food Restaurant,Distribution Center,Dog Run,Doner Restaurant,Drugstore
4,Coffee Shop,Diner,Discount Store,Dog Run,Donut Shop
