# Toronto Clustering Exercise


Here we will try to cluster the above Boroughs according to their location

## Load libraries

First lets make sure we have a number of libraries loaded

In [2]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

# All requested packages already installed.

Libraries imported.


## Load some airbnb data

This is available from (tomslee.net)[http://tomslee.net/airbnb-data-collection-get-the-data] but I've made a local copy to make it easier.

In [5]:
import pandas as pd
df_airbnb = pd.read_csv('tomslee_airbnb_toronto_1428_2017-07-10.csv')
df_airbnb.head(2)

Unnamed: 0,room_id,survey_id,host_id,room_type,country,city,borough,neighborhood,reviews,overall_satisfaction,accommodates,bedrooms,bathrooms,price,minstay,last_modified,latitude,longitude,location
0,19767654,1428,70082317,Entire home/apt,,Toronto,,Niagara (82),0,0.0,3,2.0,,96.0,,2017-07-10 14:31:07.859320,43.638055,-79.401659,0101000020E61000003849F3C7B4D953C04F0647C9ABD1...
1,11521420,1428,7994690,Entire home/apt,,Toronto,,Woburn (137),0,0.0,4,2.0,,100.0,,2017-07-10 14:30:30.217267,43.780728,-79.246416,0101000020E6100000834D9D47C5CF53C0278925E5EEE3...


Take the room_type and neighbourhood, normalize (removng the neighborhood ids) and then onehot the type to make something we can KMeans

In [6]:
df_airbnb_use = df_airbnb[['room_type','neighborhood','latitude','longitude']]
df_airbnb_norm = df_airbnb_use.replace({r'(.*)\(.*\).*' : r'\1'}, regex=True)
df_airbnb_onehot = pd.get_dummies(df_airbnb_norm[['room_type']], prefix="", prefix_sep="")
df_airbnb_onehot['latitude'] = df_airbnb_norm['latitude']
df_airbnb_onehot['longitude'] = df_airbnb_norm['longitude']
df_airbnb_onehot['neighborhood'] = df_airbnb_norm['neighborhood']

# move neighborhood column to the first column
fixed_columns = [df_airbnb_onehot.columns[-1]] + list(df_airbnb_onehot.columns[:-1])
df_airbnb_onehot = df_airbnb_onehot[fixed_columns]

df_airbnb_onehot.head()

Unnamed: 0,neighborhood,Entire home/apt,Private room,Shared room,latitude,longitude
0,Niagara,1,0,0,43.638055,-79.401659
1,Woburn,1,0,0,43.780728,-79.246416
2,Willowdale East,1,0,0,43.778519,-79.414452
3,West Humber-Clairville,1,0,0,43.725403,-79.616909
4,Newtonbrook West,1,0,0,43.785733,-79.437599


Now group the results. We want the mean of everything, even latitude and longitude

In [7]:
df_airbnb_grouped = df_airbnb_onehot.groupby('neighborhood').mean().reset_index()
df_airbnb_grouped.head(2)

Unnamed: 0,neighborhood,Entire home/apt,Private room,Shared room,latitude,longitude
0,Agincourt North,0.2,0.733333,0.066667,43.809252,-79.26724
1,Agincourt South-Malvern West,0.269841,0.730159,0.0,43.78943,-79.27971


In [8]:
kclusters = 4
airbnb_clustering = df_airbnb_grouped.drop('neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(airbnb_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([3, 3, 1, 1, 0, 0, 1, 0, 3, 1], dtype=int32)

## Now map the results

The above 4 clusters should be sufficient for display, but first we need to add back to the neighborhoods and the coordinates for the (average) neighborhood location.

In [9]:
df_airbnb_grouped['Latitude']=0.0
df_airbnb_grouped['Longitude']=0.0
df_airbnb_grouped['Cluster']=0

# Add the coordinates
for index, row in df_airbnb_grouped.iterrows():
    df_airbnb_grouped.at[index,'Cluster'] = kmeans.labels_[index]

df_airbnb_grouped.head()

Unnamed: 0,neighborhood,Entire home/apt,Private room,Shared room,latitude,longitude,Latitude,Longitude,Cluster
0,Agincourt North,0.2,0.733333,0.066667,43.809252,-79.26724,0.0,0.0,3
1,Agincourt South-Malvern West,0.269841,0.730159,0.0,43.78943,-79.27971,0.0,0.0,3
2,Alderwood,0.882353,0.117647,0.0,43.605245,-79.542239,0.0,0.0,1
3,Annex,0.716867,0.26506,0.018072,43.670808,-79.402416,0.0,0.0,1
4,Banbury-Don Mills,0.459459,0.513514,0.027027,43.735718,-79.346191,0.0,0.0,0


In [11]:
import numpy as np

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for index, row in df_airbnb_grouped.iterrows():
    cluster = row['Cluster']
    label = folium.Popup(str(row['neighborhood']) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [row['latitude'], row['longitude']],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

And we can conclude that there is little geographical relationship to the natural clustering of the AirBnB room types