In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import io
import os
import math
from sklearn.metrics import silhouette_samples,silhouette_score
from sklearn import preprocessing
from scipy.spatial import distance
from scipy.spatial.distance import cdist
from sklearn.decomposition import PCA
import random 
random.seed(33)

In [2]:
listings = pd.read_csv('../Data/listings_clean.csv', index_col=0)

In [3]:
numerical_features = ['latitude', 'longitude', 'accommodates', 'bathrooms','bedrooms', 'beds']

In [4]:
boolean_features = ['instant_bookable', 'is_business_travel_ready']

In [5]:
categorical_features = ['neighbourhood_cleansed', 'property_type', 'bed_type']
listings_categorical = pd.get_dummies(listings[categorical_features], columns=categorical_features, 
                            prefix=["Neigh", "Property", "Bed"])

In [6]:
listingsCluster = pd.concat([listings[numerical_features],listings[categorical_features],listings[boolean_features]], axis=1)

In [7]:
listingsCluster.head()

Unnamed: 0_level_0,latitude,longitude,accommodates,bathrooms,bedrooms,beds,neighbourhood_cleansed,property_type,bed_type,instant_bookable,is_business_travel_ready
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
18628,40.424715,-3.698638,2,1.0,0.0,1.0,Justicia,Apartment,Pull-out Sofa,1,0
19864,40.413418,-3.706838,2,1.0,0.0,1.0,Embajadores,Apartment,Pull-out Sofa,1,0
21512,40.42492,-3.713446,2,1.0,0.0,1.0,Argüelles,Apartment,Pull-out Sofa,0,0
23021,40.423417,-3.712456,10,3.0,4.0,5.0,Argüelles,Apartment,Real Bed,0,0
24805,40.422022,-3.703954,3,1.0,0.0,2.0,Universidad,Apartment,Real Bed,0,0


In [8]:
listingsCluster.isnull().sum()

latitude                    0
longitude                   0
accommodates                0
bathrooms                   0
bedrooms                    0
beds                        0
neighbourhood_cleansed      0
property_type               0
bed_type                    0
instant_bookable            0
is_business_travel_ready    0
dtype: int64

In [9]:
to_categorical = categorical_features + boolean_features
indCatColumns = [listingsCluster.columns.get_loc(col) for col in to_categorical]

In [10]:
from sklearn.preprocessing import RobustScaler
RS = RobustScaler()

In [11]:
listingsCluster[numerical_features] = RS.fit_transform(listingsCluster[numerical_features])

In [None]:
listingsCluster.describe()

Unnamed: 0,latitude,longitude,accommodates,bathrooms,bedrooms,beds,instant_bookable,is_business_travel_ready
count,8586.0,8586.0,8586.0,8586.0,8586.0,8586.0,8586.0,8586.0
mean,0.1526813,0.3135931,0.066562,0.248777,0.479385,0.201025,0.556138,0.063359
std,1.1185,1.670191,0.966366,0.512864,0.957383,0.76655,0.496867,0.243622
min,-5.753218,-12.01677,-1.5,-1.0,-1.0,-0.5,0.0,0.0
25%,-0.4771182,-0.4703298,-0.5,0.0,0.0,-0.5,0.0,0.0
50%,2.397762e-13,-1.997035e-14,0.0,0.0,0.0,0.0,1.0,0.0
75%,0.5228818,0.5296702,0.5,0.0,1.0,0.5,1.0,0.0
max,12.61458,14.25931,6.0,5.0,9.0,7.5,1.0,1.0


In [None]:
! pip install --upgrade kmodes

In [None]:
from kmodes.kprototypes import KPrototypes

In [None]:
for cat_col in categorical_features:
    listingsCluster[cat_col] = preprocessing.LabelEncoder().fit_transform(listingsCluster[cat_col])

In [None]:
X = listingsCluster.values

In [None]:
def distance_dissim(X,Y,gamma,categorical):
    no_categorical = [i for i in range(len(X)) if i not in categorical]
    Xnum = np.asanyarray([X[ii] for ii in no_categorical]).astype(np.float64)
    Ynum = np.asanyarray([Y[ii] for ii in no_categorical]).astype(np.float64)
    Xcat = np.asanyarray([X[ii] for ii in categorical])
    Ycat = np.asanyarray([Y[ii] for ii in categorical])
    
    distance_dissim = distance.euclidean(Xnum, Ynum) + gamma * np.sum(Xcat != Ycat)
    
    return distance_dissim

In [None]:
meandist=[]
clusters = range(1,10)

for k in clusters:
    model = KPrototypes(n_clusters=k, init='Huang', n_init=1, verbose=2)
    model_fit = model.fit(X, categorical = indCatColumns)
    clusassign= model_fit.predict(X, categorical = indCatColumns)
    centroids = list()
    for i in range(k):
        centroids.append(list())
        centroids[i] = np.append(model_fit.cluster_centroids_[0][i], model_fit.cluster_centroids_[1][i])
    meandist.append(sum(np.min(cdist(X,centroids, metric=lambda X,Y: distance_dissim(X,Y,gamma = model_fit.gamma,categorical = indCatColumns)),axis=1))/listingsCluster.shape[0])

In [None]:
plt.plot(clusters,meandist)
plt.xlabel("Number of clusters")
plt.ylabel("Average distance")
plt.title("Selecting k with the Elbow method")
plt.show()

In [None]:
#According to the Elbow method, we could consider 3 clusters and maybe 5 and 8 clusters

In [None]:
model3 = KPrototypes(n_clusters=3, init='Huang', n_init=1, verbose=2)
model_fit3 = model3.fit(X, categorical = indCatColumns)
clusassign3= model_fit3.predict(X, categorical = indCatColumns)
listings['Cluster3'] = model_fit3.labels_

In [None]:
plt.scatter(listings['latitude'], listings['longitude'], c = model_fit3.labels_.astype(np.float))
plt.show()

In [None]:
model5 = KPrototypes(n_clusters=5, init='Huang', n_init=1, verbose=2)
model_fit5 = model5.fit(X, categorical = indCatColumns)
clusassign5= model_fit5.predict(X, categorical = indCatColumns)
listings['Cluster5'] = model_fit5.labels_

In [None]:
plt.scatter(listings['latitude'], listings['longitude'], c = model_fit5.labels_.astype(np.float))
plt.show()

In [None]:
model8 = KPrototypes(n_clusters=8, init='Huang', n_init=1, verbose=2)
model_fit8 = model8.fit(X, categorical = indCatColumns)
clusassign8= model_fit8.predict(X, categorical = indCatColumns)
listings['Cluster8'] = model_fit8.labels_

In [None]:
plt.scatter(listings['latitude'], listings['longitude'], c = model_fit8.labels_.astype(np.float))
plt.show()

In [None]:
labels = model_fit3.labels_
gamma = model_fit3.gamma
Silhoutte3 = silhouette_score(X, labels, metric= lambda X,Y: distance_dissim(X,Y,gamma = gamma,categorical = indCatColumns))
print(Silhoutte3)

In [None]:
labels = model_fit5.labels_
gamma = model_fit5.gamma
Silhoutte5 = silhouette_score(X, labels, metric= lambda X,Y: distance_dissim(X,Y,gamma = gamma,categorical = indCatColumns))
print(Silhoutte5)

In [None]:
labels = model_fit8.labels_
gamma = model_fit8.gamma
Silhoutte5 = silhouette_score(X, labels, metric= lambda X,Y: distance_dissim(X,Y,gamma = gamma,categorical = indCatColumns))
print(Silhoutte8)

In [None]:
#The Silhoutte coefficient should be close to 1 if there is a good split between clusters, so it doesn't seem these are good clusters
#Even though, we test the stability of the clusters recalculating them with K=3 (highest coefficient)

In [None]:
centroids = model_fit3.cluster_centroids_[0]
centroids_PCA = PCA(n_components=2).fit_transform(centroids)

In [None]:
x_coord=list()
y_coord=list()
x_coord.append(list())
y_coord.append(list())
for i in range(0,3):
    x_coord[0].append(centroids_PCA[i][0])
    y_coord[0].append(centroids_PCA[i][1])

In [None]:
for k in range(1,10):
    model = KPrototypes(n_clusters=3, init='Huang', n_init=1, verbose=False)
    model_fit = model.fit(X, categorical = indCatColumns)
    centroids = model_fit.cluster_centroids_[0]
    centroids_PCA = PCA(n_components=2).fit_transform(centroids)
    x_coord.append(list())
    y_coord.append(list())
    for i in range(0,3):
        x_coord[k].append(centroids_PCA[i][0])
        y_coord[k].append(centroids_PCA[i][1])

In [None]:
plt.plot(x_coord[0],y_coord[0], 'ro',x_coord[1],y_coord[1], 'bo', x_coord[2],y_coord[2], 'yo', x_coord[3],y_coord[3], 'go', x_coord[4],y_coord[4], 'co', x_coord[5],y_coord[5], 'mo', x_coord[6],y_coord[6], 'ro', x_coord[7],y_coord[7], 'bo', x_coord[8],y_coord[8], 'go', x_coord[9],y_coord[9], 'go')
plt.show()

In [None]:
x_coord

In [None]:
#We can see that although the Silhouette coefficient is not high, the cluster centroids are stable 
#(at lest on the numerical variables),as we get almost the same ones after running the algorithm multiple times

In [None]:
#We export them in order to study the clusters in other tool
listings.to_csv('../Data/listings_cluster.csv', index=False)