# Preparing data for clustering

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import io
import os
import math
import random 
random.seed(33)

In [2]:
listings = pd.read_csv('./Data/listings_clean.csv', index_col=0)

In [3]:
numerical_features = ['latitude', 'longitude', 'accommodates', 'bathrooms','bedrooms', 'beds']

In [4]:
reviewScoresColums = listings.filter(regex='review_scores.*').columns
numerical_features.extend(reviewScoresColums)

In [5]:
boolean_features = ['instant_bookable', 'is_business_travel_ready']

In [6]:
AmenColums = listings.filter(regex='Amen_.*').columns
boolean_features.extend(AmenColums)

In [7]:
categorical_features = ['neighbourhood_cleansed', 'property_type', 'bed_type']
listings_categorical = pd.get_dummies(listings[categorical_features], columns=categorical_features, 
                            prefix=["Neigh", "Property", "Bed"])

In [8]:
listingsCluster = pd.concat([listings[numerical_features],listings[categorical_features],listings[boolean_features]], axis=1)

In [9]:
listingsCluster[boolean_features]=listingsCluster[boolean_features].astype(object)

In [10]:
to_categorical = categorical_features + boolean_features
indCatColumns = [listingsCluster.columns.get_loc(col) for col in to_categorical]

In [11]:
from sklearn.preprocessing import RobustScaler
RS = RobustScaler()

In [12]:
listingsCluster[numerical_features] = RS.fit_transform(listingsCluster[numerical_features])
listingsCluster.head()

Unnamed: 0_level_0,latitude,longitude,accommodates,bathrooms,bedrooms,beds,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,...,Amen_Waterfront,Amen_Well-lit path to entrance,Amen_Wheelchair accessible,Amen_Wide clearance to bed,Amen_Wide clearance to shower,Amen_Wide doorway,Amen_Wide entryway,Amen_Wide hallway clearance,Amen_Window guards,Amen_Wireless Internet
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
18628,0.46635,0.29356,-1.0,0.0,-1.0,-0.5,-0.625,-1.0,-1.0,-1.0,...,0,1,0,0,0,0,0,0,0,1
19864,-0.297475,-0.4455,-1.0,0.0,-1.0,-0.5,-0.375,-1.0,-1.0,0.0,...,0,0,0,0,0,0,0,0,0,1
21512,0.480205,-1.041027,-1.0,0.0,-1.0,-0.5,-1.875,-1.0,-2.0,-1.0,...,0,0,1,0,0,0,0,0,0,1
23021,0.378545,-0.951813,3.0,2.0,3.0,1.5,-1.75,-2.0,-2.0,-1.0,...,0,0,1,0,0,0,0,0,0,1
24805,0.28424,-0.185544,-0.5,0.0,-1.0,0.0,0.75,-2.0,-2.0,0.0,...,0,0,0,0,0,0,0,0,0,1


In [18]:
! pip install --upgrade kmodes

Requirement already up-to-date: kmodes in /home/dsc/anaconda3/lib/python3.5/site-packages
Collecting numpy>=1.10.4 (from kmodes)
  Downloading https://files.pythonhosted.org/packages/7b/61/11b05cc37ccdaabad89f04dbdc2a02905cf6de6f9b05816dba843beed328/numpy-1.14.3-cp35-cp35m-manylinux1_x86_64.whl (12.1MB)
[K    100% |████████████████████████████████| 12.1MB 27kB/s  eta 0:00:01   62% |████████████████████            | 7.6MB 9.3MB/s eta 0:00:01
[?25hRequirement already up-to-date: scikit-learn<0.20.0,>=0.19.0 in /home/dsc/anaconda3/lib/python3.5/site-packages (from kmodes)
Collecting scipy>=0.13.3 (from kmodes)
  Downloading https://files.pythonhosted.org/packages/51/3d/494e1a81121c12233cb2f511e31b0dae3944008c81bbfa0218ec2d0038a8/scipy-1.0.1-cp35-cp35m-manylinux1_x86_64.whl (49.6MB)
[K    100% |████████████████████████████████| 49.7MB 5.7kB/s eta 0:00:01  5% |█▊                              | 2.6MB 9.2MB/s eta 0:00:06    35% |███████████▍                    | 17.6MB 19.6MB/s eta 0:00:02

In [19]:
! pip install --upgrade pip

Collecting pip
  Downloading https://files.pythonhosted.org/packages/0f/74/ecd13431bcc456ed390b44c8a6e917c1820365cbebcb6a8974d1cd045ab4/pip-10.0.1-py2.py3-none-any.whl (1.3MB)
[K    100% |████████████████████████████████| 1.3MB 217kB/s ta 0:00:011
[?25hInstalling collected packages: pip
  Found existing installation: pip 9.0.2
    Uninstalling pip-9.0.2:
      Successfully uninstalled pip-9.0.2
Successfully installed pip-10.0.1


In [13]:
from kmodes.kprototypes import KPrototypes
X = listingsCluster.values

In [14]:
clustersPrototypes10 = KPrototypes(n_clusters=5, init='Huang', n_init=1, verbose=2)\
                       .fit_predict(X, categorical = indCatColumns)

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 1, iteration: 1/100, moves: 3662, ncost: 118409.44763413515
Run: 1, iteration: 2/100, moves: 2062, ncost: 110778.5306904483
Run: 1, iteration: 3/100, moves: 731, ncost: 109094.46659636685
Run: 1, iteration: 4/100, moves: 380, ncost: 108847.83873440792
Run: 1, iteration: 5/100, moves: 206, ncost: 108807.73093671104
Run: 1, iteration: 6/100, moves: 69, ncost: 108798.70128860779
Run: 1, iteration: 7/100, moves: 44, ncost: 108794.6661356242
Run: 1, iteration: 8/100, moves: 26, ncost: 108794.01414962336
Run: 1, iteration: 9/100, moves: 7, ncost: 108793.94201098538
Run: 1, iteration: 10/100, moves: 1, ncost: 108793.93973100351
Run: 1, iteration: 11/100, moves: 0, ncost: 108793.93973100351


In [15]:
listings['Cluster'] = clustersPrototypes10

In [128]:
listings.to_csv('./Data/listingsCluster.csv', index=True)

In [16]:
listings_categorical = pd.get_dummies(listings[categorical_features], columns=categorical_features, 
                            prefix=["Neigh", "Property", "Bed"])

In [20]:
listingsCluster = pd.concat([listings[numerical_features],listings[boolean_features], listings_categorical], axis=1)

In [18]:
%pylab inline
import pandas as pd
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [21]:
X = listingsCluster
y = listings['Cluster']

In [22]:
from sklearn.manifold import TSNE
tsne5 = TSNE(n_components=2, perplexity=5)
X_enc5 =tsne5.fit_transform(X)
py.iplot([go.Scatter(x=X_enc5[:,0],y=X_enc5[:,1], mode='markers', marker=dict(color=y, size=2))])