# Preparing data for TSNE

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import io
import os
import math

In [2]:
listings = pd.read_csv('./Data/listings_clean.csv', index_col=0)

In [3]:
listings.head()

Unnamed: 0_level_0,latitude,longitude,host_response_rate,host_listings_count,accommodates,bathrooms,bedrooms,beds,price,security_deposit,...,Amen_Wireless Internet,host_is_superhost,host_identity_verified,host_has_profile_pic,instant_bookable,require_guest_profile_picture,require_guest_phone_verification,weekly_price,monthly_price,is_business_travel_ready
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
18628,40.424715,-3.698638,0.6,1.0,2,1.0,0.0,1.0,54.0,100.0,...,1,0,0,1,1,0,0,1,1,0
19864,40.413418,-3.706838,1.0,1.0,2,1.0,0.0,1.0,65.0,250.0,...,1,0,1,1,1,0,0,0,1,0
21512,40.42492,-3.713446,1.0,12.0,2,1.0,0.0,1.0,40.0,100.0,...,1,0,0,1,0,0,0,1,1,0
23021,40.423417,-3.712456,1.0,12.0,10,3.0,4.0,5.0,90.0,300.0,...,1,0,0,1,0,0,0,1,1,0
24805,40.422022,-3.703954,0.83,3.0,3,1.0,0.0,2.0,55.0,200.0,...,1,0,1,1,0,0,0,1,1,0


In [5]:
for_cluster = ['latitude', 'longitude', 'accommodates', 'bathrooms','bedrooms', 'beds', 'instant_bookable',
              'is_business_travel_ready']

In [6]:
reviewScoresColums = listings.filter(regex='review_scores.*').columns
for_cluster.extend(reviewScoresColums)

In [7]:
AmenColums = listings.filter(regex='Amen_.*').columns
for_cluster.extend(AmenColums)

In [8]:
categorical_features = ['neighbourhood_cleansed', 'property_type', 'bed_type']
listings_categorical = pd.get_dummies(listings[categorical_features], columns=categorical_features, 
                            prefix=["Neigh", "Property", "Bed"])

In [9]:
listingsCluster = pd.concat([listings[for_cluster], listings_categorical], axis=1)
listingsCluster.head()

Unnamed: 0_level_0,latitude,longitude,accommodates,bathrooms,bedrooms,beds,instant_bookable,is_business_travel_ready,review_scores_rating,review_scores_accuracy,...,Property_Other,Property_Serviced apartment,Property_Townhouse,Property_Vacation home,Property_Villa,Bed_Airbed,Bed_Couch,Bed_Futon,Bed_Pull-out Sofa,Bed_Real Bed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
18628,40.424715,-3.698638,2,1.0,0.0,1.0,1,0,89.0,9.0,...,0,0,0,0,0,0,0,0,1,0
19864,40.413418,-3.706838,2,1.0,0.0,1.0,1,0,91.0,9.0,...,0,0,0,0,0,0,0,0,1,0
21512,40.42492,-3.713446,2,1.0,0.0,1.0,0,0,79.0,9.0,...,0,0,0,0,0,0,0,0,1,0
23021,40.423417,-3.712456,10,3.0,4.0,5.0,0,0,80.0,8.0,...,0,0,0,0,0,0,0,0,0,1
24805,40.422022,-3.703954,3,1.0,0.0,2.0,0,0,100.0,8.0,...,0,0,0,0,0,0,0,0,0,1


## TSNE

In [10]:
%pylab inline
import pandas as pd
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)

Populating the interactive namespace from numpy and matplotlib


In [11]:
from sklearn.manifold import TSNE
tsne5 = TSNE(n_components=2, perplexity=5)
X_enc5 =tsne5.fit_transform(listingsCluster)
py.iplot([go.Scatter(x=X_enc5[:,0],y=X_enc5[:,1], mode='markers', marker=dict(size=2))])

In [12]:
tsne10 = TSNE(n_components=2, perplexity=10)
X_enc10 =tsne10.fit_transform(listingsCluster)
py.iplot([go.Scatter(x=X_enc10[:,0],y=X_enc10[:,1], mode='markers', marker=dict(size=2))])

In [13]:
tsne30 = TSNE(n_components=2, perplexity=30)
X_enc30 =tsne30.fit_transform(listingsCluster)
py.iplot([go.Scatter(x=X_enc30[:,0],y=X_enc30[:,1], mode='markers', marker=dict(size=2))])

In [14]:
tsne50 = TSNE(n_components=2, perplexity=50)
X_enc50 =tsne50.fit_transform(listingsCluster)
py.iplot([go.Scatter(x=X_enc50[:,0],y=X_enc50[:,1], mode='markers', marker=dict(size=2))])

In [15]:
tsne100 = TSNE(n_components=2, perplexity=100)
X_enc100 =tsne100.fit_transform(listingsCluster)
py.iplot([go.Scatter(x=X_enc100[:,0],y=X_enc100[:,1], mode='markers', marker=dict(size=2))])

In [16]:
tsne200 = TSNE(n_components=2, perplexity=200)
X_enc200 =tsne200.fit_transform(listingsCluster)
py.iplot([go.Scatter(x=X_enc200[:,0],y=X_enc200[:,1], mode='markers', marker=dict(size=2))])