In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from scipy.spatial import distance
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import RobustScaler

In [2]:
listings = pd.read_csv('../Data/listings_cleanTotal.csv')

In [35]:
numerical_features = ['latitude', 'longitude', 'accommodates', 'bathrooms','bedrooms', 'minimum_nights']
boolean_features = ['instant_bookable', 'is_business_travel_ready', 'Parking', 'Wifi']
categorical_features = ['neighbourhood_cleansed', 'property_type', 'cancellation_policy']
listings_categorical = pd.get_dummies(listings[categorical_features], columns=categorical_features, 
                            prefix=["Neigh", "Property", "CancPol"])
listingsCluster = pd.concat([listings['id'], listings['room_type'],listings[numerical_features],listings[categorical_features],listings[boolean_features]], axis=1)

In [36]:
listingsHome = listingsCluster[listingsCluster['room_type']=='Entire home/apt'].drop(['room_type'],axis=1)
listingsRoom = listingsCluster[listingsCluster['room_type']=='Private room'].drop(['room_type'],axis=1)
listingsSharedRoom = listingsCluster[listingsCluster['room_type']=='Shared room'].drop(['room_type'],axis=1)

In [37]:
listingsHome = listingsHome.reset_index(drop=True)
listingsRoom = listingsRoom.reset_index(drop=True)
listingsSharedRoom = listingsSharedRoom.reset_index(drop=True)

In [87]:
to_categorical = categorical_features + boolean_features
indCatColumns = [listingsCluster.iloc[:,2:].columns.get_loc(col) for col in to_categorical]
no_categorical = [i for i in range(listingsCluster.iloc[:,2:].shape[1]) if i not in indCatColumns]

K neighbors for full house

In [88]:
RS = RobustScaler()
listingsHome[numerical_features] = RS.fit_transform(listingsHome[numerical_features])

In [89]:
listingsHome.head()

Unnamed: 0,id,latitude,longitude,accommodates,bathrooms,bedrooms,minimum_nights,neighbourhood_cleansed,property_type,cancellation_policy,instant_bookable,is_business_travel_ready,Parking,Wifi
0,18628,0.464673,0.292232,-1.0,0.0,-1.0,26.0,61,0,2,1,0,0,1
1,19864,-0.297783,-0.445287,-1.0,0.0,-1.0,0.0,48,0,2,1,0,0,1
2,21512,0.478503,-1.039572,-1.0,0.0,-1.0,0.0,16,0,0,0,0,0,1
3,23021,0.377025,-0.950544,3.0,2.0,3.0,0.0,16,0,1,0,0,0,1
4,24805,0.282889,-0.185874,-0.5,0.0,-1.0,1.0,109,0,2,0,0,0,1


In [90]:
for cat_col in categorical_features:
    listingsHome[cat_col] = preprocessing.LabelEncoder().fit_transform(listingsHome[cat_col])

In [114]:
X = listingsHome.loc[:,listingsHome.columns != 'id'].values

In [92]:
Xnum = X[:,no_categorical]
gamma =  0.5 * Xnum.std()

In [93]:
def distance_dissim(X,Y,gamma,categorical):
    no_categorical = [i for i in range(len(X)) if i not in categorical]
    Xnum = X[no_categorical]
    Ynum = Y[no_categorical]
    Xcat = X[categorical]
    Ycat = Y[categorical]
    
    distance_dissim = distance.euclidean(Xnum, Ynum) + gamma * np.sum(Xcat != Ycat)
    
    return distance_dissim

In [94]:
neigh = NearestNeighbors(n_neighbors=11, metric=lambda X,Y: distance_dissim(X,Y,gamma = gamma,categorical = indCatColumns))

In [98]:
neigh_fit = neigh.fit(X)

In [99]:
vecinos = neigh.kneighbors(X,11, return_distance=False)

In [111]:
vecinos_id_home=np.empty(shape=(0,3)).astype(int)

for house in vecinos:
    position = 0
    for element in house:
        listWithNeighbors=[listingsHome.loc[house[0]]['id']] + [listingsHome.loc[element]['id']] + [position]
        vecinos_id_home = np.vstack([vecinos_id_home,listWithNeighbors])
        position +=1

K neighbors for Private room

In [15]:
RS = RobustScaler()
listingsRoom[numerical_features] = RS.fit_transform(listingsRoom[numerical_features])

In [16]:
for cat_col in categorical_features:
    listingsRoom[cat_col] = preprocessing.LabelEncoder().fit_transform(listingsRoom[cat_col])

In [17]:
X = listingsRoom.loc[:,listingsRoom.columns != 'id'].values

In [18]:
Xnum = X[:,no_categorical]
gamma =  0.5 * Xnum.std()

In [19]:
neigh = NearestNeighbors(n_neighbors=11, metric=lambda X,Y: distance_dissim(X,Y,gamma = gamma,categorical = indCatColumns))

In [20]:
neigh_fit = neigh.fit(X)

In [21]:
vecinos = neigh.kneighbors(X,11, return_distance=False)

In [22]:
vecinos_id_room=np.empty(shape=(0,3)).astype(int)

for house in vecinos:
    position = 0
    for element in house:
        listWithNeighbors=[listingsRoom.loc[house[0]]['id']] + [listingsRoom.loc[element]['id']] + [position]
        vecinos_id_room = np.vstack([vecinos_id_room,listWithNeighbors])
        position +=1

K neighbors for Shared room

In [23]:
RS = RobustScaler()
listingsSharedRoom[numerical_features] = RS.fit_transform(listingsSharedRoom[numerical_features])

In [24]:
for cat_col in categorical_features:
    listingsSharedRoom[cat_col] = preprocessing.LabelEncoder().fit_transform(listingsSharedRoom[cat_col])

In [25]:
X = listingsSharedRoom.loc[:,listingsSharedRoom.columns != 'id'].values

In [26]:
Xnum = X[:,no_categorical]
gamma =  0.5 * Xnum.std()

In [27]:
neigh = NearestNeighbors(n_neighbors=11, metric=lambda X,Y: distance_dissim(X,Y,gamma = gamma,categorical = indCatColumns))

In [28]:
neigh_fit = neigh.fit(X)

In [29]:
vecinos = neigh.kneighbors(X,11, return_distance=False)

In [30]:
vecinos_id_shared=np.empty(shape=(0,3)).astype(int)

for house in vecinos:
    position = 0
    for element in house:
        listWithNeighbors=[listingsSharedRoom.loc[house[0]]['id']] + [listingsSharedRoom.loc[element]['id']] + [position]
        vecinos_id_shared = np.vstack([vecinos_id_shared,listWithNeighbors])
        position +=1

In [31]:
vecinos_id = np.vstack([vecinos_id_home,vecinos_id_room,vecinos_id_shared])

In [43]:
vecinos_id.shape

(190521, 3)

In [41]:
header_n =['idNeighbourSelection','id','NeighbourNum']
pd.DataFrame(data=vecinos_id).to_csv('../Data/kneighbors_file2.csv', header=header_n)