In [151]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from scipy.spatial import distance

In [152]:
listings = pd.read_csv('../Data/listings_clean.csv')

In [153]:
numerical_features = ['latitude', 'longitude', 'accommodates', 'bathrooms','bedrooms', 'beds']
boolean_features = ['instant_bookable', 'is_business_travel_ready']
categorical_features = ['neighbourhood_cleansed', 'property_type', 'bed_type']
listings_categorical = pd.get_dummies(listings[categorical_features], columns=categorical_features, 
                            prefix=["Neigh", "Property", "Bed"])
listingsCluster = pd.concat([listings[numerical_features],listings[categorical_features],listings[boolean_features]], axis=1)

In [154]:
to_categorical = categorical_features + boolean_features
indCatColumns = [listingsCluster.columns.get_loc(col) for col in to_categorical]

In [155]:
from sklearn.preprocessing import RobustScaler
RS = RobustScaler()
listingsCluster[numerical_features] = RS.fit_transform(listingsCluster[numerical_features])
#In order to put more weight on these 2 variables
#listingsCluster['latitude'] = listingsCluster['latitude'] * 3
#listingsCluster['longitude'] = listingsCluster['latitude'] * 3

In [156]:
for cat_col in categorical_features:
    listingsCluster[cat_col] = preprocessing.LabelEncoder().fit_transform(listingsCluster[cat_col])

In [157]:
X = listingsCluster.values

In [158]:
from sklearn.neighbors import NearestNeighbors

In [159]:
no_categorical = [i for i in range(len(X)) if i not in indCatColumns]
Xnum = np.asanyarray([X[ii] for ii in no_categorical]).astype(np.float64)
gamma =  0.5 * Xnum.std()

In [160]:
def distance_dissim(X,Y,gamma,categorical):
    no_categorical = [i for i in range(len(X)) if i not in categorical]
    Xnum = np.asanyarray([X[ii] for ii in no_categorical]).astype(np.float64)
    Ynum = np.asanyarray([Y[ii] for ii in no_categorical]).astype(np.float64)
    Xcat = np.asanyarray([X[ii] for ii in categorical])
    Ycat = np.asanyarray([Y[ii] for ii in categorical])
    
    distance_dissim = distance.euclidean(Xnum, Ynum) + gamma * np.sum(Xcat != Ycat)
    
    return distance_dissim

In [161]:
neigh = NearestNeighbors(n_neighbors=11, metric=lambda X,Y: distance_dissim(X,Y,gamma = gamma,categorical = indCatColumns))

In [162]:
neigh_fit = neigh.fit(X)

In [None]:
vecinos = neigh.kneighbors(X,11, return_distance=False)

In [None]:
vecinos_id = {}

for house in vecinos:
    id_house=listings.loc[house[0]]['id']
    vecinos_id[id_house]=list()
    for element in house[1:]:
        vecinos_id[id_house].append(listings.loc[element]['id'])

In [None]:
vecinos_id

In [None]:
header_n =["Neighbor_"+ str(i) for i in range(1,11)]
(pd.DataFrame.from_dict(data=vecinos_id, orient='index').to_csv('dict_file.csv', header=header_n))