In [12]:
import numpy as np
import pandas as pd
from scipy.spatial import distance 
from sklearn.utils import validation
from sklearn.metrics import pairwise
from scipy.sparse import issparse
import random

In [None]:


def _return_float_dtype(X, Y):
    """
    1. If dtype of X and Y is float32, then dtype float32 is returned.
    2. Else dtype float is returned.
    """
    if not issparse(X) and not isinstance(X, np.ndarray):
        X = np.asarray(X)

    if Y is None:
        Y_dtype = X.dtype
    elif not issparse(Y) and not isinstance(Y, np.ndarray):
        Y = np.asarray(Y)
        Y_dtype = Y.dtype
    else:
        Y_dtype = Y.dtype

    if X.dtype == Y_dtype == np.float32:
        dtype = np.float32
    elif X.dtype == np.object and not issparse(X):
        dtype = np.float
        for col in range(X.shape[1]):
            if not np.issubdtype(type(X[0, col]), np.number):
                dtype = np.object
                break
    else:
        dtype = np.float

    return X, Y, dtype


def check_pairwise_arrays(X, Y, precomputed=False, dtype=None):
    X, Y, dtype_float = _return_float_dtype(X, Y)

    warn_on_dtype = dtype is not None
    estimator = 'check_pairwise_arrays'
    if dtype is None:
        dtype = dtype_float


    if Y is X or Y is None:
        X = Y = validation.check_array(X, accept_sparse='csr', dtype=dtype,
                            warn_on_dtype=warn_on_dtype, estimator=estimator)
    else:
        X = validation.check_array(X, accept_sparse='csr', dtype=dtype,
                        warn_on_dtype=warn_on_dtype, estimator=estimator)
        Y = validation.check_array(Y, accept_sparse='csr', dtype=dtype,
                        warn_on_dtype=warn_on_dtype, estimator=estimator)

    if precomputed:
        if X.shape[1] != Y.shape[0]:
            raise ValueError("Precomputed metric requires shape "
                             "(n_queries, n_indexed). Got (%d, %d) "
                             "for %d indexed." %
                             (X.shape[0], X.shape[1], Y.shape[0]))
    elif X.shape[1] != Y.shape[1]:
        raise ValueError("Incompatible dimension for X and Y matrices: "
                         "X.shape[1] == %d while Y.shape[1] == %d" % (
                             X.shape[1], Y.shape[1]))

    return X, Y

In [None]:
def gower_distances(X, Y=None, w=None, categorical_features=None):
    """
    Computes the gower distances between X and Y

    Read more in the :ref:`User Guide <metrics>`.

    Parameters
    ----------
    X : array-like, shape (n_samples, n_features)

    Y : array-like, shape (n_samples, n_features)

    w:  array-like, shape (n_features)
    According the Gower formula, w is an attribute weight.

    categorical_features: array-like, shape (n_features)
    Indicates with True/False wheter a column is a categorical attribute.
    This is useful when categorical atributes are represented as integer
    values.

    Returns
    -------
    similarities : ndarray, shape (n_samples, )

    Notes
    ------
    Gower is a similarity measure for categorical, boolean and numerical mixed
    data.

    """

    X, Y = check_pairwise_arrays(X, Y, dtype=(np.object, None)[issparse(X) or
                                                               issparse(Y)])

    rows, cols = X.shape

    if categorical_features is None:
        categorical_features = []
        for col in range(cols):
            if np.issubdtype(type(X[0, col]), np.number):
                categorical_features.append(False)
            else:
                categorical_features.append(True)
    # Calculates the normalized ranges and max values of numeric values
    ranges_of_numeric = [0.0] * cols
    max_of_numeric = [0.0] * cols
    for col in range(cols):
        if not categorical_features[col]:
            max = None
            min = None
            if issparse(X):
                col_array = X.getcol(col)
                max = col_array.max() + 0.0
                min = col_array.min() + 0.0
            else:
                col_array = X[:, col].astype(np.double)
                max = np.nanmax(col_array)
                min = np.nanmin(col_array)

            if np.isnan(max):
                max = 0.0
            if np.isnan(min):
                min = 0.0
            max_of_numeric[col] = max
            ranges_of_numeric[col] = (1 - min / max) if (max != 0) else 0.0

    if w is None:
        w = [1] * cols

    yrows, ycols = Y.shape

    dm = np.zeros((rows, yrows), dtype=np.double)

    for i in range(0, rows):
        j_start = i

        # for non square results
        if rows != yrows:
            j_start = 0

        for j in range(j_start, yrows):
            sum_sij = 0.0
            sum_wij = 0.0
            for col in range(cols):
                value_xi = X[i, col]
                value_xj = Y[j, col]

                if not categorical_features[col]:
                    if (max_of_numeric[col] != 0):
                        value_xi = value_xi / max_of_numeric[col]
                        value_xj = value_xj / max_of_numeric[col]
                    else:
                        value_xi = 0
                        value_xj = 0

                    if ranges_of_numeric[col] != 0:
                        sij = abs(value_xi - value_xj) / ranges_of_numeric[col]
                    else:
                        sij = 0
                    wij = (w[col], 0)[np.isnan(value_xi) or np.isnan(value_xj)]
                else:
                    sij = (1.0, 0.0)[value_xi == value_xj]
                    wij = (w[col], 0)[value_xi is None and value_xj is None]
                sum_sij += (wij * sij)
                sum_wij += wij

            if sum_wij != 0:
                dm[i, j] = (sum_sij / sum_wij)
                if j < rows and i < yrows:
                    dm[j, i] = dm[i, j]

    return dm




In [None]:
def gower_distances(X, Y=None, w=None, categorical_features=None):
    """
    Computes the gower distances between X and Y

    Read more in the :ref:`User Guide <metrics>`.

    Parameters
    ----------
    X : array-like, shape (n_samples, n_features)

    Y : array-like, shape (n_samples, n_features)

    w:  array-like, shape (n_features)
    According the Gower formula, w is an attribute weight.

    categorical_features: array-like, shape (n_features)
    Indicates with True/False wheter a column is a categorical attribute.
    This is useful when categorical atributes are represented as integer
    values.

    Returns
    -------
    similarities : ndarray, shape (n_samples, )

    Notes
    ------
    Gower is a similarity measure for categorical, boolean and numerical mixed
    data.

    """

    X, Y = check_pairwise_arrays(X, Y, dtype=(np.object, None)[issparse(X) or
                                                               issparse(Y)])

    rows, cols = X.shape

    if categorical_features is None:
        categorical_features = []
        for col in range(cols):
            if np.issubdtype(type(X[0, col]), np.number):
                categorical_features.append(False)
            else:
                categorical_features.append(True)
    # Calculates the normalized ranges and max values of numeric values
    ranges_of_numeric = [0.0] * cols
    max_of_numeric = [0.0] * cols
    for col in range(cols):
        if not categorical_features[col]:
            max = None
            min = None
            if issparse(X):
                col_array = X.getcol(col)
                max = col_array.max() + 0.0
                min = col_array.min() + 0.0
            else:
                col_array = X[:, col].astype(np.double)
                max = np.nanmax(col_array)
                min = np.nanmin(col_array)

            if np.isnan(max):
                max = 0.0
            if np.isnan(min):
                min = 0.0
            max_of_numeric[col] = max
            ranges_of_numeric[col] = (1 - min / max) if (max != 0) else 0.0

    if w is None:
        w = [1] * cols

    yrows, ycols = Y.shape

    dm = np.zeros((rows, yrows), dtype=np.double)

    for i in range(0, rows):
        j_start = i

        # for non square results
        if rows != yrows:
            j_start = 0

        for j in range(j_start, yrows):
            sum_sij = 0.0
            sum_wij = 0.0
            for col in range(cols):
                value_xi = X[i, col]
                value_xj = Y[j, col]

                if not categorical_features[col]:
                    if (max_of_numeric[col] != 0):
                        value_xi = value_xi / max_of_numeric[col]
                        value_xj = value_xj / max_of_numeric[col]
                    else:
                        value_xi = 0
                        value_xj = 0

                    if ranges_of_numeric[col] != 0:
                        sij = abs(value_xi - value_xj) / ranges_of_numeric[col]
                    else:
                        sij = 0
                    wij = (w[col], 0)[np.isnan(value_xi) or np.isnan(value_xj)]
                else:
                    sij = (1.0, 0.0)[value_xi == value_xj]
                    wij = (w[col], 0)[value_xi is None and value_xj is None]
                sum_sij += (wij * sij)
                sum_wij += wij

            if sum_wij != 0:
                dm[i, j] = (sum_sij / sum_wij)
                if j < rows and i < yrows:
                    dm[j, i] = dm[i, j]

    return dm




In [13]:
HotelUnclustered= pd.read_csv("Hotel_Reviews.csv",usecols=[5,7,8,10,11,12,14,15,16])

In [14]:
import re
HotelUnclustered['days_since_review'] = HotelUnclustered['days_since_review'].apply(lambda x: int(''.join(filter(str.isdigit, x))))
HotelUnclustered

Unnamed: 0,Reviewer_Nationality,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,days_since_review,lat,lng
0,Russia,397,1403,11,7,2.9,0,52.360576,4.915968
1,Ireland,0,1403,105,7,7.5,0,52.360576,4.915968
2,Australia,42,1403,21,9,7.1,3,52.360576,4.915968
3,United Kingdom,210,1403,26,1,3.8,3,52.360576,4.915968
4,New Zealand,140,1403,8,3,6.7,10,52.360576,4.915968
5,Poland,17,1403,20,1,6.7,10,52.360576,4.915968
6,United Kingdom,33,1403,18,6,4.6,17,52.360576,4.915968
7,United Kingdom,11,1403,19,1,10.0,17,52.360576,4.915968
8,Belgium,34,1403,0,3,6.5,25,52.360576,4.915968
9,Norway,15,1403,50,1,7.9,26,52.360576,4.915968


In [None]:
HotelUnclustered

In [None]:
def kMedoids(D, k, tmax=100):
    # determine dimensions of distance matrix D
    m, n = D.shape

    if k > n:
        raise Exception('too many medoids')
    # randomly initialize an array of k medoid indices
    M = np.arange(n)
    np.random.shuffle(M)
    M = np.sort(M[:k])

    # create a copy of the array of medoid indices
    Mnew = np.copy(M)

    # initialize a dictionary to represent clusters
    C = {}
    for t in range(tmax):
        # determine clusters, i. e. arrays of data indices
        J = np.argmin(D[:,M], axis=1)
        for kappa in range(k):
            C[kappa] = np.where(J==kappa)[0]
        # update cluster medoids
        for kappa in range(k):
            J = np.mean(D[np.ix_(C[kappa],C[kappa])],axis=1)
            j = np.argmin(J)
            Mnew[kappa] = C[kappa][j]
        np.sort(Mnew)
        # check for convergence
        if np.array_equal(M, Mnew):
            break
        M = np.copy(Mnew)
    else:
        # final update of cluster memberships
        J = np.argmin(D[:,M], axis=1)
        for kappa in range(k):
            C[kappa] = np.where(J==kappa)[0]

    # return results
    return M, C

In [15]:
HotelUnclustered_100_indices=np.random.choice(len(HotelUnclustered), 500,replace=False)


In [16]:
selected_reviews=HotelUnclustered.loc[HotelUnclustered_100_indices,:]

In [17]:
selected_reviews

Unnamed: 0,Reviewer_Nationality,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,days_since_review,lat,lng
462084,United Kingdom,12,2027,14,7,8.8,548,52.361381,4.883259
182816,Japan,131,768,7,2,5.0,459,51.512676,-0.186783
4176,United Kingdom,0,3150,40,1,9.6,582,51.522622,-0.125160
508669,United Kingdom,13,1193,19,31,8.3,527,51.509133,-0.130189
353141,United Kingdom,5,1375,0,8,9.0,177,48.202110,16.372084
2375,India,51,4380,16,2,9.5,352,51.514218,-0.180903
130287,United Kingdom,29,1114,10,30,7.9,449,51.498882,-0.106289
289214,United Kingdom,10,2692,24,6,8.3,382,48.203568,16.356445
122755,Hong Kong,49,1008,12,2,7.9,692,51.517330,-0.118097
347083,Maldives,6,2586,4,5,9.2,438,52.358881,4.988604


In [18]:
selected_reviews.isnull().any()

Reviewer_Nationality                          False
Review_Total_Negative_Word_Counts             False
Total_Number_of_Reviews                       False
Review_Total_Positive_Word_Counts             False
Total_Number_of_Reviews_Reviewer_Has_Given    False
Reviewer_Score                                False
days_since_review                             False
lat                                            True
lng                                            True
dtype: bool

In [None]:
D = gower_distances(selected_reviews)
print(D)

In [None]:
from sklearn.manifold import MDS
import matplotlib.pyplot as plt
model = MDS(n_components=2, dissimilarity='precomputed', random_state=1)
out = model.fit_transform(D)
plt.scatter(out[:, 0], out[:, 1])
plt.axis('equal');
plt.show()

In [None]:
M, C = kMedoids(D, 2)
selected_reviews.iloc[89,:]
for point_idx in M:
    print( selected_reviews.iloc[point_idx,:] )





In [None]:
print('')
print('clustering result:')
for label in C:
    for point_idx in C[label]:
        print('label {0}:　{1}'.format(label, selected_reviews.iloc[point_idx,:]))

In [1]:
#!pip3 install requests
import requests
import json
import pandas as pd

In [2]:
Hotel_Reviews=pd.read_csv("Hotel_Reviews.csv",usecols=['Hotel_Name','lat','lng']).drop_duplicates()
Hotel_Reviews
 

Unnamed: 0,Hotel_Name,lat,lng
0,Hotel Arena,52.360576,4.915968
405,K K Hotel George,51.491888,-0.194971
971,Apex Temple Court Hotel,51.513734,-0.108751
2008,The Park Grand London Paddington,51.514218,-0.180903
3778,Monhotel Lounge SPA,48.874348,2.289733
3813,Kube Hotel Ice Bar,48.886570,2.358833
3832,The Principal London,51.522622,-0.125160
5257,Park Plaza County Hall London,51.501400,-0.116009
7480,One Aldwych,51.511783,-0.119417
7616,Splendid Etoile,48.874707,2.293676


In [10]:
Hotel_Reviews['landmark_count']='-'

In [None]:
ex=Hotel_Reviews.lng[500179]
np.isnan(ex)

In [7]:
def landmarksCount(lat,lng):
    url='https://reverse.geocoder.cit.api.here.com/6.2/reversegeocode.json?app_id=6P8bp4So17PsN2isu0RY&app_code=uXbjRVdq9Bd0k2RsH641SA&gen=8&mode=retrieveLandmarks&prox='+str(lat)+','+str(lng)+',500'
    r = requests.get(url)
    jsonobj=json.loads(r.text)
    if len(jsonobj['Response']['View'])>0:
        return(len(jsonobj['Response']['View'][0]['Result']))
    else:
        return 0

In [9]:
landmarksCount(52.360576,4.915968)

https://reverse.geocoder.cit.api.here.com/6.2/reversegeocode.json?app_id=6P8bp4So17PsN2isu0RY&app_code=uXbjRVdq9Bd0k2RsH641SA&gen=8&mode=retrieveLandmarks&prox=52.360576,4.915968,500


6

In [19]:

for i in Hotel_Reviews.index:
    lat = Hotel_Reviews.lat[i]
    lng = Hotel_Reviews.lng[i]
    if np.isnan(lat) or np.isnan(lng):
           continue
    else:  
            try:       
                Hotel_Reviews['landmark_count'][i]=landmarksCount(lat,lng)
            except:
                print("error at",i,lat,lng)
            
      

https://reverse.geocoder.cit.api.here.com/6.2/reversegeocode.json?app_id=6P8bp4So17PsN2isu0RY&app_code=uXbjRVdq9Bd0k2RsH641SA&gen=8&mode=retrieveLandmarks&prox=52.3605759,4.9159683,500


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


https://reverse.geocoder.cit.api.here.com/6.2/reversegeocode.json?app_id=6P8bp4So17PsN2isu0RY&app_code=uXbjRVdq9Bd0k2RsH641SA&gen=8&mode=retrieveLandmarks&prox=51.4918878,-0.1949706,500
https://reverse.geocoder.cit.api.here.com/6.2/reversegeocode.json?app_id=6P8bp4So17PsN2isu0RY&app_code=uXbjRVdq9Bd0k2RsH641SA&gen=8&mode=retrieveLandmarks&prox=51.5137335,-0.1087512,500
https://reverse.geocoder.cit.api.here.com/6.2/reversegeocode.json?app_id=6P8bp4So17PsN2isu0RY&app_code=uXbjRVdq9Bd0k2RsH641SA&gen=8&mode=retrieveLandmarks&prox=51.5142184,-0.1809032,500
https://reverse.geocoder.cit.api.here.com/6.2/reversegeocode.json?app_id=6P8bp4So17PsN2isu0RY&app_code=uXbjRVdq9Bd0k2RsH641SA&gen=8&mode=retrieveLandmarks&prox=48.8743481,2.2897334,500
https://reverse.geocoder.cit.api.here.com/6.2/reversegeocode.json?app_id=6P8bp4So17PsN2isu0RY&app_code=uXbjRVdq9Bd0k2RsH641SA&gen=8&mode=retrieveLandmarks&prox=48.88657,2.3588332,500
https://reverse.geocoder.cit.api.here.com/6.2/reversegeocode.json?app_id=6

In [21]:
Hotel_Reviews.to_csv("landmarks_count.csv")