# Hotel Embedding Notebook

In [None]:
import pandas as pd
import numpy as np
import random
random.seed(100)

from keras.layers import Input, Embedding, Dot, Reshape, Dense
from keras.models import Model
from keras.models import load_model

#import umap as UMAP
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
plt.rcParams['font.size'] = 15

In [None]:
hotel = pd.read_csv('hotel_embedding.csv')
hotel = hotel[hotel["DWHotelID"].notna()]
hotel = hotel.fillna(0)
hotel.index = range(0, hotel.shape[0])

In [None]:
city = hotel["HotelName"]
name = hotel["CityName"]
rooms = hotel["#rooms"]
df = hotel.drop(["HotelName","CityName","#rooms"], axis = 1)
df["DWHotelID"] = df["DWHotelID"].astype(int)

In [None]:
item_id = df['DWHotelID'].values
cols =df.columns.to_list()[1:]
properties = df[cols].values

In [None]:
hotel_index = {item_id: idx for idx, item_id in enumerate(item_id)}
index_hotel = {idx: item_id for item_id, idx in hotel_index.items()}
index_properties = {idx: properties for idx, properties in  enumerate(properties)}
properties_index = {prop:idx  for idx, prop in  enumerate(cols)}

In [None]:
pairs = []
#positive pairs
for item in item_id:
    pairs.extend((hotel_index[item], cols[p]) for p in range(len(cols)) if (df.loc[hotel_index[item]][cols[p]] == 1))

pairs_set = set(pairs)

In [None]:
def generate_batch(pairs, n_positive = 30, negative_ratio = 1.0, classification = False):
    """Generate batches of samples for training"""
    batch_size = n_positive * (1 + negative_ratio)
    batch = np.zeros((batch_size, 3))
    
    # Adjust label based on task
    if classification:
        neg_label = 0
    else:
        neg_label = -1
    
    # This creates a generator
    while True:
        # randomly choose positive examples
        for idx, (hotel_id,prop) in enumerate(random.sample(pairs, n_positive)):
            batch[idx, :] = (hotel_id, properties_index[prop], 1)

        # Increment idx by 1
        idx += 1
        
        # Add negative examples until reach batch size
        while idx < batch_size:
            
            # random selection
            random_idx = random.randrange(df.shape[0])
            random_prop = random.randrange(len(cols))
            
            # Check to make sure this is not a positive example
            if (random_idx, random_prop) not in pairs_set:
                
                # Add to batch and increment index
                batch[idx, :] = (random_idx, random_prop, neg_label)
                idx += 1
                
        # Make sure to shuffle order
        np.random.shuffle(batch)
        yield {'hotel_id': batch[:, 0], 'property': batch[:, 1]}, batch[:, 2]

In [None]:
next(generate_batch(pairs, n_positive = 2, negative_ratio = 2))

In [None]:
def hotel_embedding_model(embedding_size = 200, classification = False):
    """Model to embed hotels and props using the functional API."""
    
    # Both inputs are 1-dimensional
    hotel_id = Input(name = 'hotel_id', shape = [1])
    prop = Input(name = 'property', shape = [1])
    
    # Embedding the hotel (shape will be (None, 1, embedding_size))
    hotel_embedding = Embedding(name = 'hotel_embedding',
                               input_dim = df.shape[0],
                               output_dim = embedding_size)(hotel_id)
    
    # Embedding the prop (shape will be (None, 1, embedding_size))
    prop_embedding = Embedding(name = 'prop_embedding',
                               input_dim = len(cols),
                               output_dim = embedding_size)(prop)
    
    # Merge the layers with a dot product along the second axis (shape will be (None, 1, 1))
    merged = Dot(name = 'dot_product', normalize = True, axes = 2)([hotel_embedding, prop_embedding])
    
    # Reshape to be a single number (shape will be (None, 1))
    merged = Reshape(target_shape = [1])(merged)
    
    # If classifcation, add extra layer and loss function is binary cross entropy
    if classification:
        merged = Dense(1, activation = 'sigmoid')(merged)
        model = Model(inputs = [hotel_id, prop], outputs = merged)
        model.compile(optimizer = 'Adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    
    # Otherwise loss function is mean squared error
    else:
        model = Model(inputs = [hotel_id, prop], outputs = merged)
        model.compile(optimizer = 'Adam', loss = 'mse')
    
    return model

In [None]:
# Instantiate model and show parameters
model = hotel_embedding_model()
model.summary()

n_positive = 2048

gen = generate_batch(pairs, n_positive, negative_ratio = 2)

In [None]:
# Train
h = model.fit_generator(gen, epochs = 15, 
                        steps_per_epoch = len(pairs) // n_positive,
                        verbose = 2)

model.save('./hotel_embedding.h5')

In [None]:
hotel_layer = model.get_layer('hotel_embedding')
hotel_weights = hotel_layer.get_weights()[0]
hotel_weights.shape

In [None]:
def find_similar(hotel_id, weights, index_name = 'hotel_id', n = 11, least = False, return_dist = False, plot = False):
    """Find n most similar items (or least) to name based on embeddings. Option to also plot the results"""
    
    # Select index and reverse index
    if index_name == 'hotel_id':
        index = hotel_index
        rindex = index_hotel
    elif index_name == 'prop':
        index = properties_index
        rindex = cols
    
    # Check to make sure `name` is in index
    try:
        # Calculate dot product between book and all others
        dists = np.dot(weights, weights[hotel_index[hotel_id]])
    except KeyError:
        print(f'{hotel_id} Not Found.')
        return
    
    # Sort distance indexes from smallest to largest
    sorted_dists = np.argsort(dists)
    
    # Plot results if specified
    if plot:
        
        # Find furthest and closest items
        furthest = sorted_dists[:(n // 2)]
        closest = sorted_dists[-n-1: len(dists) - 1]
        items = [rindex[c] for c in furthest]
        items.extend(rindex[c] for c in closest)
        hotel_name = []
        for id_ in items:
            hotel_name.append(hotel[hotel["DWHotelID"]==id_]["HotelName"].values[0])
        
        
        # Find furthest and closets distances
        distances = [dists[c] for c in furthest]
        distances.extend(dists[c] for c in closest)
        
        colors = ['r' for _ in range(n //2)]
        colors.extend('g' for _ in range(n))
  
        data = pd.DataFrame({'distance': distances}, index = hotel_name)

        # Horizontal bar chart
        data['distance'].plot.barh(color = colors, figsize = (10, 8),
                                   edgecolor = 'k', linewidth = 2)
        plt.xlabel('Cosine Similarity');
        plt.axvline(x = 0, color = 'k');
        
        # Formatting for italicized title
        name_str = f'{index_name.capitalize()}s Most and Least Similar to'
        
            # Title uses latex for italize
        name_str += ' $\it{' + hotel[hotel["DWHotelID"]==hotel_id]["HotelName"].values[0] + '}$'
        plt.title(name_str, x = 0.2, size = 28, y = 1.05)
        
        return None
    
    # If specified, find the least similar
    if least:
        # Take the first n from sorted distances
        closest = sorted_dists[:n]
         
        print(f'{index_name.capitalize()}s furthest from {hotel_id}.\n')
        
    # Otherwise find the most similar
    else:
        # Take the last n sorted distances
        closest = sorted_dists[-n:]
        
        # Need distances later on
        if return_dist:
            return dists, closest
        
        
        print(f'{index_name.capitalize()}s closest to {hotel_id}.\n')
        
    # Need distances later on
    if return_dist:
        return dists, closest
    
    
    # Print formatting
    max_width = max([len(str(rindex[c])) for c in closest])
    # Print the most similar and distances
    closest_ids = []
    for c in reversed(closest):
        closest_ids.append(rindex[c])
        print(f'{index_name.capitalize()}: {rindex[c]:{max_width + 2}} Similarity: {dists[c]:.{2}}')
    return closest_ids

In [None]:
print( "{} oteli için en benzer otellerin bulunması :".format(hotel[hotel["DWHotelID"]==2708]["HotelName"].values[0]) )
closest = find_similar(2708, hotel_weights)
print("\n")
for c in closest:
    print(hotel[hotel["DWHotelID"]==c][["DWHotelID","HotelName","CityName"]].values)

In [None]:
closest = find_similar(2708, hotel_weights, n = 5, plot=True)

In [None]:
arr_1 = hotel[hotel["DWHotelID"]==2708].values[0]
arr_2 = hotel[hotel["DWHotelID"]==10477].values[0]
count = -3

same = []
for a in range(len(arr_1)):
    if arr_1[a]==arr_2[a]:
        count+=1
    else:
        same.append(a)
print("Number of same features out of 335 is : ", count)

for same_ in same:
    print(hotel.columns[same_])