## Hotel_description_generation_with_RNN SimpleRNN



Machine Learning -  Homework #4 - University of Memphis. Fall 2022  <br>
Last updated - Nov 15, 2022  <br>
Author Bereket Kebede, Graduate Student <br>


## Getting Started 
----

In [15]:
##############################################################################
## import necessary libraries

from keras_preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout, SimpleRNN, GRU
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 
import pandas as pd
import numpy as np
import string, os 
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [16]:
hotel_df = pd.read_csv('Seattle_Hotels_address_description.csv', encoding="latin-1")
all_descriptions = list(hotel_df.desc.values)

In [4]:
len(all_descriptions)

152

In [5]:
corpus = [x for x in all_descriptions]
corpus[:1]

['Located on the southern tip of Lake Union, the Hilton Garden Inn Seattle Downtown hotel is perfectly located for business and leisure. Non-Smoking\nHotel is 100% non-smoking, including e-cigarettes, in all guest rooms and public areas. A fee of up to $250 USD will be assessed for smoking in a non-smoking room. Please ask the Front Desk for locations of designated outdoor smoking areas. Check-in: 4:00 pm. Check-out: 12:00 pm. Cancellation policies may vary depending on the rate or dates of your reservation. Please refer to your reservation confirmation to verify your cancellation policy.\n']

In [6]:
t = Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', char_level=False, oov_token=None, document_count=0)
t.fit_on_texts(corpus)

In [7]:
# A dictionary of words and their counts.
print(t.word_counts)

# A dictionary of words and how many documents each appeared in.
print(t.word_docs)

# An integer count of the total number of documents that were used to fit the Tokenizer (i.e. total number of documents)
print(t.document_count)

# A dictionary of words and their uniquely assigned integers.
print(t.word_index)

OrderedDict([('located', 106), ('on', 128), ('the', 1236), ('southern', 1), ('tip', 1), ('of', 526), ('lake', 40), ('union', 31), ('hilton', 11), ('garden', 11), ('inn', 81), ('seattle', 463), ('downtown', 131), ('hotel', 293), ('is', 279), ('perfectly', 6), ('for', 213), ('business', 84), ('and', 1044), ('leisure', 18), ('non', 19), ('smoking', 29), ('100', 10), ('including', 48), ('e', 2), ('cigarettes', 2), ('in', 460), ('all', 104), ('guest', 61), ('rooms', 106), ('public', 9), ('areas', 19), ('a', 610), ('fee', 7), ('up', 46), ('to', 474), ('250', 3), ('usd', 2), ('will', 50), ('be', 49), ('assessed', 2), ('room', 81), ('please', 5), ('ask', 2), ('front', 12), ('desk', 12), ('locations', 2), ('designated', 6), ('outdoor', 24), ('check', 41), ('4', 15), ('00', 19), ('pm', 11), ('out', 36), ('12', 11), ('cancellation', 3), ('policies', 5), ('may', 5), ('vary', 3), ('depending', 2), ('rate', 8), ('or', 159), ('dates', 2), ('your', 184), ('reservation', 5), ('refer', 1), ('confirmatio

In [8]:
print('Found %s unique tokens.' % len(t.word_index))

Found 3428 unique tokens.


In [9]:
# Tokenization
t = Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', char_level=False, oov_token=None, document_count=0)

def get_sequence_of_tokens(corpus):
    t.fit_on_texts(corpus)
    total_words = len(t.word_index) + 1
    
    input_sequences = []
    for line in corpus:
        token_list = t.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
            
    return input_sequences, total_words
input_sequences, total_words = get_sequence_of_tokens(corpus)

In [10]:
input_sequences[:10]

[[24, 22],
 [24, 22, 1],
 [24, 22, 1, 1750],
 [24, 22, 1, 1750, 1751],
 [24, 22, 1, 1750, 1751, 4],
 [24, 22, 1, 1750, 1751, 4, 83],
 [24, 22, 1, 1750, 1751, 4, 83, 114],
 [24, 22, 1, 1750, 1751, 4, 83, 114, 1],
 [24, 22, 1, 1750, 1751, 4, 83, 114, 1, 334],
 [24, 22, 1, 1750, 1751, 4, 83, 114, 1, 334, 335]]

In [11]:
total_words

3429

In [12]:
# pad sequences 
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen = max_sequence_len, padding = 'pre'))
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes = total_words)
    
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(input_sequences)

In [17]:
def create_model(max_sequence_len, total_words):
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=max_sequence_len - 1))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(SimpleRNN(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 505, 10)           34290     
                                                                 
 simple_rnn (SimpleRNN)      (None, 100)               11100     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 dense_1 (Dense)             (None, 3429)              346329    
                                                                 
Total params: 391,719
Trainable params: 391,719
Non-trainable params: 0
_________________________________________________________________


In [18]:
model.fit(predictors, label, epochs=50, verbose=5)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x11ee1146620>

In [19]:
def generate_text(seed_text, next_words, model, max_seq_len):
    for _ in range(next_words):
        token_list = t.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding='pre')
        
        #predicted = model.predict(token_list, verbose=0)
        predicted = np.argmax(model.predict(token_list), axis=-1)
        
        output_word = ''
        
        for word,index in t.word_index.items():
            if index == predicted:
                output_word = word
                break
                
        seed_text = seed_text + " " + output_word
        
    return seed_text.title()

In [20]:
desc_1 = generate_text("hilton seattle downtown", 100, model, max_sequence_len)



In [21]:
print(desc_1)

Hilton Seattle Downtown Seattle Airport And A Variety Of Programming And Catering Options And Toiletries Furnishings For Up On A Lumpy Old Mattress A Variety Of A Kind Room With A Bath Of The Most Comprehensive Smoke Free Hot Breakfast Buffet Every Morning And A Seating Area In Our Ultra Fitness Center And A Great Stay At The Master And Building In The City And Northwest Folklife Festivals And Don'T Forget That The Best Western This Hotel Is A Unique Location To Enjoy A Ten Minute Drive From The Space Needle Pike Place Market And The University Of Washington Campus And The Uw


In [22]:
print(generate_text("hilton seattle downtown", 100, model, max_sequence_len))
print()
print(generate_text("best western seattle airport hotel", 200, model, max_sequence_len))
print()
print(generate_text('located in the heart of downtown seattle', 300, model, max_sequence_len))

Hilton Seattle Downtown Seattle Airport And A Variety Of Programming And Catering Options And Toiletries Furnishings For Up On A Lumpy Old Mattress A Variety Of A Kind Room With A Bath Of The Most Comprehensive Smoke Free Hot Breakfast Buffet Every Morning And A Seating Area In Our Ultra Fitness Center And A Great Stay At The Master And Building In The City And Northwest Folklife Festivals And Don'T Forget That The Best Western This Hotel Is A Unique Location To Enjoy A Ten Minute Drive From The Space Needle Pike Place Market And The University Of Washington Campus And The Uw

Best Western Seattle Airport Hotel In The Heart Of The City And Necessitated Major Renovation Fortunately Bacon Mansion'S Massive Walls Spared The Glass And Wood Pocket Doors Found In The Quirky Carriage And A Glistening Room Inn At The Market Of The Universe Of Downtown Seattle And The Uw Hospital Plus The City At The Market'S Community And Financial And Multi Ultimate Panoramic And Parklands Beckon Nature Lover

Located In The Heart Of Downtown Seattle And Puget Sound Opt For A Decadent Suite With A Microwave And Refrigerator And Furnishings And Shower Private Service And A Fridge A Large Mosaic Pineapple In The Heart Of The City At The Seattle Parks Are Mere Steps From The City Of The Seattle Famous Seattle Has A Quick Escape To A Business And A Business Center With A Range Of A Seating Area In The Late 1960S Renamed The Mason House The Inn Operated And The Seahawks Music Camp Boutiques And The Pacific Northwest With A Variety Of Programming And Activities And The Rooftop Fireplace For A Variety Of A Garden Room With A Bath And Shower Emerald City And Sleeping And The Retail Core The 97 Conveniently Located To The Best And Activities At The Pacific Northwest And Breweries In The Mix Of The Pacific Northwest And Breweries In The Heart Of The Emerald City Of Downtown Seattle Downtown Seattle And A Variety Of Programming And Activities And The Rooftop Fireplace For A Variety Of A Garden Room W

### References: <br>

[1] SimpleRNN: https://www.tensorflow.org/api_docs/python/tf/keras/layers/SimpleRNN