In [1]:
import numpy as np
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.models import save_model
import csv 
import pandas as pd
from random import shuffle

In [2]:
df = pd.read_csv('hotel-reviews.csv', nrows=100000)
df['REVIEW_TEXT'] = df['REVIEW_TEXT'].astype(str)
print('Number of rows ', df.shape[0])

Number of rows  100000


In [3]:
print('Sentiment distribution')
print(df[df.columns[1]].value_counts())

Sentiment distribution
0    56627
1    43373
Name: REVIEW_SENTIMENT, dtype: int64


In [4]:
X_train = df['REVIEW_TEXT']
Y_train = df['REVIEW_SENTIMENT']

data_text = X_train

In [5]:
import re
def process(txt):
    out = re.sub(r'[^a-zA-Z0-9\s]', '', txt)
    out = out.split()
    out = [word.lower() for word in out]
    return out

In [6]:
def tokenize(thresh = 5):
    count  = dict()
    idx = 1
    word_index = dict()
    for txt in data_text:
        words = process(txt)
        for word in words:
            if word in count.keys():
                count[word] += 1
            else:
                count[word]  = 1
    most_counts = [word for word in count.keys() if count[word]>=thresh]
    for word in most_counts:
        word_index[word] = idx
        idx+=1
    return word_index

In [7]:
num_words = None
word_index = tokenize()
num_words = len(word_index)
print('length of the dictionary ',len(word_index))

length of the dictionary  7688


In [8]:
def getMax(data):
    max_tokens = 0 
    for txt in data:
        if max_tokens < len(txt.split()):
            max_tokens = len(txt.split())
    return max_tokens
max_tokens = getMax(X_train)
print('statement with the largest number of words ', max_tokens)

statement with the largest number of words  399


In [9]:
def create_sequences(data):
    tokens = []
    counter = 0;
    for txt in data:
        counter = counter + 1
        words = process(txt)
        seq = [0] * max_tokens
        i = 0 
        for word in words:
            start = max_tokens-len(words)
            if word.lower() in word_index.keys():
                seq[i+start] = word_index[word]
            i+=1
        tokens.append(seq)  
        if (counter % 10000 == 0):
            print('Rows processed ', counter)
    print('Complete ', counter)
    return np.array(tokens)

In [10]:
print(create_sequences(['Bed was comfy']))

Complete  1
[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0 

In [11]:
X_train_tokens = create_sequences(X_train)

Rows processed  10000
Rows processed  20000
Rows processed  30000
Rows processed  40000
Rows processed  50000
Rows processed  60000
Rows processed  70000
Rows processed  80000
Rows processed  90000
Rows processed  100000
Complete  100000


In [12]:
model = Sequential()
embedding_size = 8
model.add(Embedding(input_dim=num_words + 1,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='layer_embedding'))

model.add(GRU(units=16, name = "gru_1",return_sequences=True))
model.add(GRU(units=8, name = "gru_2" ,return_sequences=True))
model.add(GRU(units=4, name= "gru_3"))
model.add(Dense(1, activation='sigmoid',name="dense_1"))
optimizer = Adam(lr=0.01)
model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

W0819 13:56:40.976882 140568097670976 deprecation.py:506] From /opt/conda/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0819 13:56:41.013174 140568097670976 deprecation.py:506] From /opt/conda/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0819 13:56:41.951501 140568097670976 deprecation.py:323] From /opt/conda/lib/python3.6/site-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.o

In [13]:
model.fit(np.array(X_train_tokens), np.array(Y_train),
          validation_split=0.2, epochs=5, batch_size=128)

Train on 80000 samples, validate on 20000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fd8228a9a90>

In [14]:
txt = ["Rooms are tastefully decorated to a high standard and the beds are very comfortable","Only 2 lifts and 1 was not operating broken","The bathroom was nice and spacious"]
pred = model.predict(create_sequences(txt))
print('\n prediction for \n',pred[:,0])

Complete  3

 prediction for 
 [0.00231358 0.9839129  0.00533617]


In [15]:
save_model(
    model,
    "hotel-reviews-model.h5",
    overwrite=True,
    include_optimizer=True
)

In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedding (Embedding)  (None, 399, 8)            61512     
_________________________________________________________________
gru_1 (GRU)                  (None, 399, 16)           1200      
_________________________________________________________________
gru_2 (GRU)                  (None, 399, 8)            600       
_________________________________________________________________
gru_3 (GRU)                  (None, 4)                 156       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 5         
Total params: 63,473
Trainable params: 63,473
Non-trainable params: 0
_________________________________________________________________


In [None]:
def create_csv(file):
    with open(file, 'w') as csvfile:
        writer = csv.writer(csvfile)
        for key in word_index.keys():
            writer.writerow([key,word_index[key]])

In [None]:
create_csv('dict.csv')