# Sentiment Analysis with GRU

In [8]:
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras import Sequential
from keras.layers import Embedding, LSTM,GRU, Dense, Dropout, Bidirectional, BatchNormalization
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import backend as K
import numpy as np
from keras.callbacks import ModelCheckpoint
import keras.backend.tensorflow_backend as KTF
import tensorflow as tf

In [2]:
config = tf.ConfigProto()
config.gpu_options.allow_growth=True   
sess = tf.Session(config=config)

KTF.set_session(sess)

# Utils

Calculate F1 Manually

In [3]:
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


## Preparing data

Separate data and create sequences of the same length, dictionaries from word to id and viceversa

In [4]:
vocabulary_size = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = vocabulary_size)
tokenizer = Tokenizer(nb_words=vocabulary_size)


word2id = imdb.get_word_index()
id2word = {i: word for word, i in word2id.items()}

max_words = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)



We are going to get the initial values with Glove embedding

In [5]:
vocabulary_size = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = vocabulary_size)
tokenizer = Tokenizer(nb_words=vocabulary_size)


word2id = imdb.get_word_index()
id2word = {i: word for word, i in word2id.items()}

max_words = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)

embedding_size=100
embeddings_index = {}
f = open('glove.6B.100d.txt', encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

embedding_matrix = np.zeros((len(word2id) + 1, embedding_size))
counter = 0
for word, i in word2id.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        counter = counter +1
        embedding_matrix[i] = embedding_vector
        
print("found: "+str(counter/len(word2id)))

found: 0.6790165266865348


## Create model

In [10]:
model=Sequential()
model.add(Embedding(len(word2id) + 1,embedding_size,weights=[embedding_matrix],input_length=max_words,trainable=True))
#model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))
model.add(GRU(100, dropout=0.2, recurrent_dropout=0.2))
model.add(BatchNormalization())
model.add(Dense(1, activation='sigmoid'))
print(model.summary())
#model.load_weights("GRU.dropout.glove.hdf5")
model.compile(loss='binary_crossentropy', 
             optimizer='adam', 
             metrics=['accuracy',f1])

batch_size = 64
num_epochs = 3
X_valid, y_valid = X_train[:batch_size], y_train[:batch_size]
X_train2, y_train2 = X_train[batch_size:], y_train[batch_size:]

filepath="GRU.dropout.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]
model.fit(X_train2, y_train2, validation_data=(X_valid, y_valid), batch_size=batch_size, epochs=num_epochs, callbacks=[checkpoint],verbose=0)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 500, 100)          8858500   
_________________________________________________________________
gru_4 (GRU)                  (None, 100)               60300     
_________________________________________________________________
batch_normalization_2 (Batch (None, 100)               400       
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 101       
Total params: 8,919,301
Trainable params: 8,919,101
Non-trainable params: 200
_________________________________________________________________
None

Epoch 00001: val_acc improved from -inf to 0.68750, saving model to GRU.dropout.hdf5

Epoch 00002: val_acc improved from 0.68750 to 0.90625, saving model to GRU.dropout.hdf5

Epoch 00003: val_acc improved from 0.90625 to 0.93750, saving model to GRU.dropou

<keras.callbacks.History at 0x1c3074da860>

## Testing model

we are going to obtain the general score with test dataset and then we are going to test the model with mock data 

In [15]:
scores = model.evaluate(X_test, y_test, verbose=0)
print("accuracy: "+str(scores[1])+" f1: "+str(scores[2]))

accuracy: 0.875 f1: 0.866256873626709


In [13]:
sequences = []
texts = []
texts.append("I like this movie")
texts.append("I do not like this movie")
texts.append("love this movie")
texts.append("hate movie")
texts.append("do not see this movie")
for text in texts:
    sequence = []
    for word in text.split(" "):
        sequence.append(word2id.get(word,0))
    sequences.append(sequence)
data = pad_sequences(sequences, maxlen=max_words)
print(sequences)
results = model.predict(data)
results

[[0, 37, 11, 17], [0, 78, 21, 37, 11, 17], [116, 11, 17], [781, 17], [78, 21, 64, 11, 17]]


array([[0.8378732 ],
       [0.1483976 ],
       [0.56098115],
       [0.33535677],
       [0.07895444]], dtype=float32)