<a href="https://colab.research.google.com/github/atta007/Sentiment-Analysis-with-RNN/blob/master/Sentiment_Analysis_with_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from keras.datasets import imdb #Keras has a built-in IMDb movie reviews data set that we can use

Using TensorFlow backend.


In [0]:
#Downloading  and Set the Vocabulary size and load in training and test data

vocab_size = 5000

(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words= vocab_size)

print('Loaded dataset with {} training samples, {} test samples'.format(len(X_train), len(X_test)))

Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz
Loaded dataset with 25000 training samples, 25000 test samples


In [0]:
#Inspect a sample review and its label

print('---review---')
print(X_train[6])

print('---label---')
print(y_train[6])

#the label is an integer(0 for negative, 1 for positive)


---review---
[1, 2, 365, 1234, 5, 1156, 354, 11, 14, 2, 2, 7, 1016, 2, 2, 356, 44, 4, 1349, 500, 746, 5, 200, 4, 4132, 11, 2, 2, 1117, 1831, 2, 5, 4831, 26, 6, 2, 4183, 17, 369, 37, 215, 1345, 143, 2, 5, 1838, 8, 1974, 15, 36, 119, 257, 85, 52, 486, 9, 6, 2, 2, 63, 271, 6, 196, 96, 949, 4121, 4, 2, 7, 4, 2212, 2436, 819, 63, 47, 77, 2, 180, 6, 227, 11, 94, 2494, 2, 13, 423, 4, 168, 7, 4, 22, 5, 89, 665, 71, 270, 56, 5, 13, 197, 12, 161, 2, 99, 76, 23, 2, 7, 419, 665, 40, 91, 85, 108, 7, 4, 2084, 5, 4773, 81, 55, 52, 1901]
---label---
1


In [0]:
#we can use the dictionary returned by imdb.get_word_index() to map the review back to the original words.

word2id = imdb.get_word_index()
id2word = {i: word for word,i in word2id.items()}

print('---review with word---')
print([id2word.get(i, '') for i in X_train[6]])
print('---label---')
print(y_train[6])

Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json
---review with word---
['the', 'and', 'full', 'involving', 'to', 'impressive', 'boring', 'this', 'as', 'and', 'and', 'br', 'villain', 'and', 'and', 'need', 'has', 'of', 'costumes', 'b', 'message', 'to', 'may', 'of', 'props', 'this', 'and', 'and', 'concept', 'issue', 'and', 'to', "god's", 'he', 'is', 'and', 'unfolds', 'movie', 'women', 'like', "isn't", 'surely', "i'm", 'and', 'to', 'toward', 'in', "here's", 'for', 'from', 'did', 'having', 'because', 'very', 'quality', 'it', 'is', 'and', 'and', 'really', 'book', 'is', 'both', 'too', 'worked', 'carl', 'of', 'and', 'br', 'of', 'reviewer', 'closer', 'figure', 'really', 'there', 'will', 'and', 'things', 'is', 'far', 'this', 'make', 'mistakes', 'and', 'was', "couldn't", 'of', 'few', 'br', 'of', 'you', 'to', "don't", 'female', 'than', 'place', 'she', 'to', 'was', 'between', 'that', 'nothing', 'and', 'movies', 'get', 'are', 'and', 'br', 'yes', 'female', 'just', 'i

In [0]:
print('Maximum review length: {}', format(len(max((X_train + X_test), key=len))))

Maximum review length: {} 2697


In [0]:
print('Minimum review length: {}'.format(len(min((X_test + X_test), key=len))))

Minimum review length: 14


In [0]:
#In order to feed this data into our RNN, all input documents must have the same length. 
#We will limit the maximum review length to max_words by truncating longer reviews and paddins shorter reviews with a null value(0).
#We can accomplish this using the pad_sequence() functionms in keras. For now, set max_words to 500.


from keras.preprocessing.sequence import pad_sequences
max_words=500

X_train = pad_sequences(X_train, maxlen=max_words)
X_test = pad_sequences(X_test, maxlen=max_words)



In [0]:
from keras import Sequential 
from keras.layers import Embedding, LSTM, Dense, Dropout

embedding_size = 32

model=Sequential()
model.add(Embedding(vocab_size,embedding_size, input_length= max_words))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
          
          

Instructions for updating:
Colocations handled automatically by placer.


In [0]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
None


In [0]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [0]:
from keras.callbacks import EarlyStopping
from tensorboardcolab import *
tbc = TensorBoardColab()
es = EarlyStopping(monitor = "val_loss", patience = 10)

Wait for 8 seconds...
TensorBoard link:
https://c5eca4f8.ngrok.io


In [0]:
batch_size = 64
num_epochs = 10

X_valid, y_valid = X_train[:batch_size], y_train[:batch_size]
X_train2, y_train2 = X_train[batch_size:], y_train[batch_size:]

model.fit(X_train2, y_train2, validation_data=(X_valid, y_valid), batch_size=batch_size, epochs=num_epochs, callbacks = [es, TensorBoardColabCallback(tbc)])

Instructions for updating:
Use tf.cast instead.
Train on 24936 samples, validate on 64 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f779c90b7b8>

In [0]:
scores = model.evaluate(X_test, y_test, verbose=0)
print('Test accuracy:', scores[1])

Test accuracy: 0.86212
