NLP Sentiment Classification

In [1]:
import keras
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import LSTM, Dense, TimeDistributed
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
import numpy as np

Import the data


*   Get train and test data.
*   Take 10000 most frequent words.



In [2]:
VocabSize = 10000
(X_train,y_train),(X_test,y_test) = imdb.load_data(num_words=VocabSize)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [3]:
print("The shape of X_train is:", X_train.shape)
print("The shape of y_train is:", y_train.shape)
print("The shape of X_test is:", X_test.shape)
print("The shape of y_test is:", y_test.shape)

The shape of X_train is: (25000,)
The shape of y_train is: (25000,)
The shape of X_test is: (25000,)
The shape of y_test is: (25000,)


Pad each sentence to be of same length


*   Take maximum sequent length as 300




In [4]:
seq_length = 300
X_train = sequence.pad_sequences(X_train, maxlen = seq_length)
X_test = sequence.pad_sequences(X_test, maxlen = seq_length)

Print shape of features and labels

Number of review, number of words in each review

In [5]:
print("The number of reviews in training data set is:", X_train.shape[0])
print("The number of words in each review in training data set is:", X_train.shape[1])

The number of reviews in training data set is: 25000
The number of words in each review in training data set is: 300


In [6]:
print("The number of reviews in testing data set is:", X_test.shape[0])
print("The number of words in each review in testing data set is:", X_test.shape[1])

The number of reviews in testing data set is: 25000
The number of words in each review in testing data set is: 300


Number of labels

In [7]:
print("The number of labels in training data set is:", len(np.unique(y_train)))

The number of labels in training data set is: 2


In [8]:
print("The number of labels in testing data set is:", len(np.unique(y_test)))

The number of labels in testing data set is: 2


Print value of any one feature and its label

In [9]:
i = 0

Feature value

In [10]:
X_train[i]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    1,   14,   22,   16,   43,  530,
        973, 1622, 1385,   65,  458, 4468,   66, 3941,    4,  173,   36,
        256,    5,   25,  100,   43,  838,  112,   50,  670,    2,    9,
         35,  480,  284,    5,  150,    4,  172,  112,  167,    2,  336,
        385,   39,    4,  172, 4536, 1111,   17,  546,   38,   13,  447,
          4,  192,   50,   16,    6,  147, 2025,   19,   14,   22,    4,
       1920, 4613,  469,    4,   22,   71,   87,   

Label value

In [11]:
y_train[i]

1

Decode the feature value to get original sentence

First, retrieve a dictionary that contains mapping of words to their index in the IMDB dataset

In [12]:
map_word = keras.datasets.imdb.get_word_index()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


Now, use the dictionary to get the original words from the encodings, for a particular sentence

In [13]:
id_to_word = {value:key for key,value in map_word.items()}
decoded = ""
for w in X_train[i]:
  if w != 0:
    word = id_to_word.get(w)
    decoded += ' %s' % (word)
print(decoded)

 the as you with out themselves powerful lets loves their becomes reaching had journalist of lot from anyone to have after out atmosphere never more room and it so heart shows to years of every never going and help moments or of every chest visual movie except her was several of enough more with is now current film as you of mine potentially unfortunately of you than him that with out themselves her get for was camp of you movie sometimes movie that with scary but and to story wonderful that in seeing in character to of 70s musicians with heart had shadows they of here that with her serious to have does when from why what have critics they is you that isn't one will very to as itself with other and in of seen over landed for anyone of and br show's to whether from than out themselves history he name half some br of and odd was two most of mean for 1 any an boat she he should is thought frog but of script you not while history he heart to real at barrel but when from one bit then have t

Get the sentiment for the above sentence


*   positive(1)
*   negative(0)



In [14]:
if y_train[i] == 1:
  print("Positive review")
else:
  print("Negative Review")

Positive review


Define Model

In [15]:
embed_dim = 100
model = Sequential()
model.add(Embedding(VocabSize, embed_dim, input_length=X_train.shape[1]))
model.add(LSTM(100,dropout = 0.2, recurrent_dropout = 0.2, return_sequences=True))
model.add(TimeDistributed(Dense(100)))
model.add(Flatten())
model.add(Dense(1, activation="sigmoid"))

Compile the model

In [16]:
model.compile(optimizer="Adam", loss="binary_crossentropy", metrics=["accuracy"])

Print model summary

In [17]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 100)          1000000   
_________________________________________________________________
lstm (LSTM)                  (None, 300, 100)          80400     
_________________________________________________________________
time_distributed (TimeDistri (None, 300, 100)          10100     
_________________________________________________________________
flatten (Flatten)            (None, 30000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 30001     
Total params: 1,120,501
Trainable params: 1,120,501
Non-trainable params: 0
_________________________________________________________________


Fit the model

In [18]:
model.fit(X_train, np.array(y_train), batch_size=32, epochs=5, validation_data=(X_test,np.array(y_test)), verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f8e13cfdc88>

In [19]:
test_pred = model.predict(X_test, batch_size=32)

Evaluate the model

In [20]:
score,acc = model.evaluate(X_test, y_test, verbose = 2, batch_size = 32)
print("score: %.2f" % (score))
print("Accuracy: %.2f" % (acc))

782/782 - 49s - loss: 0.7398 - accuracy: 0.8632
score: 0.74
Accuracy: 0.86


The model has an accuracy score of 86% on test data

In [21]:
from sklearn.metrics import classification_report,confusion_matrix

In [22]:
report=classification_report(y_test, test_pred.round(),target_names=['Positive','Negative'])
print(report)

              precision    recall  f1-score   support

    Positive       0.84      0.90      0.87     12500
    Negative       0.89      0.82      0.86     12500

    accuracy                           0.86     25000
   macro avg       0.87      0.86      0.86     25000
weighted avg       0.87      0.86      0.86     25000



The precision and recall values are,

*   Postiive review

      *   Precision - 0.84
      *   Recall - 0.90

*   Negative review

      *   Precision - 0.89
      *   Recall - 0.82


Predict on one example

In [23]:
if y_test[1] == 1:
  print("The actual review is Positive")
else:
  print("The actual review is Negative")

if test_pred[1].round() == 1:
  print("The predicted review is Positive")
else:
  print("The predicted review is Negative")


The actual review is Positive
The predicted review is Positive
