# Sentiment Classification


## Loading the dataset

In [0]:
from keras.datasets import imdb

vocab_size = 10000 #vocab size

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size) # vocab_size is no.of words to consider from the dataset, ordering based on frequency.

In [0]:
from keras.preprocessing.sequence import pad_sequences
vocab_size = 10000 #vocab size
maxlen = 20  #number of word used from each review

## Train test split

In [0]:
#load dataset as a list of ints
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)
#make all sequences of the same length
x_train = pad_sequences(x_train, maxlen=maxlen)
x_test =  pad_sequences(x_test, maxlen=maxlen)

In [37]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(25000, 20)
(25000,)
(25000, 20)
(25000,)


In [0]:
word_to_id = imdb.get_word_index()
word_to_id = {k: (v+3) for k,v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2

In [39]:
id_to_word = {value:key for key, value in word_to_id.items()}
print(' '.join(id_to_word[id] for id in x_train[6]))
print('The sentiment is: ', y_train[6])

too much on <UNK> of head shots like most other films of the 80s and 90s do very good results
('The sentiment is: ', 1)


## Build Keras Embedding Layer Model
We can think of the Embedding layer as a dicionary that maps a index assigned to a word to a word vector. This layer is very flexible and can be used in a few ways:

* The embedding layer can be used at the start of a larger deep learning model. 
* Also we could load pre-train word embeddings into the embedding layer when we create our model.
* Use the embedding layer to train our own word2vec models.

The keras embedding layer doesn't require us to onehot encode our words, instead we have to give each word a unqiue intger number as an id. For the imdb dataset we've loaded this has already been done, but if this wasn't the case we could use sklearn [LabelEncoder](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html).

In [40]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [0]:
from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()
word2vec_file = open('/content/drive/My Drive/NLP/Week 1 & 2/word2vec.glove.6B.50d.txt')

for line in word2vec_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
word2vec_file.close()

In [0]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
embedding_matrix = zeros((vocab_size, 20))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

Classification with Simple Neural Network

In [43]:
import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Flatten
model = Sequential()
embedding_layer = Embedding(vocab_size, 20, weights=[embedding_matrix], input_length=maxlen , trainable=False)
model.add(embedding_layer)

model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

print(model.summary())

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 20, 20)            200000    
_________________________________________________________________
flatten_1 (Flatten)          (None, 400)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 401       
Total params: 200,401
Trainable params: 401
Non-trainable params: 200,000
_________________________________________________________________
None


In [44]:
model.fit(x_train, y_train, batch_size=128, epochs=6, verbose=1, validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<tensorflow.python.keras.callbacks.History at 0x7f1e4293f290>

In [45]:
score = model.evaluate(x_test, y_test, verbose=1)
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

('Test Score:', 0.6931486254119873)
('Test Accuracy:', 0.5)


Classification with a Convolutional Neural Network

In [46]:
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D
model1 = Sequential()

embedding_layer = Embedding(vocab_size, 20, weights=[embedding_matrix], input_length=maxlen , trainable=False)
model1.add(embedding_layer)

model1.add(Conv1D(128, 5, activation='relu'))
model1.add(GlobalMaxPooling1D())
model1.add(Dense(1, activation='sigmoid'))
model1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model1.summary())

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 20, 20)            200000    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 16, 128)           12928     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 129       
Total params: 213,057
Trainable params: 13,057
Non-trainable params: 200,000
_________________________________________________________________
None


In [47]:
model1.fit(x_train, y_train, batch_size=32, epochs=6, verbose=1, validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<tensorflow.python.keras.callbacks.History at 0x7f1e41f64fd0>

In [48]:
score1 = model1.evaluate(x_test, y_test, verbose=1)
print("Test Score:", score1[0])
print("Test Accuracy:", score1[1])

('Test Score:', 0.693197723312378)
('Test Accuracy:', 0.5)


Classification with Recurrent Neural Network (LSTM)

In [49]:
model2 = Sequential()
embedding_layer = Embedding(vocab_size, 20, weights=[embedding_matrix], input_length=maxlen , trainable=False)
model2.add(embedding_layer)
model2.add(LSTM(128))

model2.add(Dense(1, activation='sigmoid'))
model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model2.summary())

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 20, 20)            200000    
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               76288     
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 129       
Total params: 276,417
Trainable params: 76,417
Non-trainable params: 200,000
_________________________________________________________________
None


In [50]:
batch_size = 32
model2.fit(x_train, y_train, epochs = 6, batch_size=batch_size, verbose = 2)

Train on 25000 samples
Epoch 1/6
25000/25000 - 8s - loss: 0.6933 - acc: 0.5021
Epoch 2/6
25000/25000 - 6s - loss: 0.6932 - acc: 0.5036
Epoch 3/6
25000/25000 - 7s - loss: 0.6932 - acc: 0.4995
Epoch 4/6
25000/25000 - 7s - loss: 0.6932 - acc: 0.4990
Epoch 5/6
25000/25000 - 6s - loss: 0.6932 - acc: 0.4974
Epoch 6/6
25000/25000 - 6s - loss: 0.6932 - acc: 0.4966


<tensorflow.python.keras.callbacks.History at 0x7f1e41ce05d0>

In [51]:
score2 = model2.evaluate(x_test, y_test, verbose=1)
print("Test Score:", score2[0])
print("Test Accuracy:", score2[1])

('Test Score:', 0.6931525472640991)
('Test Accuracy:', 0.5)


We have seen very low accuracy scores for both training & test set with models based on Word2Vec models

LSTM Models (Indepenedent of Word2Vec Model)

In [0]:
max_features = 20000
model3 = Sequential()
model3.add(Embedding(vocab_size, 128,input_length = x_train.shape[1]))
model3.add(LSTM(128, dropout = 0.2, recurrent_dropout= 0.2))
model3.add(Dense(1, activation= 'sigmoid'))
model3.compile(loss = 'binary_crossentropy', optimizer= 'adam', metrics = ['accuracy'])

In [53]:
batch_size = 32
model3.fit(x_train, y_train, epochs = 6, batch_size=batch_size, verbose = 2)

Train on 25000 samples
Epoch 1/6
25000/25000 - 50s - loss: 0.5274 - accuracy: 0.7306
Epoch 2/6
25000/25000 - 47s - loss: 0.4043 - accuracy: 0.8125
Epoch 3/6
25000/25000 - 46s - loss: 0.3259 - accuracy: 0.8522
Epoch 4/6
25000/25000 - 46s - loss: 0.2580 - accuracy: 0.8883
Epoch 5/6
25000/25000 - 46s - loss: 0.1976 - accuracy: 0.9182
Epoch 6/6
25000/25000 - 46s - loss: 0.1516 - accuracy: 0.9402


<tensorflow.python.keras.callbacks.History at 0x7f1de0c544d0>

In [54]:
score3 = model3.evaluate(x_test, y_test, verbose=1)
print("Test Score:", score3[0])
print("Test Accuracy:", score3[1])

('Test Score:', 0.8656493499469757)
('Test Accuracy:', 0.74676)


We can see that both validation accuracy & test accuracy have improved and loss has gone down considerably when compared to previous models

## Retrive the output of each layer in keras for a given single test sample from the trained model you built

In [55]:
instance = ' '.join(id_to_word[id] for id in x_test[9000])
print(instance)
print('The sentiment is:', y_test[9000])

don't need to treat the audience as idiots overall there is just too much emotional melodrama in the whole movie
('The sentiment is:', 0)


In [0]:
instance = tokenizer.texts_to_sequences(instance)

In [57]:
flat_list = []
for sublist in instance:
    for item in sublist:
        flat_list.append(item)

flat_list = [flat_list]

instance = pad_sequences(flat_list, maxlen=maxlen)

print(model.predict(instance))
print(model1.predict(instance))
print(model2.predict(instance))
print(model3.predict(instance))

[[0.50085634]]
[[0.5050242]]
[[0.50163823]]
[[0.00122061]]


Positive outputs is mapped to 1 and negative outputs is mapped to 0. However, the sigmoid function floats output between 1 & 0. If the value is greater than 0.5, the sentiment is considered positive.The sentiment value for our single instance is greater than 0.5 for the 1st 3 models. That means the sentiment is positive. However, model 4 has predicted 0.0. This means that our sentiment is negative which actually is the case.

In [62]:
from keras import backend as K
inp = model3.input
outputs = [layer.output for layer in model3.layers]
functor = K.function([inp, K.learning_phase()], outputs)
# Prediction_input_padded
layer_outs = functor([instance, 0])
print(layer_outs)

[array([[[ 0.0861014 , -0.06323227,  0.02154035, ..., -0.00451315,
          0.04316535,  0.09930071],
        [ 0.0861014 , -0.06323227,  0.02154035, ..., -0.00451315,
          0.04316535,  0.09930071],
        [ 0.0861014 , -0.06323227,  0.02154035, ..., -0.00451315,
          0.04316535,  0.09930071],
        ...,
        [ 0.0861014 , -0.06323227,  0.02154035, ..., -0.00451315,
          0.04316535,  0.09930071],
        [ 0.0861014 , -0.06323227,  0.02154035, ..., -0.00451315,
          0.04316535,  0.09930071],
        [ 0.0861014 , -0.06323227,  0.02154035, ..., -0.00451315,
          0.04316535,  0.09930071]]], dtype=float32), array([[ 0.644007  ,  0.11638042, -0.12205642, -0.034178  , -0.00887403,
        -0.46166834,  0.00509549,  0.46002036,  0.15542595,  0.02199194,
         0.32282138, -0.7592026 , -0.23874262,  0.09091439, -0.39348122,
         0.08801773,  0.06043807, -0.33857736,  0.08257023,  0.25960174,
         0.01355452, -0.60856134,  0.03751541,  0.26973736, -0.2