# Sentiment Analysis - IMDB Dataset

In [None]:
import os
import pickle
import numpy as np

from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import Dense, Embedding, Flatten, Dropout, Conv1D, MaxPool1D
from tensorflow.keras.optimizers import Adam

In [None]:
word2idx = imdb.get_word_index()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [None]:
idx_arr = sorted(word2idx, key=word2idx.get)
idx_arr[:10]

['the', 'and', 'a', 'of', 'to', 'is', 'br', 'in', 'it', 'i']

In [None]:
idx2word = {v: k for k, v in word2idx.items()}

In [None]:
path = keras.utils.get_file('imdb_full.pkl',
                origin='https://s3.amazonaws.com/text-datasets/imdb_full.pkl',
                md5_hash='d091312047c43cf9e4e38fef92437263')

Downloading data from https://s3.amazonaws.com/text-datasets/imdb_full.pkl


In [None]:
print(path)

/root/.keras/datasets/imdb_full.pkl


In [None]:
f = open(path, 'rb')
(x_train, labels_train), (x_test, labels_test) = pickle.load(f)

## Exploring Data

In [None]:
len(x_train)

25000

Here's the 1st review. As you see, the words have been replaced by ids. 
The ids can be looked up in idx2word.

In [None]:
', '.join(map(str, x_train[0]))

'23022, 309, 6, 3, 1069, 209, 9, 2175, 30, 1, 169, 55, 14, 46, 82, 5869, 41, 393, 110, 138, 14, 5359, 58, 4477, 150, 8, 1, 5032, 5948, 482, 69, 5, 261, 12, 23022, 73935, 2003, 6, 73, 2436, 5, 632, 71, 6, 5359, 1, 25279, 5, 2004, 10471, 1, 5941, 1534, 34, 67, 64, 205, 140, 65, 1232, 63526, 21145, 1, 49265, 4, 1, 223, 901, 29, 3024, 69, 4, 1, 5863, 10, 694, 2, 65, 1534, 51, 10, 216, 1, 387, 8, 60, 3, 1472, 3724, 802, 5, 3521, 177, 1, 393, 10, 1238, 14030, 30, 309, 3, 353, 344, 2989, 143, 130, 5, 7804, 28, 4, 126, 5359, 1472, 2375, 5, 23022, 309, 10, 532, 12, 108, 1470, 4, 58, 556, 101, 12, 23022, 309, 6, 227, 4187, 48, 3, 2237, 12, 9, 215'

The first word of the first review is 23022. Let's see what that is.



In [None]:
idx2word[23022]

'bromwell'

Here's the whole review, mapped from ids to words.

In [None]:
' '.join([idx2word[o] for o in x_train[0]])

"bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my 35 years in the teaching profession lead me to believe that bromwell high's satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers' pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled at high a classic line inspector i'm here to sack one of your teachers student welcome to bromwell high i expect that many adults of my age think that bromwell high is far fetched what a pity that it isn't"

The labels are 1 for positive, 0 for negative.

In [None]:
labels_train[:10]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

Reduce vocab size by setting rare words to max index.

In [None]:
vocab_size = 5000

trn = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_train]
test = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_test]

Look at distribution of lengths of sentences.

In [None]:
lens = np.array(list(map(len, trn)))
(lens.max(), lens.min(), lens.mean())

(2493, 10, 237.71364)

Pad (with zero) or truncate each sentence to make consistent length.

In [None]:
seq_len = 500

trn = sequence.pad_sequences(trn, maxlen=seq_len, value=0)
test = sequence.pad_sequences(test, maxlen=seq_len, value=0)

In [None]:
trn.shape

(25000, 500)

## Single hidden layer NN

The simplest model that tends to give reasonable results is a single hidden layer net. So let's try that. Note that we can't expect to get any useful results by feeding word ids directly into a neural net - so instead we use an embedding to replace them with a vector of 32 (initially random) floats for each word in the vocab.

In [None]:
model = Sequential([
    Embedding(vocab_size, 32, input_length=seq_len),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(1, activation='sigmoid')])

In [None]:
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 32)           160000    
_________________________________________________________________
flatten (Flatten)            (None, 16000)             0         
_________________________________________________________________
dense (Dense)                (None, 100)               1600100   
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 1,760,201
Trainable params: 1,760,201
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(trn, np.array(labels_train), validation_data=(test, np.array(labels_test)), epochs=2, batch_size=64)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f928e176190>

## Pre-trained vectors

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

--2021-06-10 16:30:03--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-06-10 16:30:03--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-06-10 16:30:04--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2021-0

In [None]:
!head -5 glove.6B.50d.txt

the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.044457 -0.49688 -0.17862 -0.00066023 -0.6566 0.27843 -0.14767 -0.55677 0.14658 -0.0095095 0.011658 0.10204 -0.12792 -0.8443 -0.12181 -0.016801 -0.33279 -0.1552 -0.23131 -0.19181 -1.8823 -0.76746 0.099051 -0.42125 -0.19526 4.0071 -0.18594 -0.52287 -0.31681 0.00059213 0.0074449 0.17778 -0.15897 0.012041 -0.054223 -0.29871 -0.15749 -0.34758 -0.045637 -0.44251 0.18785 0.0027849 -0.18411 -0.11514 -0.78581
, 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709 -0.42852 -0.55641 -0.364 -0.23938 0.13001 -0.063734 -0.39575 -0.48162 0.23291 0.090201 -0.13324 0.078639 -0.41634 -0.15428 0.10068 0.48891 0.31226 -0.1252 -0.037512 -1.5179 0.12612 -0.02442 -0.042961 -0.28351 3.5416 -0.11956 -0.014533 -0.1499 0.21864 -0.33412 -0.13872 0.31806 0.70358 0.44858 -0.080262 0.63003 0.32111 -0.46765 0.22786 0.36034 -0.37818 -0.56657 0.044691 0.30392
. 0.15164 0.30177 -0.16763 0.17684 0.31719 0.33973 -0.43478 -0.31086 -0.44999 -0.29486 0.16608 0.11963 -0.41328 -0.42353

In [None]:
EMBEDDING_DIM = 50

In [None]:
# store all the pre-trained word vectors

print('Loading word vectors...')
word2vec = {}

with open(os.path.join('glove.6B.%sd.txt' % EMBEDDING_DIM)) as f:
  # is just a space-separated text file in the format:
  # word vec[0] vec[1] vec[2] ...
  for line in f:
    values = line.split()
    word = values[0]
    vec = np.asarray(values[1:], dtype='float32')
    word2vec[word] = vec

print('Found %s word vectors.' % len(word2vec))

Loading word vectors...
Found 400000 word vectors.


In [None]:
# prepare embedding matrix
print('Filling pre-trained embeddings...')
hits = 0
misses = 0

embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))

for word, i in word2idx.items():
  if i < vocab_size:
    embedding_vector = word2vec.get(word)

    if embedding_vector is not None:
      # words not found in embedding index will be all zeros.
      embedding_matrix[i] = embedding_vector
      hits += 1
    else:
      misses += 1

print("Converted %d words (%d misses)" % (hits, misses))

Filling pre-trained embeddings...
Converted 4920 words (79 misses)


### Conv1D Model

In [None]:
model = Sequential([
    Embedding(vocab_size, EMBEDDING_DIM, input_length=seq_len, #dropout=0.2, 
              weights=[embedding_matrix], trainable=False),
    Conv1D(64, 5, padding='same', activation='relu'),
    Dropout(0.25),
    MaxPool1D(),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')])

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 50)           250000    
_________________________________________________________________
dropout (Dropout)            (None, 500, 50)           0         
_________________________________________________________________
conv1d (Conv1D)              (None, 500, 64)           16064     
_________________________________________________________________
dropout_1 (Dropout)          (None, 500, 64)           0         
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 250, 64)           0         
_________________________________________________________________
flatten (Flatten)            (None, 16000)             0         
_________________________________________________________________
dense (Dense)                (None, 100)               1

In [None]:
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [None]:
model.fit(trn, np.array(labels_train), validation_data=(test, np.array(labels_test)), epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f928def42d0>

But let's fine-tune the embedding weights - especially since the words we couldn't find in glove just have random embeddings.

In [None]:
model.layers[0].trainable=True

In [None]:
model.optimizer.lr=1e-4

In [None]:
model.fit(trn, np.array(labels_train), validation_data=(test, np.array(labels_test)), epochs=1, batch_size=64)



<tensorflow.python.keras.callbacks.History at 0x7f9206431a10>