In [1]:
%matplotlib inline
from utils import *
from model import *
import numpy as np

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

  from ._conv import register_converters as _register_converters


### Get Input Data

In [2]:
filepath = "./data/data.txt"
data_X, data_Y, m = preprocess(filepath)

print("Total number of examples:",data_X.shape)
print("Sanity Check: ",data_X[0] , data_Y[0])

Total number of examples: (7086,)
Sanity Check:  we're gonna like watch Mission Impossible or Hoot.(
 1


### Prepare Input data
<p> Prepare Training Data, Validation set and test set. Here we will use 60-20-20 split. We will use the <b>split_dataset(data_X, data_Y)</b> method defined in utils.py </p>

In [3]:
train_X, train_Y, dev_X, dev_Y, test_X, test_Y = split_dataset(data_X, data_Y)

print("Train Set Size:", train_X.shape, train_Y.shape)
print("Dev Set Size:", dev_X.shape, dev_Y.shape)
print("Test Set Size:", test_X.shape, test_Y.shape)

Train Set Size: (4251,) (4251,)
Dev Set Size: (1417,) (1417,)
Test Set Size: (1418,) (1418,)


## Preprocess Text

### Tokenise the text

In [4]:
%%time
num_words=10000
tokenizer = Tokenizer(num_words=num_words)

tokenizer.fit_on_texts(data_X)

CPU times: user 162 ms, sys: 2.24 ms, total: 164 ms
Wall time: 163 ms


In [5]:
### Text to sequence of tokens
x_train_tokens = tokenizer.texts_to_sequences(train_X)
x_dev_tokens = tokenizer.texts_to_sequences(dev_X)
x_test_tokens = tokenizer.texts_to_sequences(test_X)

### Pad the sequence 

In [6]:
num_tokens = [len(tokens) for tokens in x_train_tokens + x_dev_tokens + x_test_tokens]
num_tokens = np.array(num_tokens)

max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)

print(np.sum(num_tokens < max_tokens) / len(num_tokens))

pad = 'pre'

#Pad training set
x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens,
                            padding=pad, truncating=pad)

#Pad dev data
x_dev_pad = pad_sequences(x_dev_tokens, maxlen=max_tokens,
                            padding=pad, truncating=pad)

#Pad test data
x_test_pad = pad_sequences(x_test_tokens, maxlen=max_tokens,
                            padding=pad, truncating=pad)

print(x_train_pad.shape)
print("Non-padded tokenized sequence: ",np.array(x_train_tokens[1]))
print("Padded tokenized sequence: ",np.array(x_train_pad[1]))

0.9294383290996331
(4251, 25)
Non-padded tokenized sequence:  [  1 108   2 102  16  17]
Padded tokenized sequence:  [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   1 108   2 102  16  17]


### Create the model using Keras

In [7]:
model = sentiment_analysis(num_words, max_tokens)

In [8]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_layer (Embedding)  (None, 25, 8)             80000     
_________________________________________________________________
lstm_1 (LSTM)                (None, None, 16)          1600      
_________________________________________________________________
lstm_2 (LSTM)                (None, None, 8)           800       
_________________________________________________________________
lstm_3 (LSTM)                (None, 4)                 208       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 5         
Total params: 82,613
Trainable params: 82,613
Non-trainable params: 0
_________________________________________________________________


<h4> Train the model </h4>

In [9]:
model.fit(x_train_pad, train_Y, epochs=5, batch_size=64)

Epoch 1/5

Epoch 2/5

Epoch 3/5

Epoch 4/5

Epoch 5/5



<tensorflow.python.keras._impl.keras.callbacks.History at 0x123acd3c8>

### Evaluate Model on test and development set

In [10]:
result = model.evaluate(x_dev_pad, dev_Y)
print("Accuracy: {0:.2%}".format(result[1]))


Accuracy: 93.37%


In [11]:
result = model.evaluate(x_test_pad, test_Y)
print("Accuracy: {0:.2%}".format(result[1]))


Accuracy: 94.50%


In [12]:
#Use the model on new Examples

text = ["I loved the film"]
tokens = tokenizer.texts_to_sequences(text)
tokens_pad = pad_sequences(tokens, maxlen=max_tokens,
                           padding=pad, truncating=pad)
print(tokens_pad.shape)
result = model.predict(tokens_pad)
print(result)

(1, 25)
[[0.9073302]]
