In [1]:
# This code in a modification of Siraj Raval's Sentiment Analysis code for 
# movie reviews: https://www.youtube.com/watch?v=si8zZHkufRY
# Please check out useful links in the video description

import numpy as np
import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb
import tensorflow as tf

In [3]:
# load movie review data into training and testing sets, takes a while
train, test, _ = imdb.load_data(path='imdb.pkl',n_words=10000,
                               valid_portion=0.1) 
#http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl 

In [61]:
# check out train data
tmp_testX,tmp_testY = test
tmp_testY[0:10]

[0, 1, 1, 1, 0, 0, 1, 1, 1, 1]

In [5]:
# separate variables we will train on from labels
trainX, trainY = train
testX, testY = test

In [6]:
# Need to do some data processing
# convert each review into a matrix and pad with a traling 0
trainX = pad_sequences(trainX, maxlen=100, value=0.)
testX = pad_sequences(testX, maxlen=100, value=0.)

In [70]:
import numpy as np

In [74]:
# Check how to get the numeric values for the text data
# This will be usefull for this week's assignment!
x_text = ['This is a cat','I like icecream', 'This is a a dog']
max_document_length = max([len(xx.split(" ")) for xx in x_text])
print(max_document_length)
## Create the vocabulary processor object, setting the max lengh of the documents.
vocab_processor = tflearn.data_utils.VocabularyProcessor(max_document_length)

## Transform the documents using the vocabulary.
xx = np.array(list(vocab_processor.fit_transform(x_text)))    
print(xx)
## Extract word:id mapping from the object.
vocab_dict = vocab_processor.vocabulary_._mapping
vocab_dict # Not universal numbers...


[[1 2 3 4 0]
 [5 6 7 0 0]
 [1 2 3 3 8]]


{'<UNK>': 0,
 'I': 5,
 'This': 1,
 'a': 3,
 'cat': 4,
 'dog': 8,
 'icecream': 7,
 'is': 2,
 'like': 6}

In [52]:
type(trainX)
trainX[10,]


array([ 922,   17,   25,   10, 1162,    4,   10,  406,    3,   67, 6427,
          3,   96, 2922, 2166,  474,    4,   33,  184,   40,   40,   40,
         40,   40,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0], dtype=int32)

In [8]:
# convert labels to binary (0/1, yes/no)
trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)
print(testY[1:6]) # Q: what did this function do?

In [9]:
# build network, 1st - batch size, 100 - our max seq length
net = tflearn.input_data([None,100])
# 128 # of resulting embeddings...
net = tflearn.embedding(net, input_dim=10000,output_dim=128)

In [10]:
# next layer - remeber sequences from earlier layer, will drop some nodes
net = tflearn.lstm(net,128,dropout=.8)


In [11]:
# add fully connected layer with previous layer, produces output probabilites 
net = tflearn.fully_connected(net, 2, activation='softmax')
# adam does gradient descent 
net = tflearn.regression(net, optimizer='adam',learning_rate=.0001,
                        loss='categorical_crossentropy')

In [12]:
# Training!
model = tflearn.DNN (net, tensorboard_verbose=0)

In [13]:
# tf.reset_default_graph() # did nto fix the problem, had to restart the kernel
model.fit(trainX,trainY,validation_set=(testX, testY), show_metric=True, batch_size=32)

Training Step: 7039  | total loss: [1m[32m0.16292[0m[0m | time: 881.089s
| Adam | epoch: 010 | loss: 0.16292 - acc: 0.9445 -- iter: 22496/22500
Training Step: 7040  | total loss: [1m[32m0.15076[0m[0m | time: 915.186s
| Adam | epoch: 010 | loss: 0.15076 - acc: 0.9500 | val_loss: 0.67268 - val_acc: 0.7940 -- iter: 22500/22500
--


In [14]:
#save model
model.save('imdb_trainedDNN_10epochs_save')

INFO:tensorflow:/Users/ykarpievitch/yuliyaFiles/UWA/teaching/AI/2019/code/sentiment/imdb_trainedDNN_10epochs_save is not in all_model_checkpoint_paths. Manually adding it.


In [19]:
#!pip install dill
import dill 
filename = 'imdb_trainedDNN_10epochs.pkl'
# dill.dump_session(filename) # too much recursion in the model...
# and to load the session in the future
# dill.load_session(filename)

In [31]:
# predict probabilities for each class 
pred_testX_prob = model.predict(testX)

In [33]:
pred_testX_prob[0:10]

array([[0.9962196 , 0.00378047],
       [0.04559765, 0.9544024 ],
       [0.0066211 , 0.9933789 ],
       [0.00837002, 0.99163   ],
       [0.9962393 , 0.00376067],
       [0.99621326, 0.00378666],
       [0.00721604, 0.9927839 ],
       [0.01658944, 0.98341054],
       [0.00888421, 0.9911158 ],
       [0.00767686, 0.99232316]], dtype=float32)

In [28]:
tmp_testX, tmp_testY = test

In [34]:
# compare predicted and true labels
pred_testY = model.predict_label(testX)


In [35]:
pred_testY[0:10]

array([[0, 1],
       [1, 0],
       [1, 0],
       [1, 0],
       [0, 1],
       [0, 1],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0]])

In [36]:
tmp_testY[0:10] # Q: do these match to what the model predicted? 
# explore this a bit further, computer how many mismatches we have,...

[0, 1, 1, 1, 0, 0, 1, 1, 1, 1]