In [1]:
import os
os.environ['KERAS_BACKEND'] = 'plaidml.keras.backend'

import keras
import pandas as pd
import string
import re
import numpy as np

Using plaidml.keras.backend backend.


In [2]:
os.getcwd()

'C:\\Users\\antra\\OneDrive\\kaggle\\sentiment_analysis\\keras'

In [3]:
os.listdir(os.getcwd())

['.ipynb_checkpoints',
 '.Rhistory',
 'imdb_binary_10000_words.csv',
 'imdb_binary_20000_words.csv',
 'imdb_binary_30000_words.csv',
 'imdb_binary_representation.ipynb',
 'imdb_keras.Rmd',
 'imdb_keras2.Rmd',
 'imdb_word_embedding_128batch.csv',
 'imdb_word_embedding_128batch_200words.csv',
 'imdb_word_embedding_128batch_500words.csv',
 'imdb_word_embedding_128batch_own_embedding.csv',
 'imdb_word_embedding_128batch_own_embedding_300d.csv',
 'imdb_word_embedding_32batch.csv',
 'imdb_word_embedding_512batch.csv',
 'imdb_word_embedding_glove.ipynb',
 'labeledTrainData.tsv',
 'NN_processed_words_tf.csv',
 'NN_processed_words_tfidf.csv',
 'NN_processed_words_tfidf_10739words.csv',
 'NN_processed_words_tfidf_10739words_binarized.csv',
 'NN_processed_words_tfidf_4714words.csv',
 'NN_processed_words_tfidf_4714words_binarized.csv',
 'NN_processed_words_tfidf_6023words.csv',
 'NN_processed_words_tfidf_7164words.csv',
 'NN_processed_words_tf_4716words_binarized.csv',
 'NN_processed_words_tf_6023

In [4]:
# load file using relative path
train = pd.read_csv('labeledTrainData.tsv', sep='\t')
test = pd.read_csv('testData.tsv', sep='\t')

In [5]:
train

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...
...,...,...,...
24995,3453_3,0,It seems like more consideration has gone into...
24996,5064_1,0,I don't believe they made this film. Completel...
24997,10905_3,0,"Guy is a loser. Can't get girls, needs to buil..."
24998,10194_3,0,This 30 minute documentary Buñuel made in the ...


Let's do some **cleaning** in our texts: lower case; remove punctuations, remove numbers, replace abbreviations, etc. Because keras **tokenizer** does the punctuations removal; we don't have to manually do it.

In [6]:
# lowercase all words in text
def to_lower(docs):
    docs = [doc.lower() for doc in docs]
    return docs

In [7]:
# check for non-ascii character
def is_alpha(char):
    return char in string.ascii_lowercase

# only keep numbers and ascii characters. replace non-ascii characters with space
def keep_alphanumeric(doc):
    doc = [char for char in doc]
    out = ''
    for char in doc:
        good = is_alpha(char) or char.isnumeric() # or char in [' ', '.']
        if good:
            out += char
        else:
            out += ' '
    return out

In [8]:
# remove special characters
def remove_special(docs):
    docs = [keep_alphanumeric(doc) for doc in docs]
    return docs

In [9]:
# remove numbers
def remove_numbers(docs):
    docs = [re.sub('\d+', ' ', doc) for doc in docs]
    return docs

In [10]:
# removes extra whitespace for a list of text strings
def remove_whitespace(docs):
    docs = [' '.join(doc.split()) for doc in docs]
    docs = [doc.rstrip() for doc in docs]
    docs = [doc.lstrip() for doc in docs]
    return docs

In [11]:
train['review'] = to_lower(train['review'])

In [12]:
train['review'] = remove_special(train['review'])

In [13]:
train['review'] = remove_numbers(train['review'])

In [14]:
train['review'] = remove_whitespace(train['review'])

In [15]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

max_words = 10000 # considers only the top 10000 words in the dataset
maxlen = 500

tokenizer = Tokenizer(num_words=max_words) # creates a tokenizer, takes only the first 10000 common words
tokenizer.fit_on_texts(train['review']) # build the word index
sequences = tokenizer.texts_to_sequences(train['review']) # turns strings into lists of integer indices

word_index = tokenizer.word_index # how to recover the word index that was computed
print('Found %s unique tokens.' % len(word_index)) # how many tokens are there in total?

Found 73276 unique tokens.


Let's decode the indices of one of the **sequences** back into original text.

In [16]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
decoded_review = ' '.join([reverse_word_index.get(i, '?') for i in sequences[0]])
decoded_review

'with all this stuff going down at the moment with mj i ve started listening to his music watching the odd documentary here and there watched the and watched again maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent is part biography part feature film which i remember going to see at the cinema when it was originally released some of it has subtle messages about mj s feeling towards the press and also the obvious message of drugs are bad m kay br br visually impressive but of course this is all about michael jackson so unless you remotely like mj in anyway then you are going to hate this and find it boring some may call mj an for to the making of this movie but mj and most of his fans would say that he made it for the fans which if true is really nice of him br br the actual feature film bit when it finally starts is only on for minutes or so the smooth criminal sequence an

Let's pad our word indices with 0's.

In [17]:
data = pad_sequences(sequences, maxlen=maxlen) # truncate texts after "maxlen", or fill in with 0's if not long enough

print('Shape of data tensor:', data.shape)

Shape of data tensor: (25000, 500)


In [18]:
data[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   17,   31,   11,  529,
        169,  180,   32,    1,  546,   17, 8964,   10,  138,  635, 2600,
          5,   27,  223,  148,    1, 1009,  644,  127,    2,   40,  293,
          1,    2,  293,  173,  278,   10,   43,  181,    5,   75,    3,
        794, 2601,   83,   11,  225,   36,   10,  195,   14,   65,  627,
          9,    1, 4220,   43,    5,  278,   95,   56,   60,  327,  711,
         25,    6, 2476,   42, 1331,    6,  172, 4972,  172,  768,   20,
         62,   10,  371,  169,    5,   66,   32,   

Let's load the GloVe word embeddings with 100 dimension.

In [19]:
# glove_dir = 'C:\\Users\\antra\\OneDrive\\kaggle\\sentiment_analysis\\word_embedding\\glove.6B'

# embeddings_index = {}
# f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'), errors='ignore', encoding='utf8')
# for line in f:
#     values = line.split()
#     word = values[0]
#     coefs = np.asarray(values[1:], dtype='float32')
#     embeddings_index[word] = coefs
# f.close()

# print('Found %s word vectors.' % len(embeddings_index))

Next, build an embedding matrix that can load into Embedding layer. It must be a matrix of shape (max_words, embedding_dim). **Note**: index 0 isn't supposed to stand for any word or token; it's simply a placeholder.

In [20]:
embedding_dim = 500

# embedding_matrix = np.zeros((max_words, embedding_dim))
# for word, i in word_index.items():
#     if i < max_words:
#         embedding_vector = embeddings_index.get(word)
#         if embedding_vector is not None:
#             embedding_matrix[i] = embedding_vector # words not found in embedding index will be all 0's.

In [21]:
# embedding_matrix.shape

Defining the model architecture.

In [22]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen)) # vocabulary size; embedding dimensions; length of each document
model.add(Flatten())
model.add(Dense(16, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

INFO:plaidml:Opening device "opencl_amd_ellesmere.0"


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 500)          5000000   
_________________________________________________________________
flatten_1 (Flatten)          (None, 250000)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                4000016   
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 9,000,305
Trainable params: 9,000,305
Non-trainable params: 0
_________________________________________________________________


The **Embedding** layer has a single weight matrix: a 2D float matrix where each entry is the word vector associated with index i. Let's load the GloVe matrix that we prepared into the Embedding layer, the first layer in the model. We will freeze the Embedding layer (*trainable* to *False*) because pre-trained parts shouldn't be updated during training.

If we do not want to use GloVe pre-trained word embeddings; we can disable the pre-trained weights and let the model learn the embedding weights on their own for this task specific problem.

In [23]:
# model.layers[0].set_weights([embedding_matrix])
# model.layers[0].trainable = False

In [24]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 500)          5000000   
_________________________________________________________________
flatten_1 (Flatten)          (None, 250000)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                4000016   
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 9,000,305
Trainable params: 9,000,305
Non-trainable params: 0
_________________________________________________________________


In order to evaluate how well our model performs on unseen data; we need to have a validation set. Let's use **20%** of our **training data** as **validation data**.

In [25]:
x_val = data[:5000]
partial_x_train = data[5000:]

y_val = np.asarray(train['sentiment'][:5000]) # convert labels/outputs to arrays because it's faster
partial_y_train = np.asarray(train['sentiment'][5000:])

Let's compile and train the model.

In [26]:
model.compile(optimizer='rmsprop',
             loss='binary_crossentropy',
             metrics=['accuracy'])
history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=7,
                    batch_size=128,
                    validation_data=(x_val, y_val))
# model.save_weights('pre_trained_glove.h5')

Train on 20000 samples, validate on 5000 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


Now let's plot the model's performance overtime.

In [27]:
import matplotlib.pyplot as plt

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

The model starts overfitting after around 7 epochs. Let's re-train the model again for only 7 epochs using all data. First, let's redefine the network architecture.

In [28]:
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen)) # vocabulary size; embedding dimensions; length of each document
model.add(Flatten())
model.add(Dense(16, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 500, 500)          5000000   
_________________________________________________________________
flatten_2 (Flatten)          (None, 250000)            0         
_________________________________________________________________
dense_4 (Dense)              (None, 16)                4000016   
_________________________________________________________________
dense_5 (Dense)              (None, 16)                272       
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 17        
Total params: 9,000,305
Trainable params: 9,000,305
Non-trainable params: 0
_________________________________________________________________


Because we do not want to use GloVe pre-trained word embeddings; let's disable the pre-trained weights and let the model learn the embedding weights on their own for this task specific problem.

In [29]:
# model.layers[0].set_weights([embedding_matrix])
# model.layers[0].trainable = False
# model.summary()

In [30]:
model.compile(optimizer='rmsprop',
             loss='binary_crossentropy',
             metrics=['accuracy'])
history = model.fit(data,
                    np.asarray(train['sentiment']),
                    epochs=5,
                    batch_size=128)
# model.save_weights('pre_trained_glove.h5')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Now that we have our finalized training model; let's use it to **predict** the **test set**. First, we are going to **clean up** the **test set** exactly like we did with the **training set**.

In [31]:
test = pd.read_csv('testData.tsv', sep='\t')

test['review'] = to_lower(test['review'])
test['review'] = remove_special(test['review'])
test['review'] = remove_numbers(test['review'])
test['review'] = remove_whitespace(test['review'])

Now that we have the texts in our test set cleaned up; let's **tokenize** the texts based on the **Tokenizer on training data**.

In [34]:
sequences_test = tokenizer.texts_to_sequences(test['review'])
test_data = pad_sequences(sequences_test, maxlen=maxlen) # truncate texts after "maxlen", or fill in with 0's if not long enough

In [35]:
# model.load_weights('pre_trained_glove.h5') # load the pre-trained weights before

predictions = model.predict(test_data) # these are probabilities that it belongs to positive class

In [36]:
sentiment = np.where(predictions >= 0.5, 1, np.where(predictions < 0.5, 0, predictions)) # if probabilities > 0.5; sentiment = 1

In [37]:
sentiment

array([[1.],
       [0.],
       [0.],
       ...,
       [0.],
       [1.],
       [1.]], dtype=float32)

In [38]:
test['sentiment'] = sentiment

In [39]:
del test['review']

In [40]:
test.to_csv('imdb_word_embedding_128batch_own_embedding_500d.csv', index=False)

In [41]:
# model.save_weights('own_embedding_128batch_500d.h5') # save model weights