In [1]:
import re
import itertools
from collections import Counter
import numpy as np


import mxnet as mx
from mxnet import gluon, nd, autograd
from mxnet.gluon import nn, rnn
 

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

context = mx.gpu(0)

In [2]:
# Here we using Stanford's Large Movie Review Dataset available at this
# link: http://ai.stanford.edu/~amaas/data/sentiment/

def read_files(foldername):
    import os
    sentiments = []
    filenames = os.listdir(os.curdir+ "/"+foldername)
    for file in filenames:
        with open(foldername+"/"+file,"r", encoding="utf8") as pos_file:
            data=pos_file.read().replace('\n', '')
            sentiments.append(data)
    return sentiments
    
    
foldername = "aclImdb/train/pos/"
postive_sentiment = read_files(foldername)

foldername = "aclImdb/train/neg/"
negative_sentiment = read_files(foldername)

positive_labels = [1 for _ in postive_sentiment]
negative_labels = [0 for _ in negative_sentiment]

In [3]:
#some string preprocessing
def clean_str(string):  
    remove_special_chars = re.compile("[^A-Za-z0-9 ]+")
    string = string.lower().replace("<br />", " ")
    return re.sub(remove_special_chars, "", string.lower())

In [4]:
#create a dict of word and their count in entrie dataset{word:count}

word_counter = Counter()
def create_count(sentiments):
    for line in sentiments:
        for word in (clean_str(line)).split():
            if word not in word_counter.keys():               
                word_counter[word] = 1
            else:
                word_counter[word] += 1

#Assigns a unique a number for each word (sorted by descending order based on the frequency of occurrence)
# and returns a word_dict
def create_word_index():
    idx = 1
    word_dict = {}
    for word in word_counter.most_common():
        word_dict[word[0]] = idx
        idx+=1
    return word_dict
    

all_sentiments = postive_sentiment + negative_sentiment
all_labels = positive_labels + negative_labels
create_count(all_sentiments)
word_dict = create_word_index()

#create a reverse index from a number to the word 
idx2word = {v: k for k, v in word_dict.items()}

In [5]:
#Creates a encoded sentences. 
#Assigns the unique id from wordict to the words in the sentences
def encoded_sentences(input_file,word_dict):
    output_string = []
    for line in input_file:
        output_line = []
        for word in (clean_str(line)).split():
            if word in word_dict:
                output_line.append(word_dict[word])
        output_string.append(output_line)
    return output_string

def decode_sentences(input_file,word_dict):
    output_string = []
    for line in input_file:
        output_line = ''
        for idx in line:
            output_line += idx2word[idx] + ' '
        output_string.append(output_line)
    return output_string

#Pad the sequences to maxlen.
#if sentences is greater than maxlen, truncates the sentences
#if sentences is less the 500, pads with value 0 (most commonly occurrning word)
def pad_sequences(sentences,maxlen=500,value=0):
    """
    Pads all sentences to the same length. The length is defined by maxlen.
    Returns padded sentences.
    """
    padded_sentences = []
    for sen in sentences:
        new_sentence = []
        if(len(sen) > maxlen):
            new_sentence = sen[:maxlen]
            padded_sentences.append(new_sentence)
        else:
            num_padding = maxlen - len(sen)
            new_sentence = np.append(sen,[value] * num_padding)
            padded_sentences.append(new_sentence)
    return padded_sentences

In [44]:
#Encodes the positive sentiment into sequence of number.
positive_encoded = encoded_sentences(postive_sentiment,word_dict)
negative_encoded = encoded_sentences(negative_sentiment,word_dict)

all_encoded = positive_encoded + negative_encoded

In [45]:
vocab_size = 50000 #Here we set the total num of words to be tracked

#Any word outside of the tracked range will be encoded with last position
t_data = [np.array([i if i<(vocab_size-1) else (vocab_size-1) for i in s]) for s in all_encoded]


In [46]:
# Loads Stanford's Global Vector for Word Representation (GloVe) embedding
# We specifically used glove.42B.300d.zip available at this link:
# https://nlp.stanford.edu/projects/glove/

num_embed = 300 #This is the richness of the word attributes captured

def load_glove_index(loc):
    f = open(loc, encoding="utf8")
    embeddings_index = {}
    
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype = 'float32')
        embeddings_index[word] = coefs
    f.close()
    return embeddings_index

def create_emb():
    embedding_matrix = np.zeros((vocab_size, num_embed))
    for word, i in word_dict.items():
        if i >= vocab_size:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    embedding_matrix = nd.array(embedding_matrix).as_in_context(context)
    return embedding_matrix

embeddings_index = load_glove_index('glove.42B.300d.txt')
embedding_matrix = create_emb()

In [32]:
#train+validation, test split
X_train_val, X_test, y_train_val, y_test_set = train_test_split(t_data, all_labels, test_size=0.3, random_state=42)

In [33]:
#train, validation split of data
X_train, X_val, y_train, y_validation = train_test_split(X_train_val, y_train_val, test_size=0.3, random_state=42)

In [34]:
#statistics of sentences before padding
min_len = min(map(len, t_data))
max_len = max(map(len,t_data))
avg_len = sum(map(len,t_data)) / len(t_data)
print("the minimum length is:",min_len)
print("the maximum length is:",max_len)
print("the average length is:",avg_len)

the minimum length is: 10
the maximum length is: 2459
the average length is: 230.51952


In [35]:
seq_len = 500 #This set the max word length of each movie review

#padding of sentences
trn = nd.array(pad_sequences(X_train, maxlen=seq_len, value=0))
val = nd.array(pad_sequences(X_val, maxlen=seq_len, value=0))
test = nd.array(pad_sequences(X_test, maxlen=seq_len, value=0))
y_trn = nd.array(y_train).as_in_context(context)
y_val = nd.array(y_validation).as_in_context(context)
y_test = nd.array(y_test_set).as_in_context(context)

In [36]:
num_classes = 2
num_hidden = 64
learning_rate = .001
epochs = 10
batch_size = 24


model = mx.gluon.nn.Sequential()

with model.name_scope():    
    model.embed = mx.gluon.nn.Embedding(vocab_size, num_embed)
    model.add(mx.gluon.rnn.LSTM(num_hidden, layout = 'NTC'))
    model.add(mx.gluon.nn.Dense(num_classes))

In [37]:
def evaluate_accuracy(x,y,batch_size):
    
    acc = mx.metric.Accuracy()
    
    for i in range(x.shape[0] // batch_size):
        data = x[i*batch_size:(i*batch_size + batch_size),]
        target = y[i*batch_size:(i*batch_size + batch_size),]
    
        output = model(data)
        predictions = nd.argmax(output, axis=1)
        acc.update(preds=predictions, labels=target)
    
    return acc.get()[1]

In [38]:

model.collect_params().initialize(mx.init.Xavier(), ctx=context)

model.embed.weight.set_data(embedding_matrix.as_in_context(context))

trainer = gluon.Trainer(model.collect_params(), 'sgd',
                        {'learning_rate': learning_rate})

softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()    

for epoch in range(epochs):
            
    for b in range(trn.shape[0] // batch_size):
        data = trn[b*batch_size:(b*batch_size + batch_size),]
        target = y_trn[b*batch_size:(b*batch_size + batch_size),]
        
        data = data.as_in_context(context)
        target = target.as_in_context(context)
        
        with autograd.record():
            output = model(data)
            L = softmax_cross_entropy(output, target)
            L.backward()
        trainer.step(data.shape[0])
            
    test_accuracy = evaluate_accuracy(trn, y_trn, batch_size)
    train_accuracy = evaluate_accuracy(test, y_test, batch_size)
    print("Epoch %s. Train_acc %s, Test_acc %s" %
          (epoch, train_accuracy, test_accuracy))

MXNetError: [14:33:43] src/c_api/c_api_ndarray.cc:128: Check failed: ndinputs[i].ctx().dev_mask() == ctx.dev_mask() (1 vs. 2) All inputs must live on the same context. But the first argument is on gpu(0) while the 3-th argument is on cpu(0)

Stack trace returned 10 entries:
[bt] (0) /usr/local/lib/python3.5/dist-packages/mxnet-0.11.0-py3.5.egg/mxnet/libmxnet.so(_ZN4dmlc15LogMessageFatalD1Ev+0x3c) [0x7f03f6eb3f0c]
[bt] (1) /usr/local/lib/python3.5/dist-packages/mxnet-0.11.0-py3.5.egg/mxnet/libmxnet.so(_Z10SetContextPN5mxnet7ContextERKN4nnvm9NodeAttrsERKSt6vectorINS_7NDArrayESaIS7_EESB_RKS0_+0x37b) [0x7f03f7f0d43b]
[bt] (2) /usr/local/lib/python3.5/dist-packages/mxnet-0.11.0-py3.5.egg/mxnet/libmxnet.so(_Z20ImperativeInvokeImplRKN5mxnet7ContextERKN4nnvm9NodeAttrsEPSt6vectorINS_7NDArrayESaIS8_EESB_+0x15e) [0x7f03f7f1226e]
[bt] (3) /usr/local/lib/python3.5/dist-packages/mxnet-0.11.0-py3.5.egg/mxnet/libmxnet.so(MXImperativeInvoke+0x217) [0x7f03f7f12cd7]
[bt] (4) /usr/lib/python3.5/lib-dynload/_ctypes.cpython-35m-x86_64-linux-gnu.so(ffi_call_unix64+0x4c) [0x7f0427e73e20]
[bt] (5) /usr/lib/python3.5/lib-dynload/_ctypes.cpython-35m-x86_64-linux-gnu.so(ffi_call+0x2eb) [0x7f0427e7388b]
[bt] (6) /usr/lib/python3.5/lib-dynload/_ctypes.cpython-35m-x86_64-linux-gnu.so(_ctypes_callproc+0x49a) [0x7f0427e6e01a]
[bt] (7) /usr/lib/python3.5/lib-dynload/_ctypes.cpython-35m-x86_64-linux-gnu.so(+0x9fcb) [0x7f0427e61fcb]
[bt] (8) /usr/bin/python3(PyObject_Call+0x47) [0x5b5da7]
[bt] (9) /usr/bin/python3(PyEval_EvalFrameEx+0x4eb6) [0x528956]
