In [1]:
import numpy as np
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from collections import Counter
import mxnet as mx

Using TensorFlow backend.
  from numpy.testing.decorators import setastest


In [2]:
CLEANING_PATTERN = re.compile("[\s\n\r\t.,:;\-_\'\"?!#&()\/%\[\]\{\}\<\>\\$@\!\*\+\=]")
VOCAB_SIZE = 5800
MAX_TIME = 15
LSTM_HIDDEN = 256
MAX_NORM = 10

  CLEANING_PATTERN = re.compile("[\s\n\r\t.,:;\-_\'\"?!#&()\/%\[\]\{\}\<\>\\$@\!\*\+\=]")


In [3]:
x_train = []
x_test = []
with open('train.txt') as f:
    for line in f:
        x_train.append(line)

with open('test.txt') as f:
    for line in f:
        x_test.append(line)

In [4]:
tokenizer = Tokenizer(VOCAB_SIZE)

In [5]:
tokenizer.fit_on_texts(x_train)

In [6]:
train_x = tokenizer.texts_to_sequences(x_train)
test_x = tokenizer.texts_to_sequences(x_test)

In [7]:
sent_lenghts_train = [len(sent) for sent in train_x]
sent_lenghts_test = [len(sent) for sent in test_x]

In [8]:
train_x = pad_sequences(train_x,maxlen=MAX_TIME,padding='post')
test_x = pad_sequences(test_x,maxlen=MAX_TIME,padding='post')

In [9]:
train_y = np.zeros((len(train_x),MAX_TIME))
test_y = np.zeros((len(test_x),MAX_TIME))
train_y[:,:-1] = train_x[:,1:]
test_y[:,:-1] = test_x[:,1:]

In [10]:
train_x = mx.nd.array(train_x)
train_y = mx.nd.array(train_y)

In [11]:
wordIndex = tokenizer.word_index
wI = {}
for k,v in wordIndex.items():
    if v < VOCAB_SIZE:
        wI[k] = v
wordIndex = wI

In [12]:
vocabulary = mx.contrib.text.vocab.Vocabulary(Counter(tokenizer.word_counts))
my_indexing = [0]
for i in range(1,VOCAB_SIZE):
    myword = tokenizer.index_word[i]
    my_indexing.append(vocabulary.token_to_idx[myword])

In [13]:
EMBEDDINGS_FILE = 'glove.6B.200d.txt'
embeddings = mx.contrib.text.embedding.GloVe(EMBEDDINGS_FILE,embedding_root='./',vocabulary=vocabulary)
all_tokens = vocabulary.to_tokens(list(range(len(vocabulary))))
weight_matrix = embeddings.get_vecs_by_tokens(all_tokens)
weight_matrix = weight_matrix[my_indexing]

In [14]:
def _clip_by_global_norm(_module,max_norm):
        assert _module.binded and _module.params_initialized \
               and _module.optimizer_initialized
        grad_array = []
        for grad in _module._exec_group.grad_arrays:
            grad_array += grad
        return mx.gluon.utils.clip_global_norm(grad_array, max_norm)

In [15]:
vocab_size = VOCAB_SIZE
embedding_dim = 200
input_data = mx.sym.Variable('data')
label = mx.sym.Variable('softmax_label')
input_embed = mx.sym.Embedding(data=input_data,input_dim=vocab_size,output_dim=embedding_dim,\
                                   name='embed')
lstm_cell = mx.rnn.LSTMCell(num_hidden=LSTM_HIDDEN)
begin_state = lstm_cell.begin_state()
output, states = lstm_cell.unroll(MAX_TIME,input_embed, begin_state,merge_outputs=True)
pred = mx.sym.Reshape(output, shape=(-1, LSTM_HIDDEN))
pred = mx.sym.FullyConnected(data=pred, num_hidden=vocab_size, name='pred')
pred = mx.sym.Reshape(pred, shape=(-1, vocab_size))

label = mx.sym.Reshape(label, shape=(-1,))
loss = mx.sym.SoftmaxOutput(data=pred,label=label)
#logits = mx.sym.log_softmax(pred, axis=-1)
#loss = -mx.sym.pick(logits, label, axis=-1, keepdims=True)
#loss = mx.sym.mean(loss, axis=0, exclude=True)
#loss = mx.sym.make_loss(loss, name='nll')

In [16]:
batch_size=100
model = mx.module.Module(loss)

In [17]:
model.bind(data_shapes=[('data',(batch_size,MAX_TIME))], label_shapes=[('softmax_label', (batch_size,MAX_TIME))])
model.init_params(initializer=mx.initializer.Uniform(0.1))
model.set_params({'embed_weight' : weight_matrix},{},allow_missing=True)
model.init_optimizer('rmsprop',optimizer_params=(('learning_rate', 0.005), ))

In [18]:
NUM_EPOCHS = 20
num_batches = int(len(train_x)/batch_size)
for epoch in range(NUM_EPOCHS):
    train_loss = 0
    eval_metric = mx.metric.CrossEntropy()
    for i in range(num_batches):
        bt = mx.io.DataBatch(data=[train_x[i*batch_size:(i+1)*batch_size]], label=[train_y[i*batch_size:(i+1)*batch_size]])
        model.forward(bt)
        model.backward()
        _clip_by_global_norm(model,MAX_NORM*batch_size*MAX_TIME)
        model.update()
        outputs = model.get_outputs(merge_multi_context=True)
        eval_metric.update(train_y[i*batch_size:(i+1)*batch_size].reshape(-1,),outputs[0])
        train_loss += eval_metric.get()[1]
    print("Epoch %d : Train Loss : %f"%(epoch+1,train_loss))
    model.save_checkpoint("model",10,True)

Epoch 1 : Train Loss : 243.957742
Epoch 2 : Train Loss : 171.993836
Epoch 3 : Train Loss : 167.193367
Epoch 4 : Train Loss : 163.276642
Epoch 5 : Train Loss : 160.208192
Epoch 6 : Train Loss : 157.950610
Epoch 7 : Train Loss : 156.119504
Epoch 8 : Train Loss : 154.597980
Epoch 9 : Train Loss : 153.323816
Epoch 10 : Train Loss : 152.226113
Epoch 11 : Train Loss : 151.265862
Epoch 12 : Train Loss : 150.419106
Epoch 13 : Train Loss : 149.665493
Epoch 14 : Train Loss : 148.988650
Epoch 15 : Train Loss : 148.375703
Epoch 16 : Train Loss : 147.816301
Epoch 17 : Train Loss : 147.302244
Epoch 18 : Train Loss : 146.827192
Epoch 19 : Train Loss : 146.386278
Epoch 20 : Train Loss : 145.975675


In [19]:
#Test for last word

test_accuracy_1 = 0
num_batches = int(len(test_x)/batch_size)
for i in range(num_batches):
    bt = mx.io.DataBatch(data=[train_x[i*batch_size:(i+1)*batch_size]], label=[train_y[i*batch_size:(i+1)*batch_size]])
    model.forward(bt)
    test_outs = model.get_outputs(merge_multi_context=True)[0].asnumpy()
    test_outs = test_outs.reshape((batch_size,MAX_TIME,-1))
    test_preds = np.argmax(test_outs,axis=-1)
    batch_y = np.array(test_y[i*batch_size:(i+1)*batch_size],dtype='int64')
    idxing = range(batch_size),np.array(sent_lenghts_test[i*batch_size:(i+1)*batch_size])-1
    acc = np.sum(test_preds[idxing] == batch_y[idxing])
    test_accuracy_1 += acc

In [20]:
print("Test Accuracy for Task 1 : ",test_accuracy_1/(num_batches*batch_size))

Test Accuracy for Task 1 :  0.845


In [21]:
#Test for 2nd half of sentence

test_accuracy_2 = 0
num_preds = 0
num_batches = int(len(test_x)/batch_size)
for i in range(num_batches):
    bt = mx.io.DataBatch(data=[train_x[i*batch_size:(i+1)*batch_size]], label=[train_y[i*batch_size:(i+1)*batch_size]])
    model.forward(bt)
    test_outs = model.get_outputs(merge_multi_context=True)[0].asnumpy()
    test_outs = test_outs.reshape((batch_size,MAX_TIME,-1))
    test_preds = np.argmax(test_outs,axis=-1)
    batch_y = np.array(test_y[i*batch_size:(i+1)*batch_size],dtype='int64')
    ct = 0
    
    for j in range(i*batch_size,(i+1)*batch_size):
        baselen = sent_lenghts_test[j]
        encodelen = int(baselen/2)
        sent_preds = test_preds[ct,range(encodelen-1,baselen)]
        sent_batch = batch_y[ct,range(encodelen-1,baselen)]
        test_accuracy_2 += np.sum(sent_batch == sent_preds)
        num_preds += len(sent_preds)
        ct += 1

In [22]:
print("Test Accuracy for Task 2 : ",test_accuracy_2/num_preds)

Test Accuracy for Task 2 :  0.18640712636093698
