In [1]:
## All neccessary imports and context initialization

from __future__ import print_function
import mxnet as mx
from mxnet import nd, autograd
import numpy as np
import string
import re
import mxnet
mx.random.seed(1)
ctx = mx.cpu()

## Function for cleaning the text

In [2]:
def clean_document(doco):
    punctuation = string.punctuation + '\n\n';
    punc_replace = ''.join([' ' for s in punctuation]);
    doco_clean = doco.replace('-', ' ');
    doco_alphas = re.sub(r'\W +', '', doco_clean)
    trans_table = string.maketrans(punctuation, punc_replace);
    doco_clean = ' '.join([word.translate(trans_table) for word in doco_alphas.split(' ')]);
    doco_clean = doco_clean.split(' ');
    doco_clean = [word.lower() for word in doco_clean if len(word) > 0];
    return doco_clean;

## Function to convert an indexed sentence to one hot vector

In [3]:
def one_hots(numerical_list, vocab_size):
    result = nd.zeros((len(numerical_list), vocab_size), ctx=ctx)
    for i, idx in enumerate(numerical_list):
        if(idx == -1):
            continue
        result[i, idx] = 1.0
    return result

## Function to add padding to a sentence

In [4]:
def padding(dataset):
    maxlen = max([len(row) for row in dataset])
    for i in range(len(dataset)):
        dataset[i] = dataset[i]+[-1]*(maxlen - len(dataset[i]))
#         for j in range(maxlen - len(dataset[i])):
#             dataset[i].append(-1)
    return dataset

## Softmax function

In [5]:
def softmax(y_linear):
    if(len(y_linear.shape) == 1):
        lin = (y_linear-nd.max(y_linear))
        exp = nd.exp(lin)
        partition = nd.sum(exp)
        return exp / partition
    lin = (y_linear-nd.max(y_linear, axis=1).reshape((-1,1))) # shift each row of y_linear by its max
    exp = nd.exp(lin)
    partition =nd.sum(exp, axis=1).reshape((-1,1))
    return exp / partition

## Cross Entropy Function

In [6]:
def cross_entropy(yhat, y):
    return - nd.mean(nd.sum(y * nd.log(yhat), axis=0, exclude=True))

## Average Cross Entropy loss function

In [7]:
def average_ce_loss(outputs, labels):
    assert(len(outputs) == len(labels))
    total_loss = 0.
    for (output, label) in zip(outputs,labels):
        total_loss = total_loss + cross_entropy(output, label)
    return total_loss / len(outputs)

### SGD with gradient clipping implemented

In [8]:
def SGD(params, lr):
    # Gradient clipping
    for i in range(len(params)):
        mxnet.ndarray.clip(params[i].grad, a_min = -10, a_max = 10, out=params[i].grad)
    # Back Prop
    for param in params:
        param[:] = param - lr * param.grad

# Part A - With one hot vectors

In [9]:
# Loading the dataset

with open("train.txt", "r") as file:
    train_data = file.readlines()
    train_data = [clean_document(doc) for doc in train_data]
    
with open("test.txt", "r") as file:
    test_data = file.readlines()
    test_data = [clean_document(doc) for doc in test_data]

AttributeError: type object 'str' has no attribute 'maketrans'

In [10]:
# Adding the SOS and EOS delimiters to the data

for i in range(len(train_data)):
    train_data[i] = ["<SOS>"]+train_data[i]
    train_data[i] = train_data[i]+["<EOS>"]

for i in range(len(test_data)):
    test_data[i] = ["<SOS>"]+test_data[i]
    test_data[i] = test_data[i]+["<EOS>"]

In [11]:
total_words = []
for row in train_data:
    total_words += row
for row in test_data:
    total_words += row
    
word_list = list(set(total_words))
vocab_size = len(word_list)
print("Length of vocab: %s" % vocab_size)

Length of vocab: 6692


## Generating dictionary (Don't run it for simply testing without training)

In [12]:
word_dict = {}
for e, word in enumerate(word_list):
    word_dict[word] = e
np.save('./word_dict_model_1.npy', word_dict)

## Loading Dictionary for simply testing without training

In [None]:
word_dict = np.load('./weights/word_dict_model_1.npy').item()

In [13]:
X_train = []
for row in train_data:
    X_train.append([word_dict[x] for x in row])

X_train_list = X_train.copy()
X_test = []
for row in test_data:
    X_test.append([word_dict[x] for x in row])


X_train = padding(X_train)

In [14]:
dataset = []
for row in X_train:
    dataset.append(one_hots(row[:-1], vocab_size))

temp = nd.zeros((len(dataset), dataset[0].shape[0], dataset[0].shape[1]), ctx=ctx)
for i in range(len(temp)):
    temp[i] = dataset[i]
dataset = temp

In [15]:
batch_size = 64
seq_length = len(dataset[0])
print('# of sequences in dataset: ', len(dataset))
num_batches = len(dataset) // batch_size
print('# of batches: ', num_batches)
train_data = dataset[:num_batches*batch_size].reshape((batch_size, num_batches, seq_length, vocab_size))
train_data = nd.swapaxes(train_data, 0, 1)
train_data = nd.swapaxes(train_data, 1, 2)
print('Shape of data set: ', train_data.shape)

# of sequences in dataset:  3610
# of batches:  56
Shape of data set:  (56, 19, 64, 6692)


In [16]:
X_label = []
for i in X_train:
    X_label.append(i[1:])
    
labels = []
for row in X_label:
    labels.append(one_hots(row, vocab_size))

temp = nd.zeros((len(labels), labels[0].shape[0], labels[0].shape[1]), ctx=ctx)
for i in range(len(temp)):
    temp[i] = labels[i]
labels = temp

In [17]:
train_labels = labels[:num_batches*batch_size].reshape((batch_size, num_batches, seq_length, vocab_size))
train_labels = nd.swapaxes(train_labels, 0, 1)
train_labels = nd.swapaxes(train_labels, 1, 2)
print('Shape of data set: ', train_labels.shape)

Shape of data set:  (56, 19, 64, 6692)


## For training model, run this cell (For testing only, run below cell)

In [18]:
num_inputs = vocab_size
num_hidden = 128
num_outputs = vocab_size
Wxh_model_1 = nd.random_normal(shape=(num_inputs,num_hidden), ctx=ctx) * .01
Whh_model_1 = nd.random_normal(shape=(num_hidden,num_hidden), ctx=ctx) * .01
bh_model_1 = nd.random_normal(shape=num_hidden, ctx=ctx) * .01
Why_model_1 = nd.random_normal(shape=(num_hidden,num_outputs), ctx=ctx) * .01
by_model_1 = nd.random_normal(shape=num_outputs, ctx=ctx) * .01

params_model_1 = [Wxh_model_1, Whh_model_1, bh_model_1, Why_model_1, by_model_1]

for param in params_model_1:
    param.attach_grad()


## For simply testing without training run this cell instead of above

In [None]:
# Loading the model
num_inputs = vocab_size
num_hidden = 128
num_outputs = vocab_size

Wxh_model_1 = mxnet.ndarray.load("./weights/Wxh_model_1")[0]
Whh_model_1 = mxnet.ndarray.load("./weights/Whh_model_1")[0]
bh_model_1 = mxnet.ndarray.load("./weights/bh_model_1")[0]
Why_model_1 = mxnet.ndarray.load("./weights/Why_model_1")[0]
by_model_1 = mxnet.ndarray.load("./weights/by_model_1")[0]

params_model_1 = [Wxh_model_1, Whh_model_1, bh_model_1, Why_model_1, by_model_1]

In [19]:
def simple_rnn_model_1(inputs, state):
    outputs = []
    h = state
    for X in inputs:
        h_linear = nd.dot(X, Wxh_model_1) + nd.dot(h, Whh_model_1) + bh_model_1
        h = nd.tanh(h_linear)
        yhat_linear = nd.dot(h, Why_model_1) + by_model_1
        yhat = softmax(yhat_linear)
        outputs.append(yhat)
    return (outputs, h)

In [20]:
def test_last_word_1(in_dataset, ground_data):
    true = 0
    count = 0
    for i in range(len(in_dataset)):
        if(len(in_dataset[i]) <= 3):
            continue
        state = nd.zeros(shape=(num_hidden), ctx=ctx)
        data_one_hot = in_dataset[i]
        with autograd.record():
            outputs, state = simple_rnn_model_1(data_one_hot, state)
        true += int(outputs[-3].asnumpy().argmax() == ground_data[i][-3])
        count+=1


    print("Result: ", true, count, true/float(count)*100)

## Training the model

### (For simply testing the pretrained model, please run below cells)

In [23]:
test_dataset = []
for row in X_test:
    test_dataset.append(one_hots(row[:-1], vocab_size))

In [None]:
epochs = 125

learning_rate = 0.9
for e in range(epochs):
    over_loss = 0
    state = nd.zeros(shape=(batch_size, num_hidden), ctx=ctx)
    for i in range(num_batches):
        data_one_hot = train_data[i]
        label_one_hot = train_labels[i]
        with autograd.record():
            outputs, state = simple_rnn_model_1(data_one_hot, state)
            loss = average_ce_loss(outputs, label_one_hot)
            loss.backward()
        SGD(params_model_1, learning_rate)
        over_loss += np.mean(loss.asnumpy()[0])
    print("Epoch %s. Loss: %s" % (e, over_loss))
    if ((e+1) % 25 == 0):
        test_last_word_1(test_dataset, X_test)

Epoch 0. Loss: 173.75654649734497
Epoch 1. Loss: 173.3448269367218
Epoch 2. Loss: 172.903146982193
Epoch 3. Loss: 172.48332262039185
Epoch 4. Loss: 172.10226702690125
Epoch 5. Loss: 171.70755910873413
Epoch 6. Loss: 171.3070297241211
Epoch 7. Loss: 170.9063436985016
Epoch 8. Loss: 170.50190210342407


## Results for training accuracy

In [None]:
train_dataset = []
for row in X_train_list:
    train_dataset.append(one_hots(row[:-1], vocab_size))
    
print("Train Results: (True, number_of_samples, percentage)")
test_last_word_1(train_dataset, X_train_list)

## Results for test accuracy

In [None]:
test_dataset = []
for row in X_test:
    test_dataset.append(one_hots(row[:-1], vocab_size))

print("Test Results: (True, number_of_samples, percentage)")
test_last_word_1(test_dataset, X_test)

### Saving model parameters (please don't run while testing)

In [None]:
mxnet.ndarray.save("Wxh_model_1", Wxh_model_1)
mxnet.ndarray.save("Whh_model_1", Whh_model_1)
mxnet.ndarray.save("bh_model_1", bh_model_1)
mxnet.ndarray.save("Why_model_1", Why_model_1)
mxnet.ndarray.save("by_model_1", by_model_1)

# Part -B: With pretrained Fasttext embeddings

In [44]:
with open("train.txt", "r") as file:
    train_data = file.readlines()
    train_data = [clean_document(doc) for doc in train_data]
    
with open("test.txt", "r") as file:
    test_data = file.readlines()
    test_data = [clean_document(doc) for doc in test_data]

for i in range(len(train_data)):
    train_data[i] = ["<SOS>"]+train_data[i]
    train_data[i] = train_data[i]+["<EOS>"]

for i in range(len(test_data)):
    test_data[i] = ["<SOS>"]+test_data[i]
    test_data[i] = test_data[i]+["<EOS>"]
    
file = open("cleaned_data.txt", "w")
for sent in train_data+test_data:
    for word in sent:
        file.write(word+" ")
    file.write("\n")

In [10]:
# import fasttext
# embed_size = 300
# model = fasttext.skipgram('cleaned_data.txt', 'model', dim=embed_size)

In [11]:
# import io
# embed_size = 300
# def load_vectors(fname):
#     fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
#     n, d = map(int, fin.readline().split())
#     data = {}
#     for line in fin:
#         tokens = line.rstrip().split(' ')
#         data[tokens[0]] = map(float, tokens[1:])
#     return data

# model = load_vectors("wiki-news-300d-1M.vec")
# model_words = model.keys()

In [22]:
from numpy import asarray
model = dict()
embed_size = 300
f = open('wiki-news-300d-1M.vec')
for line in f:
    values = line.split()
    word = values[0].lower()
    coefs = asarray(values[1:], dtype='float32')
    model[word] = coefs
    
model_words = list(model.keys())

In [45]:
total_words = []
for row in train_data:
    total_words += row
for row in test_data:
    total_words += row
    
word_list = list(set(total_words))
vocab_size = len(word_list)
print("Length of vocab: %s" % vocab_size)

Length of vocab: 6694


## Generating dictionary (Don't run it for simply testing without training)

In [24]:
word_dict = {}
for e, word in enumerate(word_list):
    word_dict[word] = e

temp = []
for i in model_words:
    if(i in word_dict):
        temp.append(i)
        

model_words = temp
np.save('./word_dict_model_2.npy', word_dict)

## Loading Dictionary for simply testing without training

In [46]:
word_dict = np.load('./weights/word_dict_model_2.npy').item()

In [47]:
inv_word_dict = {v: k for k, v in word_dict.items()}

X_train = []
for row in train_data:
    X_train.append([word_dict[x] for x in row])

X_train_list = X_train[:]
X_test = []
for row in test_data:
    X_test.append([word_dict[x] for x in row])

X_train = padding(X_train)

In [48]:
def fast_embed(numerical_list, embed_size):
    result = nd.zeros((len(numerical_list), embed_size), ctx=ctx)
    for i, idx in enumerate(numerical_list):
        if(idx == -1):
            continue
        if(inv_word_dict[idx] not in model_words):
            result[i] = nd.random_normal(shape=(1,embed_size), ctx=ctx) * .01
            continue
        if(inv_word_dict[idx] == '<SOS>'):
            result[i] = nd.ones((1, embed_size), ctx=ctx)
            continue
        if(inv_word_dict[idx] == '<EOS>'):
            result[i] = -nd.ones((1, embed_size), ctx=ctx)
            continue
        result[i] = model[inv_word_dict[idx]]
    return result

In [49]:
dataset = []
for row in X_train:
    dataset.append(fast_embed(row[:-1], embed_size))

temp = nd.zeros((len(dataset), dataset[0].shape[0], dataset[0].shape[1]), ctx=ctx)
for i in range(len(temp)):
    temp[i] = dataset[i]
dataset = temp

In [50]:
batch_size = 64
seq_length = len(dataset[0])
print('# of sequences in dataset: ', len(dataset))
num_batches = len(dataset) // batch_size
print('# of batches: ', num_batches)
train_data = dataset[:num_batches*batch_size].reshape((batch_size, num_batches, seq_length, embed_size))
train_data = nd.swapaxes(train_data, 0, 1)
train_data = nd.swapaxes(train_data, 1, 2)
print('Shape of data set: ', train_data.shape)

# of sequences in dataset:  3610
# of batches:  56
Shape of data set:  (56L, 19L, 64L, 300L)


In [51]:
X_label = []
for i in X_train:
    X_label.append(i[1:])
    
labels = []
for row in X_label:
    labels.append(one_hots(row, vocab_size))

temp = nd.zeros((len(labels), labels[0].shape[0], labels[0].shape[1]), ctx=ctx)
for i in range(len(temp)):
    temp[i] = labels[i]
labels = temp

In [52]:
train_labels = labels[:num_batches*batch_size].reshape((batch_size, num_batches, seq_length, vocab_size))
train_labels = nd.swapaxes(train_labels, 0, 1)
train_labels = nd.swapaxes(train_labels, 1, 2)
print('Shape of data set: ', train_labels.shape)

Shape of data set:  (56L, 19L, 64L, 6694L)


## For training model, run this cell (For testing only, run below cell)

In [33]:
num_inputs = embed_size
num_hidden = 128
num_outputs = vocab_size
Wxh_model_2 = nd.random_normal(shape=(num_inputs,num_hidden), ctx=ctx) * .01
Whh_model_2 = nd.random_normal(shape=(num_hidden,num_hidden), ctx=ctx) * .01
bh_model_2 = nd.random_normal(shape=num_hidden, ctx=ctx) * .01
Why_model_2 = nd.random_normal(shape=(num_hidden,num_outputs), ctx=ctx) * .01
by_model_2 = nd.random_normal(shape=num_outputs, ctx=ctx) * .01

params_model_2 = [Wxh_model_2, Whh_model_2, bh_model_2, Why_model_2, by_model_2]

for param in params_model_2:
    param.attach_grad()


## For simply testing without training run this cell instead of above

In [53]:
# Loading the model
num_inputs = embed_size
num_hidden = 128
num_outputs = vocab_size

Wxh_model_2 = mxnet.ndarray.load("./weights/Wxh_model_2")[0]
Whh_model_2 = mxnet.ndarray.load("./weights/Whh_model_2")[0]
bh_model_2 = mxnet.ndarray.load("./weights/bh_model_2")[0]
Why_model_2 = mxnet.ndarray.load("./weights/Why_model_2")[0]
by_model_2 = mxnet.ndarray.load("./weights/by_model_2")[0]

params_model_2 = [Wxh_model_2, Whh_model_2, bh_model_2, Why_model_2, by_model_2]

In [54]:
def simple_rnn_model_2(inputs, state):
    outputs = []
    h = state
    for X in inputs:
        h_linear = nd.dot(X, Wxh_model_2) + nd.dot(h, Whh_model_2) + bh_model_2
        h = nd.tanh(h_linear)
        yhat_linear = nd.dot(h, Why_model_2) + by_model_2
        yhat = softmax(yhat_linear)
        outputs.append(yhat)
    return (outputs, h)

In [55]:
def test_last_word_2(in_dataset, ground_data):
    true = 0
    count = 0
    predicted = []
    for i in range(len(in_dataset)):
        if(len(in_dataset[i]) <= 3):
            continue
        state = nd.zeros(shape=(num_hidden), ctx=ctx)
        data_one_hot = in_dataset[i]
        with autograd.record():
            outputs, state = simple_rnn_model_2(data_one_hot, state)
        true += int(outputs[-3].asnumpy().argmax() == ground_data[i][-3])
        count+=1

    print(true, count, true/float(count)*100)

## Training the model

### (For simply testing the pretrained model, please run below cells)

In [56]:
test_dataset = []
for row in X_test:
    test_dataset.append(fast_embed(row[:-1], embed_size))

In [37]:
epochs = 150

learning_rate = 0.9

# state = nd.zeros(shape=(batch_size, num_hidden), ctx=ctx)
for e in range(epochs):
    over_loss = 0
    state = nd.zeros(shape=(batch_size, num_hidden), ctx=ctx)
    for i in range(num_batches):
        data_one_hot = train_data[i]
        label_one_hot = train_labels[i]
        with autograd.record():
            outputs, state = simple_rnn_model_2(data_one_hot, state)
            loss = average_ce_loss(outputs, label_one_hot)
            loss.backward()
        SGD(params_model_2, learning_rate)
        over_loss += np.mean(loss.asnumpy()[0])
    print("Epoch %s. Loss: %s" % (e, over_loss))
    if ((e+1) % 25 == 0):
        test_last_word_2(test_dataset, X_test)

Epoch 0. Loss: 236.27257013320923
Epoch 1. Loss: 201.95642232894897
Epoch 2. Loss: 197.75856757164001
Epoch 3. Loss: 195.5913918018341
Epoch 4. Loss: 194.40653133392334
Epoch 5. Loss: 194.1512849330902
Epoch 6. Loss: 193.5118727684021
Epoch 7. Loss: 192.55285239219666
Epoch 8. Loss: 192.4698624610901
Epoch 9. Loss: 192.3498260974884
Epoch 10. Loss: 191.37185883522034
Epoch 11. Loss: 191.59667539596558
Epoch 12. Loss: 189.77130699157715
Epoch 13. Loss: 189.27198696136475
Epoch 14. Loss: 187.38207125663757
Epoch 15. Loss: 187.6641161441803
Epoch 16. Loss: 184.74856209754944
Epoch 17. Loss: 184.12769317626953
Epoch 18. Loss: 182.73721504211426
Epoch 19. Loss: 181.81715846061707
Epoch 20. Loss: 180.76193833351135
Epoch 21. Loss: 179.65906429290771
Epoch 22. Loss: 179.03691339492798
Epoch 23. Loss: 178.38765382766724
Epoch 24. Loss: 177.73265075683594
35 684 5.11695906433
Epoch 25. Loss: 177.01154327392578
Epoch 26. Loss: 176.49986004829407
Epoch 27. Loss: 175.9004726409912
Epoch 28. Loss: 

## Results for training accuracy

In [57]:
train_dataset = []
for row in X_train_list:
    train_dataset.append(fast_embed(row[:-1], embed_size))
    
print("Train Results: (True, number_of_samples, percentage)")
test_last_word_2(train_dataset, X_train_list)


Train Results: (True, number_of_samples, percentage)
612 3608 16.9623059867


## Results for test accuracy

In [58]:
test_dataset = []
for row in X_test:
    test_dataset.append(fast_embed(row[:-1], embed_size))

print("Test Results: (True, number_of_samples, percentage)")
test_last_word_2(test_dataset, X_test)

Test Results: (True, number_of_samples, percentage)
40 684 5.84795321637


### Saving model parameters (please don't run while testing)

In [40]:
mxnet.ndarray.save("Wxh_model_2", Wxh_model_2)
mxnet.ndarray.save("Whh_model_2", Whh_model_2)
mxnet.ndarray.save("bh_model_2", bh_model_2)
mxnet.ndarray.save("Why_model_2", Why_model_2)
mxnet.ndarray.save("by_model_2", by_model_2)

In [41]:
word_dict["n't"]

KeyError: "n't"