# Language model example

## Step 1: Load gluon

In [1]:
import argparse
import collections
import time
import math
import mxnet as mx
from mxnet import gluon, autograd
from mxnet.gluon import data, text
from mxnet.gluon.model_zoo.text.lm import StandardRNN, AWDRNN

  from ._conv import register_converters as _register_converters


## Step 2: Load language model parameters

In [4]:
##pretrained lm parameter set
mode = 'lstm'
emsize = 400
nhid = 1150
nlayers = 3
dropout = 0.4
dropout_h = 0.3
dropout_i = 0.4
dropout_e = 0.1
weight_dropout = 0.65
tied = True
##SA parameter set
nclass = 2
##hyper parameters
lr = 0.01
epochs = 10
bptt = 35
##
batch_size = 32

## Step 3: Load training, val, testing dataset

In [9]:
context = mx.gpu(2)

train_dataset = data.text.WikiText2(segment='train', seq_len=bptt, eos='<eos>')

def get_frequencies(dataset):
    return collections.Counter(x for tup in dataset for x in tup[0] if x)

vocab = text.vocab.Vocabulary(get_frequencies(train_dataset), reserved_tokens=['<eos>', '<pad>'])

In [11]:
len(train_dataset)

59306

In [7]:
with open('./wikitext2_vocab.json') as file:
    file.write(vocab.json_serialize())

In [10]:
def index_tokens(data, label):
    return vocab[data], vocab[label]

val_dataset, test_dataset = [data.text.WikiText2(segment=segment,
                                                 seq_len=bptt,
                                                 eos='<eos>')
                             for segment in ['val', 'test']]

nbatch_train = len(train_dataset) // batch_size
train_data = gluon.data.DataLoader(train_dataset.transform(index_tokens),
                                   batch_size=batch_size,
                                   sampler=gluon.contrib.data.IntervalSampler(len(train_dataset),
                                                                              nbatch_train),
                                   last_batch='discard')

nbatch_val = len(val_dataset) // batch_size
val_data = gluon.data.DataLoader(val_dataset.transform(index_tokens),
                                 batch_size=batch_size,
                                 sampler=gluon.contrib.data.IntervalSampler(len(val_dataset),
                                                                            nbatch_val),
                                 last_batch='discard')

nbatch_test = len(test_dataset) // batch_size
test_data = gluon.data.DataLoader(test_dataset.transform(index_tokens),
                                  batch_size=batch_size,
                                  sampler=gluon.contrib.data.IntervalSampler(len(test_dataset),
                                                                             nbatch_test),
                                  last_batch='discard')

AssertionError: Interval 59306 must be smaller than length 59306

In [12]:
def index_tokens(data, label):
    return vocab[data], vocab[label]
nbatch_train = len(train_dataset) // batch_size
train_data = gluon.data.DataLoader(train_dataset.transform(index_tokens),
                                   batch_size=batch_size,
                                   sampler=gluon.contrib.data.IntervalSampler(len(train_dataset),
                                                                              nbatch_train),
                                   last_batch='discard')

AssertionError: Interval 59306 must be smaller than length 59306

## Step 4: Build language model

In [None]:
ntokens = len(vocab)

if args.weight_dropout:
    model = AWDLSTM(mode, vocab, emsize, nhid, nlayers,
                    dropout, dropout_h, dropout_i, dropout_e, weight_dropout,
                    tied)
else:
    model = RNNModel(mode, vocab, emsize, nhid,
                     nlayers, dropout, tied)

model.initialize(mx.init.Xavier(), ctx=context)


compression_params = None if args.gctype == 'none' else {'type': args.gctype, 'threshold': args.gcthreshold}
trainer = gluon.Trainer(model.collect_params(), 'sgd',
                        {'learning_rate': args.lr,
                         'momentum': 0,
                         'wd': 0},
                        compression_params=compression_params)
loss = gluon.loss.SoftmaxCrossEntropyLoss()

## Step 5: Train and evaluate language model

In [None]:
def detach(hidden):
    if isinstance(hidden, (tuple, list)):
        hidden = [i.detach() for i in hidden]
    else:
        hidden = hidden.detach()
    return hidden

def eval(data_source):
    total_L = 0.0
    ntotal = 0
    hidden = model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size, ctx=context[0])
    for i, (data, target) in enumerate(data_source):
        data = data.as_in_context(context[0]).T
        target= target.as_in_context(context[0]).T
        output, hidden = model(data, hidden)
        L = loss(mx.nd.reshape(output, (-3, -1)),
                 mx.nd.reshape(target, (-1,)))
        total_L += mx.nd.sum(L).asscalar()
        ntotal += L.size
    return total_L / ntotal

def train():
    best_val = float("Inf")
    start_train_time = time.time()
    for epoch in range(args.epochs):
        total_L = 0.0
        start_epoch_time = time.time()
        hiddens = [model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size, ctx=ctx) for ctx in context]
        for i, (data, target) in enumerate(train_data):
            start_batch_time = time.time()
            data = data.T
            target= target.T
            data_list = gluon.utils.split_and_load(data, context, even_split=False)
            target_list = gluon.utils.split_and_load(target, context, even_split=False)
            hiddens = [detach(hidden) for hidden in hiddens]
            Ls = []
            with autograd.record():
                for j, (X, y, h) in enumerate(zip(data_list, target_list, hiddens)):
                    output, h = model(X, h)
                    Ls.append(loss(mx.nd.reshape(output, (-3, -1)), mx.nd.reshape(y, (-1,))))
                    hiddens[j] = h
            for L in Ls:
                L.backward()
            for ctx in context:
                grads = [p.grad(ctx) for p in model.collect_params().values()]
                gluon.utils.clip_global_norm(grads, args.clip * args.bptt * args.batch_size)

            trainer.step(args.batch_size)

            total_L += sum([mx.nd.sum(L).asscalar() for L in Ls])

            if i % args.log_interval == 0 and i > 0:
                cur_L = total_L / args.bptt / args.batch_size / args.log_interval
                print('[Epoch %d Batch %d] loss %.2f, ppl %.2f'%(
                    epoch, i, cur_L, math.exp(cur_L)))
                total_L = 0.0

            print('[Epoch %d Batch %d] throughput %.2f samples/s'%(
                    epoch, i, args.batch_size / (time.time() - start_batch_time)))

        mx.nd.waitall()

        print('[Epoch %d] throughput %.2f samples/s'%(
                    epoch, (args.batch_size * nbatch_train) / (time.time() - start_epoch_time)))
        val_L = eval(val_data)
        print('[Epoch %d] time cost %.2fs, valid loss %.2f, valid ppl %.2f'%(
            epoch, time.time()-start_epoch_time, val_L, math.exp(val_L)))

        if val_L < best_val:
            best_val = val_L
            test_L = eval(test_data)
            model.collect_params().save(args.save)
            print('test loss %.2f, test ppl %.2f'%(test_L, math.exp(test_L)))

    print('Total training throughput %.2f samples/s'%(
                            (args.batch_size * nbatch_train * args.epochs) / (time.time() - start_train_time)))

## Step 6: Train and report language model performance

In [None]:
train()
model.collect_params().load(args.save, context)
val_L = eval(val_data)
test_L = eval(test_data)
print('Best validation loss %.2f, test ppl %.2f'%(val_L, math.exp(val_L)))
print('Best test loss %.2f, test ppl %.2f'%(test_L, math.exp(test_L)))
print('Total time cost %.2fs'%(time.time()-start_pipeline_time))