In [1]:
import mxnet as mx
import numpy as np
import random
import bisect

In [2]:
import logging
import imp
imp.reload(logging)
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')

# A Glance of LSTM structure and embedding layer


<img src="http://webdocs.cs.ualberta.ca/~bx3/char-rnn_1.png">


<img src="http://webdocs.cs.ualberta.ca/~bx3/char-rnn_2.png">

In [3]:
from lstm import lstm_unroll, lstm_inference_symbol
from bucket_io import BucketSentenceIter
from rnn_model import LSTMInferenceModel

# Helper functions


In [4]:
# Read from doc
def read_content(path):
    with open(path) as ins:
        content = ins.read()
        return content

def build_vocab(path):
    content = read_content(path)
    content = list(content)
    idx = 1 # 0 is left for zero-padding
    the_vocab = {}
    for word in content:
        if len(word) == 0:
            continue
        if not word in the_vocab:
            the_vocab[word] = idx
            idx += 1
    return the_vocab

def text2id(sentence, the_vocab):
    words = list(sentence)
    words = [the_vocab[w] for w in words if len(w) > 0]
    return words

In [5]:
# Evaluation 
def Perplexity(label, pred):
    loss = 0.
    for i in range(pred.shape[0]):
        loss += -np.log(max(1e-10, pred[i][int(label[i])]))
    return np.exp(loss / label.size)

# LSTM Hyperparameters

In [6]:
batch_size = 32
buckets = [129]
num_hidden = 512
num_embed = 256
num_lstm_layer = 3

In [7]:
num_epoch = 2
learning_rate = 0.01
momentum = 0.0

In [8]:
devs = [mx.context.gpu(i) for i in range(1)]

In [9]:
vocab = build_vocab("./obama.txt")

In [10]:
# generate symbol
def sym_gen(seq_len):
    return lstm_unroll(num_lstm_layer, seq_len, len(vocab) + 1,
                       num_hidden=num_hidden, num_embed=num_embed,
                       num_label=len(vocab) + 1, dropout=0.2)

In [11]:
# initalize states for LSTM
init_c = [('l%d_init_c'%l, (batch_size, num_hidden)) for l in range(num_lstm_layer)]
init_h = [('l%d_init_h'%l, (batch_size, num_hidden)) for l in range(num_lstm_layer)]
init_states = init_c + init_h

In [12]:
data_train = BucketSentenceIter("./obama.txt", vocab, buckets, batch_size,
                                init_states, seperate_char='\n',
                                text2id=text2id, read_content=read_content)

bucket of len 129 : 8290 samples


  bucket_plan = np.hstack([np.zeros(n, int)+i for i, n in enumerate(bucket_n_batches)])


In [13]:
symbol = sym_gen(buckets[0])

# Train model

In [14]:
model = mx.model.FeedForward(ctx=devs,
                             symbol=symbol,
                             num_epoch=num_epoch,
                             learning_rate=learning_rate,
                             momentum=momentum,
                             wd=0.0001,
                             initializer=mx.init.Xavier(factor_type="in", magnitude=2.34))

In [15]:
model.fit(X=data_train,
          eval_metric = mx.metric.np(Perplexity),
          batch_end_callback=mx.callback.Speedometer(batch_size, 50),
          epoch_end_callback=mx.callback.do_checkpoint("obama"))

09:10:19 INFO:Start training with [gpu(0)]
09:10:39 INFO:Epoch[0] Batch [50]	Speed: 88.50 samples/sec	Train-Perplexity=31.854990
09:10:57 INFO:Epoch[0] Batch [100]	Speed: 86.90 samples/sec	Train-Perplexity=27.432798
09:11:15 INFO:Epoch[0] Batch [150]	Speed: 86.80 samples/sec	Train-Perplexity=25.817704
09:11:34 INFO:Epoch[0] Batch [200]	Speed: 86.90 samples/sec	Train-Perplexity=24.971366
09:11:52 INFO:Epoch[0] Batch [250]	Speed: 86.85 samples/sec	Train-Perplexity=24.455250
09:11:56 INFO:Epoch[0] Resetting Data Iterator
09:11:56 INFO:Epoch[0] Train-Perplexity=24.386388
09:11:56 INFO:Epoch[0] Time cost=95.259
09:11:56 INFO:Saved checkpoint to "obama-0001.params"
09:12:14 INFO:Epoch[1] Batch [50]	Speed: 88.60 samples/sec	Train-Perplexity=22.404177
09:12:33 INFO:Epoch[1] Batch [100]	Speed: 86.79 samples/sec	Train-Perplexity=22.363506
09:12:51 INFO:Epoch[1] Batch [150]	Speed: 86.87 samples/sec	Train-Perplexity=22.321544
09:13:09 INFO:Epoch[1] Batch [200]	Speed: 86.84 samples/sec	Train-Perple

# Inference from model

In [16]:
# helper strcuture for prediction
def MakeRevertVocab(vocab):
    dic = {}
    for k, v in vocab.items():
        dic[v] = k
    return dic

In [17]:
# make input from char
def MakeInput(char, vocab, arr):
    idx = vocab[char]
    tmp = np.zeros((1,))
    tmp[0] = idx
    arr[:] = tmp

In [18]:
# helper function for random sample 
def _cdf(weights):
    total = sum(weights)
    result = []
    cumsum = 0
    for w in weights:
        cumsum += w
        result.append(cumsum / total)
    return result

def _choice(population, weights):
    assert len(population) == len(weights)
    cdf_vals = _cdf(weights)
    x = random.random()
    idx = bisect.bisect(cdf_vals, x)
    return population[idx]

def MakeOutput(prob, vocab, sample=False, temperature=1.):
    if sample == False:
        idx = np.argmax(prob, axis=1)[0]
    else:
        fix_dict = [""] + [vocab[i] for i in range(1, len(vocab) + 1)]
        scale_prob = np.clip(prob, 1e-6, 1 - 1e-6)
        rescale = np.exp(np.log(scale_prob) / temperature)
        rescale[:] /= rescale.sum()
        return _choice(fix_dict, rescale[0, :])
    try:
        char = vocab[idx]
    except:
        char = ''
    return char

In [19]:
# load from check-point
_, arg_params, __ = mx.model.load_checkpoint("obama", 75)

In [20]:
model = LSTMInferenceModel(num_lstm_layer, len(vocab) + 1,
                           num_hidden=num_hidden, num_embed=num_embed,
                           num_label=len(vocab) + 1, arg_params=arg_params, dropout=0.2)

In [21]:
# generate a sequence
seq_length = 600
input_ndarray = mx.nd.zeros((1,))
revert_vocab = MakeRevertVocab(vocab)
output ='T'
MakeInput(output[-1], vocab, input_ndarray)
for i in range(seq_length):
    prob = model.forward(input_ndarray, i==0)
    next_char = MakeOutput(prob, revert_vocab, False)
    output += next_char
    MakeInput(output[-1], vocab, input_ndarray)

    

In [22]:
print(output)

The American people have been talking about how to see what we can afford - to give them the chance to start by the conversation that it was the moment when they come home. It's time to start by ally reduced to the oress of our hardiners of oure scandals and his debate about what we can do to change the world as the oned and the onection wo ha the hardineres of tich their school and communism and anounces in the ones and the oure shoression. I will be able to go to college. They say they are going to be a part of the oure as about th
