### CharCNN

Code source: https://github.com/L1aoXingyu/Char-RNN-Gluon

#### Data Preprocessing

In [4]:
#!/usr/bin/python
# -*- coding: utf-8 -*-

import codecs
import numpy as np
import mxnet as mx
from mxnet import nd
from mxnet import gluon as g


class TextConverter(object):
    def __init__(self, text_path, max_vocab=5000):
        ## Read in data(abstract):
        with codecs.open(text_path, mode='r', encoding='utf-8') as f:
            text_file = f.readlines()
        word_list = [v for s in text_file for v in s]
        vocab = set(word_list)
        # get word frequency
        vocab_count = {}
        for word in vocab:
            vocab_count[word] = 0
        for word in word_list:
            vocab_count[word] += 1
        vocab_count_list = []
        for word in vocab_count:
            vocab_count_list.append((word, vocab_count[word]))
        vocab_count_list.sort(key=lambda x: x[1], reverse=True)
        # if exceeds max word length, delete words with least word frequency 
        if len(vocab_count_list) > max_vocab:
            vocab_count_list = vocab_count_list[:max_vocab]
        vocab = [x[0] for x in vocab_count_list]
        self.vocab = vocab

        self.word_to_int_table = {c: i for i, c in enumerate(self.vocab)}
        self.int_to_word_table = dict(enumerate(self.vocab))

    @property
    def vocab_size(self):
        return len(self.vocab) + 1

    def word_to_int(self, word):
        if word in self.word_to_int_table:
            return self.word_to_int_table[word]
        else:
            return len(self.vocab)

    def int_to_word(self, index):
        if index == len(self.vocab):
            return '<unk>'
        elif index < len(self.vocab):
            return self.int_to_word_table[index]
        else:
            raise Exception('Unknow index!')

    def text_to_arr(self, text):
        arr = []
        for word in text:
            arr.append(self.word_to_int(word))
        return np.array(arr)

    def arr_to_text(self, arr):
        words = []
        for index in arr:
            words.append(self.int_to_word(index))
        return "".join(words)


class TextData(g.data.Dataset):
    def __init__(self, text_path, n_step, arr_to_idx):
        self.n_step = n_step

        with codecs.open(text_path, mode='r', encoding='utf-8') as f:
            data = f.readlines()
        text = [v for s in data for v in s]
        num_seq = int(len(text) / n_step)
        self.num_seq = num_seq
        text = text[:num_seq * n_step]  # cut th
        arr = arr_to_idx(text)
        arr = arr.reshape((num_seq, -1))
        self.arr = arr

    def __getitem__(self, index):
        x = self.arr[index, :]
        y = np.zeros(x.shape)
        y[:-1], y[-1] = x[1:], x[0]
        return nd.array(x), nd.array(y)

    def __len__(self):
        return self.num_seq

  from ._conv import register_converters as _register_converters


In [7]:
import mxnet as mx
from mxnet import nd
from mxnet import gluon as g


class CharRNN(g.Block):
    def __init__(self, num_classes, embed_dim, hidden_size, num_layers,
                 dropout):
        super(CharRNN, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size

        with self.name_scope():
            self.word_to_vec = g.nn.Embedding(num_classes, embed_dim)
            self.rnn = g.rnn.GRU(hidden_size, num_layers, dropout=dropout)
            self.proj = g.nn.Dense(num_classes)

    def forward(self, x, hs=None):
        batch = x.shape[0]
        if hs is None:
            hs = nd.zeros(
                (self.num_layers, batch, self.hidden_size), ctx=mx.gpu())
        word_embed = self.word_to_vec(x)  # batch x len x embed
        word_embed = word_embed.transpose((1, 0, 2))  # len x batch x embed
        out, h0 = self.rnn(word_embed, hs)  # len x batch x hidden
        le, mb, hd = out.shape
        out = out.reshape((le * mb, hd))
        out = self.proj(out)
        out = out.reshape((le, mb, -1))
        out = out.transpose((1, 0, 2))  # batch x len x hidden
        return out.reshape((-1, out.shape[2])), h0

In [9]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import argparse
import os
import sys
import time

import numpy as np
import mxnet as mx
from mxnet import nd
from mxnet import gluon as g



def train_epoch(ctx, model, dataloader, criterion, optimizer, clip):
    running_loss = 0.0
    n_total = 0.0
    for batch in dataloader:
        x, y = batch
        y = y.astype('float32').as_in_context(ctx).swapaxes(0, 1)
        x = x.as_in_context(ctx).swapaxes(0, 1)
        mb_size = x.shape[0]
        with g.autograd.record():
            out = model(x)
            batch_loss = criterion(out, y)
        batch_loss.backward()

        grads = [i.grad(ctx) for i in model.collect_params().values()]
        total_norm = g.utils.clip_global_norm(grads, clip * y.shape[0] * y.shape[1])

        if np.isfinite(total_norm):
            optimizer.step(mb_size)
            running_loss += nd.sum(batch_loss).asscalar()
            n_total += mb_size
        else:
            raise UserWarning('nan/inf detected. skipping batch')
    return running_loss / n_total


def train(ctx, n_epoch, model, dataloader, optimizer, criterion, clip):
    for e in range(n_epoch):
        print('{}/{}'.format(e + 1, n_epoch))
        since = time.time()
        loss = train_epoch(ctx, model, dataloader, criterion, optimizer, clip)
        print('Loss: {:.6f}, Time: {:.3} s'.format(loss, time.time() - since))
        if (e + 1) % 1000 == 0:
            if not os.path.exists('./checkpoints'):
                os.mkdir('./checkpoints')
            model.save_params('./checkpoints/model_{}.params'.format(e + 1))


def pick_top_n(preds, top_n=5):
    top_pred_prob, top_pred_label = nd.topk(preds, axis=2, k=top_n, ret_typ='both')
    top_pred_label = top_pred_label.asnumpy()
    top_pred_prob /= nd.sum(top_pred_prob, axis=2, keepdims=True)
    top_pred_prob = top_pred_prob.asnumpy().reshape((-1, ))
    top_pred_label = top_pred_label.reshape((-1, ))
    c = np.random.choice(top_pred_label, size=1, p=top_pred_prob)
    return c


def sample(ctx, model, checkpoint, convert, arr_to_text, prime, text_len=20):
    '''
    将载入好权重的模型读入，指定开始字符和长度进行生成，将生成的结果保存到txt文件中
    checkpoint: 载入的模型
    convert: 文本和下标转换
    prime: 起始文本
    text_len: 生成文本长度
    '''
    model.load_params(checkpoint, ctx=ctx)
    samples = [convert(c) for c in prime]
    input_txt = nd.array(samples).reshape((-1 ,1)).as_in_context(ctx)
    embed = model[0](input_txt)
    hs = nd.zeros(model[1].state_info(1)[0]['shape'], ctx=ctx)
    _, init_state = model[1](embed, hs)

    result = samples
    model_input = input_txt[:, input_txt.shape[1] - 1].reshape((-1, 1))
    for i in range(text_len):
        # out是输出的字符，大小为1 x vocab
        # init_state是RNN传递的hidden state
        with mx.autograd.predict_mode():
            embed = model[0](model_input)
            out, init_state = model[1](embed, init_state)
            out = model[2](out)
        pred = pick_top_n(out)
        model_input = nd.array(pred).reshape((-1, 1)).as_in_context(ctx)
        result.append(pred[0])
    return arr_to_text(result)


def main():
    '''main function'''
    parser = argparse.ArgumentParser()
    parser.add_argument('--state', required=True, help='训练还是预测, train or eval')
    parser.add_argument('--txt', required=True, help='进行训练的txt文件')
    parser.add_argument('--batch', default=128, type=int, help='训练的batch size')
    parser.add_argument('--epoch', default=5000, type=int, help='跑多少个epoch')
    parser.add_argument('--len', default=100, type=int, help='输入模型的序列长度')
    parser.add_argument(
        '--max_vocab', default=5000, type=int, help='最多存储的字符数目')
    parser.add_argument('--embed', default=512, type=int, help='词向量的维度')
    parser.add_argument('--hidden', default=512, type=int, help='RNN的输出维度')
    parser.add_argument('--n_layer', default=2, type=int, help='RNN的层数')
    parser.add_argument(
        '--dropout', default=0.5, type=float, help='RNN中drop的概率')
    parser.add_argument('--begin', default='我', type=lambda s: unicode(s, 'utf8'), help='给出生成文本的开始')
    parser.add_argument('--pred_len', default=20, type=int, help='生成文本的长度')
    parser.add_argument('--checkpoint', help='载入模型的位置')
    parser.add_argument('--clip', default=0.2, type=float, help='权重上限')
    parser.add_argument('--use-gpu', default=True, help='是否使用的GPU')
    opt = parser.parse_args()
    print(opt)

    convert = TextConverter(opt.txt, max_vocab=opt.max_vocab)
    model = g.nn.Sequential()
    with model.name_scope():
        model.add(g.nn.Embedding(convert.vocab_size, opt.embed))
        model.add(g.rnn.GRU(opt.hidden, opt.n_layer, dropout=opt.dropout))
        model.add(g.nn.Dense(convert.vocab_size, flatten=False))

    ctx = mx.gpu(0) if opt.use_gpu else mx.cpu()
    model.initialize(ctx=ctx)

    if opt.state == 'train':
        dataset = TextData(opt.txt, opt.len, convert.text_to_arr)
        dataloader = g.data.DataLoader(dataset, opt.batch, shuffle=True)
        lr_sch = mx.lr_scheduler.FactorScheduler(
            int(1000 * len(dataloader)), factor=0.1)
        optimizer = g.Trainer(model.collect_params(), 'adam', {
            'learning_rate': 1e-3,
            'clip_gradient': 3,
            'lr_scheduler': lr_sch
        })
        cross_entropy = g.loss.SoftmaxCrossEntropyLoss()
        train(ctx, opt.epoch, model, dataloader, optimizer, cross_entropy, opt.clip)

    elif opt.state == 'eval':
        pred_text = sample(ctx, model, opt.checkpoint, convert.word_to_int,
                           convert.arr_to_text, opt.begin, opt.pred_len)
        print(pred_text)
        with open('./generate.txt', 'a') as f:
            f.write(pred_text)
            f.write('\n')
    else:
        print('Error state, must choose from train and eval!')


if __name__ == '__main__':
    main()

NameError: name 'unicode' is not defined