# 模仿《安娜卡列尼娜》的文本生成代码，构建一个平凡的世界的生成模型

In [1]:
import time
from collections import namedtuple
import numpy as np
import pandas as pd
import jieba
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

Instructions for updating:
non-resource variables are not supported in the long term


## 1 数据加载与预处理

In [39]:
with open ('./mini-data.txt') as f:
    text=f.read()
text=text.split('\n\n')
vocab=[]
# 对文本进行结巴分词处理
for sentence in text:
    seg=jieba.lcut(sentence)
    vocab.append(seg)
#     降维
vocab=sum(vocab,[])
vocab_set=set(vocab)

In [40]:
vocab_to_int={c:i for i,c in enumerate(vocab_set)}
int_to_vocab=dict(enumerate(vocab_set))
# 将分词后的文本转化为整数
encoded=np.array([vocab_to_int[c] for c in vocab],dtype=np.int32)

In [89]:
print(vocab_to_int.get('.'))

None


## 2 数据集划分

#### 我们定义一个batch中的序列个数为N（batch_size），定义单个序列长度为M（也就是我们的steps）。那么实际上我们每个batch是一个N x M的数组。在这里我们重新定义batch_size为一个N x M的数组，而不是batch中序列的个数。在上图中，当我们设置N=2， M=3时，我们可以得到每个batch的大小为2 x 3 = 6个字符，整个序列可以被分割成12 / 6 = 2个batch。

In [49]:
def get_batches(arr,n_seqs,n_steps):
    '''
    对已有的数组进行mini-batch划分
    arr:带划分的数组
    n_seqs:一个batch中的序列个数
    n_steps:单个序列包含的字符数
    '''
    batch_size=n_seqs * n_steps
    n_batches=int(len(arr) / batch_size)
#     保留完整的batch
    arr=arr[:batch_size * n_batches]
    arr=arr.reshape(n_seqs,-1)
    
    for n in range(0,arr.shape[1],n_steps):
    # inputs
        x = arr[:, n:n+n_steps]
        # targets
        y = np.zeros_like(x)
        y[:, :-1], y[:, -1] = x[:, 1:], x[:, 0]
        yield x, y  

In [50]:
batches=get_batches(encoded,10,50)
x,y=next(batches)

In [51]:
print('x\n', x[:10, :10])
print('\ny\n', y[:10, :10])
print(x.shape)
print(y.shape)

x
 [[1514 1122 1122 1122 1122  339  161 1931  372  398]
 [1256 1902 1825  529  328 1721  358  795  473 1991]
 [1907  208  747 2031  953 1066 1939 1240 1943 1506]
 [1994  148 1768  152 2422 1452 1074 2334  152 1359]
 [1415  441 2012  358  534 1290 1638  723  590  445]
 [2031 1148 1640 2123 1109 2436 1823 1493  168  420]
 [1940  839 1122 1122 1122 1122 1531 2437  549 1176]
 [ 843 1646 1673 1832  358  118 1991  669 1480 1266]
 [1531 2386 2031  657 2001  469 1067 1411  839 1531]
 [ 358 1060  152  358 1722  803 1726 2334  152  548]]

y
 [[1122 1122 1122 1122  339  161 1931  372  398 2192]
 [1902 1825  529  328 1721  358  795  473 1991 2427]
 [ 208  747 2031  953 1066 1939 1240 1943 1506  732]
 [ 148 1768  152 2422 1452 1074 2334  152 1359  346]
 [ 441 2012  358  534 1290 1638  723  590  445 1498]
 [1148 1640 2123 1109 2436 1823 1493  168  420  358]
 [ 839 1122 1122 1122 1122 1531 2437  549 1176 2334]
 [1646 1673 1832  358  118 1991  669 1480 1266 2334]
 [2386 2031  657 2001  469 1067 1411  

## 3 模型构建
####  模型构建部分主要包括了输入层、LSTM层、输出层、loss、optimize等部分的构建

### 3.1 输入层

In [52]:
def build_inputs(num_seqs,num_steps):
    '''
    构建输入层
    
    num_seqs:每个batch中的序列个数
    num_steps:每个序列包含的字符数
    '''
    inputs=tf.placeholder(tf.int32,shape=(num_seqs,num_steps),name='inputs')
    targets=tf.placeholder(tf.int32,shape=(num_seqs,num_steps),name='targets')  
    
#     加入keep_prob：dropout中保留的概率
    keep_prob=tf.placeholder(tf.float32,name='keep_prob')
    
    return inputs,targets,keep_prob    

### 3.2 LSTM层

In [60]:
def build_lstm(lstm_size,num_layers,batch_size,keep_prob):
    '''
    构建lstm层
    
    keep_prob:dropout保留的概率
    num_layers:lstm的隐藏层的数目
    batch_size
    '''
    def get_a_cell(lstm_size,keep_prob):
        lstm=tf.nn.rnn_cell.BasicLSTMCell(lstm_size)
        drop=tf.nn.rnn_cell.DropoutWrapper(lstm,output_keep_prob=keep_prob)
        return drop
    
#     构建一个基本的lstm单元
    lstm=tf.nn.rnn_cell.BasicLSTMCell(lstm_size) 
    
#     添加dropout
    drop=tf.nn.rnn_cell.DropoutWrapper(lstm,output_keep_prob=keep_prob)
    
#     堆叠:构建多隐层神经网络
# 使用RNN堆叠函数将前面构造的lstm_cell多层堆叠得到cell，堆叠次数为lstm中隐层数目-num_layers
    cell=tf.nn.rnn_cell.MultiRNNCell([get_a_cell(lstm_size,keep_prob) for _ in range(num_layers)])
#     设置LSTM单元的初始化状态为0
    initial_state=cell.zero_state(batch_size,tf.float32)
    
    return cell,initial_state

### 3.3 输出层

In [62]:
def build_output(lstm_output,in_size,out_size):
    '''
    构造输出层
    
    lstm_output:lstm层的输出结果
    in_size:lstm输出层重塑后的size
    out_size:softmax层的size
    '''
    
    seq_output=tf.concat(lstm_output,1)
    x=tf.reshape(seq_output,[-1,in_size])
    
#     将lstm层和softmax层全连接起来
    # 将lstm层与softmax层全连接
    with tf.variable_scope('softmax'):
        softmax_w = tf.Variable(tf.truncated_normal([in_size, out_size], stddev=0.1))
        softmax_b = tf.Variable(tf.zeros(out_size))
    
    # 计算logits
    logits = tf.matmul(x, softmax_w) + softmax_b
    
    # softmax层返回概率分布
    out = tf.nn.softmax(logits, name='predictions')
    
    return out, logits

### 3.4 训练误差计算

In [55]:
def build_loss(logits, targets, lstm_size, num_classes):
    '''
    根据logits和targets计算损失
    
    logits: 全连接层的输出结果（不经过softmax）
    targets: targets
    lstm_size
    num_classes: vocab_size
        
    '''
    
    # One-hot编码
    y_one_hot = tf.one_hot(targets, num_classes)
    y_reshaped = tf.reshape(y_one_hot, logits.get_shape())
    
    # Softmax cross entropy loss
    loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y_reshaped)
    loss = tf.reduce_mean(loss)
    
    return loss 

### 3.5 Optimizer

In [56]:
def build_optimizer(loss, learning_rate, grad_clip):
    ''' 
    构造Optimizer
   
    loss: 损失
    learning_rate: 学习率
    
    '''
    
    # 使用clipping gradients
    tvars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), grad_clip)
    train_op = tf.train.AdamOptimizer(learning_rate)
    optimizer = train_op.apply_gradients(zip(grads, tvars))
    
    return optimizer

### 3.6 模型组合
使用tf.nn.dynamic_run来运行RNN序列

In [71]:
class CharRNN:
    
    def __init__(self, num_classes, batch_size=64, num_steps=50, 
                       lstm_size=128, num_layers=2, learning_rate=0.001, 
                       grad_clip=5, sampling=False):
    
        # 如果sampling是True，则采用SGD
        if sampling == True:
            batch_size, num_steps = 1, 1
        else:
            batch_size, num_steps = batch_size, num_steps

        tf.reset_default_graph()
        
        # 输入层
        self.inputs, self.targets, self.keep_prob = build_inputs(batch_size, num_steps)

        # LSTM层
        cell, self.initial_state = build_lstm(lstm_size, num_layers, batch_size, self.keep_prob)
#         print('@@@@@@@@@@',cell)
#         print('##########',self.initial_state)
        # 对输入进行one-hot编码
        x_one_hot = tf.one_hot(self.inputs, num_classes)
#         print('$$$$$$$$$$',x_one_hot)
        # 运行RNN
        outputs, state = tf.nn.dynamic_rnn(cell, x_one_hot, initial_state=self.initial_state)
#         print('&&&&&&&&&&',state)
        print('output',outputs)
        self.final_state = state
        
        # 预测结果
        self.prediction, self.logits = build_output(outputs, lstm_size, num_classes)
        
        # Loss 和 optimizer (with gradient clipping)
        self.loss = build_loss(self.logits, self.targets, lstm_size, num_classes)
        self.optimizer = build_optimizer(self.loss, learning_rate, grad_clip)

## 4 模型训练
参数设置
在模型训练之前，我们首先初始化一些参数，我们的参数主要有：

num_seqs: 单个batch中序列的个数

num_steps: 单个序列中字符数目

lstm_size: 隐层结点个数

num_layers: LSTM层个数

learning_rate: 学习率

keep_prob: dropout层中保留结点比例

In [72]:
batch_size = 100         # Sequences per batch
num_steps = 100          # Number of sequence steps per batch
lstm_size = 512         # Size of hidden layers in LSTMs
num_layers = 2          # Number of LSTM layers
learning_rate = 0.001    # Learning rate
keep_prob = 0.5         # Dropout keep probability
epochs = 20
# 每n轮进行一次变量保存
save_every_n = 200

In [121]:
model = CharRNN(len(vocab_set), batch_size=batch_size, num_steps=num_steps,
                lstm_size=lstm_size, num_layers=num_layers, 
                learning_rate=learning_rate)

saver = tf.train.Saver(max_to_keep=100)
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    counter = 0
    for e in range(epochs):
        # Train network
        new_state = sess.run(model.initial_state)
#         print('@@@@@@@@@@@')
        loss = 0
#         print('@@@@@@@@@@@')
        for x, y in get_batches(encoded, batch_size, num_steps):
            print('counter:',counter)
            print('@@@@@@@@@@@')
            counter += 1
            start = time.time()
            feed = {model.inputs: x,
                    model.targets: y,
                    model.keep_prob: keep_prob,
                    model.initial_state: new_state}
            batch_loss, new_state, _ = sess.run([model.loss, 
                                                 model.final_state, 
                                                 model.optimizer], 
                                                 feed_dict=feed)
            
            end = time.time()
            # control the print lines
            print('counter:',counter)
            if counter % 100 == 0:
                print('轮数: {}/{}... '.format(e+1, epochs),
                      '训练步数: {}... '.format(counter),
                      '训练误差: {:.4f}... '.format(batch_loss),
                      '{:.4f} sec/batch'.format((end-start)))

            if (counter % save_every_n == 0):
                saver.save(sess, "checkpoints/i{}_l{}.ckpt".format(counter, lstm_size))
    
    saver.save(sess, "checkpoints/i{}_l{}.ckpt".format(counter, lstm_size))

output Tensor("rnn/transpose_1:0", shape=(100, 100, 512), dtype=float32)


In [116]:
# 查看checkpoints
tf.train.get_checkpoint_state('checkpoints')

model_checkpoint_path: "checkpoints/i0_l512.ckpt"
all_model_checkpoint_paths: "checkpoints/i0_l512.ckpt"

## 5 文本生成

In [75]:
def pick_top_n(preds, vocab_size, top_n=5):
    """
    从预测结果中选取前top_n个最可能的字符
    
    preds: 预测结果
    vocab_size
    top_n
    """
    p = np.squeeze(preds)
    # 将除了top_n个预测值的位置都置为0
    p[np.argsort(p)[:-top_n]] = 0
    # 归一化概率
    p = p / np.sum(p)
    # 随机选取一个字符
    c = np.random.choice(vocab_size, 1, p=p)[0]
    return c

In [109]:
def sample(checkpoint, n_samples, lstm_size, vocab_size, prime="同村 "):
    """
    生成新文本
    
    checkpoint: 某一轮迭代的参数文件
    n_sample: 新闻本的字符长度
    lstm_size: 隐层结点数
    vocab_size
    prime: 起始文本
    """
    # 将输入的单词转换为单个字符组成的list
#     samples = [c for c in prime]
    samples=prime
    # sampling=True意味着batch的size=1 x 1
    model = CharRNN(len(vocab_set), lstm_size=lstm_size, sampling=True)
    saver = tf.train.Saver()
    with tf.Session() as sess:
        # 加载模型参数，恢复训练
        saver.restore(sess, checkpoint)
        new_state = sess.run(model.initial_state)
#         for c in prime:
#             x = np.zeros((1, 1))
#             # 输入单个字符
#             x[0,0] = vocab_to_int[c]
#             feed = {model.inputs: x,
#                     model.keep_prob: 1.,
#                     model.initial_state: new_state}
#             preds, new_state = sess.run([model.prediction, model.final_state], 
#    
        x = np.zeros((1, 1))
        # 输入单个字符
        x[0,0] = vocab_to_int[samples]
        feed = {model.inputs: x,
                model.keep_prob: 1.,
                model.initial_state: new_state}
        preds, new_state = sess.run([model.prediction, model.final_state], 
                                     feed_dict=feed)

        c = pick_top_n(preds, len(vocab_set))
        # 添加字符到samples中
        samples=samples+int_to_vocab[c]
        
#         samples.append(int_to_vocab[c])
        
        # 不断生成字符，直到达到指定数目
        for i in range(n_samples):
            x[0,0] = c
            feed = {model.inputs: x,
                    model.keep_prob: 1.,
                    model.initial_state: new_state}
            preds, new_state = sess.run([model.prediction, model.final_state], 
                                         feed_dict=feed)

            c = pick_top_n(preds, len(vocab_set))
#             samples.append(int_to_vocab[c])
            samples=samples+int_to_vocab[c]
        
    return ''.join(samples)

In [110]:
tf.train.latest_checkpoint('checkpoints')

'checkpoints/i0_l512.ckpt'

In [122]:
# 选用最终的训练参数作为输入进行文本生成
checkpoint = tf.train.latest_checkpoint('checkpoints')
samp = sample(checkpoint, 2000, lstm_size, len(vocab_set), prime="同村")
print(samp)

output Tensor("rnn/transpose_1:0", shape=(1, 1, 512), dtype=float32)
INFO:tensorflow:Restoring parameters from checkpoints/i0_l512.ckpt
同村痼疾树木细长细长人长人长这天这天生抽瞥这天笑大众大众父母亲父母亲一枝优越感天地烂脏老丈人烂脏珍爱老丈人咬着牙相互之间黄瘦碗语言地黑只能只能这个二十里卓娅晚上该手润叶姐二十里交往更加更加或者先更加先得书籍甚至而且低下润叶全部剩水全部最住心情开朗住波澜除好饭上面惶恐不安上面两个出山灯谈恋爱没少心疼监察天天行走怒火县立县立美好县立剩剩剩叫化子敏捷地不要紧肮肮脏脏东西往不要紧春天大门口经历拒绝说不出说不出常膝盖满天膝盖找润叶找润叶不会那够蓝莹莹那生气大方养成这学肚子这学走到反动这个这个味道被润叶姐欧洲自有一位一位屙兄弟更加兄弟抬起大众这种影响趔趔趄趄一会村子交到回到交到村子体面武装部带水弥漫着扫视扫视细扫视当然细细笑扫视笑了笑细停止式停止柳树小房一段样子县城高层次和高层次县城铺天盖地铺天盖地铺天盖地相当内心队伍尿尿爽快开身开身仙女仙女非洲1975般的仙女仙女转暖1975弯腰脚长长的一堆弯腰脚默默地脚默默地默默地不体面日子离村不体面上这么这学同龄茶杯茶杯理解砍大方声音指着指着草木使得高更大满脸偷偷很象抚摸抚摸早年间抚摸吃晚饭一段路瞧见成份成份下蚀化下蚀化敏感走出一路泪珠错落看作推荐空中脸色脸色早年间盖早年间盖实在脚上脚上脚上优越感：女生自然自然自然学会来自自然个混合混合混合省粮省粮混合表明表明校园内表明孩子孩子经不住校园内去处去处去处爱读按说莫测本书许多莫测本书本书一下子朦胧朦胧发笑当然鞋样鞋样大厦大厦心理心理枝条什么样蜂涌沟里伙食提着专政惊异不想胳膊窝思维思维一种铁勺小山吃得起原来自己本书浮浅本书仍然四月笑笑笑潦倒哪里脑门农乙菜乙菜无能铁勺亲密一路老染得染得泥水泥水正在不光泥水滑落先不光线绳菜菜借给借给菜菜菜不一会伙食送到泥土通过到来到来门外桌子黄瘦关注光彩踏泥断定省钱空无一人踏泥这个善意善意好人好事边沿发展晚上晚上轰象刚微小一种一点迹象一点三月整天背呆等等背纷纷人物仍然包围既见共骑各个各个哥共骑克制克制留有破烂砖墙明晃晃耿直如同如同影响影响血涌村子情感村子哥哥、同校只能各个各个低人一等不体面念家境念欧