# 基于Seq2Seq生成对联

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
#处理数据
source = open('source.txt','w')
target = open('target.txt','w')

with open('对联.txt','r') as f:
    for line in f.readlines():
        up_down = line.strip().split(' ')
        source.write(up_down[0]+'\n')
        target.write(up_down[1]+'\n')      

In [3]:
with open('source.txt','r') as f:
    source_data = f.read()
with open('target.txt','r') as f:
    target_data = f.read()
    
#将上下联分开
source = source_data.split('\n')
target = target_data.split('\n')

In [7]:
#seq2seq模型中不能输入字符，输入的应该是数字 就是使用数字表示文本
#首先建立字符到数字和数字到字符的字典
def extract_character_vocab(data):
    '''
    参数:data
    返回:voc_int int_voc
    '''
    special_words = ['<PAD>','<UNK>','<GO>','<EOS>']
    #<PAD>用于字符补全，'<UNK>','<GO>'用于Decoder端序列中<UNK>代替一些未出现的词或者低频词
    set_words = list(set([character for line in data for character in line]))
    int_to_voc = {idx:value for idx,value in enumerate(special_words+set_words)}
    voc_to_int = {word:idx for idx,word in int_to_voc.items()}
    return voc_to_int,int_to_voc

source_letter_to_int,source_int_to_letter = extract_character_vocab(source+target)
target_letter_to_int,target_int_to_letter = extract_character_vocab(source+target)

In [8]:
#dict的get方法定义了两个参数（a,b） 如果dict中存在key-a则返回dict[a]否则返回b
#将字符用数字表示
source_int = [[source_letter_to_int.get(letter,source_letter_to_int['<UNK>']) for letter in line]for line in source]

target_int = [[target_letter_to_int.get(letter,target_letter_to_int['<UNK>']) for letter in line]for line in target]

# 按步骤建立Encoder和Decoder模型

In [9]:
#1、首先建立encoder层
def get_encoder_layer(input_data,rnn_size,num_layers,source_sequence_length,source_vocab_size,encoding_embedding_size):
    
    #tf.contrib.layers.embed_sequence 将文章的的每一个字使用embedding表示
    encoder_embed_input = tf.contrib.layers.embed_sequence(input_data,source_voc_size,embed_dim)
    
    def get_LSTMCell(rnn_size):
        return tf.contrib.rnn.BasicLSTMCell(rnn_size,initializer=tf.random_uniform_initializer(-0.1,0.1,seed=2))
    
    cell = tf.contrib.rnn.MultiRNNCell([get_LSTMCell(rnn_size) for _ in range(num_layers)])
    
    encoder_output,encoder_state = tf.nn.dynamic_rnn(cell,encoder_embed_input,sequence_length=source_sequence_length,dtype = tf.float32)
    
    return encoder_output,encoder_state


In [10]:
#2、其次建立decoder层
#decoder层包含两个阶段 training和predict两个阶段共享参数
#需要在target句子前加一个<go>表示句子的开头，还需要将target中的最后一个字符去掉
def process_decoder_input(data,voc_to_int,batch_size):
    
    ending = tf.strided_slice(data,[0,0],[batch_size,-1],[1,1])#将target中的最后一个字符去掉
    #fill参数表示的（形状,填充的数字）
    decoder_input = tf.concat([tf.fill([batch_size,1],voc_to_int['<GO>']),ending],1)#target句子前加一个<go>表示句子的开头
    
    return decoder_input

In [11]:
def get_decoder_layer(target_letter_to_int,decoding_embedding_size,num_layers,rnn_size,
                   target_sequence_length,max_target_sequence_length,encoder_state,decoder_input):
    #1、embedding
    target_vocab_size = len(target_letter_to_int)
    decoder_embeddings = tf.Variable(tf.random_uniform([target_vocab_size,decoder_embedding_size]))
    decoder_embed_input = tf.nn.embedding_lookup(decoder_embeddings,decoder_input)
    #2、构造Decoder中的RNNCell单元
    def get_decode_cell(rnn_size):
        return tf.contrib.rnn.BasicLSTMCell(rnn_size,initializer=tf.random_uniform_initializer(-0.1,0.1,seed=2))
    
    cell = tf.contrib.rnn.MultiRNNCell([get_decoder_layer(rnn_size) for _ in range(num_layers)])
    #3、Output全连接层
    output_layer = Dense(target_vocab_size,kernel_initializer=tf.truncated_normal_initializer(mean=0.1,stddev=0.1))
    #4、Training decoder
    with tf.Variable_scope('decode'):
        training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=decoder_embed_input,
                                                            sequence_length = target_sequence_length,
                                                           time_major = False)
        training_decoder = tf.contrib.seq2seq.BasicDecoder(cell,training_helper,encoder_state,output_layer)
        training_decoder_output,_,_ = tf.contrib.seq2seq.dynamic_decode(training_decoder,impute_finished=True,
                                                                        maximum_iterations=max_target_sequence_length)
    #5、Predicting decoder
    #与前一个decoder共享参数
    with tf.Variable_scope('decode',reuse = True):
        start_token = tf.tile(tf.constant([target_letter_to_int['GO']],dtype = tf.int32),[batch_size],name='start_token')
        predicting_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(decoder_embeddings,start_tokens,
                                                                     target_letter_to_int['<EOS>'])
        predicting_decoder = tf.contrib.seq2seq.BasicDecoder(cell,predicting_helper,encoder_state,output_layer)
        predicting_decoder_output,_,_ = tf.contrib.seq2seq.dynamic_decode(pre,impute_finished=True,
                                                                          maximum_iterations=max_target_sequence_length)
        return training_decoder_output,predicting_decoder_output

In [12]:
#上面已经把Encoder和Decoder建立完毕，之后使用seq2seq模型将其链接起来
def seq2seq_model(input_data,targets,lr,target_sequence_length,
                  max_target_sequence_length,source_sequence_length,
                  source_vocab_size,target_vocab_size,encoder_embedding_size,decoder_embedding_size,rnn_size,num_layers):
    _,encoder_state = get_encoder_layer(input_data,
                                        rnn_size,
                                        num_layers,
                                        source_sequence_length,
                                        source_vocab_size,
                                        encoding_embedding_size)
    decoder_input = process_decoder_input(target,target_letter_to_int,batch_size)
    training_decoder_output,predicting_decoder_output = get_decoder_layer(target_letter_to_int,decoding_embedding_size,
                                                                          num_layers,rnn_size,
                   target_sequence_length,max_target_sequence_length,encoder_state,decoder_input)
    return training_decoder_output,predicting_decoder_output

In [13]:
#定义输入数据函数
def get_inputs():
    inputs = tf.placehloder(tf.int32,[None,None],name = 'inputs')
    targets = tf.placeholder(tf.int32,[None,None],name = 'targets')
    learning_rate = tf.placeholder(tf.float32,name = 'learning_rate')
    
    target_sequence_length = tf.placeholder(tf.int32,(None,),name = 'target_sequence_length')
    max_target_sequence_length = tf.reduce_max(target_sequence_length,name='max_target_len')
    source_sequence_length = tf.placeholder(tf.int32,(None,),name = 'source_sequence_length')
    return inputs,targets,learning_rate,target_sequence_length,max_target_sequence_length,source_sequence_length

In [None]:
#定义loss和optimizer
train_graph = tf.Graph()

with train_graph.as_default():
    
    inputs,targets,lr,target_sequence_length,max_target_sequence_length,source_sequence_length = get_inputs()
    training_decoder_output,predicting_decoder_output = seq2seq_model(  inputs,
                                                                        targets,
                                                                        lr,
                                                                        target_sequence_length,
                                                                        max_target_sequence_length,
                                                                        source_sequence_length,
                                                                        len(source_letter_to_int),
                                                                        len(target_letter_to_int),
                                                                        encoding_embedding_size,
                                                                        decoding_embedding_size,
                                                                        rnn_size,
                                                                        num_layers)
    train_logits = tf.identity(training_decoder_output,rnn_output,'logits')
    predict_logits = tf.identity(predicting_decoder_output.sample_id,'predictions')
    masks = tf.sequence_mask(target_sequence_length,max_target_sequence_length,dtype = tf.float32,name='masks')
    with tf.name_scope('optimization'):
        