## Seq2Seq(with Attention)

In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

In [30]:
tf.reset_default_graph()
#创建真正的翻译seq2seq


#P是用来填充长度不够的字符串的
#S表示decoder的输入开端
#E表示decoder的输出结尾
sentences = ['ich mochte ein bier P', 'S i want a beer', 'i want a beer E']#data
words = ' '.join(sentences).split()#使用’ ‘连接list中的每个元素 构建词表
words = list(set(words))#去一下重
word_num = {w:i for i,w in enumerate(words)}#建立数字和单词的对应关系
num_word = {i:w for i,w in enumerate(words)}

#定义超参数
n_step = 5#句子长度
n_hidden = 128#隐层单元数量
n_class = len(word_num)#词表长度

def get_batch(sentences):
    input_batch = [np.eye(n_class)[[word_num[w] for w in sentences[0].split()]]]#[batch_size,n_step,n_class]
    output_batch = [np.eye(n_class)[[word_num[w] for w in sentences[1].split()]]]#[batch_size,n_step+1,n_class]
    target_batch = [[word_num[w] for w in sentences[2].split()]]#[batch_size,n_step+1]
    return input_batch,output_batch,target_batch

enc_input = tf.placeholder(tf.float32,[None,None,n_class])#[batch_size,n_step,n_class]
dec_input = tf.placeholder(tf.float32,[None,None,n_class])#[batch_size,n_step,n_class]
target = tf.placeholder(tf.int64,[1, n_step])##[batch_size,n_step]不是one-hot

attn = tf.Variable(tf.random_normal([n_hidden,n_hidden]))
out = tf.Variable(tf.random_normal([n_hidden*2,n_class]))

def get_att_score(decoder_hidden,encoder_hidden):
    #encoder_hidden[batch_size,n_hidden]
    score = tf.squeeze(tf.matmul(encoder_hidden,attn),0)#[n_hidden]
    decoder_hidden = tf.squeeze(decoder_hidden,[0,1])#[n_hidden]
    return tf.tensordot(decoder_hidden,score,1)
    
def get_att_weight(decoder_hidden,encoder_hidden):
    attn_score = []
    encoder_hidden = tf.transpose(encoder_hidden,[1,0,2])#[n_step,batch_size,n_hidden]
    for i in range(n_step):
        attn_score.append(get_att_score(decoder_hidden,encoder_hidden[i]))
    return tf.reshape(tf.nn.softmax(attn_score),[1,1,-1])# [1, 1, n_step]

Attention = []#[n_step,n_step]
model = []

with tf.variable_scope('encoder'):
    encoder = tf.contrib.rnn.BasicRNNCell(n_hidden)
    encoder = tf.contrib.rnn.DropoutWrapper(encoder,output_keep_prob=0.5)
    encoder_hidden,encoder_output = tf.nn.dynamic_rnn(encoder,enc_input,dtype=tf.float32)
    #enc_input [batch_size,n_step,n_class] 
    #encoder_hidden [batch_size,n_step,n_hidden]
    #encoder_output [batch_size,n_hidden]
    
with tf.variable_scope('decoder'):
    decoder = tf.contrib.rnn.BasicRNNCell(n_hidden)
    decoder = tf.contrib.rnn.DropoutWrapper(decoder,output_keep_prob=0.5)#dropout 随机失活
    inputs = tf.transpose(dec_input,[1,0,2])#Time_major 如果为True 那么表示输入的向量的第一维度必须是time_step
    hidden = encoder_output
    for i in range(n_step):
        #这里的输入只取dec_input的第一项
        decoder_hidden,hidden = tf.nn.dynamic_rnn(decoder,tf.expand_dims(inputs[i],1),initial_state=hidden,
                                                          dtype=tf.float32,time_major = True)#tf.expand_dims 在指定的地方增加一个维度
        att_weights = get_att_weight(decoder_hidden,encoder_hidden)
        Attention.append(tf.squeeze(att_weights))
        context = tf.matmul(att_weights,encoder_hidden)#[1,1,n_step]x[1,n_step,n_hidden] = [1,1,n_hidden]
        decoder_hidden = tf.squeeze(decoder_hidden,0)#[1,n_step]
        context = tf.squeeze(context, 1)#[1,n_hidden]
        model.append(tf.matmul(tf.concat((decoder_hidden,context),1),out))# [n_step, batch_size(=1), n_class]

train_att = tf.stack([Attention[0],Attention[1],Attention[2],Attention[3],Attention[4]],0)
model = tf.transpose(model,[1,0,2])# [batch_size(=1),n_step, n_class]
prediction = tf.argmax(model,2)
cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=model, labels=target))
optimizer = tf.train.AdamOptimizer(0.001).minimize(cost)

with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run(init)
    for epoch in range(2000):
        input_batch, output_batch, target_batch = get_batch(sentences)
        _, loss, attention = sess.run([optimizer, cost, train_att],
                                      feed_dict={enc_input: input_batch, dec_input: output_batch, target: target_batch})

        if (epoch + 1) % 400 == 0:
            print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

    predict_batch = [np.eye(n_class)[[word_num[n] for n in 'P P P P P'.split()]]]
    result = sess.run(prediction, feed_dict={enc_input: input_batch, dec_input: predict_batch})
    print(sentences[0].split(), '->', [num_word[n] for n in result[0]])

Epoch: 0400 cost = 0.000012
Epoch: 0800 cost = 0.067272
Epoch: 1200 cost = 0.000000
Epoch: 1600 cost = 0.000000
Epoch: 2000 cost = 0.000000
['ich', 'mochte', 'ein', 'bier', 'P'] -> ['i', 'want', 'a', 'beer', 'E']
