In [2]:
# 示例代码运行环境
%load_ext watermark
%watermark -p tensorflow,numpy -v -m

CPython 2.7.6
IPython 5.2.2

tensorflow 1.0.0
numpy 1.12.0

compiler   : GCC 4.8.4
system     : Linux
release    : 4.4.43-boot2docker
machine    : x86_64
processor  : x86_64
CPU cores  : 1
interpreter: 64bit


In [3]:
import tensorflow as tf
from tensorflow.contrib.legacy_seq2seq import basic_rnn_seq2seq, embedding_rnn_seq2seq, sequence_loss, embedding_attention_seq2seq
from tensorflow.python.ops import variable_scope

In [4]:
%matplotlib inline
from matplotlib import pyplot as plt
import os
import time
from collections import Counter
import math
import jieba
from jieba import posseg as pseg
import numpy as np
import datetime
import pickle

In [5]:
LINE_LIMIT = 0
VOCAB_SIZE = 5000

In [6]:
DICT_FILE = './corpus/dict_2.pkl'
RAW_FILE = './corpus/ssa.out'

In [7]:
def load_dict(file_name):
    wdict = {}
    wcnt = Counter()
    with open(file_name, 'rb') as f:
        (wdict, wcnt) = pickle.load(f)

    rdict = dict(zip(wdict.values(), wdict.keys())) 
    return wcnt, wdict, rdict

In [8]:
t0 = time.time()
word_cnt, train_dict, train_reverse_dict = load_dict(DICT_FILE)
t1 = time.time()
print(t1-t0)

0.0677170753479


In [9]:
def load_train_data(file_name):
    with open(file_name, 'rb') as f:
        lines = f.readlines()
        sentences = []
        for line in lines:
            line = line.decode('utf8').rstrip()
            sentences.append(line)
    return sentences

In [10]:
raw_sentences = load_train_data(RAW_FILE)
print(len(raw_sentences))

20000


In [11]:
LINE_BREAK = u'<Break>'
WORD_DELIMITER = u'/'
UNK_WORD = u'<UNK>'
PADDING_WORD = u'<PAD>'
START_WORD = u'<GO>'
END_WORD = u'<EOS>'

In [12]:

START_ID = train_dict[START_WORD]
END_ID = train_dict[END_WORD]
PAD_ID = train_dict[PADDING_WORD]
UNK_ID = train_dict[UNK_WORD]
print(UNK_ID)
print(START_ID)
print(END_ID)
print(PAD_ID)


4997
4998
4999
0


In [13]:
def words2id(words, wdict):
    out_ = []
    for w in words:
        if w in wdict.keys():           
            out_.append(wdict[w])
        else:
            out_.append(wdict[UNK_WORD])
    return out_

In [14]:
def build_dataset(sentences, wdict):    
    out_ = []
    for sentence in sentences:
        cur_sentence = words2id(sentence, wdict)          
        out_.append(cur_sentence)         
    return out_

In [19]:
t0 = time.time()
train_sentences = build_dataset(raw_sentences, train_dict)
t1 = time.time()
print(t1-t0)
print(len(train_sentences))


16.6768779755
20000


In [20]:
print(train_sentences[0])

[3, 2, 2, 534, 787, 84, 551, 2, 2, 2, 247, 808, 816]


In [15]:
def pad_sentence(data, length, pad_index, start_index, end_index, is_encode=True):
    result_ = []
    data_len = len(data)
    if (data_len >= length):
        result_ = data[:length] #长句做截断处理
        if not is_encode:
            result_[length-1] = end_index
    else:
        pad_len = length - data_len
        padding = [pad_index] * pad_len
        if is_encode:
            result_ = padding + data
        else:
            result_ = [start_index] + data + [end_index] + padding[:-2]
    
    return result_

In [16]:
def get_batch_data2(offset, size, input_data, input_len, output_len):
    total_len = len(input_data)    
    if (offset) > total_len:
        offset = 0
    if (offset+size>total_len):
        size = total_len - offset
    
    
    input_ = []
    output_ = []
    
    index = offset
    while(len(input_) < size):
        if (index >= total_len-1):
            index = 0
        if (len(input_data[index])>1):
            encode_data = pad_sentence(input_data[index], input_len, 0, START_ID, END_ID)
            decode_data = pad_sentence(input_data[index+1], output_len, 0, START_ID, END_ID, False)
            input_.append(encode_data)
            output_.append(decode_data)
        index += 1    

    return input_, output_  

In [23]:
a,b = get_batch_data2(0,3, train_sentences, 15, 20)
print(len(a))
print(len(b))

print(a)
print(b)

3
3
[[0, 0, 3, 2, 2, 534, 787, 84, 551, 2, 2, 2, 247, 808, 816], [0, 18, 1284, 199, 1698, 198, 924, 2, 5, 857, 392, 1, 174, 116, 86], [0, 878, 13, 774, 51, 135, 316, 315, 128, 245, 1, 944, 423, 140, 55]]
[[4998, 18, 1284, 199, 1698, 198, 924, 2, 5, 857, 392, 1, 174, 116, 86, 4999, 0, 0, 0, 0], [4998, 878, 13, 774, 51, 135, 316, 315, 128, 245, 1, 944, 423, 140, 55, 4999, 0, 0, 0, 0], [4998, 117, 5, 101, 463, 6, 315, 128, 1, 3, 4999, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


In [17]:
def left_shift(decoder_inputs, pad_idx):
    # for generating targets
    return [list(input_[1:]) + [pad_idx] for input_ in decoder_inputs]

In [18]:
def generate_feed_dict(batch_encoder_inputs, batch_decoder_inputs, pad_index):
    encoder_inputs_ = list(zip(*batch_encoder_inputs))  

    target_inputs_ = list(zip(*left_shift(batch_decoder_inputs, pad_index)))
    decoder_inputs_ = list(zip(*batch_decoder_inputs))

    feed_dict = dict()
    # Prepare input data    
    for (i, placeholder) in enumerate(encoder_placeholders):
        # 这里用 placeholder 或者 placeholder.name 都可以       
        feed_dict[placeholder.name] = np.asarray(encoder_inputs_[i], dtype=int)       
        for i in range(len(decoder_placeholders)):           
            feed_dict[decoder_placeholders[i].name] = np.asarray(decoder_inputs_[i], dtype=int)
            feed_dict[target_placeholders[i].name] = np.asarray(target_inputs_[i], dtype=int)        
            # 这里使用 weights 把 <PAD> 的损失屏蔽了
            feed_dict[target_weights_placeholders[i].name] = np.asarray([float(idx != pad_index) for idx in target_inputs_[i]],
                                                              dtype=float)
    return feed_dict

In [19]:
#Attenion
tf.reset_default_graph()

RNN_CELL_TYPE = 'LSTMCell_Attention'
learning_rate = 1.0

encoder_length = 15
decoder_length = 20
embed_dim = 128

cell = tf.contrib.rnn.LSTMCell(embed_dim)
num_encoder_symbols = VOCAB_SIZE
num_decoder_symbols = VOCAB_SIZE
embedding_size = embed_dim

encoder_len_placeholder = tf.placeholder(tf.int32)

encoder_placeholders = [tf.placeholder(tf.int32, shape=[None],
                                       name="encoder_%d" % i) for i in range(encoder_length)]
decoder_placeholders = [tf.placeholder(tf.int32, shape=[None],
                                       name="decoder_%d" % i) for i in range(decoder_length)]
target_placeholders = [tf.placeholder(tf.int32, shape=[None],
                                       name="target_%d" % i) for i in range(decoder_length)]
target_weights_placeholders = [tf.placeholder(tf.float32, shape=[None],
                                       name="decoder_weight_%d" % i) for i in range(decoder_length)]
outputs, states = embedding_attention_seq2seq(
    encoder_placeholders, decoder_placeholders, cell,
    num_encoder_symbols, num_decoder_symbols,
    embedding_size, output_projection=None,
    feed_previous=False)

loss = sequence_loss(outputs, target_placeholders, target_weights_placeholders)
#train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
#train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss)
train_step = tf.train.AdagradOptimizer(learning_rate).minimize(loss)

In [29]:
epoch_num = 100
batch_size = 100
sample_ratio = 100 #为节约训练时间，只用 1/ Sample_ratio的样本数据训练

train_data_size = len(train_sentences)
iteration_num = int(train_data_size / batch_size / sample_ratio)
#if train_data_size % batch_size > 0:
#    iteration_num += 1

display_step = int(iteration_num / 10)

print(train_data_size)
print(iteration_num)
print(display_step)

save_epoch = 1

pad_index = 0

20000
2
0


In [None]:
saver = tf.train.Saver()
sess = tf.Session()

sess.run(tf.global_variables_initializer())
#saved_model = './../models/bot_model_0026'
#print('Loading model from:', saved_model)        
#saver.restore(sess, saved_model)


print("%s Start training, Cell type=%s, Learning rate=%f" % 
      (time.strftime("%Y-%m-%d %H:%M:%S"), RNN_CELL_TYPE, learning_rate))

costs = []
t0 = time.time()
for epoch in range(epoch_num):
    #print("Start epoch %d" % epoch)
        
    offset = 0
    #iteration_num = 1
    for i in range(iteration_num):
        encoder_inputs, decoder_inputs = get_batch_data2(
            offset, batch_size, 
            train_sentences, 
            encoder_length, decoder_length)
        offset += batch_size

        feed_dict1 = generate_feed_dict(encoder_inputs, decoder_inputs, pad_index)

        sess.run(train_step, feed_dict1)
        
        if i % display_step == 0:
            print("%s %d, %f" % (time.strftime("%Y-%m-%d %H:%M:%S"), i, sess.run(loss, feed_dict1)))
        
    c = sess.run(loss, feed_dict1)
    costs.append(c)
    print("%s epoch %d, cost=%f" %(time.strftime("%Y-%m-%d %H:%M:%S"), epoch, c))
    
    if ((epoch+1) % save_epoch) == 0:
        model_path = './../models/bot_model_attention_%04d' % (epoch) 
        print(model_path)
        saved_model = saver.save(sess, model_path)
        print("%s Modeled saved to: %s" % (time.strftime("%Y-%m-%d %H:%M:%S"), saved_model))
            

t1 = time.time()
print("Training duration:%d" % (t1-t0))

2017-05-15 14:44:53 Start training, Cell type=LSTMCell_Attention, Learning rate=1.000000


In [21]:
saver = tf.train.Saver()
sess = tf.Session()

#sess.run(tf.global_variables_initializer())
saved_model = './model/bot_model_attention_0039'
print('Loading model from:', saved_model)   

t0 = time.time()
saver.restore(sess, saved_model)
t1 = time.time()
print(t1-t0)

('Loading model from:', './../models/bot_model_attention_0039')
1.66039395332


In [22]:
def decode_data(session, offset, size, encode_input, decode_input):
    # Decoding
    with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=True):
        #outputs, states = embedding_rnn_seq2seq(
        outputs, states = embedding_attention_seq2seq(
             encoder_placeholders, decoder_placeholders, cell,
            num_encoder_symbols, num_decoder_symbols,
            embedding_size, output_projection=None,
            feed_previous=True)
  
        
        test_encoder_inputs, test_decoder_inputs = get_batch_data2(
                offset, size, 
                encode_input[:-1], 
                encoder_length, 
                decoder_length)

        feed_dict_test = generate_feed_dict(test_encoder_inputs, test_decoder_inputs, pad_index)        
      
        result = []
        for o in outputs:
            # 注意这里也需要提供 feed_dict
            m = np.argmax(o.eval(feed_dict_test, session=sess), axis=1)
            result.append(m[0])

        return result            


In [23]:
def index_to_words(data, dictionary):
    text = ''
    for w in data:
        if (w==END_ID):
            break
        if (w!=PAD_ID):
            text += dictionary[w] + ' '
    #text = ' '.join([dictionary[i] for i in data])
    return text

In [33]:
print(index_to_words(train_sentences[2], train_reverse_dict))


拥 有 史 上 最 强 魔 法 师 的 遗 传 因 子 


In [34]:
def decode_line(line_no, offset, dataset, rdict):
    index = line_no+offset
    while (len(dataset[index])<=1):
        index += 1
        if (index>=len(dataset)):
            index=0
            break
    encoder_input = list(dataset[index])

   
    output = decode_data(sess, index, 1, dataset, dataset)
    print("Line#: %d" % (index+1))
    print("Input: %s" % index_to_words(dataset[index], rdict)) 
    print("Result: %s" % index_to_words(output, rdict))
    print("Expect: %s" % index_to_words(dataset[index+1], rdict)) 
    print("----------------------------------")

In [35]:
decode_line(0,100,train_sentences, train_reverse_dict)

Line#: 101
Input: 快 给 我 出 去 
Result: 仪 ！ 救 他 你 跟 办 
Expect: 不 可 以 
----------------------------------


In [36]:
def decode_lines(line_count, offset, encode_input, decode_input, encode_dict, decode_dict):
    for i in range(line_count):
        index = offset+i
        output = decode_data(sess, index, 1, encode_input, decode_input)
        print("Line#: %d" % (index+1))
        print("Input: %s" % index_to_words(encode_input[index], encode_dict))
        print("Result: %s" % index_to_words(output, decode_dict))
        print("Expect: %s" % index_to_words(decode_input[index+1], decode_dict)) 
        print("----------------------------------")

In [37]:
#训练数据集翻译测试
decode_lines(10, 100, train_sentences, train_sentences, train_reverse_dict, train_reverse_dict)


Line#: 101
Input: 快 给 我 出 去 
Result: 仪 ！ 救 他 你 跟 办 
Expect: 不 可 以 
----------------------------------
Line#: 102
Input: 不 可 以 
Result: ！ 救 才 ！ 救 丹 才 ！ 救 丹 才 ！ 救 丹 去 吧 
Expect: 不 管 是 幽 灵 也 好     什 么 也 好 
----------------------------------
Line#: 103
Input: 不 管 是 幽 灵 也 好     什 么 也 好 
Result: 他 们 上 他 
Expect: 和 树 还 是 和 树 
----------------------------------
Line#: 104
Input: 和 树 还 是 和 树 
Result: 没 ！ 救 救 他 诉 他 你 跟 放 听 
Expect: 是 我 最 重 要 　 最 重 要 的 丈 夫 
----------------------------------
Line#: 105
Input: 是 我 最 重 要 　 最 重 要 的 丈 夫 
Result: 上 他 来 他 你 俊 
Expect: 怎 么 会       夕 菜 
----------------------------------
Line#: 106
Input: 怎 么 会       夕 菜 
Result: 仪 ！ 去 吧 他   ！ 救 救 去 权 仪 ！ 去 吧 他   ！ 救 救 
Expect: 如 果 有 利 用 价 值 的 话 
----------------------------------
Line#: 107
Input: 如 果 有 利 用 价 值 的 话 
Result: ！ 救 丹 才 ！ 去 吧 他 一 ！ 救 丹 才 ！ 去 体 味 上 历 仪 
Expect: 要 留 他 下 来 也 可 以 
----------------------------------
Line#: 108
Input: 要 留 他 下 来 也 可 以 
Result: 上 他 没 ！ 救 他 
Expect: 可 是 对 方 是 幽 灵 啊 
-----------------

In [None]:
decode_line(1,307,train_sentences, train_reverse_dict)

In [24]:
def generate_response(session, test_sentence, wdict, rdict, encoder_len):
    data = build_test_dataset(test_sentence, wdict, encoder_len)

    output = decode_text(session, data, PAD_ID)
    
    print("Raw:%s" % test_sentence)
    print("Input:%s" % index_to_words(data, rdict))
    print("Output:%s" % index_to_words(output, rdict))

In [25]:
def build_test_dataset(test_text, wdict, encoder_len):
    #words = pseg.cut(test_text)
    #words = [w for (w,v) in words]
    words = test_text
    
    ids = words2id(words, wdict)
    if (len(ids)>encoder_len):
        ids = ids[:encoder_len]
    return pad_sentence(ids, encoder_len, 0, START_ID, END_ID)
    

In [26]:
def words2id(words, wdict):
    id_ = []
    for w in words:
        index = wdict[UNK_WORD]
        if w in wdict.keys():
            id_.append(wdict[w])
        else:
            id_.append(wdict[UNK_WORD])
    return id_

In [27]:
def decode_text(session, encode_input, pad_index):    
      
        decode_input = encode_input + [pad_index]*(decoder_length-len(encode_input))
        feed_dict_test = generate_feed_dict([encode_input], [decode_input], pad_index)        
      
        result = []
        for o in outputs:
            # 注意这里也需要提供 feed_dict
            m = np.argmax(o.eval(feed_dict_test, session=sess), axis=1)
            result.append(m[0])

        return result 

In [28]:
input_text = u'我该怎么办'
generate_response(sess, input_text, train_dict, train_reverse_dict, encoder_length)

Raw:我该怎么办
Input:我 该 怎 么 办 
Output:藤 硫 良 救 良 良 良 ！ 案 际 他 起 ！ ！ v 
