## 1.Preprocess Tools

In [0]:
'''
  code by Minho Ryu @bzantium
  
'''
import re
from collections import Counter
import numpy as np

def read_txt(data):
    lines = []
    with open(data, encoding='utf-8') as f:
        for line in f:
            lines.append(re.sub('\n', '', line))
    return lines

def tokenizer(sentence):
    tokens = re.findall(r"[\w]+|[^\s\w]", sentence)
    return tokens


def build_character(sentences):
    word_counter = Counter()
    vocab = dict()
    reverse_vocab = dict()

    for sentence in sentences:
        tokens = list(sentence)
        word_counter.update(tokens)

    vocab['<PAD>'] = 0
    vocab['<GO>'] = 1
    vocab['<UNK>'] = 2
    vocab_idx = 3

    for key, value in word_counter.most_common(len(word_counter)):
        vocab[key] = vocab_idx
        vocab_idx += 1

    for key, value in vocab.items():
        reverse_vocab[value] = key

    vocab_size = len(vocab.keys())

    return vocab, reverse_vocab, vocab_size


def build_vocab(sentences):
    word_counter = Counter()
    vocab = dict()
    reverse_vocab = dict()

    for sentence in sentences:
        tokens = tokenizer(sentence)
        word_counter.update(tokens)

    vocab['<PAD>'] = 0
    vocab['<GO>'] = 1
    vocab['<UNK>'] = 2
    vocab_idx = 3

    for key, value in word_counter.most_common(len(word_counter)):
        vocab[key] = vocab_idx
        vocab_idx += 1

    for key, value in vocab.items():
        reverse_vocab[value] = key

    vocab_size = len(vocab.keys())

    return vocab, reverse_vocab, vocab_size


def sentence_to_char_index(lines, vocab, is_target=False):
    tokens = []
    indexes = []
    max_len = 0

    if len(lines) == 1:
        tokens = list(lines[0])
        for token in tokens:
            if token in vocab.keys():
                indexes.append(vocab[token])
            else:
                indexes.append(vocab['<UNK>'])

    else:
        for sentence in lines:
            token = list(sentence)
            tokens.append(token)
            length = len(token)
            if max_len < length:
                if is_target == True:
                    max_len = length + 1
                else:
                    max_len = length

        for token in tokens:
            temp = token
            for _ in range(len(temp), max_len):
                temp.append('<PAD>')
            index = []
            for char in temp:
                if char in vocab.keys():
                    index.append(vocab[char])
                else:
                    index.append(vocab['<UNK>'])
            indexes.append(index)

    return indexes


def sentence_to_word_index(lines, vocab, is_target=False):
    tokens = []
    indexes = []
    max_len = 0

    if type(lines) is str:
        tokens = tokenizer(lines)
        for token in tokens:
            if token in vocab.keys():
                indexes.append(vocab[token])
            else:
                indexes.append(vocab['<UNK>'])

    else:
        for sentence in lines:
            token = tokenizer(sentence)
            tokens.append(token)
            length = len(token)
            if max_len < length:
                if is_target == True:
                    max_len = length + 1
                else:
                    max_len = length

        for token in tokens:
            temp = token
            for _ in range(len(temp), max_len):
                temp.append('<PAD>')
            index = []
            for char in temp:
                if char in vocab.keys():
                    index.append(vocab[char])
                else:
                    index.append(vocab['<UNK>'])
            indexes.append(index)

    return indexes


def make_dataset(data):
    input = []
    target = []
    for i in range(len(data)-1):
        input.append(data[i])
        target.append(data[i+1])
    return input, target

	
def make_dataset_for_translation(data):
    input = []
    target = []
    for i in range(0, len(data), 2):
        input.append(data[i])
        target.append(data[i+1])
    return input, target


def batch_iter(data, batch_size, num_epochs, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int((len(data)-1)/batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]

## 2.Build Model

In [0]:
import tensorflow as tf

class seq2seq:

    def __init__(self, sess, encoder_vocab_size, decoder_vocab_size, lr=1e-1, max_step=50,
                 embedding_size=300, encoder_hidden_size=128):
        self.sess = sess
        self.encoder_vocab_size = encoder_vocab_size
        self.decoder_vocab_size = decoder_vocab_size
        self.lr = lr
        self.max_step = max_step
        self.embedding_size = embedding_size
        self.encoder_hidden_size = encoder_hidden_size
        self.decoder_hidden_size = encoder_hidden_size * 2
        self._build_net()

    def _build_net(self):
        # placeholder for encoder_input, decoder_input, decoder_target
        with tf.variable_scope("placeholder"):
            self.encoder_inputs = tf.placeholder(dtype=tf.int32, shape=(None, None), name='encoder_inputs')
            encoder_inputs_length = tf.reduce_sum(tf.sign(self.encoder_inputs), axis=1)

            self.decoder_targets = tf.placeholder(dtype=tf.int32, shape=(None, None), name='decoder_inputs')
            decoder_targets_length = tf.reduce_sum(tf.sign(self.decoder_targets), axis=1) + 1
            batch_size, decoder_max_length = tf.unstack(tf.shape(self.decoder_targets))
            decoder_inputs = tf.concat((tf.transpose([tf.ones([batch_size], dtype=tf.int32)], perm=(1,0)),
                                        self.decoder_targets[:,:-1]), axis=1)

        # embedding for encoder, decoder inputs
        with tf.variable_scope("embedding", reuse=tf.AUTO_REUSE):
            embedding = tf.get_variable('embedding',
                                        dtype=tf.float32,
                                        initializer=tf.random_uniform((self.encoder_vocab_size,
                                                                       self.embedding_size),
                                                                       minval=-1.0, maxval=1.0))
            embedded_encoder_inputs = tf.nn.embedding_lookup(embedding, self.encoder_inputs)
            embedded_decoder_inputs = tf.nn.embedding_lookup(embedding, decoder_inputs)

        # encoder operations
        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            self.encoder_fw_cell = tf.nn.rnn_cell.LSTMCell(self.encoder_hidden_size)
            self.encoder_bw_cell = tf.nn.rnn_cell.LSTMCell(self.encoder_hidden_size)

            ((_, _),
             (encoder_fw_last_state, encoder_bw_last_state)) = tf.nn.bidirectional_dynamic_rnn(self.encoder_fw_cell,
                                                                                               self.encoder_bw_cell,
                                                                                               embedded_encoder_inputs,
                                                                                               encoder_inputs_length,
                                                                                               dtype=tf.float32)

            encoder_final_state_c = tf.concat((encoder_fw_last_state.c, encoder_bw_last_state.c), 1)
            encoder_final_state_h = tf.concat((encoder_fw_last_state.h, encoder_bw_last_state.h), 1)
            self.encoder_final_state = tf.nn.rnn_cell.LSTMStateTuple(encoder_final_state_c, encoder_final_state_h)

        # decoder operations with last encoder hidden state as an initial hidden state
        with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
            self.decoder_cell = tf.nn.rnn_cell.LSTMCell(self.decoder_hidden_size)
            decoder_output, decoder_last_state = tf.nn.dynamic_rnn(self.decoder_cell,
                                                                   embedded_decoder_inputs,
                                                                   initial_state=self.encoder_final_state)

        # output with decoder memories
        with tf.variable_scope("output"):
            self.W = tf.get_variable('W', initializer=tf.truncated_normal(shape=(self.decoder_hidden_size, self.decoder_vocab_size)))
            self.b = tf.get_variable('b', initializer=tf.constant(0.1, shape=(self.decoder_vocab_size,)))

            batch_size, max_time_step = tf.unstack(tf.shape(self.decoder_targets))
            decoder_output = tf.reshape(decoder_output, [-1, self.decoder_hidden_size]) # [batch_size*time_step, decoder_hidden_size]
            logits = tf.add(tf.matmul(decoder_output, self.W), self.b) #
            logits = tf.reshape(logits, [batch_size, max_time_step, -1])

        # loss calculation
        with tf.variable_scope("loss"):
            self.loss = tf.reduce_mean(tf.contrib.seq2seq.sequence_loss(logits=logits,
                                                                        targets=self.decoder_targets,
                                                                        weights=tf.sequence_mask(decoder_targets_length,
                                                                                                 decoder_max_length,
                                                                                                 dtype=tf.float32)))

        # train with clipped gradient
        with tf.variable_scope("train", reuse=tf.AUTO_REUSE):
            global_step = tf.Variable(0, trainable=False)
            learning_rate = tf.train.exponential_decay(self.lr, global_step,
                                                       1e+3, 0.96, staircase=True)
            optimizer = tf.train.AdamOptimizer(learning_rate)
            gvs = optimizer.compute_gradients(self.loss)
            capped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gvs]
            self.train_op = optimizer.apply_gradients(capped_gvs, global_step=global_step)

        # inference with user's input (feed previous output to next input)
        with tf.variable_scope("inference"):
            batch_size = tf.unstack(tf.shape(self.encoder_inputs))[0]
            go_time_slice = tf.ones([batch_size], dtype=tf.int32, name='GO')
            self.predictions = []
            prediction = None
            state = self.encoder_final_state
            for i in range(self.max_step):
                if i == 0:
                    input_ = tf.nn.embedding_lookup(embedding, go_time_slice)
                else:
                    input_ = tf.nn.embedding_lookup(embedding, prediction)

                output, state = self.decoder_cell(input_, state)
                logits = tf.add(tf.matmul(output, self.W), self.b)
                prediction = tf.argmax(logits, 1)
                self.predictions.append(prediction)
            self.predictions = tf.stack(self.predictions, 1)
        
        self.sess.run(tf.global_variables_initializer())

    def train(self, encoder_inputs, decoder_targets):
        return self.sess.run([self.loss, self.train_op], feed_dict={self.encoder_inputs:encoder_inputs,
                                                                    self.decoder_targets:decoder_targets})

    def inference(self, encoder_inputs):
        return self.sess.run(self.predictions, feed_dict={self.encoder_inputs: encoder_inputs})

    def setMaxStep(self, max_step):
        self.max_step = max_step

## 3.Train Model

In [0]:
import os, re, json

tf.reset_default_graph()
DIR = "conversation-model"

# read and build dataset
data = read_txt('dialog.txt')
vocab, reverse_vocab, vocab_size = build_character(data)

# save vocab
with open('vocab.json', 'w') as fp:
    json.dump(vocab, fp)

# open session
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
  
    # make model instance
    model = seq2seq(sess, encoder_vocab_size=vocab_size, decoder_vocab_size=vocab_size)

    # make train batches
    inputs, target = make_dataset(data)
    batches = batch_iter(list(zip(inputs, target)), batch_size=64, num_epochs=500)

    # model saver
    saver = tf.train.Saver(max_to_keep=1, keep_checkpoint_every_n_hours=0.5)

    # train model
    print('모델 훈련을 시작합니다.')
    avgLoss = []
    for step, batch in enumerate(batches):
        x_train, y_train = zip(*batch)
        x_train = sentence_to_char_index(x_train, vocab, is_target=False)
        y_train = sentence_to_char_index(y_train, vocab, is_target=True)
        l, _ = model.train(x_train, y_train)
        avgLoss.append(l)
        if (step + 1) % 100 == 0:
            print('batch:', '%04d' % (step + 1), 'loss:', '%.5f' % np.mean(avgLoss))
            saver.save(sess, os.path.join(DIR, "model"), global_step=step+1)
            avgLoss = []

모델 훈련을 시작합니다.
batch: 0100 loss: 0.65784
batch: 0200 loss: 0.00492
batch: 0300 loss: 0.00356
batch: 0400 loss: 0.00325
batch: 0500 loss: 0.00310


## 4.Enjoy Conversation

In [0]:
tf.reset_default_graph()
DIR = "conversation-model"

# load vocab, reverse_vocab, vocab_size
with open('vocab.json', 'r') as fp:
    vocab = json.load(fp)
reverse_vocab = dict()
for key, value in vocab.items():
    reverse_vocab[value] = key
vocab_size = len(vocab)

# open session
config = tf.ConfigProto()
config.gpu_options.allow_growth = True

with tf.Session(config=config) as sess:
    # make model instance
    model = seq2seq(sess, encoder_vocab_size=vocab_size, decoder_vocab_size=vocab_size, max_step=50)

    # load trained model
    saver = tf.train.Saver()
    saver.restore(sess, tf.train.latest_checkpoint(DIR))

    # inference
    while True:
        test = input('User >> ')
        if test == "exit":
            break
        speak = sentence_to_char_index([test], vocab, False)
        result = model.inference([speak])
        for sentence in result:
            response = ''
            for index in sentence:
                if index == 0:
                    break
                response += reverse_vocab[index]
            print("Bot >> ", response, "\n")
