## 1.Preprocess Tools

In [0]:
'''
  code by Minho Ryu @bzantium
  
'''
import numpy as np
import re, random
from collections import Counter

def read_txt(data):
    lines = []
    with open(data, encoding='utf-16') as f:
        for line in f:
            lines.append(line)
    return lines

def preprocess(data):
    lines = []
    for line in data:
        line = re.sub('<head>', '', line)
        line = re.sub('</head>', '', line)
        line = re.sub('<p>', '', line)
        line = re.sub('</p>', '', line)
        line = re.sub('\n', '', line)
        line = re.sub('\"', '', line)
        lines += line.split('. ')
    return lines

def build_vocab(data):
    word_counter = Counter()
    vocab = dict()
    reverse_vocab = dict()
    for line in data:
        word_counter.update(line.split())
    vocab['<PAD>'] = 0
    vocab['<UNK>'] = 1
    vocab_idx = 2
    for key, value in word_counter.most_common(len(word_counter)):
        vocab[key] = vocab_idx
        vocab_idx += 1
    for key, value in vocab.items():
        reverse_vocab[value] = key

    vocab_size = len(vocab)

    return vocab, reverse_vocab, vocab_size


def sentenceToIndex(lines, vocab):
    maxLength = 0
    inputSet = []
    targetSet = []
    data = []

    if len(lines) == 1:
        line = lines[0]
        line = re.sub('<head>', '', line)
        line = re.sub('</head>', '', line)
        line = re.sub('<p>', '', line)
        line = re.sub('</p>', '', line)
        line = re.sub('\n', '', line)
        line = re.sub('\"', '', line)
        line = re.sub('\.', '', line)
        data = line.split(' ')
        indexes = []
        for word in data:
            if word in vocab.keys():
                indexes.append(vocab[word])
            else:
                indexes.append(vocab['<UNK>'])
        indexes.append(0)
        inputSet = [indexes[:-1]]
        targetSet = [indexes[1:]]

    else:
        for line in lines:
            line = re.sub('<head>', '', line)
            line = re.sub('</head>', '', line)
            line = re.sub('<p>', '', line)
            line = re.sub('</p>', '', line)
            line = re.sub('\n', '', line)
            line = re.sub('\"', '', line)
            line = re.sub('\.', '', line)
            data.append(line.split(' '))

        for line in data:
            if maxLength < len(line):
                maxLength = len(line)

        for words in data:
            indexes = []
            for word in words:
                if word in vocab.keys():
                    indexes.append(vocab[word])
                else:
                    indexes.append(vocab['<UNK>'])
            for i in range(len(words), maxLength + 1):
                indexes.append(0)
            inputSet.append(indexes[:-1])
            targetSet.append(indexes[1:])

    return inputSet, targetSet


def indexToSentence(lines, reverse_vocab):
    sentences = []
    if len(lines) == 1:
        line = lines[0]
        sentence = ''
        for index in line:
            if index == 0:
                sentence = sentence[:-1]
                break
            if index == 1:
                continue
            sentence += reverse_vocab[index] + ' '
        sentences.append(sentence)

    else:
        for line in lines:
            sentence = ''
            for index in line:
                if index == 0:
                    sentence = sentence[:-1]
                    break
                if index == 1:
                    continue
                sentence += reverse_vocab[index] + ' '
            sentences.append(sentence)
    return sentences


def batch_iter(data, batch_size, num_epochs, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int((len(data)-1)/batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]


def find_vocab(character, vocab):
    candidate = []
    if character == "리" or character == "니":
        character = "이"
    elif character == "림" or character == "님":
        character = "임"
    elif character == "린" or character == "닌":
        character = "인"
    elif character == "랑" or character == "낭":
        character = "앙"
    elif character == "름" or character == "늠":
        character = "음"
    elif character == "랴" or character == "냐":
        character = "야"
    elif character == "력" or character == "녁":
        character = "역"
    elif character == "류":
        character = "유"
    elif character == "로":
        character = "노"
    elif character == "려":
        character = "여"
    for key, value in vocab.items():
        if character == key[0]:
            candidate.append(value)
    try:
        result = random.sample(candidate, 1)[0]
    except ValueError:
        print("다른 글자를 입력해 주세요.")
        return "retry"
    return result

## 2.Build Model

In [0]:
import tensorflow as tf

class reRNN:
    def __init__(self, sess, vocab_size, lr=1e-1, max_step=50):
        self.sess = sess
        self.vocab_size = vocab_size
        self.lr = lr
        self.max_step = max_step
        self._build_net()

    def _build_net(self):
        hidden_size = 128
        embedding_size = 300
        # placeholder for first_input, full_input, target_input
        with tf.variable_scope("placeholder"):
            self.first_input = tf.placeholder(tf.int32, shape=(None,))
            self.full_input = tf.placeholder(tf.int32, shape=(None, None))
            input_length = tf.reduce_sum(tf.sign(self.full_input), axis=1)
            self.target_input = tf.placeholder(tf.int32, shape=(None, None))
            target_length = tf.reduce_sum(tf.sign(self.target_input), axis=1) + 1

        # embedding for vocabs
        with tf.variable_scope("embedding", reuse=tf.AUTO_REUSE):
            self.embedding = tf.Variable(
                tf.random_uniform(shape=(self.vocab_size, embedding_size), minval=-1.0, maxval=1.0))
            embedded_full_input = tf.nn.embedding_lookup(self.embedding, self.full_input)

        # recurrent operations
        with tf.variable_scope("recurrent"):
            self.cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_size)

            batch_size, max_time_step = tf.unstack(tf.shape(self.full_input))

            outputs, _ = tf.nn.dynamic_rnn(self.cell,
                                           embedded_full_input,
                                           input_length,
                                           dtype=tf.float32)  # outputs: [batch, time, hidden], bw_outputs: [batch, time, hidden]
            outputs = tf.reshape(outputs, (-1, hidden_size))  # output: [batch*time, hidden]

        # output with rnn memories
        with tf.variable_scope("output", reuse=tf.AUTO_REUSE):
            self.W = tf.Variable(tf.truncated_normal(shape=(hidden_size, self.vocab_size)))
            self.b = tf.Variable(tf.constant(0.1, shape=(self.vocab_size,)))
            logits = tf.add(tf.matmul(outputs, self.W), self.b)  # logits: [batch*time, vocab_size]
            logits = tf.reshape(logits, (batch_size, max_time_step, -1))  # logits: [batch, time, vocab_size]

        # loss calculation
        with tf.variable_scope("loss"):
            self.loss = tf.reduce_mean(tf.contrib.seq2seq.sequence_loss(logits=logits,
                                                                        targets=self.target_input,
                                                                        weights=tf.sequence_mask(target_length,
                                                                                                 max_time_step,
                                                                                                 dtype=tf.float32)))

        # train with clipped gradient
        with tf.variable_scope("train", reuse=tf.AUTO_REUSE):
            global_step = tf.Variable(0, trainable=False)
            learning_rate = tf.train.exponential_decay(self.lr, global_step,
                                                       1e+3, 0.96, staircase=True)
            optimizer = tf.train.AdamOptimizer(learning_rate)
            gvs = optimizer.compute_gradients(self.loss)
            capped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gvs]
            self.train_op = optimizer.apply_gradients(capped_gvs, global_step=global_step)

        # inference with first input (feed previous output to next input)
        with tf.variable_scope("inference"):
            batch_size = tf.unstack(tf.shape(self.first_input))[0]
            state = self.cell.zero_state(batch_size, dtype=tf.float32)
            self.predictions = []
            prediction = 0
            for i in range(self.max_step):
                if i == 0:
                    input_ = tf.nn.embedding_lookup(self.embedding, self.first_input)
                else:
                    input_ = tf.nn.embedding_lookup(self.embedding, prediction)
                output, state = self.cell(input_, state)
                inf_logits = tf.add(tf.matmul(output, self.W), self.b)
                values, indices = tf.nn.top_k(inf_logits, 2)
                indices = tf.squeeze(indices, axis=0)
                index = tf.squeeze(tf.multinomial(values, 1), axis=0)
                prediction = tf.reshape(indices[index[0]], shape=(-1,))
                self.predictions.append(prediction)
            self.predictions = tf.stack(self.predictions, 1)

        self.sess.run(tf.global_variables_initializer())

    def setMaxStep(self, max_step):
        self.max_step = max_step

    def train(self, full_input, target_input):
        return self.sess.run([self.loss, self.train_op],
                             feed_dict={self.full_input: full_input, self.target_input: target_input})

    def inference(self, first_input):
        return self.sess.run(self.predictions, feed_dict={self.first_input: first_input})

## 3.Train Model

In [0]:
import os, json

tf.reset_default_graph()
DIR = "samhangsi-model"

# read and build dataset
data = read_txt('novel.txt')
data = preprocess(data)
vocab, reverse_vocab, vocab_size = build_vocab(data)

# save vocab
with open('vocab.json', 'w') as fp:
    json.dump(vocab, fp)

# open session
config = tf.ConfigProto()
config.gpu_options.allow_growth = True

with tf.Session(config=config) as sess:
    # make model instance
    model = reRNN(sess=sess, vocab_size=vocab_size, lr=1e-1)

    # make train batches
    batches = batch_iter(data, batch_size=64, num_epochs=100)

    # model saver
    saver = tf.train.Saver(max_to_keep=1, keep_checkpoint_every_n_hours=0.5)

    # train model
    print('모델 훈련을 시작합니다.')
    avgLoss = []
    for step, batch in enumerate(batches):
        x_train, y_train = sentenceToIndex(batch, vocab)
        l, _ = model.train(x_train, y_train)
        avgLoss.append(l)
        if (step + 1) % 100 == 0:
            print('batch:', '%04d' % (step + 1), 'loss:', '%.5f' % np.mean(avgLoss))
            saver.save(sess, os.path.join(DIR, 'my-model.ckpt'), global_step=(step+1))
            avgLoss = []

모델 훈련을 시작합니다.


## 4.Enjoy SamHangSi

In [0]:
tf.reset_default_graph()
DIR = "samhangsi-model"

# load vocab, reverse_vocab, vocab_size
with open('vocab.json', 'r') as fp:
    vocab = json.load(fp)
reverse_vocab = dict()
for key, value in vocab.items():
    reverse_vocab[value] = key
vocab_size = len(vocab)

# open session
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
    # make model instance
    model = reRNN(sess=sess, vocab_size=vocab_size, max_step=70)

    # load trained model
    saver = tf.train.Saver()
    saver.restore(sess, tf.train.latest_checkpoint(DIR))

    # inference
    while(True):
        chars = input('세 글자를 입력하세요: ')
        if chars == "exit":
            break
        if len(chars) != 3:
            print("세 글자를 입력해 주세요.")
            continue
        for character in chars:
            number = find_vocab(character, vocab)
            if number == "retry":
                continue
            result = model.inference([number])
            print(reverse_vocab[number] + ' ' + indexToSentence(result, reverse_vocab)[0])
        print('')