In [1]:
#-*- coding: utf-8 -*-
import numpy as np
import tensorflow as tf
from tensorflow.models.rnn import rnn_cell
from tensorflow.models.rnn import seq2seq

import collections
import argparse
import time
import os
import sys

from six.moves import cPickle

cwd=os.getcwd() 
sys.path.append(cwd)

from Hangulpy.Hangulpy import *

In [2]:
# This is for loading TEXT!
class TextLoader():
    def __init__(self, data_dir, batch_size, seq_length):
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.seq_length = seq_length

        input_file = os.path.join(data_dir, "input.txt")
        vocab_file = os.path.join(data_dir, "vocab.pkl")
        tensor_file = os.path.join(data_dir, "data.npy")

        if not (os.path.exists(vocab_file) and os.path.exists(tensor_file)):
            print("reading text file")
            self.preprocess(input_file, vocab_file, tensor_file)
        else:
            print("loading preprocessed files")
            self.load_preprocessed(vocab_file, tensor_file)
        self.create_batches()
        self.reset_batch_pointer()

    def preprocess(self, input_file, vocab_file, tensor_file):
        with open(input_file, "r") as f:
            data = f.read()
        counter = collections.Counter(data)
        count_pairs = sorted(counter.items(), key=lambda x: -x[1])
        self.chars, _ = zip(*count_pairs)
        self.vocab_size = len(self.chars)
        self.vocab = dict(zip(self.chars, range(len(self.chars))))
        with open(vocab_file, 'wb') as f:
            cPickle.dump(self.chars, f)
        self.tensor = np.array(list(map(self.vocab.get, data)))
        np.save(tensor_file, self.tensor)

    def load_preprocessed(self, vocab_file, tensor_file):
        with open(vocab_file, 'rb') as f:
            self.chars = cPickle.load(f)
        self.vocab_size = len(self.chars)
        self.vocab = dict(zip(self.chars, range(len(self.chars))))
        self.tensor = np.load(tensor_file)
        self.num_batches = int(self.tensor.size / (self.batch_size *
                                                   self.seq_length))

    def create_batches(self):
        self.num_batches = int(self.tensor.size / (self.batch_size *
                                                   self.seq_length))
        self.tensor = self.tensor[:self.num_batches * self.batch_size * self.seq_length]
        xdata = self.tensor
        ydata = np.copy(self.tensor)
        ydata[:-1] = xdata[1:]
        ydata[-1] = xdata[0]
        self.x_batches = np.split(xdata.reshape(self.batch_size, -1), self.num_batches, 1)
        self.y_batches = np.split(ydata.reshape(self.batch_size, -1), self.num_batches, 1)


    def next_batch(self):
        x, y = self.x_batches[self.pointer], self.y_batches[self.pointer]
        self.pointer += 1
        return x, y

    def reset_batch_pointer(self):
        self.pointer = 0

In [3]:
# Load text 
batch_size  = 50
seq_length  = 50
data_dir    = "data/han2"
#data_dir    = "data/han2"
#data_dir    = "data/han3"
#data_dir    = "data/han4"
#data_dir    = "data/han5"

data_loader = TextLoader(data_dir, batch_size, seq_length)
print ("Done")


loading preprocessed files
Done


In [4]:
# Define Network 
rnn_size   = 1024
num_layers = 2
grad_clip  = 5.

_batch_size = 1
_seq_length = 1

vocab_size = data_loader.vocab_size

# Select RNN Cell
unitcell = rnn_cell.BasicLSTMCell(rnn_size)
cell = rnn_cell.MultiRNNCell([unitcell] * num_layers)

# Set paths to the graph 
input_data = tf.placeholder(tf.int32, [_batch_size, _seq_length])
targets    = tf.placeholder(tf.int32, [_batch_size, _seq_length])
initial_state = cell.zero_state(_batch_size, tf.float32)

# Set Network
with tf.variable_scope('rnnlm'):
    softmax_w = tf.get_variable("softmax_w", [rnn_size, vocab_size])
    softmax_b = tf.get_variable("softmax_b", [vocab_size])
    with tf.device("/cpu:0"):
        embedding = tf.get_variable("embedding", [vocab_size, rnn_size])
        inputs = tf.split(1, _seq_length, tf.nn.embedding_lookup(embedding, input_data))
        inputs = [tf.squeeze(input_, [1]) for input_ in inputs]
# Loop function for seq2seq
def loop(prev, _):
    prev = tf.nn.xw_plus_b(prev, softmax_w, softmax_b)
    prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
    return tf.nn.embedding_lookup(embedding, prev_symbol)
# Output of RNN 
outputs, last_state = seq2seq.rnn_decoder(inputs, initial_state, cell, loop_function=None, scope='rnnlm')
output = tf.reshape(tf.concat(1, outputs), [-1, rnn_size])
logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b)
# Next word probability 
probs = tf.nn.softmax(logits)
# Define LOSS
loss = seq2seq.sequence_loss_by_example([logits], # Input
    [tf.reshape(targets, [-1])], # Target
    [tf.ones([_batch_size * _seq_length])], # Weight 
    vocab_size)
# Define Optimizer
cost = tf.reduce_sum(loss) / _batch_size / _seq_length
final_state = last_state
lr = tf.Variable(0.0, trainable=False)
tvars = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), grad_clip)
_optm = tf.train.AdamOptimizer(lr)
optm = _optm.apply_gradients(zip(grads, tvars))

print ("Network Ready")

Network Ready


In [5]:
# Sampling function 
def sample( sess, chars, vocab, __probs, num=200, prime=u'ㅇㅗᴥㄴㅡㄹᴥ '):
    # state = cell.zero_state(1, tf.float32).eval()
    state = sess.run(cell.zero_state(1, tf.float32))
    _probs = __probs
    
    prime = list(prime)

    for char in prime[:-1]:
        x = np.zeros((1, 1))
        x[0, 0] = vocab[char]
        feed = {input_data: x, initial_state:state}
        [state] = sess.run([final_state], feed)

    def weighted_pick(weights):
        t = np.cumsum(weights)
        s = np.sum(weights)
        return(int(np.searchsorted(t, np.random.rand(1)*s)))

    ret = prime
    char = prime[-1]
    for n in range(num):
        x = np.zeros((1, 1))
        x[0, 0] = vocab[char]
        feed = {input_data: x, initial_state:state}
        [_probsval, state] = sess.run([_probs, final_state], feed)
        p = _probsval[0]
        # sample = int(np.random.choice(len(p), p=p))
        sample = weighted_pick(p)
        pred = chars[sample]
        ret += pred
        char = pred
    return ret

In [6]:
# Sample ! 
save_dir = "save"
prime = decompose_text(u"세상은")
print ("Prime Text : %s => %s" % (automata(prime), "".join(prime)))

n = 1000
with open(os.path.join(save_dir, 'config.pkl'), 'rb') as f:
    saved_args = cPickle.load(f)
with open(os.path.join(save_dir, 'chars_vocab.pkl'), 'rb') as f:
    chars, vocab = cPickle.load(f)

print chars
print vocab
    
sess = tf.Session()
sess.run(tf.initialize_all_variables())

saver = tf.train.Saver(tf.all_variables())
ckpt = tf.train.get_checkpoint_state(save_dir)

print (ckpt.model_checkpoint_path)
if ckpt and ckpt.model_checkpoint_path:
    saver.restore(sess, ckpt.model_checkpoint_path)
    sampled_text = sample(sess, chars, vocab, probs, n, prime)
    print ("")
    print (u"SAMPLED TEXT = %s" % sampled_text)

    print ("")
    print ("-- RESULT --")
    print (automata("".join(sampled_text)))

Prime Text : 세상은 => ㅅㅔᴥㅅㅏㅇᴥㅇㅡㄴᴥ
(u'\u1d25', u' ', u'\u3147', u'\u314f', u'\u3134', u'\u3131', u'\u3139', u'\u3163', u'\u3161', u'\u3153', u'\u3137', u'\u3145', u'\u3157', u'\u3141', u'\u3148', u'\u314e', u'\u315c', u'\u3142', u'\n', u'\r', u'\u3154', u'\u3155', u'\u3150', u'\u3146', u'.', u'\u314a', u'\u3162', u'\u314c', u'\u3158', u'\u314d', u',', u'\u3132', u'"', u'\u315b', u'\u3138', u'\u315a', u'\u3151', u'\u314b', u'\u315f', u'\u315d', u'\u3136', u'\u3144', u'\u3160', u'\u3156', u'?', u'\u3149', u'\u3143', u"'", u'1', u')', u'(', u'\u3159', u'!', u'-', u'0', u'2', u'\u313a', u'3', u'5', u'9', u'\u313b', u'\u3135', u'\u3140', u'4', u'e', u'6', u'8', u'7', u'a', u'\u3152', u'i', u'n', u'o', u'>', u'\u313c', u'<', u'r', u':', u't', u's', u'l', u'\u315e', u']', u'[', u'h', u'm', u'`', u'c', u'S', u'd', u'u', u'A', u'p', u'C', u'T', u'g', u'I', u'B', u'y', u'M', u'D', u'P', u'f', u'E', u'^', u'*', u'L', u'N', u'R', u'\u3133', u'O', u'k', u'K', u'b', u'V', u'H', u'_', u'F', u'J', u'v', 