In [1]:
import json
import re
import numpy as np
import tensorflow as tf
from sklearn.model_selection import KFold
from tqdm import tqdm

In [2]:
def process_string(string):
    string = re.sub('[^A-Za-z0-9\-\/ ]+', ' ', string).split()
    return [to_title(y.strip()) for y in string]

def to_title(string):
    if string.isupper():
        string = string.title()
    return string

In [3]:
with open('pos-data-v3.json','r') as fopen:
    dataset = json.load(fopen)

In [4]:
texts, labels = [], []
for i in dataset:
    try:
        texts.append(process_string(i[0])[0].lower())
        labels.append(i[-1])
    except Exception as e:
        print(e, i)

list index out of range ['%', '%', 'SYM']
list index out of range ['%', '%', 'SYM']
list index out of range ['*', '*', 'SYM']
list index out of range ['뭘봐', '뭘봐', 'PROPN']
list index out of range ['%', '%', 'SYM']
list index out of range ['ひ', 'ひ', 'PROPN']
list index out of range ['ヒ', 'ヒ', 'PROPN']
list index out of range ['形聲', '形聲', 'NOUN']
list index out of range ['°', '°', 'SYM']
list index out of range ['汉', '汉', 'PROPN']
list index out of range ['东', '东', 'PROPN']
list index out of range ['王', '王', 'PROPN']
list index out of range ['（', '（', 'PROPN']
list index out of range ['伊', '伊', 'PROPN']
list index out of range ['）', '）', 'PROPN']
list index out of range ['ȝ', 'ȝ', 'PROPN']
list index out of range ['%', '%', 'SYM']
list index out of range ['°', '°', 'SYM']
list index out of range ['%', '%', 'SYM']
list index out of range ["'", '_', 'PROPN']
list index out of range ['碁', '碁', 'NOUN']
list index out of range ['囲碁', '囲碁', 'NOUN']
list index out of range ['*', '*', 'SYM']
lis

list index out of range ['=', '=', 'SYM']
list index out of range ['서종', '서종', 'PROPN']
list index out of range ['제', '제', 'PROPN']
list index out of range ['±', '±', 'SYM']
list index out of range ['=', '=', 'SYM']
list index out of range ['仙', '仙', 'PROPN']
list index out of range ['仚', '仚', 'PROPN']
list index out of range ['僊', '僊', 'PROPN']
list index out of range ['=', '=', 'SYM']
list index out of range ['=', '=', 'SYM']
list index out of range ['腹切り', '腹切り', 'PROPN']
list index out of range ['§', '§', 'PROPN']
list index out of range ['%', '%', 'SYM']
list index out of range ['%', '%', 'SYM']
list index out of range ['%', '%', 'SYM']
list index out of range ['%', '%', 'SYM']
list index out of range ['%', '%', 'SYM']
list index out of range ['%', '%', 'SYM']
list index out of range ['%', '%', 'SYM']
list index out of range ['%', '%', 'SYM']
list index out of range ['=', '=', 'SYM']
list index out of range ['육식', '육식', 'PROPN']
list index out of range ['동물', '동물', 'PROPN']
list i

In [5]:
word2idx = {'PAD': 0,'NUM':1,'UNK':2}
tag2idx = {'PAD': 0}
char2idx = {'PAD': 0}
word_idx = 3
tag_idx = 1
char_idx = 1

def parse_XY(texts, labels):
    global word2idx, tag2idx, char2idx, word_idx, tag_idx, char_idx
    X, Y = [], []
    for no, text in enumerate(texts):
        text = to_title(text)
        tag = labels[no]
        for c in text:
            if c not in char2idx:
                char2idx[c] = char_idx
                char_idx += 1
        if tag not in tag2idx:
            tag2idx[tag] = tag_idx
            tag_idx += 1
        Y.append(tag2idx[tag])
        if text not in word2idx:
            word2idx[text] = word_idx
            word_idx += 1
        X.append(word2idx[text])
    return X, np.array(Y)

In [6]:
X, Y = parse_XY(texts, labels)
idx2word={idx: tag for tag, idx in word2idx.items()}
idx2tag = {i: w for w, i in tag2idx.items()}

In [7]:
seq_len = 50
def iter_seq(x):
    return np.array([x[i: i+seq_len] for i in range(0, len(x)-seq_len, 1)])

def to_train_seq(*args):
    return [iter_seq(x) for x in args]

def generate_char_seq(batch):
    x = [[len(idx2word[i]) for i in k] for k in batch]
    maxlen = max([j for i in x for j in i])
    temp = np.zeros((batch.shape[0],batch.shape[1],maxlen),dtype=np.int32)
    for i in range(batch.shape[0]):
        for k in range(batch.shape[1]):
            for no, c in enumerate(idx2word[batch[i,k]]):
                temp[i,k,-1-no] = char2idx[c]
    return temp

In [8]:
X_seq, Y_seq = to_train_seq(X, Y)
X_char_seq = generate_char_seq(X_seq)
X_seq.shape

(103367, 50)

In [9]:
import json
with open('concat-pos.json','w') as fopen:
    fopen.write(json.dumps({'idx2tag':idx2tag,'idx2word':idx2word,
           'word2idx':word2idx,'tag2idx':tag2idx,'char2idx':char2idx}))

In [10]:
from keras.utils import to_categorical
Y_seq_3d = [to_categorical(i, num_classes=len(tag2idx)) for i in Y_seq]

Using TensorFlow backend.


In [11]:
from sklearn.cross_validation import train_test_split
train_X, test_X, train_Y, test_Y, train_char, test_char = train_test_split(X_seq, Y_seq_3d, X_char_seq, 
                                                                           test_size=0.1)



In [12]:
class Model:
    def __init__(
        self,
        dim_word,
        dim_char,
        dropout,
        learning_rate,
        hidden_size_char,
        hidden_size_word,
        num_layers,
    ):
        def cells(size, reuse = False):
            return tf.contrib.rnn.DropoutWrapper(
                tf.nn.rnn_cell.LSTMCell(
                    size,
                    initializer = tf.orthogonal_initializer(),
                    reuse = reuse,
                ),
                state_keep_prob = dropout,
                output_keep_prob = dropout,
            )

        self.word_ids = tf.placeholder(tf.int32, shape = [None, None])
        self.char_ids = tf.placeholder(tf.int32, shape = [None, None, None])
        self.labels = tf.placeholder(tf.int32, shape = [None, None, None])
        self.maxlen = tf.shape(self.word_ids)[1]
        self.lengths = tf.count_nonzero(self.word_ids, 1)

        self.word_embeddings = tf.Variable(
            tf.truncated_normal(
                [len(word2idx), dim_word], stddev = 1.0 / np.sqrt(dim_word)
            )
        )
        self.char_embeddings = tf.Variable(
            tf.truncated_normal(
                [len(char2idx), dim_char], stddev = 1.0 / np.sqrt(dim_char)
            )
        )

        word_embedded = tf.nn.embedding_lookup(
            self.word_embeddings, self.word_ids
        )
        char_embedded = tf.nn.embedding_lookup(
            self.char_embeddings, self.char_ids
        )
        s = tf.shape(char_embedded)
        char_embedded = tf.reshape(
            char_embedded, shape = [s[0] * s[1], s[-2], dim_char]
        )

        for n in range(num_layers):
            (out_fw, out_bw), (
                state_fw,
                state_bw,
            ) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw = cells(hidden_size_char),
                cell_bw = cells(hidden_size_char),
                inputs = char_embedded,
                dtype = tf.float32,
                scope = 'bidirectional_rnn_char_%d' % (n),
            )
            char_embedded = tf.concat((out_fw, out_bw), 2)
        output = tf.reshape(
            char_embedded[:, -1], shape = [s[0], s[1], 2 * hidden_size_char]
        )
        word_embedded = tf.concat([word_embedded, output], axis = -1)

        for n in range(num_layers):
            (out_fw, out_bw), (
                state_fw,
                state_bw,
            ) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw = cells(hidden_size_word),
                cell_bw = cells(hidden_size_word),
                inputs = word_embedded,
                dtype = tf.float32,
                scope = 'bidirectional_rnn_word_%d' % (n),
            )
            word_embedded = tf.concat((out_fw, out_bw), 2)

        logits = tf.layers.dense(word_embedded, len(idx2tag))
        y_t = tf.argmax(self.labels, 2)
        log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(
            logits, y_t, self.lengths
        )
        self.cost = tf.reduce_mean(-log_likelihood)
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate = learning_rate
        ).minimize(self.cost)
        mask = tf.sequence_mask(self.lengths, maxlen = self.maxlen)
        self.tags_seq, tags_score = tf.contrib.crf.crf_decode(
            logits, transition_params, self.lengths
        )
        self.tags_seq = tf.identity(self.tags_seq, name = 'logits')

        y_t = tf.cast(y_t, tf.int32)
        self.prediction = tf.boolean_mask(self.tags_seq, mask)
        mask_label = tf.boolean_mask(y_t, mask)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))


In [13]:
tf.reset_default_graph()
sess = tf.InteractiveSession()

dim_word = 128
dim_char = 256
dropout = 0.8
learning_rate = 1e-3
hidden_size_char = 64
hidden_size_word = 64
num_layers = 2
batch_size = 32

model = Model(dim_word,dim_char,dropout,learning_rate,hidden_size_char,hidden_size_word,num_layers)
sess.run(tf.global_variables_initializer())

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [14]:
import time

for e in range(3):
    lasttime = time.time()
    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        batch_x = train_X[i : min(i + batch_size, train_X.shape[0])]
        batch_char = train_char[i : min(i + batch_size, train_X.shape[0])]
        batch_y = train_Y[i : min(i + batch_size, train_X.shape[0])]
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.word_ids: batch_x,
                model.char_ids: batch_char,
                model.labels: batch_y
            },
        )
        assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)
    
    pbar = tqdm(
        range(0, len(test_X), batch_size), desc = 'test minibatch loop'
    )
    for i in pbar:
        batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
        batch_char = test_char[i : min(i + batch_size, test_X.shape[0])]
        batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.word_ids: batch_x,
                model.char_ids: batch_char,
                model.labels: batch_y
            },
        )
        assert not np.isnan(cost)
        test_loss += cost
        test_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)
    
    train_loss /= len(train_X) / batch_size
    train_acc /= len(train_X) / batch_size
    test_loss /= len(test_X) / batch_size
    test_acc /= len(test_X) / batch_size

    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (e, train_loss, train_acc, test_loss, test_acc)
    )

train minibatch loop: 100%|██████████| 2908/2908 [41:50<00:00,  1.21it/s, accuracy=0.98, cost=2.51]  
test minibatch loop: 100%|██████████| 324/324 [02:43<00:00,  1.98it/s, accuracy=1, cost=0.489]    
train minibatch loop:   0%|          | 0/2908 [00:00<?, ?it/s]

time taken: 2674.309029340744
epoch: 0, training loss: 8.550916, training acc: 0.946683, valid loss: 0.844176, valid acc: 0.998069



train minibatch loop: 100%|██████████| 2908/2908 [41:21<00:00,  1.32it/s, accuracy=1, cost=0.158]    
test minibatch loop: 100%|██████████| 324/324 [02:39<00:00,  2.30it/s, accuracy=1, cost=0.103]    
train minibatch loop:   0%|          | 0/2908 [00:00<?, ?it/s]

time taken: 2640.682224750519
epoch: 1, training loss: 0.558983, training acc: 0.996883, valid loss: 0.458140, valid acc: 1.000164



train minibatch loop: 100%|██████████| 2908/2908 [25:50<00:00,  3.62it/s, accuracy=1, cost=0.201]     
test minibatch loop: 100%|██████████| 324/324 [00:46<00:00,  7.03it/s, accuracy=1, cost=0.00714]   

time taken: 1596.8093905448914
epoch: 2, training loss: 0.311219, training acc: 0.998317, valid loss: 0.302924, valid acc: 1.001093






In [15]:
def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            out_i.append(idx2tag[p])
        out.append(out_i)
    return out

In [16]:
real_Y, predict_Y = [], []

pbar = tqdm(
    range(0, len(test_X), batch_size), desc = 'validation minibatch loop'
)
for i in pbar:
    batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
    batch_char = test_char[i : min(i + batch_size, test_X.shape[0])]
    batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
    predicted = pred2label(sess.run(model.tags_seq,
            feed_dict = {
                model.word_ids: batch_x,
                model.char_ids: batch_char,
            },
    ))
    real = pred2label(np.argmax(batch_y, axis = 2))
    predict_Y.extend(predicted)
    real_Y.extend(real)

validation minibatch loop: 100%|██████████| 324/324 [00:44<00:00,  7.31it/s]


In [17]:
from sklearn.metrics import classification_report
print(classification_report(np.array(real_Y).ravel(), np.array(predict_Y).ravel()))

             precision    recall  f1-score   support

        ADJ       0.99      1.00      1.00     22663
        ADP       1.00      1.00      1.00     60199
        ADV       1.00      1.00      1.00     23633
        AUX       1.00      1.00      1.00      5249
      CCONJ       1.00      1.00      1.00     18485
        DET       1.00      1.00      1.00     19849
       NOUN       1.00      1.00      1.00    135031
        NUM       1.00      1.00      1.00     21842
       PART       1.00      1.00      1.00      2900
       PRON       1.00      1.00      1.00     23908
      PROPN       1.00      1.00      1.00    113206
      SCONJ       1.00      0.99      1.00      7304
        SYM       1.00      1.00      1.00      1205
       VERB       1.00      1.00      1.00     61222
          X       0.97      0.99      0.98       154

avg / total       1.00      1.00      1.00    516850

