In [1]:
import json
import numpy as np
import random

import tensorflow as tf
import tensorlayer as tl

from IPython.core.display import display, HTML

  from ._conv import register_converters as _register_converters


In [2]:
batch_size = 5
vec_dim = 100
state_dim = 50
sent_len = 20

# Load word embedded vectors

In [3]:
def load_vec():
    dict_vec = {}
    with open('tokens.vec', 'r', encoding='utf-8') as f:
        f.readline()
        for line in f:
            line = line.split(' ')
            token = line[0]
            vec = line[1:-1]
            dict_vec[token] = [float(i) for i in vec]
    return dict_vec

def get_vec(k):
    ak = dict_vec.keys()
    if k in ak:
        return dict_vec[k]
    else:
        return [0.0]*vec_dim

In [4]:
dict_vec = load_vec()

# Load data

In [5]:
with open('docs.txt', 'r', encoding='utf-8') as f:
    docs = json.load(f)

In [6]:
with open('sents.txt', 'r', encoding='utf-8') as f:
    sents = json.load(f)

In [7]:
docs2 = docs[:550]

In [9]:
sents2 = []
for s in sents:
    if s['labels'].count(1) > 0:
        sents2.append(s)

In [10]:
def get_sents(doc):
    sents = []
    tokens = []
    labels = []

    for t, l in list(zip(doc['text'], doc['labels'])):
        if t in ['，', '。', '？', '！']:

            sents.append({
                'tokens': tokens,
                'labels': labels 
            })

            tokens = []
            labels = []
        else:
            tokens.append(t)
            labels.append(l)
            
    return sents

In [11]:
sents3 = []
for d in docs2:
    sents3 += get_sents(d)

# Build model

In [12]:
tf.reset_default_graph()

sent = tf.placeholder(tf.float32, shape=(batch_size, sent_len, vec_dim))
label = tf.placeholder(tf.float32, shape=(batch_size, sent_len))

network = tl.layers.InputLayer(sent)
network = tl.layers.RNNLayer(
    network, 
    cell_fn=tf.nn.rnn_cell.BasicLSTMCell, 
    n_hidden=state_dim,
    n_steps=sent_len,
    return_last=False)
network = tl.layers.ReshapeLayer(network, [-1, state_dim])
network = tl.layers.DenseLayer(network, n_units=1, act = tf.sigmoid)
network = tl.layers.ReshapeLayer(network, [-1, sent_len, 1])

outputs = network.outputs
loss = tf.sqrt(tf.reduce_mean(tf.square(tf.subtract(tf.squeeze(outputs), label))))
train_op = tf.train.AdadeltaOptimizer(1.0, rho=0.95, epsilon=1e-06,).minimize(loss)

[TL] InputLayer  input: (5, 20, 100)
[TL] RNNLayer rnn: n_hidden: 50 n_steps: 20 in_dim: 3 in_shape: (5, 20, 100) cell_fn: BasicLSTMCell 
[TL]        RNN batch_size (concurrent processes): 5
Instructions for updating:
This class is deprecated, please use tf.nn.rnn_cell.LSTMCell, which supports all the feature this cell currently has. Please replace the existing code with tf.nn.rnn_cell.LSTMCell(name='basic_lstm_cell').
[TL]      n_params : 2
[TL] ReshapeLayer reshape: (100, 50)
[TL] DenseLayer  dense: 1 sigmoid
[TL] ReshapeLayer reshape: (5, 20, 1)


In [13]:
def get_data(sents, sent_len, vec_dim):
    x = []
    y = []

    for s in sents:
        tokens = s['tokens']
        labels = s['labels']

        sent = []
        lb = []
        for t in range(sent_len):
            if t <= len(tokens) - 1:
                sent.append(get_vec(tokens[t]))
                lb.append(labels[t])
            else:
                sent.append([0.0]*vec_dim)
                lb.append(0)
        x.append(sent)
        y.append(lb)

    return np.array(x), np.array(y)

In [28]:
X, y = get_data(sents2, sent_len, vec_dim)
X_train = X
y_train = y

In [33]:
sess = tf.InteractiveSession()
tl.layers.initialize_global_variables(sess)

num_epoch = 50
for e in range(num_epoch):
    num_step = int(X_train.shape[0] / batch_size)
    v = 0
    for i in range(num_step):
        feed_dict = {
            sent: X_train[i*batch_size:(i+1)*batch_size],
            label: y_train[i*batch_size:(i+1)*batch_size]
        }
        _, loss_value, out = sess.run([train_op, loss, outputs], feed_dict=feed_dict)
        v += loss_value
    if e % 10 == 0:
        print(e, v/num_step)



0 0.24766538643934688
10 0.12938152068584668
20 0.10457521482737216
30 0.0894592348388377
40 0.07863589379849145


In [16]:
def build_sent_html(text, labels):
    spans = []
    for i in range(min(len(text), len(labels))):
        if labels[i] == 1:
            spans.append('<span style="color:red;">'+text[i]+'</span>')
        else:
            spans.append('<span>'+text[i]+'</span>')
    
    return ''.join(spans)

def print_sent(text, labels):
    display(HTML(build_sent_html(text, labels)))

In [17]:
def print_compare_sent(s):
    tokens = s['tokens']
    labels = s['labels']
    print_sent(tokens, labels)
    
    px, py = parse_sent(s, sent_len, vec_dim, batch_size)
    pp = sess.run([outputs], feed_dict={
        sent: px,
        label: py
    })[0][0].flatten()

    pp[pp>=.5] = 1
    pp[pp<.5] = 0

    tokens = s['tokens']
    print_sent(tokens, pp)

In [18]:
def parse_sent(s, sent_len, vec_dim, batch_size):
    x = []
    y = []

    tokens = s['tokens']
    labels = s['labels']

    sent = []
    lb = []
    for t in range(sent_len):
        if t <= len(tokens) - 1:
            sent.append(get_vec(tokens[t]))
            lb.append(labels[t])
        else:
            sent.append([0.0]*vec_dim)
            lb.append(0)
    x.append(sent)
    y.append(lb)
    
    return np.array(x*batch_size), np.array(y*batch_size)

def predict_label(s):
    px, py = parse_sent(s, sent_len, vec_dim, batch_size)
    pp = sess.run([outputs], feed_dict={
        sent: px,
        label: py
    })[0][0].flatten()

    pp[pp>=.5] = 1
    pp[pp<.5] = 0
    
    return list(pp)

In [36]:
def compare_doc(sents):
    htmls = ''
    for s in sents:
        htmls += build_sent_html(s['tokens'], s['labels']) + '，'
    display(HTML(''.join(htmls)))
    
    print('\n')
    htmls = ''
    for s in sents:
        htmls += build_sent_html(s['tokens'], predict_label(s)) + '，'
    display(HTML(''.join(htmls)))

# Visulization

In [34]:
p = random.choice(sents2)
print_compare_sent(p)

In [39]:
doc = random.choice(docs2)
sents3 = get_sents(doc)
compare_doc(sents3)



