In [1]:
'''
  Reference : https://github.com/SeoSangwoo/Attention-Based-BiLSTM-relation-extraction
'''

import tensorflow as tf
import numpy as np
import pandas as pd
from tqdm import tqdm

## Parameters

In [2]:
class Config:
    # Data loading params
    max_sentence_length = 90
    dev_sample_percentage = 0.1
    
    # Embeddings
    embedding_path = ''
    embedding_dim = 100
    emb_dropout_keep_prob = 0.7
    
    # AttLSTM
    hidden_size = 100
    rnn_dropout_keep_prob = 0.7
    
    # Training parameters
    batch_size = 20
    num_epochs = 100
    display_every = 300
    evaluate_every = 600
    num_checkpoints = 5
    learning_rate = 1.0
    decay_rate = 0.9
    
    l2_reg_lambda = 1e-5
    dropout_keep_prob = 0.5
    
    # Testing parameters
    checkpoint_dir = ''

    labels_count = 19
    class2label = {'Other': 0,
               'Message-Topic(e1,e2)': 1, 'Message-Topic(e2,e1)': 2,
               'Product-Producer(e1,e2)': 3, 'Product-Producer(e2,e1)': 4,
               'Instrument-Agency(e1,e2)': 5, 'Instrument-Agency(e2,e1)': 6,
               'Entity-Destination(e1,e2)': 7, 'Entity-Destination(e2,e1)': 8,
               'Cause-Effect(e1,e2)': 9, 'Cause-Effect(e2,e1)': 10,
               'Component-Whole(e1,e2)': 11, 'Component-Whole(e2,e1)': 12,
               'Entity-Origin(e1,e2)': 13, 'Entity-Origin(e2,e1)': 14,
               'Member-Collection(e1,e2)': 15, 'Member-Collection(e2,e1)': 16,
               'Content-Container(e1,e2)': 17, 'Content-Container(e2,e1)': 18}

    label2class = {0: 'Other',
                   1: 'Message-Topic(e1,e2)', 2: 'Message-Topic(e2,e1)',
                   3: 'Product-Producer(e1,e2)', 4: 'Product-Producer(e2,e1)',
                   5: 'Instrument-Agency(e1,e2)', 6: 'Instrument-Agency(e2,e1)',
                   7: 'Entity-Destination(e1,e2)', 8: 'Entity-Destination(e2,e1)',
                   9: 'Cause-Effect(e1,e2)', 10: 'Cause-Effect(e2,e1)',
                   11: 'Component-Whole(e1,e2)', 12: 'Component-Whole(e2,e1)',
                   13: 'Entity-Origin(e1,e2)', 14: 'Entity-Origin(e2,e1)',
                   15: 'Member-Collection(e1,e2)', 16: 'Member-Collection(e2,e1)',
                   17: 'Content-Container(e1,e2)', 18: 'Content-Container(e2,e1)'}

## Dataset 

Load Relation Extraction dataset of SemEval2010 task8

In [3]:
import nltk
import re
import os

class Dataset:
    def clean_str(self, text):
        text = text.lower()
        # Clean the text
        text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
        text = re.sub(r"what's", "what is ", text)
        text = re.sub(r"that's", "that is ", text)
        text = re.sub(r"there's", "there is ", text)
        text = re.sub(r"it's", "it is ", text)
        text = re.sub(r"\'s", " ", text)
        text = re.sub(r"\'ve", " have ", text)
        text = re.sub(r"can't", "can not ", text)
        text = re.sub(r"n't", " not ", text)
        text = re.sub(r"i'm", "i am ", text)
        text = re.sub(r"\'re", " are ", text)
        text = re.sub(r"\'d", " would ", text)
        text = re.sub(r"\'ll", " will ", text)
        text = re.sub(r",", " ", text)
        text = re.sub(r"\.", " ", text)
        text = re.sub(r"!", " ! ", text)
        text = re.sub(r"\/", " ", text)
        text = re.sub(r"\^", " ^ ", text)
        text = re.sub(r"\+", " + ", text)
        text = re.sub(r"\-", " - ", text)
        text = re.sub(r"\=", " = ", text)
        text = re.sub(r"'", " ", text)
        text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
        text = re.sub(r":", " : ", text)
        text = re.sub(r" e g ", " eg ", text)
        text = re.sub(r" b g ", " bg ", text)
        text = re.sub(r" u s ", " american ", text)
        text = re.sub(r"\0s", "0", text)
        text = re.sub(r" 9 11 ", "911", text)
        text = re.sub(r"e - mail", "email", text)
        text = re.sub(r"j k", "jk", text)
        text = re.sub(r"\s{2,}", " ", text)

        return text.strip()

    def load_data_and_labels(self, path):
        # Data Format
        # 1\t"The system as described above has its greatest application in an arrayed <e1>configuration</e1> of antenna <e2>elements</e2>."
        # Component-Whole(e2,e1)
        # Comment: Not a collection: there is structure here, organisation.
        # 
        # 2\t"The <e1>child</e1> was carefully wrapped and bound into the <e2>cradle</e2> by means of a cord."
        # Other
        # Comment:
        # 
        data = []
        lines = [line.strip() for line in open(path)]
        max_sentence_length = 0
        for idx in range(0, len(lines), 4):
            id = lines[idx].split("\t")[0]
            
            # Sentence
            sentence = lines[idx].split("\t")[1][1:-1]
            sentence = sentence.replace('<e1>', ' _e11_ ')
            sentence = sentence.replace('</e1>', ' _e12_ ')
            sentence = sentence.replace('<e2>', ' _e21_ ')
            sentence = sentence.replace('</e2>', ' _e22_ ')

            sentence = self.clean_str(sentence)
            tokens = nltk.word_tokenize(sentence)
            sentence = " ".join(tokens)
            
            # Max Sentence Length
            if max_sentence_length < len(tokens):
                max_sentence_length = len(tokens)
                
            # e1, e2 position
            e1 = tokens.index("e12") - 1
            e2 = tokens.index("e22") - 1
            
            # Relative Position
            pos1 = ""
            pos2 = ""
            for word_idx in range(len(tokens)):
                pos1 += str((Config.max_sentence_length - 1) + word_idx - e1) + " "
                pos2 += str((Config.max_sentence_length - 1) + word_idx - e2) + " "
                
            # Label
            relation = lines[idx + 1]
            label = Config.class2label[relation]
            data.append([id, sentence, e1, e2, pos1, pos2, relation, label])

        print(path)
        print("max sentence length = {}\n".format(max_sentence_length))

        df = pd.DataFrame(data=data, columns=["id", "sentence", "e1", "e2", 'pos1', 'pos2', 'relation', 'label'])
        return df
    
    def download_and_load_datasets(self):
        dataset = tf.keras.utils.get_file(
          fname="SemEval2010_task8_all_data.zip", 
          origin="https://s3.ap-northeast-2.amazonaws.com/bowbowbow-storage/dataset/SemEval2010_task8_all_data.zip", 
          extract=True)
        
        train_file = 'SemEval2010_task8_all_data/SemEval2010_task8_training/TRAIN_FILE.TXT'
        test_file = 'SemEval2010_task8_all_data/SemEval2010_task8_testing_keys/TEST_FILE_FULL.TXT'
        
        train_df = self.load_data_and_labels(os.path.join(os.path.dirname(dataset), train_file))
        test_df = self.load_data_and_labels(os.path.join(os.path.dirname(dataset), test_file))
        return train_df, test_df

dataset = Dataset()
train_df, test_df = dataset.download_and_load_datasets()
train_df.head()

/home/seungwon/.keras/datasets/SemEval2010_task8_all_data/SemEval2010_task8_training/TRAIN_FILE.TXT
max sentence length = 89

/home/seungwon/.keras/datasets/SemEval2010_task8_all_data/SemEval2010_task8_testing_keys/TEST_FILE_FULL.TXT
max sentence length = 68



Unnamed: 0,id,sentence,e1,e2,pos1,pos2,relation,label
0,1,the system as described above has its greatest...,13,18,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...,71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 8...,"Component-Whole(e2,e1)",12
1,2,the e11 child e12 was carefully wrapped and bo...,2,12,87 88 89 90 91 92 93 94 95 96 97 98 99 100 101...,77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 9...,Other,0
2,3,the e11 author e12 of a keygen uses a e21 disa...,2,10,87 88 89 90 91 92 93 94 95 96 97 98 99 100 101...,79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 9...,"Instrument-Agency(e2,e1)",6
3,4,a misty e11 ridge e12 uprises from the e21 sur...,3,9,86 87 88 89 90 91 92 93 94 95 96,80 81 82 83 84 85 86 87 88 89 90,Other,0
4,5,the e11 student e12 e21 association e22 is the...,2,5,87 88 89 90 91 92 93 94 95 96 97 98 99 100 101...,84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 9...,"Member-Collection(e1,e2)",15


## Model

In [None]:
# inputs: [batch_size, sequence_length, hidden_size]
def attention(inputs):
    # Trainable parameters
    hidden_size = inputs.shape[2].value
    omega = tf.get_variable('omega', [hidden_size], initializer=tf.keras.initializers.glorot_normal())
    # omega: [hidden_size]
    
    M = tf.tanh(inputs) # [batch_size, sequence_length, hidden_size]
        
    # For each of the timestamps its vector of size A from 'v' is reduced with 'u' vector
    alphas_ = tf.tensordot(M, omega, axes=1, name='vu') # [batch_size, sequence_length]
    alphas = tf.nn.softmax(alphas_, name='alphas') # [batch_size, sequence_length]
    
    # Output of RNN is reduced with attention vector; 
    output_ = inputs * tf.expand_dims(alphas, -1) # [batch_size, sequence_length, hidden_size]
    output = tf.reduce_sum(output_, 1)  # [batch_size, hidden_size]
    
    # Final output with tanh
    output = tf.tanh(output) # [batch_size, hidden_size]
    
    return output, alphas
                           

class AttLSTM:
    def __init__(self, 
               sequence_length, 
               num_classes, 
               vocab_size, 
               embedding_size, 
               hidden_size,
               l2_reg_lambda=0.0):
        
        self.input_text = tf.placeholder(tf.int32, shape=[None, sequence_length], name='input_text')
        self.input_y = tf.placeholder(tf.float32, shape=[None, num_classes], name='input_y')
        self.emb_dropout_keep_prob = tf.placeholder(tf.float32, name='emb_dropout_keep_prob')
        self.rnn_dropout_keep_prob = tf.placeholder(tf.float32, name='rnn_dropout_keep_prob')
        self.dropout_keep_prob = tf.placeholder(tf.float32, name='dropout_keep_prob')
        
        # Embedding layer
        with tf.device('/cpu:0'), tf.variable_scope('text-embedding'):
            self.W_text = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -0.25, 0.25), name='W_text')
            self.embedded_chars = tf.nn.embedding_lookup(self.W_text, self.input_text)
            self.embedded_chars = tf.nn.dropout(self.embedded_chars, self.emb_dropout_keep_prob)
            
        # Bidirectional LSTM
        with tf.variable_scope("bi-lstm"):
            _fw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size, initializer=tf.keras.initializers.glorot_normal())
            fw_cell = tf.nn.rnn_cell.DropoutWrapper(_fw_cell, self.rnn_dropout_keep_prob)
            _bw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size, initializer=tf.keras.initializers.glorot_normal())
            bw_cell = tf.nn.rnn_cell.DropoutWrapper(_bw_cell, self.rnn_dropout_keep_prob)
            self.rnn_outputs, _ = tf.nn.bidirectional_dynamic_rnn(cell_fw=fw_cell,
                                                                  cell_bw=bw_cell,
                                                                  inputs=self.embedded_chars,
                                                                  sequence_length=self._length(self.input_text),
                                                                  dtype=tf.float32)
            self.rnn_outputs = tf.add(self.rnn_outputs[0], self.rnn_outputs[1])
            
        # Attention
        with tf.variable_scope('attention'):
            self.attn, self.alphas = attention(self.rnn_outputs)
            self.h_drop = tf.nn.dropout(self.attn, self.dropout_keep_prob)
            
        # Fully connected layer
        with tf.variable_scope('output'):
            self.logits = tf.layers.dense(self.h_drop, num_classes, kernel_initializer=tf.keras.initializers.glorot_normal())
            self.predictions = tf.argmax(self.logits, 1, name='predictions')
            
        # Calculate mean corss-entropy loss
        with tf.variable_scope('loss'):
            losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.logits, labels=self.input_y)
            self.l2 = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables()])
            self.loss = tf.reduce_mean(losses) + l2_reg_lambda * self.l2
        
        # Accuracy    
        with tf.name_scope('accuracy'):
            correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32), name='accuracy')
    
    # Length of the sequence data
    @staticmethod
    def _length(seq):
        relevant = tf.sign(tf.abs(seq))
        length = tf.reduce_sum(relevant, reduction_indices=1)
        length = tf.cast(length, tf.int32)
        return length

## Preprocessing

In [None]:
# Build vocabulary
text_vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(Config.max_sentence_length)
x = np.array(list(text_vocab_processor.fit_transform(train_df['sentence'])))
y = np.array([np.eye(Config.labels_count)[label] for label in train_df['label']]) # One-hot encoding 

print('Text Vocabulary Size {}'.format(len(text_vocab_processor.vocabulary_)))
print('X = {}'.format(x.shape))
print('Y = {}'.format(y.shape))

# Randomly shuffle data to split into train and dev
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled, y_shuffled = x[shuffle_indices],  y[shuffle_indices]

# Split train/dev set
dev_sample_index = -1 * int(Config.dev_sample_percentage*float(len(y)))
x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:] 
y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] 
print("Train/Dev split: {:d}/{:d}\n".format(len(y_train), len(y_dev)))


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Please use tensorflow/transform or tf.data.
Instructions for updating:
Please use tensorflow/transform or tf.data.
Instructions for updating:
Please use tensorflow/transform or tf.data.
Text Vocabulary Size 19151
X = (8000, 90)
Y = (8000, 19)
Train/Dev split: 7200/800



## Function for train

In [None]:
# Pre-trained glove
def load_glove(embedding_dim, vocab):
    download_path = tf.keras.utils.get_file(
      fname="glove.6B.zip", 
      origin="http://nlp.stanford.edu/data/glove.6B.zip", 
      extract=True)
    
    embedding_path = os.path.join(os.path.dirname(download_path), 'glove.6B.100d.txt')
    print('embedding_path :', embedding_path)

    # initial matrix with random uniform
    initW = np.random.randn(len(vocab.vocabulary_), embedding_dim).astype(np.float32) / np.sqrt(len(vocab.vocabulary_))
    # load any vectors from the glove
    print("Load glove file {0}".format(embedding_path))
    f = open(embedding_path, 'r', encoding='utf8')
    for line in f:
        splitLine = line.split(' ')
        word = splitLine[0]
        embedding = np.asarray(splitLine[1:], dtype='float32')
        idx = vocab.vocabulary_.get(word)
        if idx != 0:
            initW[idx] = embedding
    return initW

def batch_iter(data, batch_size, num_epochs, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]

## Train

In [None]:
import datetime
import time

from sklearn.metrics import f1_score
import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)

sess = tf.Session()
with sess.as_default():
    att_lstm = AttLSTM(
        sequence_length=x_train.shape[1],
        num_classes=y_train.shape[1],
        vocab_size=len(text_vocab_processor.vocabulary_),
        embedding_size=Config.embedding_dim,
        hidden_size=Config.hidden_size,
        l2_reg_lambda=Config.l2_reg_lambda
    )
    
    global_step = tf.Variable(0, name='global_step', trainable=False)
    optimizer = tf.train.AdadeltaOptimizer(Config.learning_rate, Config.decay_rate, 1e-6)
    gvs = optimizer.compute_gradients(att_lstm.loss)
    capped_gvs = [(tf.clip_by_value(grad, -1.0, 1.0), var) for grad, var in gvs]
    train_op = optimizer.apply_gradients(capped_gvs, global_step=global_step)
    
    # Output directory for models and summary
    timestamp = str(int(time.time()))
    out_dir = os.path.abspath(os.path.join(os.path.curdir, "27.runs", timestamp))
    print("Writing to {}\n".format(out_dir))

    # Summaries for loss and accuracy
    loss_summary = tf.summary.scalar("loss", att_lstm.loss)
    acc_summary = tf.summary.scalar("accuracy", att_lstm.accuracy)
    
    # Train Summaries
    train_summary_op = tf.summary.merge([loss_summary, acc_summary])
    train_summary_dir = os.path.join(out_dir, "summaries", "train")
    train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

    # Dev summaries
    dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
    dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
    dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)

    # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
    checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
    checkpoint_prefix = os.path.join(checkpoint_dir, "model")
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    saver = tf.train.Saver(tf.global_variables(), max_to_keep=Config.num_checkpoints)
    
    sess.run(tf.global_variables_initializer())

    pretrain_W = load_glove(Config.embedding_dim, text_vocab_processor)
    sess.run(att_lstm.W_text.assign(pretrain_W))
    print("Success to load pre-trained glove model!\n")
    
    # Generate batches
    batches = batch_iter(list(zip(x_train, y_train)), Config.batch_size, Config.num_epochs)
    
    # Training loop. For each batch...
    best_f1 = 0.0  # For save checkpoint(model)
    for batch in batches:
        x_batch, y_batch = zip(*batch)
        # Train
        feed_dict = {
            att_lstm.input_text: x_batch,
            att_lstm.input_y: y_batch,
            att_lstm.emb_dropout_keep_prob: Config.emb_dropout_keep_prob,
            att_lstm.rnn_dropout_keep_prob: Config.rnn_dropout_keep_prob,
            att_lstm.dropout_keep_prob: Config.dropout_keep_prob
        }
        _, step, summaries, loss, accuracy = sess.run(
            [train_op, global_step, train_summary_op, att_lstm.loss, att_lstm.accuracy], feed_dict)
        train_summary_writer.add_summary(summaries, step)

        # Training log display
        if step % Config.display_every == 0:
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))

        # Evaluation
        if step % Config.evaluate_every == 0:
            print("\nEvaluation:")
            feed_dict = {
                att_lstm.input_text: x_dev,
                att_lstm.input_y: y_dev,
                att_lstm.emb_dropout_keep_prob: 1.0,
                att_lstm.rnn_dropout_keep_prob: 1.0,
                att_lstm.dropout_keep_prob: 1.0
            }
            summaries, loss, accuracy, predictions = sess.run(
                [dev_summary_op, att_lstm.loss, att_lstm.accuracy, att_lstm.predictions], feed_dict)
            dev_summary_writer.add_summary(summaries, step)

            time_str = datetime.datetime.now().isoformat()
            f1 = f1_score(np.argmax(y_dev, axis=1), predictions, labels=np.array(range(1, 19)), average="macro")
            print("{}: step {}, loss {:g}, acc {:.4f}".format(time_str, step, loss, accuracy))
            print("[UNOFFICIAL] (2*9+1)-Way Macro-Average F1 Score (excluding Other): {:g}\n".format(f1))

            # Model checkpoint
            if best_f1 < f1:
                best_f1 = f1
                path = saver.save(sess, checkpoint_prefix + "-{:.3g}".format(best_f1), global_step=step)
                print("Saved model checkpoint to {}\n".format(path))



Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use keras.layers.dense instead.
Writing to /home/seungwon/project/tf-notes/27.runs/1557219746

embedding_path : /home/seungwon/.keras/datasets/glove.6B.100d.txt
Load glove file /home/seungwon/.keras/datasets/glove.6B.100d.txt
Success to load pre-trained glove model!

2019-05-07T18:03:13.582314: step 300, loss 4.30487, acc 0.3
2019-05-07T18:03:26.272485: step 600, loss 3.64633, acc 0.5



2019-05-07T18:10:06.768263: step 9900, loss 2.34894, acc 0.95
2019-05-07T18:10:19.597303: step 10200, loss 2.37417, acc 0.9

Evaluation:
2019-05-07T18:10:19.663654: step 10200, loss 2.8803, acc 0.7788
[UNOFFICIAL] (2*9+1)-Way Macro-Average F1 Score (excluding Other): 0.752653

2019-05-07T18:10:32.514823: step 10500, loss 2.16495, acc 0.95
2019-05-07T18:10:45.278938: step 10800, loss 2.40025, acc 0.9

Evaluation:
2019-05-07T18:10:45.346468: step 10800, loss 2.84618, acc 0.7850
[UNOFFICIAL] (2*9+1)-Way Macro-Average F1 Score (excluding Other): 0.761321

Saved model checkpoint to /home/seungwon/project/tf-notes/27.runs/1557219746/checkpoints/model-0.761-10800

2019-05-07T18:10:58.425988: step 11100, loss 2.31316, acc 0.85


## Tensorboard

```
tensorboard --logdir=./27.runs --host 0.0.0.0
```