# Relationship Extraction


This is a quick "proof of concept" model, based on bi-directional LSTM with attention. The idea of the implementation of attention mechanism is based on P.Zhou et al paper <a href=http://www.aclweb.org/anthology/P16-2034>Attention-Based Bidirectional Long Short-Term Memory Networks for Relation Classification</a>. I choose this simple architecture after some brief literature research as the most feasible implementation given time constraints, and also because I already had BiLSTM building blocks from my earlier project. In addition, some implementation inspirations were taken from paper re-implementation <a href=https://github.com/SeoSangwoo/Attention-Based-BiLSTM-relation-extraction>github repo</a> (data transform, attention details).<br>
Honestly, this is quite a raw try (due to time constraints), with practically out-of-the-box model parameters. The model clearly overfits (for the sake of time I've skipped extra L2 regularisation used in repo), but nevertheless it reaches F1 around 0.68 (on the offical test set) practically without parameters adjustments. Although shy of 0.84 reported in the paper for this architecture, with some tweaks and polishes I believe the score would improve.

In [176]:
#initialise the libraries
import numpy as np
import re
import nltk
import os.path
import gensim

import tensorflow as tf
tf.compat.v2.test.is_gpu_available()

True

In [177]:
tf.__version__

'1.15.0-dev20190821'

### Initialising parameters:

In [237]:
# 1. Data Files:
#input TEXT files:
train_txt_file = '../data/raw/TRAIN_FILE.TXT'
test_txt_file = '../data/raw/TEST_FILE.txt'
test_file_full = '../data/raw/TEST_FILE_FULL.TXT'

#output TXT files:
train_out_file = '../output/train_output.txt'
test_out_file = '../output/output.txt'

#word embeddings file:
emb_path='../externals/'
emb_vec='cc.en.300.vec.gz'


# 2. validation set size:
test_size=0.15

# 3. Model params:
n_hidden_rnn=200
batch_size = 32
n_epochs = 5
learning_rate = 0.005
learning_rate_decay = np.sqrt(2)
rnn_dropout = 0.6
out_dropout = 0.5

## Split training set into training-validation
use standard scikit-learn for that, regardless classes.

In [213]:
#get stuff from data_prep.py helper
import data_prep
from sklearn.model_selection import train_test_split

#load data and split to train-validation
txt, lbl = data_prep.load_data_and_labels(train_txt_file)
X_train, X_test, y_train, y_test = train_test_split(txt, lbl, test_size=test_size)

### Create Dictionaries
as simple token-to-index and back dicts

In [180]:
from collections import defaultdict

#Build custom dictionary with special tokens - PAD
special_tokens = ['<PAD>']

#build dictionary
def build_dict(text, special_tokens):
    """
        text: list of text sentences
        special_tokens: padding token
    """
    #Generate tokens from the text
    tokens = []
    for line in text:
        token = line.split()
        tokens.append(token)
    
    # Create a dictionary with default value 0
    tok2idx = defaultdict(lambda: 0)
    idx2tok = []
    
    # Create mappings from tokens to indices and vice versa, 
    # special tokens coming first (index 0)
    
    idx2tok=special_tokens+list(set([tkn for tkns in tokens
                                     for tkn in tkns if tkn not in special_tokens]))

    i=0
    for tkn in idx2tok:
        tok2idx[tkn]=i
        i +=1
        
    return tok2idx, idx2tok

In [181]:
# Create dictionaries 
token2idx, idx2token = build_dict(X_train+X_test, special_tokens)

In [182]:
#lookup functions for the mapping between tokens and ids for a sentence
def words2idxs(tokens_list):
    return [token2idx[word] for word in tokens_list]

def idxs2words(idxs):
    return [idx2token[idx] for idx in idxs]

### Embeddings import
use *chakin* here for downloading fasttext vec file first time.<br>
TODO: find a way to get binary format (for the speed)

In [12]:
from gensim.models import KeyedVectors

emb_file=emb_path+emb_vec

#check if the file is already there (download only the first time)
if not os.path.exists(emb_file):
    import chakin
    emb_file = chakin.download(number=2, save_dir=emb_path)
    
fb_embeddings = KeyedVectors.load_word2vec_format(emb_file, binary=False)

In [183]:
words = data_prep.load_pretrained_emb(fb_embeddings, 300, idx2token)
words.shape

(19965, 300)

## Preparing Batches

In [184]:
#generating batches of batch_size from list of raw (cleaned) sentences, add one-hot labels
def batches_generator(batch_size, sentences, labels,
                      shuffle=True, allow_smaller_last_batch=True):
    """Generates padded batches of tokens and one-hot labels"""
    
    n_samples = len(sentences)
    if shuffle:
        order = np.random.permutation(n_samples)
    else:
        order = np.arange(n_samples)

    n_batches = n_samples // batch_size
    if allow_smaller_last_batch and n_samples % batch_size:
        n_batches += 1

    for k in range(n_batches):
        batch_start = k * batch_size
        batch_end = min((k + 1) * batch_size, n_samples)
        current_batch_size = batch_end - batch_start
        x_list = []
        y_list = []
        max_len_sentence = 0
        for idx in order[batch_start: batch_end]:
            x_list.append(words2idxs(sentences[idx].split()))
            y_list.append(labels[idx])
            max_len_sentence = max(max_len_sentence, len(sentences[idx].split()))
            
        # Fill in the data into numpy nd-arrays filled with padding indices.
        x = np.ones([current_batch_size, max_len_sentence], dtype=np.int32) * token2idx['<PAD>']
        y = y_list
        lengths = np.zeros(current_batch_size, dtype=np.int32)
        for n in range(current_batch_size):
            utt_len = len(x_list[n])
            x[n, :utt_len] = x_list[n]
            lengths[n] = utt_len
        yield x, y, lengths

## Build BiLSTM Model
add attention layer

In [185]:
class BiLSTMModel():
    pass

### placeholders

In [186]:
def declare_placeholders(self):
    """Specifies placeholders for the model."""

    # Placeholders for input text and labels.
    self.input_batch = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_batch') 
    self.input_labels = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_labels')
  
    # Placeholder for lengths of the sentences.
    self.lengths = tf.placeholder(dtype=tf.int32, shape=[None], name='lengths') 
    
    # Placeholder for rnn layers dropout keep probability.
    self.rnn_dropout_ph = tf.placeholder_with_default(tf.cast(1.0, tf.float32), shape=[])
    
    # Placeholder for an output layer dropout keep probability.
    self.out_dropout_ph = tf.placeholder_with_default(tf.cast(1.0, tf.float32), shape=[])
    
    # Placeholder for a learning rate (tf.float32).
    self.learning_rate_ph = tf.placeholder(dtype=tf.float32, name='learning_rate_ph')
    
    
BiLSTMModel.__declare_placeholders = classmethod(declare_placeholders)    

### layers

In [187]:
#add attention later
def attention(inputs):
    pass

In [188]:
def build_layers(self, vocabulary_size, embedding_dim, n_hidden_rnn, num_classes):
    """Specifies bi-LSTM architecture and computes logits for inputs."""
    
#TODO: create a proper way for selecting random or pre-trained words embeddings
#embedding_dim=300 - for pre-trained ones 
    # Create embedding variable (tf.Variable) with dtype tf.float32
    #initial_embedding_matrix = np.random.randn(vocabulary_size, embedding_dim) / np.sqrt(embedding_dim)
    initial_embedding_matrix = data_prep.load_pretrained_emb(fb_embeddings, embedding_dim, idx2token)
    embedding_matrix_variable = tf.Variable(initial_embedding_matrix, dtype=tf.float32)
    
    # Look up embeddings for self.input_batch
    # Shape: [batch_size, sentence_len, embedding_dim]
    embeddings =  tf.nn.embedding_lookup(embedding_matrix_variable
                                         ,ids=self.input_batch)
    
    # LSTM cells with n_hidden_rnn units, dropout initializing all *_keep_prob with dropout placeholder.
    forward_cell =  tf.nn.rnn_cell.DropoutWrapper(
                                            tf.nn.rnn_cell.LSTMCell(num_units=n_hidden_rnn)
                                           ,input_keep_prob=self.rnn_dropout_ph
                                           ,output_keep_prob=self.rnn_dropout_ph
                                           ,state_keep_prob=self.rnn_dropout_ph
                                           )
    backward_cell =  tf.nn.rnn_cell.DropoutWrapper(
                                            tf.nn.rnn_cell.LSTMCell(num_units=n_hidden_rnn)
                                           ,input_keep_prob=self.rnn_dropout_ph
                                           ,output_keep_prob=self.rnn_dropout_ph
                                           ,state_keep_prob=self.rnn_dropout_ph
                                           )
    
    # Bidirectional Dynamic RNN
    # Shape: [batch_size, sentence_len, 2 * n_hidden_rnn]. 
    # with self.lengths
    (rnn_output_fw, rnn_output_bw), _ =  tf.nn.bidirectional_dynamic_rnn(
                                                                          cell_fw=forward_cell
                                                                         ,cell_bw=backward_cell
                                                                         ,inputs=embeddings
                                                                         ,sequence_length=self.lengths
                                                                         ,dtype=tf.float32
                                                                        )
    
    #ADDING fw and bw, as in the paper. Try to CONCAT?!
    #Shape: [batch_size, sentence_len, 1 * n_hidden_rnn].
    rnn_output = tf.add(rnn_output_fw, rnn_output_bw)

    #Add Attention
    # attn: [batch_size, 1 * n_hidden_rnn], alphas: [batch_size, sentence_len]
    attn, alphas = attention(rnn_output)
    
    #Dropout for Attention layer
    h_star = tf.nn.dropout(attn
                           ,rate=1 - self.out_dropout_ph)
    
    # Dense layer on top.
    # Shape: [batch_size, num_classes]   
    self.logits = tf.layers.dense(attn, num_classes, activation=None)
    
    
BiLSTMModel.__build_layers = classmethod(build_layers)

## Build Attention
re-creating the procedure in the paper (with inspirations from official github), then tweak it 

In [189]:
#Following formulas in the paper:
def attention(inputs):
    #get the w vector
    #Shape: [1 * n_hidden_rnn]
    hidden_size = inputs.shape[2].value
    w = tf.get_variable("w", [hidden_size])
    
    #get M
    m = tf.tanh(inputs)

    #get dot product w, m
    #Shape: [batch_size, sentence_len]
    wm = tf.tensordot(m, w, axes=1, name='wm')
    #Shape: [batch_size, sentence_len]
    alphas = tf.nn.softmax(wm, name='alphas')

    #get output r
    #Shape: [batch_size, 1 * n_hidden_rnn]
    output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 1)

    # Final output h-star with tanh
    output = tf.tanh(output)

    return output, alphas


### predictions

In [190]:
def compute_predictions(self):
    #just take argmax along axis 1 -num_classes
    self.predictions = tf.argmax(self.logits, axis=1)
    
    
BiLSTMModel.__compute_predictions = classmethod(compute_predictions)    

### loss

In [191]:
def compute_loss(self):
    
    loss_tensor =  tf.nn.softmax_cross_entropy_with_logits(
                                                            labels=self.input_labels
                                                            ,logits=self.logits
                                                             )
    self.loss = tf.reduce_mean(loss_tensor)
    
    
BiLSTMModel.__compute_loss = classmethod(compute_loss)   

### accuracy

In [192]:
def compute_accuracy(self):
    
    correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_labels, 1))
    
    self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32), name="accuracy")
    
    
BiLSTMModel.__compute_accuracy = classmethod(compute_accuracy) 

### training

In [193]:
def perform_optimization(self):
    """Specifies the optimizer and train_op for the model."""
    
    self.optimizer =  tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph)
    self.grads_and_vars = self.optimizer.compute_gradients(self.loss)  
    clip_norm = tf.cast(1.0, tf.float32)
    self.grads_and_vars =  [(tf.clip_by_norm(grds, clip_norm), vrs) for grds, vrs in self.grads_and_vars]
    
    self.train_op = self.optimizer.apply_gradients(self.grads_and_vars)

    
BiLSTMModel.__perform_optimization = classmethod(perform_optimization)

### init

In [194]:
def init_model(self, vocabulary_size, embedding_dim, n_hidden_rnn, num_classes):
    self.__declare_placeholders()
    self.__build_layers(vocabulary_size, embedding_dim, n_hidden_rnn, num_classes)
    self.__compute_predictions()
    self.__compute_loss()
    self.__compute_accuracy()
    self.__perform_optimization()

    
BiLSTMModel.__init__ = classmethod(init_model)

## Batch Training and Predicting

In [195]:
def train_on_batch(self, session, x_batch, y_batch, lengths, learning_rate, rnn_dropout, out_dropout):
    feed_dict = {self.input_batch: x_batch,
                 self.input_labels: y_batch,
                 self.learning_rate_ph: learning_rate,
                 self.rnn_dropout_ph: rnn_dropout,
                 self.out_dropout_ph: out_dropout,
                 self.lengths: lengths}
    
    session.run(self.train_op, feed_dict=feed_dict)

    
BiLSTMModel.train_on_batch = classmethod(train_on_batch)

In [196]:
def predict_for_batch(self, session, x_batch, lengths):
    feed_dict = {self.input_batch: x_batch,
                 self.lengths: lengths}
    
    predictions = session.run(self.predictions, feed_dict=feed_dict)
    
    return predictions


BiLSTMModel.predict_for_batch = classmethod(predict_for_batch)

## Evaluation

In [197]:
from sklearn.metrics import f1_score, recall_score, precision_score

def eval_model(model, session, sentences, labels):
    
    y_true, y_pred = [], []
    for x_batch, y_batch, lengths in batches_generator(1, sentences, labels):
        pred_batch = model.predict_for_batch(session, x_batch, lengths)
        
        y_true.append(np.argmax(y_batch[0]))
        y_pred.append(pred_batch[0])
        
    correct_predictions = np.equal(y_pred, y_true)
    accuracy = np.mean(correct_predictions)
    
    f1 = f1_score(y_true, y_pred, labels=np.array(range(1, 19)), average="macro")
    recall = recall_score(y_true, y_pred, labels=np.array(range(0, 19)), average="macro")
    precision = precision_score(y_true, y_pred, labels=np.array(range(0, 19)), average="macro")
    
    print("accuracy:", accuracy, "precision:", precision, "recall:", recall, "F1:", f1)
          

# RUN 
train the model and check validation set performance

In [198]:
tf.reset_default_graph()

model = BiLSTMModel(vocabulary_size=len(idx2token)
                    ,num_classes=19
                    ,embedding_dim=300
                    ,n_hidden_rnn=n_hidden_rnn
                    )

batch_size = batch_size
n_epochs = n_epochs
learning_rate = learning_rate
learning_rate_decay = learning_rate_decay
rnn_dropout = rnn_dropout
out_dropout = out_dropout

In [199]:
import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)

sess = tf.Session()
sess.run(tf.global_variables_initializer())

print('Start training... \n')
for epoch in range(n_epochs):
    # For each epoch evaluate the model on train and validation data
    print('-' * 20 + ' Epoch {} '.format(epoch+1) + 'of {} '.format(n_epochs) + '-' * 20)
    print('Train data evaluation:')
    eval_model(model, sess, X_train, y_train)
    print('Validation data evaluation:')
    eval_model(model, sess, X_test, y_test)
    
    # Train the model
    for x_batch, y_batch, lengths in batches_generator(batch_size, X_train, y_train):
        model.train_on_batch(sess, x_batch, y_batch, lengths, learning_rate, rnn_dropout, out_dropout)
        
    # Decaying the learning rate
    learning_rate = learning_rate / learning_rate_decay
    
print('...training finished.')

Start training... 

-------------------- Epoch 1 of 5 --------------------
Train data evaluation:
accuracy: 0.010441176470588235 precision: 0.016697997302586438 recall: 0.05415233824539162 F1: 0.002677174447720129
Validation data evaluation:
accuracy: 0.01 precision: 0.0006043817678166709 recall: 0.048582995951417005 F1: 0.0012602394454946438
-------------------- Epoch 2 of 5 --------------------
Train data evaluation:
accuracy: 0.831764705882353 precision: 0.7945768159572211 recall: 0.7766289129378009 F1: 0.7767107732542663
Validation data evaluation:
accuracy: 0.6966666666666667 precision: 0.6379640058570133 recall: 0.6420945434291253 F1: 0.6457088913427536
-------------------- Epoch 3 of 5 --------------------
Train data evaluation:
accuracy: 0.9766176470588235 precision: 0.9214321372851381 recall: 0.9321062852445395 F1: 0.9250925232196567
Validation data evaluation:
accuracy: 0.7266666666666667 precision: 0.6839858864825219 recall: 0.7061958268990166 F1: 0.7034025531704797
--------

In [200]:
#Again evaluate on the validation set:
eval_model(model, sess, X_test, y_test)

accuracy: 0.7258333333333333 precision: 0.7119892918389158 recall: 0.7043446774697099 F1: 0.7163152982754161


## Predict
predict the relationships and store in the required format

In [234]:
def predict_relationship(model, session, sentences, f_id, out_file):
    
    print("Running predictions")
    y_pred = []
    dummies = np.zeros(len(sentences), dtype=np.int32)
    for x_batch, _, lengths in batches_generator(1, sentences, dummies, shuffle=False):
        pred_batch = model.predict_for_batch(session, x_batch, lengths)
        
        y_pred.append(pred_batch[0])
        
    print("Writing to file")
    
    with open(out_file, 'w') as f:
        for i in range(len(y_pred)):
            f.write("{}\t{}\n".format(i+f_id, data_prep.label2class[y_pred[i]]))
            
    print("Done!")    
            

In [225]:
test_txt, f_id = data_prep.load_test_data(test_txt_file)

2717 8001


In [236]:
predict_relationship(model, sess, test_txt, f_id, test_out_file)

Running predictions
Writing to file
Done!


In [238]:
txt_f, lbl_f = data_prep.load_data_and_labels(test_file_full)

In [239]:
eval_model(model, sess, txt_f, lbl_f)

accuracy: 0.7004048582995951 precision: 0.6643133394485279 recall: 0.6727478440415117 F1: 0.680718751050499
