# Relation Extraction (SemEval task 8) with Huggingface transformers & TF2.0

The task - predict semantic relations between pairs of nominals.<br>
**There are 9 (directional) types:**
- Cause-Effect
- Component-Whole
- Content-Container
- Entity-Destination
- Entity-Origin
- Instrument-Agency
- Member-Collection
- Message-Topic
- Product-Producer
- *Other* (none of above)

# Inference

In [33]:
import semeval_processor
import tensorflow as tf
from transformers import BertTokenizer

In [37]:
model = tf.keras.models.load_model('../models/tuned_bert')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [34]:
def get_relation(text):
    encoder = semeval_processor.Semeval()
    tokens_in = encoder.re_encode(input_text=text,
                                  tokenizer=tokenizer)
    pred = model.predict(tokens_in)
    
    return encoder.re_decode(pred)

### Find RE between 2 entities in the sentence:

In [48]:
sentence = "The <e1>dataset</e1> contains nine semantic relation types and one artificial relation type <e2>Other</e2>, \
            which means that the relation does not belong to any of the nine relation types."

%time get_relation(sentence)

CPU times: user 81.5 ms, sys: 13.5 ms, total: 95 ms
Wall time: 84.2 ms


'Component-Whole(e2,e1)'

# 1. Load and prepare data

Use SemEval-2010 Task 8 dataset: <br>
contains 8000 train and 2717 test examples

In [1]:
from transformers import BertTokenizer
import pickle
from sklearn.model_selection import train_test_split

import semeval_processor
from utils import features_from_tsv

In [2]:
#filenames for the input and tmp data:
train_original = '../data/TRAIN_FILE.TXT'
test_original = '../data/TEST_FILE.txt'

train_tsv = '../data/train_m.tsv'
test_tsv = '../data/test.tsv'

pickle_features = '../data/train_features'
pickle_test = '../data/test_features'

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [4]:
train_features = features_from_tsv(train_tsv, tokenizer)
test_features = features_from_tsv(test_tsv, tokenizer, test=True)

### Pickle and unpickle features

In [54]:
#dump the train and test features to pickle: 
pickle.dump(train_features,open(pickle_features,'wb'))
pickle.dump(test_features,open(pickle_test,'wb'))

In [5]:
#read train and test features from pickle
train_features = pickle.load(open(pickle_features,'rb'))
test_features = pickle.load(open(pickle_test,'rb'))

### Split data

In [5]:
xtrain_ids, xtest_ids, xtrain_mask, xtest_mask, xtrain_e1, xtest_e1, xtrain_e2, xtest_e2, ytrain, ytest = train_test_split(
    train_features['input_ids'],
    train_features['attention_mask'],
    train_features['e1_mask'],
    train_features['e2_mask'],
    train_features['label_id'],
    stratify = train_features['label_id'],
    test_size=0.2
    )

# 2. Models

In [6]:
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel, BertConfig

### BERT

Let's get BERT base uncased model and try to fine-tune it to our downstream task

In [7]:
pretrained_model = 'bert-base-uncased'
config = BertConfig()
config.output_hidden_states = False

transformer_model = TFBertModel.from_pretrained(pretrained_model, config = config)

Create a custom layer to extract the embeddings vectors of our marked entities:

In [8]:
from tensorflow.keras import layers

class Extract_entity(layers.Layer):
    
    def __init__(self):
        super(Extract_entity, self).__init__()
    
    def call(self, sequence_output, e_mask):
        extended_e_mask = tf.expand_dims(e_mask, 1) # shape (batch_size, 1, sequence_length)
        extended_e_mask = tf.cast(extended_e_mask, tf.float32)
        ext_entity = tf.matmul(extended_e_mask, sequence_output) # shape (batch_size, 1, hidden_size)
        
        return tf.squeeze(ext_entity, [1]) # shape (batch_size, hidden_size)

setup tensorboard for watching some nice pictures

In [9]:
root_logdir = "../.logs"

def get_run_logdir():
    import time
    import os
    run_id = time.strftime("run_%Y_%m_%d-%H_%M_%S")
    return os.path.join(root_logdir, run_id)

run_logdir = get_run_logdir()
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=run_logdir)

and add callbacks for early stopping, and reducing learning rate if needed

In [10]:
early_stop = tf.keras.callbacks.EarlyStopping(patience=3, verbose=1, restore_best_weights=True)
lr_on_pla = tf.keras.callbacks.ReduceLROnPlateau(patience=2)

### Tuned BERT model

Let's use embeddings (the top layer) from the pretrained hugginface Bert and add dense layrs on top, as in the paper:

In [11]:
def create_model_tuned(dropout_rate):

    input_ids = tf.keras.layers.Input(shape=(128,), name='input_ids', dtype='int32')
    attention_mask = tf.keras.layers.Input(shape=(128,), name='attention_mask', dtype='int32')
    e1_mask = tf.keras.layers.Input(shape=(128,), name='e1_mask', dtype='int32')
    e2_mask = tf.keras.layers.Input(shape=(128,), name='e2_mask', dtype='int32')

    embedding_layer = transformer_model(input_ids, attention_mask=attention_mask)[0]
    cls_h = embedding_layer[:,0,:]
    e1_h = Extract_entity()(embedding_layer,e1_mask)
    e2_h = Extract_entity()(embedding_layer,e2_mask)
    CLS = tf.keras.layers.Dense(config.hidden_size, activation="elu", kernel_initializer="he_normal", name='cls_dense')(cls_h)
    CLS = tf.keras.layers.Dropout(dropout_rate)(CLS)
    entity_layer = tf.keras.layers.Dense(config.hidden_size, activation="elu", kernel_initializer="he_normal", name='ent')
    E1 = entity_layer(e1_h)
    E1 = tf.keras.layers.Dropout(dropout_rate)(E1)
    E2 = entity_layer(e2_h)
    E2 = tf.keras.layers.Dropout(dropout_rate)(E2)
    X = tf.concat([CLS,E1,E2],-1)
    X = tf.keras.layers.Dense(config.hidden_size*3, activation="elu", kernel_initializer="he_normal")(X)
    X = tf.keras.layers.Dropout(dropout_rate)(X)
    X = tf.keras.layers.Dense(19, activation='softmax')(X)
    model = tf.keras.Model(inputs=[input_ids,attention_mask,e1_mask,e2_mask], outputs=[X])

    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
    loss = tf.keras.losses.SparseCategoricalCrossentropy()
    model.compile(optimizer=optimizer,
                    loss=loss, 
                    metrics=['accuracy'])
    
    return model

In [12]:
model_tuned_bert = create_model_tuned(dropout_rate=0.2)
model_tuned_bert.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 128)]        0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     ((None, 128, 768), ( 109482240   input_ids[0][0]                  
__________________________________________________________________________________________________
e1_mask (InputLayer)            [(None, 128)]        0                                            
______________________________________________________________________________________________

In [19]:
history = model_tuned_bert.fit([xtrain_ids, xtrain_mask, xtrain_e1, xtrain_e2],
                   ytrain,
                   validation_data = ([xtest_ids, xtest_mask, xtest_e1, xtest_e2], ytest),
                   epochs=100,
                   batch_size=16,
                   callbacks=[early_stop,
                             lr_on_pla,
                             tensorboard_callback]
                   )

Train on 6400 samples, validate on 1601 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 00005: early stopping


In [27]:
model_tuned_bert.save('../models/tuned_bert',save_format='tf')

INFO:tensorflow:Assets written to: ../models/tuned_bert/assets


### Frozen BERT model

Let's use BERT pre-trained model as the feature extractor by freezing Bert's output (embeddings). <br>
In this way we'll be training only added dense layers

In [12]:
def create_model_frozen(dropout_rate):

    input_ids = tf.keras.layers.Input(shape=(128,), name='input_ids', dtype='int32')
    attention_mask = tf.keras.layers.Input(shape=(128,), name='attention_mask', dtype='int32')
    e1_mask = tf.keras.layers.Input(shape=(128,), name='e1_mask', dtype='int32')
    e2_mask = tf.keras.layers.Input(shape=(128,), name='e2_mask', dtype='int32')

    embedding_layer = transformer_model(input_ids, attention_mask=attention_mask)[0]
    cls_h = embedding_layer[:,0,:]
    e1_h = Extract_entity()(embedding_layer,e1_mask)
    e2_h = Extract_entity()(embedding_layer,e2_mask)
    CLS = tf.keras.layers.Dense(config.hidden_size, activation="elu", kernel_initializer="he_normal", name='cls_dense')(cls_h)
    CLS = tf.keras.layers.Dropout(dropout_rate)(CLS)
    entity_layer = tf.keras.layers.Dense(config.hidden_size, activation="elu", kernel_initializer="he_normal", name='ent')
    E1 = entity_layer(e1_h)
    E1 = tf.keras.layers.Dropout(dropout_rate)(E1)
    E2 = entity_layer(e2_h)
    E2 = tf.keras.layers.Dropout(dropout_rate)(E2)
    X = tf.concat([CLS,E1,E2],-1)
    X = tf.keras.layers.Dense(config.hidden_size*3, activation="elu", kernel_initializer="he_normal")(X)
    X = tf.keras.layers.Dropout(dropout_rate)(X)
    X = tf.keras.layers.Dense(19, activation='softmax')(X)
    model = tf.keras.Model(inputs=[input_ids,attention_mask,e1_mask,e2_mask], outputs=[X])

    #don't mess up BERT layer
    model.layers[2].trainable = False

    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
    loss = tf.keras.losses.SparseCategoricalCrossentropy()
    model.compile(optimizer=optimizer,
                    loss=loss, 
                    metrics=['accuracy'])
    
    return model

In [13]:
model_frozen_bert = create_model_frozen(dropout_rate=0.15)
model_frozen_bert.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 128)]        0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     ((None, 128, 768), ( 109482240   input_ids[0][0]                  
__________________________________________________________________________________________________
e1_mask (InputLayer)            [(None, 128)]        0                                            
____________________________________________________________________________________________

In [14]:
history = model_frozen_bert.fit([xtrain_ids, xtrain_mask, xtrain_e1, xtrain_e2],
                   ytrain,
                   validation_data = ([xtest_ids, xtest_mask, xtest_e1, xtest_e2], ytest),
                   epochs=100,
                   batch_size=16,
                   callbacks=[early_stop,
                             lr_on_pla]
                   )

Train on 6400 samples, validate on 1601 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 00026: early stopping


In [16]:
model_frozen_bert.save('../models/frozen_bert',save_format='tf')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: ../models/frozen_bert/assets


# 3. Evaluate Models

get results for the test set and store in the file

In [21]:
def write_semeval_result(result_file, prediction):

    res = [semeval_processor.Semeval().RELATION_LABELS[x.argmax()] for x in prediction]

    with open(result_file, "w") as f:
        for i, answer in enumerate(res):
            f.write(f"{8001+i}\t{answer}\n")

In [22]:
#Tuned BERT
pred_res = model_tuned_bert.predict([
                            test_features['input_ids'],
                            test_features['attention_mask'],
                            test_features['e1_mask'],
                            test_features['e2_mask']
                            ])

write_semeval_result('../results/model_tuned_bert.txt', pred_res)

In [18]:
#Frozen BERT
pred_res = model_frozen_bert.predict([
                            test_features['input_ids'],
                            test_features['attention_mask'],
                            test_features['e1_mask'],
                            test_features['e2_mask']
                            ])

write_semeval_result('../results/model_frozen_bert.txt', pred_res)

### See the outcome 

using original evaluation script (looong and detailed)

#### Tuned BERT:

In [23]:
%cd ../results
!perl ../results/semeval2010_task8_scorer-v1.2.pl model_tuned_bert.txt test_keys.txt
%cd ../notebooks

/root/results
<<< (2*9+1)-WAY EVALUATION (USING DIRECTIONALITY)>>>:

Confusion matrix:
        C-E1 C-E2 C-W1 C-W2 C-C1 C-C2 E-D1 E-D2 E-O1 E-O2 I-A1 I-A2 M-C1 M-C2 M-T1 M-T2 P-P1 P-P2  _O_ <-- classified as
      +-----------------------------------------------------------------------------------------------+ -SUM- skip ACTUAL
 C-E1 | 128    1    0    0    0    0    0    0    0    0    0    0    0    0    2    0    0    2    1 |  134    0  134
 C-E2 |   0  183    0    0    0    0    0    0    3    0    0    0    0    0    0    0    0    0    8 |  194    0  194
 C-W1 |   0    0  141    7    1    0    0    0    0    0    1    0    1    2    1    0    0    1    7 |  162    0  162
 C-W2 |   0    0    3  122    0    1    0    0    1    0    0    7    0    2    4    0    0    3    7 |  150    0  150
 C-C1 |   0    0    6    1  127    0   13    0    1    0    0    0    0    0    0    0    0    0    5 |  153    0  153
 C-C2 |   0    0    0    4    0   32    0    0    0    1    0    0    0    

#### Frozen BERT:

In [19]:
%cd ../results
!perl ../results/semeval2010_task8_scorer-v1.2.pl model_frozen_bert.txt test_keys.txt
%cd ../notebooks

/root/results
<<< (2*9+1)-WAY EVALUATION (USING DIRECTIONALITY)>>>:

Confusion matrix:
        C-E1 C-E2 C-W1 C-W2 C-C1 C-C2 E-D1 E-D2 E-O1 E-O2 I-A1 I-A2 M-C1 M-C2 M-T1 M-T2 P-P1 P-P2  _O_ <-- classified as
      +-----------------------------------------------------------------------------------------------+ -SUM- skip ACTUAL
 C-E1 | 113    7    0    1    0    0    0    0    0    0    1    0    0    0    3    0    1    1    7 |  134    0  134
 C-E2 |   1  175    1    0    0    0    0    0    2    0    0    0    0    0    0    1    0    0   14 |  194    0  194
 C-W1 |   1    0  130    6    0    0    3    0    1    0    2    0    0    3    0    0    0    0   16 |  162    0  162
 C-W2 |   0    0    7  118    0    2    0    0    1    0    0    1    0    1    4    0    0    0   16 |  150    0  150
 C-C1 |   0    0    5    0  131    1    9    0    2    0    0    0    0    0    0    0    1    0    4 |  153    0  153
 C-C2 |   0    0    0    3    2   28    0    0    0    1    0    0    0    

In [18]:
print(history.history)

{'loss': [1.2511667927913368, 0.4354488536622375, 0.16860813486156986, 0.06277294257859467, 0.02286836014711298], 'accuracy': [0.6173437, 0.85984373, 0.9471875, 0.9821875, 0.9948437], 'val_loss': [0.6425678587704897, 0.5604663681983948, 0.6542311323434115, 0.7178658551722765, 0.6972057731822133], 'val_accuracy': [0.7875, 0.811875, 0.829375, 0.8325, 0.83625], 'lr': [3e-05, 3e-05, 3e-05, 3e-05, 2.9999999e-06]}
