In [1]:
import os
import math
import numpy as np
import pandas as pd

In [2]:
import sys
sys.path.append("..\\parser")
import internal_parser
import model

In [3]:
import torch
from torch.nn import functional as F

In [4]:
from sklearn.metrics import precision_recall_fscore_support

In [5]:
# Constants
NUM_CLASSES = 8 # Number of relation classes
NUM_EPOCH = 100
MAX_TOKEN = 128 # Max tokens in each sentence, set to 128 for limited RAM capacity
VALIDATION_SIZE = 1000 # Number of observations evalutated in validation step

In [6]:
device = torch.device("cuda:0")

In [7]:
def data_generator(group):
    docs = internal_parser.get_docs(group)
    data = internal_parser.extract_data(docs)
    for doc in data:
        sentence_id = 0
        starting_index = 0
        input_ids = []
        # ddd a final row with dummy sentence embedding
        doc["data_frame"].loc[doc["data_frame"].index.max() + 1, "sentence_embedding"] \
            = doc["data_frame"]["sentence_embedding"].max() + 1
        for index, row in doc["data_frame"].iterrows():
            if row["sentence_embedding"] != sentence_id or index - starting_index >= MAX_TOKEN - 2:
                new_entity_position = {}
                for entity in doc["entity_position"]:
                    if starting_index <= doc["entity_position"][entity][0] < doc["entity_position"][entity][1] <= index:
                        new_entity_position[entity] = (
                            doc["entity_position"][entity][0] - starting_index + 1, # +1: space for CLS token
                            doc["entity_position"][entity][1] - starting_index + 1  # +1: space for CLS token
                        )
                        
                # If this sentence has at least two entities for a possible relation
                if len(new_entity_position) >= 2:
                    # Add CLS and SEP to the sentence
                    input_ids = [internal_parser.CLS_TOKEN] + input_ids + [internal_parser.SEP_TOKEN]
                    e1_mask, e2_mask, labels = model.generate_entity_mask(len(input_ids), new_entity_position, doc["relations"])
                    assert e1_mask.shape[0] == e2_mask.shape[0] == labels.shape[0]
                    assert len(input_ids) == e1_mask.shape[1] == e2_mask.shape[1]
                    yield {
                        "input_ids": torch.tensor([input_ids]).long().to(device), 
                        "attention_mask": torch.ones((1, len(input_ids)), dtype=torch.long).to(device),
                        "token_type_ids": torch.zeros((1, len(input_ids)), dtype=torch.long).to(device),
                        "e1_mask": e1_mask.to(device),
                        "e2_mask": e2_mask.to(device),
                        "labels": labels.to(device)
                    }
                    del e1_mask
                    del e2_mask
                    del labels
                    
                sentence_id = row["sentence_embedding"]
                input_ids = []
                starting_index = index
            
            input_ids.append(row["token_ids"])   
        
        del input_ids

In [8]:
# Test data_generator()
generator = data_generator("All")
# Test on the first document ("143f9e00-34c4-11eb-a28a-8b07c9b15060-0")
assert next(generator)["input_ids"][0, 1] == 1015
assert next(generator)["input_ids"][0, 1] == 2057
assert next(generator)["input_ids"][0, 1] == 2119
assert next(generator)["input_ids"][0, 1] == 2012
assert next(generator)["input_ids"][0, 1] == 5214
assert next(generator)["input_ids"][0, 1] == 1016
assert next(generator)["input_ids"][0, 1] == 2057
assert next(generator)["input_ids"][0, 1] == 8115
assert next(generator)["input_ids"][0, 1] == 4550
assert next(generator)["input_ids"][0, 1] == 1999
assert next(generator)["input_ids"][0, 1] == 1016
assert next(generator)["input_ids"][0, 1] == 2009
assert next(generator)["input_ids"][0, 1] == 2057
assert next(generator)["input_ids"][0, 1] == 2156

In [9]:
mre_model = model.BertForMre(NUM_CLASSES)
mre_model.to(device)

BertForMre(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
 

In [10]:
# Freeze all layers except for the last classifier layer on top
for param in mre_model.parameters():
    param.requires_grad = False
mre_model.classifier.weight.requires_grad = True
mre_model.classifier.bias.requires_grad = True

In [11]:
for param in mre_model.parameters():
    print("size:", param.shape)
    print(param.requires_grad)

size: torch.Size([30522, 768])
False
size: torch.Size([512, 768])
False
size: torch.Size([2, 768])
False
size: torch.Size([768])
False
size: torch.Size([768])
False
size: torch.Size([768, 768])
False
size: torch.Size([768])
False
size: torch.Size([768, 768])
False
size: torch.Size([768])
False
size: torch.Size([768, 768])
False
size: torch.Size([768])
False
size: torch.Size([768, 768])
False
size: torch.Size([768])
False
size: torch.Size([768])
False
size: torch.Size([768])
False
size: torch.Size([3072, 768])
False
size: torch.Size([3072])
False
size: torch.Size([768, 3072])
False
size: torch.Size([768])
False
size: torch.Size([768])
False
size: torch.Size([768])
False
size: torch.Size([768, 768])
False
size: torch.Size([768])
False
size: torch.Size([768, 768])
False
size: torch.Size([768])
False
size: torch.Size([768, 768])
False
size: torch.Size([768])
False
size: torch.Size([768, 768])
False
size: torch.Size([768])
False
size: torch.Size([768])
False
size: torch.Size([768])
False
si

In [12]:
from transformers import AdamW
optimizer = AdamW(mre_model.parameters(), lr=1e-5)

In [13]:
def validate_model(count):
    val_generator = data_generator("Test")
    true_labels = []
    predicted_labels = []
    for i in range(count):
        inputs = next(val_generator)
        # forward
        outputs = mre_model(**inputs)
        true_labels += inputs["labels"].tolist()
        pred_labels = F.softmax(outputs.logits, dim=-1).argmax(dim=1)
        predicted_labels += pred_labels.tolist()
        assert len(predicted_labels) == len(true_labels)
        del inputs
        
    print("[validation %d]" % (count))
    result = pd.DataFrame(columns=["precision", "recall", "fbeta_score", "support"])
    result.loc["macro"] = list(precision_recall_fscore_support(true_labels, predicted_labels, average="macro"))
    result.loc["micro"] = list(precision_recall_fscore_support(true_labels, predicted_labels, average="micro"))
    print(result)

In [14]:
def train_model():
    for epoch in range(NUM_EPOCH):  # loop over the dataset multiple times
        true_labels = []
        predicted_labels = []

        for i, inputs in enumerate(data_generator("Training"), 0):
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = mre_model(**inputs)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            # print statistics
            true_labels += inputs["labels"].tolist()
            pred_labels = F.softmax(outputs.logits, dim=-1).argmax(dim=1)
            predicted_labels += pred_labels.tolist()
            assert len(predicted_labels) == len(true_labels)
            if i % 1000 == 999:    # print every 1000 mini-batches
                print("[%d, %5d]" % (epoch + 1, i + 1))
                result = pd.DataFrame(columns=["precision", "recall", "fbeta_score", "support"])
                result.loc["macro"] = list(precision_recall_fscore_support(true_labels, predicted_labels, average="macro"))
                result.loc["micro"] = list(precision_recall_fscore_support(true_labels, predicted_labels, average="micro"))
                print(result)
                true_labels = []
                predicted_labels = []
                
            del inputs
            
        validate_model(VALIDATION_SIZE)
        
    print('Finished Training')

In [None]:
train_model()

[1,  1000]
       precision    recall  fbeta_score  support
macro   0.123846  0.107681     0.113918      NaN
micro   0.829662  0.829662     0.829662      NaN


  _warn_prf(average, modifier, msg_start, len(result))


[1,  2000]
       precision    recall  fbeta_score  support
macro   0.138712  0.142857     0.140754      NaN
micro   0.970986  0.970986     0.970986      NaN


  _warn_prf(average, modifier, msg_start, len(result))


[1,  3000]
       precision    recall  fbeta_score  support
macro   0.135111  0.142857     0.138876      NaN
micro   0.945777  0.945777     0.945777      NaN


  _warn_prf(average, modifier, msg_start, len(result))


[validation 1000]
       precision    recall  fbeta_score  support
macro   0.135412  0.142857     0.139035      NaN
micro   0.947881  0.947881     0.947881      NaN


  _warn_prf(average, modifier, msg_start, len(result))


[2,  1000]
       precision    recall  fbeta_score  support
macro   0.141367  0.142857     0.142108      NaN
micro   0.989570  0.989570     0.989570      NaN


  _warn_prf(average, modifier, msg_start, len(result))


[2,  2000]
       precision    recall  fbeta_score  support
macro   0.138712  0.142857     0.140754      NaN
micro   0.970986  0.970986     0.970986      NaN


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
def test_model():
    test_generator = data_generator("Test")
    true_labels = []
    predicted_labels = []
    for inputs in test_generator:
        # forward
        outputs = mre_model(**inputs)
        true_labels += inputs["labels"].tolist()
        pred_labels = F.softmax(outputs.logits, dim=-1).argmax(dim=1)
        predicted_labels += pred_labels.tolist()
        assert len(predicted_labels) == len(true_labels)
        del inputs
    
    label_map = {v: k for k, v in internal_parser.relation_encode.items()}
    classes = list(label_map.keys())
    precision, recall, fbeta_score, support = precision_recall_fscore_support(true_labels, predicted_labels, average=None, labels=classes)
    result = pd.DataFrame(index=[label_map[c] for c in classes])
    result["precision"] = precision
    result["recall"] = recall
    result["fbeta_score"] = fbeta_score
    result["support"] = support
    result.loc["macro"] = list(precision_recall_fscore_support(true_labels, predicted_labels, average="macro"))
    result.loc["micro"] = list(precision_recall_fscore_support(true_labels, predicted_labels, average="micro"))
    
    print(result)
    return result

In [None]:
result = test_model()