# KLUE-RE Baseline

## Import library

In [1]:
import os

from functools import partial
from typing import Tuple, List, Any, Dict

import numpy as np

import torch
import torch.nn as nn

from datasets import load_dataset

import transformers
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForSequenceClassification,
)
from transformers import TrainingArguments, Trainer

## Settings

In [2]:
learning_rate = 3e-05
num_train_epochs = 4
train_batch_size = 32
eval_batch_size = 32
warmup_ratio = 0.2
# patience = 10000
output_dir = "klue_dir"
wandb_project = "klue_re"
run_name = "baseline"
report_to = "wandb"

In [3]:
model_name_or_path = "klue/roberta-large"

In [4]:
markers = dict(
    subject_start_marker="<subj>",
    subject_end_marker="</subj>",
    object_start_marker="<obj>",
    object_end_marker="</obj>",
)

In [5]:
relation_class = [
    "no_relation",
    "org:dissolved",
    "org:founded",
    "org:place_of_headquarters",
    "org:alternate_names",
    "org:member_of",
    "org:members",
    "org:political/religious_affiliation",
    "org:product",
    "org:founded_by",
    "org:top_members/employees",
    "org:number_of_employees/members",
    "per:date_of_birth",
    "per:date_of_death",
    "per:place_of_birth",
    "per:place_of_death",
    "per:place_of_residence",
    "per:origin",
    "per:employee_of",
    "per:schools_attended",
    "per:alternate_names",
    "per:parents",
    "per:children",
    "per:siblings",
    "per:spouse",
    "per:other_family",
    "per:colleagues",
    "per:product",
    "per:religion",
    "per:title"
]

num_labels = len(relation_class)

In [6]:
id2label = {idx: label for idx, label in enumerate(relation_class)}
label2id = {label: idx for idx, label in enumerate(relation_class)}

## Load and Preprocess Dataset

In [7]:
klue_re = load_dataset("klue", "re")

Downloading:   0%|          | 0.00/5.19k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.93k [00:00<?, ?B/s]

Downloading and preparing dataset klue/re (download: 5.41 MiB, generated: 13.07 MiB, post-processed: Unknown size, total: 18.48 MiB) to /opt/ml/.cache/huggingface/datasets/klue/re/1.0.0/55ff8f92b7a4b9842be6514ce0b4b5295b46d5e493f8bb5760da4be717018f90...


Downloading:   0%|          | 0.00/5.67M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset klue downloaded and prepared to /opt/ml/.cache/huggingface/datasets/klue/re/1.0.0/55ff8f92b7a4b9842be6514ce0b4b5295b46d5e493f8bb5760da4be717018f90. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
klue_re

DatasetDict({
    train: Dataset({
        features: ['guid', 'sentence', 'subject_entity', 'object_entity', 'label', 'source'],
        num_rows: 32470
    })
    validation: Dataset({
        features: ['guid', 'sentence', 'subject_entity', 'object_entity', 'label', 'source'],
        num_rows: 7765
    })
})

In [23]:
klue_re['train'].features

{'guid': Value(dtype='string', id=None),
 'sentence': Value(dtype='string', id=None),
 'subject_entity': {'word': Value(dtype='string', id=None),
  'start_idx': Value(dtype='int32', id=None),
  'end_idx': Value(dtype='int32', id=None),
  'type': Value(dtype='string', id=None)},
 'object_entity': {'word': Value(dtype='string', id=None),
  'start_idx': Value(dtype='int32', id=None),
  'end_idx': Value(dtype='int32', id=None),
  'type': Value(dtype='string', id=None)},
 'label': ClassLabel(num_classes=30, names=['no_relation', 'org:dissolved', 'org:founded', 'org:place_of_headquarters', 'org:alternate_names', 'org:member_of', 'org:members', 'org:political/religious_affiliation', 'org:product', 'org:founded_by', 'org:top_members/employees', 'org:number_of_employees/members', 'per:date_of_birth', 'per:date_of_death', 'per:place_of_birth', 'per:place_of_death', 'per:place_of_residence', 'per:origin', 'per:employee_of', 'per:schools_attended', 'per:alternate_names', 'per:parents', 'per:childr

In [24]:
klue_re['train'][0]

{'guid': 'klue-re-v1_train_00000',
 'sentence': '〈Something〉는 조지 해리슨이 쓰고 비틀즈가 1969년 앨범 《Abbey Road》에 담은 노래다.',
 'subject_entity': {'word': '비틀즈',
  'start_idx': 24,
  'end_idx': 26,
  'type': 'ORG'},
 'object_entity': {'word': '조지 해리슨',
  'start_idx': 13,
  'end_idx': 18,
  'type': 'PER'},
 'label': 0,
 'source': 'wikipedia'}

In [25]:
type(klue_re['train']['subject_entity'][0])

dict

### Load tokenizer

In [26]:
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)



In [27]:
# create_example
def mark_entity_spans(examples,
                      subject_start_marker: str, subject_end_marker: str,
                      object_start_marker: str, object_end_marker: str):

    def _mark_entity_spans(
        text: str, 
        subject_range=Tuple[int, int], 
        object_range=Tuple[int, int]
    ) -> str:
        """ Adds entity markers to the text to identify the subject/object entities.
        Args:
            text: Original sentence
            subject_range: Pair of start and end indices of subject entity
            object_range: Pair of start and end indices of object entity
        Returns:
            A string of text with subject/object entity markers
        """
        if subject_range < object_range:
            segments = [
                text[: subject_range[0]],
                subject_start_marker,
                text[subject_range[0] : subject_range[1] + 1],
                subject_end_marker,
                text[subject_range[1] + 1 : object_range[0]],
                object_start_marker,
                text[object_range[0] : object_range[1] + 1],
                object_end_marker,
                text[object_range[1] + 1 :],
            ]
        elif subject_range > object_range:
            segments = [
                text[: object_range[0]],
                object_start_marker,
                text[object_range[0] : object_range[1] + 1],
                object_end_marker,
                text[object_range[1] + 1 : subject_range[0]],
                subject_start_marker,
                text[subject_range[0] : subject_range[1] + 1],
                subject_end_marker,
                text[subject_range[1] + 1 :],
            ]
        else:
            raise ValueError("Entity boundaries overlap.")

        marked_text = "".join(segments)

        return marked_text
    
    subject_entity = examples["subject_entity"]
    object_entity = examples["object_entity"]
    
    text = _mark_entity_spans(
        examples["sentence"],
        (subject_entity["start_idx"], subject_entity["end_idx"]),
        (object_entity["start_idx"], object_entity["end_idx"]),
    )
    return {"text": text}

mark_entity_spans = partial(mark_entity_spans, **markers)

In [28]:
examples = klue_re.map(mark_entity_spans)

  0%|          | 0/32470 [00:00<?, ?ex/s]

  0%|          | 0/7765 [00:00<?, ?ex/s]

In [29]:
tokenizer.add_special_tokens(
    {"additional_special_tokens": list(markers.values())}
)

4

In [30]:
def convert_example_to_features(
    examples, 
    tokenizer,
    subject_start_marker: str,
    subject_end_marker: str,
    object_start_marker: str,
    object_end_marker: str
) -> Dict[str, List[Any]]:
    
    def fix_tokenization_error(text: str) -> List[str]:
        """Fix the tokenization due to the `obj` and `subj` marker inserted
        in the middle of a word.
        Example:
            >>> text = "<obj>조지 해리슨</obj>이 쓰고 <subj>비틀즈</subj>가"
            >>> tokens = ['<obj>', '조지', '해리', '##슨', '</obj>', '이', '쓰', '##고', '<subj>', '비틀즈', '</subj>', '가']
            >>> fix_tokenization_error(text)
            ['<obj>', '조지', '해리', '##슨', '</obj>', '##이', '쓰', '##고', '<subj>', '비틀즈', '</subj>', '##가']
            
        Only support for BertTokenizerFast
        If you use bbpe, change code!
        """
        batch_encoding = tokenizer._tokenizer.encode(text)
        tokens = batch_encoding.tokens
        # subject
        if text[text.find(subject_end_marker) + len(subject_end_marker)] != " ":
            space_idx = tokens.index(subject_end_marker) + 1
            # tokenizer_type == "bert-wp"
            if not tokens[space_idx].startswith("##") and "가" <= tokens[space_idx][0] <= "힣":
                tokens[space_idx] = "##" + tokens[space_idx]

        # object
        if text[text.find(object_end_marker) + len(object_end_marker)] != " ":
            space_idx = tokens.index(object_end_marker) + 1
            # tokenizer_type == "bert-wp"
            if not tokens[space_idx].startswith("##") and "가" <= tokens[space_idx][0] <= "힣":
                tokens[space_idx] = "##" + tokens[space_idx]
        
        return tokens    
    
    tokens = fix_tokenization_error(examples["text"])
    
    return {
        "input_ids": tokenizer.convert_tokens_to_ids(tokens),
        "tokenized": tokens,
    }

convert_example_to_features = partial(
    convert_example_to_features,
    tokenizer=tokenizer,
    **markers,
)

In [31]:
tokenized_datasets = examples.map(convert_example_to_features)

  0%|          | 0/32470 [00:00<?, ?ex/s]

  0%|          | 0/7765 [00:00<?, ?ex/s]

In [34]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['guid', 'sentence', 'subject_entity', 'object_entity', 'label', 'source', 'text', 'input_ids', 'tokenized'],
        num_rows: 32470
    })
    validation: Dataset({
        features: ['guid', 'sentence', 'subject_entity', 'object_entity', 'label', 'source', 'text', 'input_ids', 'tokenized'],
        num_rows: 7765
    })
})

In [40]:
tokenized_datasets['train']['label'][0]

0

## Load Model

In [14]:
config = AutoConfig.from_pretrained(
    pretrained_model_name_or_path=model_name_or_path,
    num_labels=num_labels,
    cache_dir="cache",
    id2label=id2label,
    label2id=label2id,
)

In [34]:
model.config._name_or_path

'klue/roberta-large'

In [15]:
model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=model_name_or_path,
    config=config,
    cache_dir="cache",
)

Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'class

In [16]:
if model.config.vocab_size < len(tokenizer):
    print("resize...")
    model.resize_token_embeddings(len(tokenizer))

resize...


## Make compute metrics

In [17]:
from sklearn.metrics import f1_score, precision_recall_curve, auc


def make_compute_metrics(label_indices):
    n_classes = len(label_indices)
    no_relation_label_idx = label_indices.index("no_relation")
    label_indices = list(range(len(relation_class)))
    label_indices.remove(no_relation_label_idx)
    
    def compute_metrics(eval_pred, label_indices=label_indices, n_classes=n_classes):
        preds, labels = eval_pred

        # Micro F1 (except no_relation)
        predictions = np.argmax(preds, axis=1).ravel()
        micro_f1 = f1_score(labels, predictions, average="micro", labels=label_indices)

        # AUPRC (Area Under the Precision-Recall Curve)
        onehots = np.eye(n_classes)[labels]
        scores = np.zeros((n_classes,))
        for c in range(n_classes):
            targets_c = onehots.take([c], axis=1).ravel()
            preds_c = preds.take([c], axis=1).ravel()
            precision, recall, _ = precision_recall_curve(targets_c, preds_c)
            scores[c] = auc(recall, precision)
        auprc = np.average(scores)

        return {
            "micro_f1": micro_f1,
            "auprc": auprc,
        }
    
    return compute_metrics

compute_metrics = make_compute_metrics(relation_class)

## Make Data Collator

In [18]:
class DataCollator:
    
    def __init__(self, tokenizer, max_length=510):
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __call__(self, batch):
        input_ids = [x["input_ids"] for x in batch]
        labels = [x["label"] for x in batch]
        batch_encoding = tokenizer.pad(
            {"input_ids": input_ids},
            max_length=self.max_length,
            return_tensors="pt",
        )
        batch_encoding.update({"labels": torch.LongTensor(labels)})
        return batch_encoding

In [19]:
data_collator = DataCollator(tokenizer)

In [20]:
os.environ["WANDB_PROJECT"] = wandb_project

call_wandb = True
try:
    os.environ["WANDB_PROJECT"]
    
except KeyError:
    call_wandb = False
    
if call_wandb:
    import wandb
    wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mjinmang2[0m (use `wandb login --relogin` to force relogin)


In [22]:
args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    weight_decay=0.01,
    warmup_ratio=warmup_ratio,
#     save_total_limit=5,
    num_train_epochs=num_train_epochs,
    fp16=True,
    report_to=report_to,
    run_name=run_name,
    load_best_model_at_end=True,
    metric_for_best_model="auprc",
)

In [23]:
features_name = list(tokenized_datasets["train"].features.keys())
features_name.pop(features_name.index("input_ids"))
features_name.pop(features_name.index("label"))
tokenized_datasets = tokenized_datasets.remove_columns(features_name)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids'],
        num_rows: 32470
    })
    validation: Dataset({
        features: ['label', 'input_ids'],
        num_rows: 7765
    })
})

In [24]:
# from datasets import DatasetDict

# tokenized_datasets = DatasetDict(
#     {
#         "train": tokenized_datasets["train"].select(range(1000)),
#         "validation": tokenized_datasets["validation"].select(range(1000)),
#     }
# )
# tokenized_datasets

In [25]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Using amp fp16 backend


In [26]:
trainer.train()

***** Running training *****
  Num examples = 32470
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 10150
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"

CondaEnvException: Unable to determine environment

Please re-run this command with one of the following options:

* Provide an environment name via --name or -n
* Re-run this command inside an activated conda environment.



Epoch,Training Loss,Validation Loss,Micro F1,Auprc
1,0.8368,0.893166,0.567788,0.56335
2,0.5877,0.866404,0.605545,0.689716
3,0.4517,0.721395,0.670039,0.736213
4,0.3279,0.691364,0.697971,0.743833
5,0.2386,0.895599,0.673916,0.726725
6,0.1692,0.950437,0.674746,0.725315


  nn.utils.clip_grad_norm_(
***** Running Evaluation *****
  Num examples = 7765
  Batch size = 32
Saving model checkpoint to klue_dir/checkpoint-1015
Configuration saved in klue_dir/checkpoint-1015/config.json
Model weights saved in klue_dir/checkpoint-1015/pytorch_model.bin
  nn.utils.clip_grad_norm_(
***** Running Evaluation *****
  Num examples = 7765
  Batch size = 32
Saving model checkpoint to klue_dir/checkpoint-2030
Configuration saved in klue_dir/checkpoint-2030/config.json
Model weights saved in klue_dir/checkpoint-2030/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 7765
  Batch size = 32
Saving model checkpoint to klue_dir/checkpoint-3045
Configuration saved in klue_dir/checkpoint-3045/config.json
Model weights saved in klue_dir/checkpoint-3045/pytorch_model.bin
  nn.utils.clip_grad_norm_(
***** Running Evaluation *****
  Num examples = 7765
  Batch size = 32
Saving model checkpoint to klue_dir/checkpoint-4060
Configuration saved in klue_dir/checkpoint-406