In [4]:
import pandas as pd
import matplotlib
import numpy as np
import pickle as pickle

import os
import pandas as pd
import torch
import sklearn
import numpy as np
from ast import literal_eval
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, Trainer, TrainingArguments, RobertaConfig, RobertaTokenizer, RobertaForSequenceClassification, BertTokenizer
from load_data import *
from torch.utils.data import DataLoader
import wandb
import re
import random
import hanja
from collections import defaultdict

* sampler작성(2차원 batch indices가 return값인 함수)
	* bucketing
	* 분포 반영
* collate_fn 작성
	* 여기서 tokenizing을 수행해야함. (padding 때문에)
* custom trainer
	* get_train_dataloader
	* get_valid_dataloader
	* get_test_dataloader   override

In [2]:
train = pd.read_csv('../dataset/train/train.csv', 
                    converters={'subject_entity':literal_eval, 'object_entity':literal_eval})
train.head()
dataset = train.copy()

In [3]:
sub_df = dataset['subject_entity'].apply(pd.Series).add_prefix('sub_')
obj_df = dataset['object_entity'].apply(pd.Series).add_prefix('obj_')
dataset = pd.concat([dataset, sub_df], axis=1)
dataset = pd.concat([dataset, obj_df], axis=1)

# sentence = dataset['sentence'].values
# subject_entity = dataset['sub_word'].values
# object_entity = dataset['obj_word'].values

# pattern_list = [re.compile(r'(\([가-힣\w\s]+\))\1'), re.compile(r'[一-龥]'), re.compile(r'\([\d]{1,2}\)')]
# replace_list = [oneParenthesis, hanjaToHangeul, '']
# target_col_list = [[sentence], [sentence, subject_entity, object_entity], [sentence]]

# for pat, repl, target_col in zip(pattern_list, replace_list, target_col_list):
#     for tgt in target_col:
#         for i in range(len(dataset)):
#             if pat.search(tgt[i]):
#                 tgt[i] = pat.sub(repl, tgt[i])

# dataset['sentence'] = sentence
# dataset['sub_word'] = subject_entity
# dataset['obj_word'] = object_entity

In [58]:
def tokenized_dataset(dataset, tokenizer):
    """ tokenizer에 따라 sentence를 tokenizing 합니다."""
    tokens = []
    for row in dataset.itertuples():
        temp = [i for i in row.sentence]
        if row.sub_start_idx > row.obj_start_idx:
            temp[row.sub_start_idx:row.sub_end_idx+1] = [f'#^{row.sub_type}^{row.sub_word}#']
            temp[row.obj_start_idx:row.obj_end_idx+1] = [f'@+{row.obj_type}+{row.obj_word}@']
        else:
            temp[row.obj_start_idx:row.obj_end_idx+1] = [f'@+{row.obj_type}+{row.obj_word}@']
            temp[row.sub_start_idx:row.sub_end_idx+1] = [f'#^{row.sub_type}^{row.sub_word}#']
        
        tokenized_sentences = tokenizer(
            ''.join(temp),
            return_tensors="pt",
            padding=False,
            truncation=False,
            max_length=256,
            add_special_tokens=True) 
        
        tokens.append(tokenized_sentences)
        
    return tokens

def split_data(dataset):
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)
    for train_index, dev_index in split.split(dataset, dataset["label"]):
        train_dataset = dataset.loc[train_index]
        dev_dataset = dataset.loc[dev_index]
    
    return train_dataset,dev_dataset

In [68]:
def make_sampler(data, batch_size=64, max_pad_len=20):
    sentence_length = [sen['input_ids'].shape[1] for sen in data]
    bucket_dict = defaultdict(list)

    for index, src_length in enumerate(sentence_length):
        bucket_dict[(src_length // max_pad_len)].append(index)

    batch_sampler = [bucket[start:start+batch_size] for bucket in bucket_dict.values() for start in range(0, len(bucket), batch_size)]
    random.shuffle(batch_sampler)

    return batch_sampler

In [105]:
def collate_fn(batch_samples):
    batched_samples = defaultdict(list)
    for data in batch_samples:
        for key, val in data.items():
            batched_samples[key].append(torch.tensor([val]))
    
    return batched_samples
    

In [61]:
def label_to_num(label):
    num_label = []
    with open('dict_label_to_num.pkl', 'rb') as f:
        dict_label_to_num = pickle.load(f)
    for v in label:
        num_label.append(dict_label_to_num[v])
    
    return num_label
train_label = label_to_num(dataset['label'].values)


In [205]:
class RE_Dataset(torch.utils.data.Dataset):
    """ Dataset 구성을 위한 class."""
    def __init__(self, pair_dataset, labels):
        self.pair_dataset = pair_dataset
        self.labels = labels

    def __getitem__(self, idx):
        #item = {key: val[idx].clone().detach() for key, val in self.pair_dataset.items()}
        item = self.pair_dataset[idx]
        item['labels'] = torch.tensor(self.labels[idx])
        
        return item

    def __len__(self):
        return len(self.labels)

In [206]:
MODEL_NAME = 'monologg/kobigbird-bert-base'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [347]:
train_dataset, dev_dataset = split_data(dataset)

In [348]:
train_label = label_to_num(train_dataset['label'].values)
dev_label = label_to_num(dev_dataset['label'].values)

# tokenizing dataset
tokenized_train = tokenized_dataset(train_dataset, tokenizer)
tokenized_dev = tokenized_dataset(dev_dataset, tokenizer)

# make dataset for pytorch.
RE_train_dataset = RE_Dataset(tokenized_train, train_label)
RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label)

In [349]:
train_sampler = make_sampler(tokenized_train, batch_size=64, max_pad_len=20)
valid_sampler = make_sampler(tokenized_dev, batch_size=64, max_pad_len=100)

In [350]:
def collate_fn(batch_samples):
    max_len = max([i['input_ids'].shape[1] for i in batch_samples])
    batch = defaultdict(list)
    for data in batch_samples:
        pad_len = max_len - data['input_ids'].shape[1]
        for key, val in data.items():
            if key != 'labels':
                batch[key].append(torch.cat((val, torch.zeros(1,pad_len)), dim=1).type(torch.long))
            else:
                batch[key].append(val)
    batch['input_ids'] = torch.stack(batch['input_ids']).squeeze(1).to(device)
    batch['token_type_ids'] = torch.stack(batch['token_type_ids']).squeeze(1).to(device)
    batch['attention_mask'] = torch.stack(batch['attention_mask']).squeeze(1).to(device)
    batch['labels'] = torch.stack(batch['labels']).to(device)
    return batch
    

In [358]:
class BucketTrainer(Trainer):
    def get_train_dataloader(self) -> DataLoader:
        train_dataset = self.train_dataset

        return DataLoader(train_dataset, batch_sampler=train_sampler, collate_fn=collate_fn)

    def get_eval_dataloader(self, eval_dataset) -> DataLoader:
        if eval_dataset is not None:
            return DataLoader(eval_dataset, batch_sampler=valid_sampler, collate_fn=collate_fn)
        else:
            return DataLoader(self.eval_dataset, batch_sampler=valid_sampler, collate_fn=collate_fn)

In [359]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device) 
# setting model hyperparameter
model_config =  AutoConfig.from_pretrained(MODEL_NAME)
model_config.num_labels = 30

model =  AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=model_config)
model = model.to(device) 

cuda:0


loading configuration file https://huggingface.co/monologg/kobigbird-bert-base/resolve/main/config.json from cache at /opt/ml/.cache/huggingface/transformers/3ff1f36a44e63a0ac32fcc55ff4c268a360e07ee22869bbc20ded21da8fdd596.4449f16b91f50859dc03ca5c81261c9952b3176fd389a7e99d067b33c0a8f3a1
Model config BigBirdConfig {
  "architectures": [
    "BigBirdForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "attention_type": "block_sparse",
  "block_size": 64,
  "bos_token_id": 5,
  "classifier_dropout": null,
  "eos_token_id": 6,
  "gradient_checkpointing": false,
  "hidden_act": "gelu_new",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 4096,
  "model_type": "big_bird",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_random_blocks": 3,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "rescale_embeddings": false,
  "sep_token_id": 3,
  "tok

In [360]:

def klue_re_micro_f1(preds, labels):
    """KLUE-RE micro f1 (except no_relation)"""
    label_list = ['no_relation', 'org:top_members/employees', 'org:members',
       'org:product', 'per:title', 'org:alternate_names',
       'per:employee_of', 'org:place_of_headquarters', 'per:product',
       'org:number_of_employees/members', 'per:children',
       'per:place_of_residence', 'per:alternate_names',
       'per:other_family', 'per:colleagues', 'per:origin', 'per:siblings',
       'per:spouse', 'org:founded', 'org:political/religious_affiliation',
       'org:member_of', 'per:parents', 'org:dissolved',
       'per:schools_attended', 'per:date_of_death', 'per:date_of_birth',
       'per:place_of_birth', 'per:place_of_death', 'org:founded_by',
       'per:religion']
    no_relation_label_idx = label_list.index("no_relation")
    label_indices = list(range(len(label_list)))
    label_indices.remove(no_relation_label_idx)
    return sklearn.metrics.f1_score(labels, preds, average="micro", labels=label_indices) * 100.0

def klue_re_auprc(probs, labels):
    """KLUE-RE AUPRC (with no_relation)"""
    labels = np.eye(30)[labels]

    score = np.zeros((30,))
    for c in range(30):
        targets_c = labels.take([c], axis=1).ravel()
        preds_c = probs.take([c], axis=1).ravel()
        precision, recall, _ = sklearn.metrics.precision_recall_curve(targets_c, preds_c)
        score[c] = sklearn.metrics.auc(recall, precision)
    return np.average(score) * 100.0

def compute_metrics(pred):
    """ validation을 위한 metrics function """
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    probs = pred.predictions

    # calculate accuracy using sklearn's function
    f1 = klue_re_micro_f1(preds, labels)
    auprc = klue_re_auprc(probs, labels)
    acc = accuracy_score(labels, preds) # 리더보드 평가에는 포함되지 않습니다.

    return {
        'micro f1 score': f1,
        'auprc' : auprc,
        'accuracy': acc,
    }

In [361]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    save_total_limit=5,              # number of total save model.
    save_steps=500,                 # model saving step.
    num_train_epochs=4,              # total number of training epochs
    learning_rate=5e-5,               # learning_rate
    per_device_train_batch_size=100000,  # batch size per device during training
    per_device_eval_batch_size=100000,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,              # log saving step.
    evaluation_strategy='epoch',
    save_strategy='epoch', # evaluation strategy to adopt during training
                                # `no`: No evaluation during training.
                                # `steps`: Evaluate every `eval_steps`.
                                # `epoch`: Evaluate every end of epoch.
    load_best_model_at_end = True, 
    #report_to='wandb'
)

trainer = BucketTrainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=RE_train_dataset,  # training dataset
    eval_dataset=RE_dev_dataset,
    compute_metrics = compute_metrics      
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [362]:
trainer.train()

***** Running training *****
  Num examples = 29223
  Num Epochs = 4
  Instantaneous batch size per device = 100000
  Total train batch size (w. parallel, distributed & accumulation) = 100000
  Gradient Accumulation steps = 1
  Total optimization steps = 1852
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Attention type 'block_sparse' is not possible if sequence_length: 114 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3.Changing attention type to 'original_full'...


Epoch,Training Loss,Validation Loss,Micro f1 score,Auprc,Accuracy
1,1.1954,0.879967,75.50077,47.59135,0.744071
2,0.603,0.614102,81.333333,68.46247,0.795196
3,0.4246,0.59462,83.87508,71.475606,0.817986
4,0.2829,0.57622,83.872331,71.686052,0.817062


***** Running Evaluation *****
  Num examples = 3247
  Batch size = None
Saving model checkpoint to ./results/checkpoint-463
Configuration saved in ./results/checkpoint-463/config.json
Model weights saved in ./results/checkpoint-463/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3247
  Batch size = None
Saving model checkpoint to ./results/checkpoint-926
Configuration saved in ./results/checkpoint-926/config.json
Model weights saved in ./results/checkpoint-926/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3247
  Batch size = None
Saving model checkpoint to ./results/checkpoint-1389
Configuration saved in ./results/checkpoint-1389/config.json
Model weights saved in ./results/checkpoint-1389/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3247
  Batch size = None
Saving model checkpoint to ./results/checkpoint-1852
Configuration saved in ./results/checkpoint-1852/config.json
Model weights saved in ./results/checkpoint-1852/pytorch_

TrainOutput(global_step=1852, training_loss=0.8488048533898965, metrics={'train_runtime': 648.5142, 'train_samples_per_second': 180.246, 'train_steps_per_second': 2.856, 'total_flos': 4493775547790208.0, 'train_loss': 0.8488048533898965, 'epoch': 4.0})

In [363]:

class RE_Dataset(torch.utils.data.Dataset):
    """ Dataset 구성을 위한 class."""
    def __init__(self, pair_dataset, labels):
        self.pair_dataset = pair_dataset
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.pair_dataset.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
  

def tokenized_dataset(dataset, tokenizer):
    """ tokenizer에 따라 sentence를 tokenizing 합니다."""
    concat_entity = []
    for row in dataset.itertuples():
        temp = [i for i in row.sentence]
        if row.sub_start_idx > row.obj_start_idx:
            temp[row.sub_start_idx:row.sub_end_idx+1] = [f'#^{row.sub_type}^{row.sub_word}#']
            temp[row.obj_start_idx:row.obj_end_idx+1] = [f'@+{row.obj_type}+{row.obj_word}@']
        else:
            temp[row.obj_start_idx:row.obj_end_idx+1] = [f'@+{row.obj_type}+{row.obj_word}@']
            temp[row.sub_start_idx:row.sub_end_idx+1] = [f'#^{row.sub_type}^{row.sub_word}#']
        concat_entity.append(''.join(temp))
        
    tokenized_sentences = tokenizer(
        concat_entity,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=256,
        add_special_tokens=True,
        )
    return tokenized_sentences

def split_data(dataset):
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)
    for train_index, dev_index in split.split(dataset, dataset["label"]):
        train_dataset = dataset.loc[train_index]
        dev_dataset = dataset.loc[dev_index]
    
    return train_dataset,dev_dataset
        
MODEL_NAME = 'monologg/kobigbird-bert-base'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# load dataset
dataset = load_data("../dataset/train/train.csv")

train_dataset, dev_dataset= split_data(dataset)

#train_label = label_to_num(dataset['label'].values)
train_label = label_to_num(train_dataset['label'].values)
dev_label = label_to_num(dev_dataset['label'].values)

# tokenizing dataset
#tokenized_train = tokenized_dataset(dataset, tokenizer)
tokenized_train = tokenized_dataset(train_dataset, tokenizer)
tokenized_dev = tokenized_dataset(dev_dataset, tokenizer)

# make dataset for pytorch.
RE_train_dataset = RE_Dataset(tokenized_train, train_label)
RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

print(device)
# setting model hyperparameter
model_config =  AutoConfig.from_pretrained(MODEL_NAME)
model_config.num_labels = 30

model =  AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=model_config)
print(model.config)
model.parameters
model.to(device)

# 사용한 option 외에도 다양한 option들이 있습니다.
# https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요.
training_args = TrainingArguments(
  output_dir='./results',          # output directory
  save_total_limit=5,              # number of total save model.
  save_steps=500,                 # model saving step.
  num_train_epochs=4,              # total number of training epochs
  learning_rate=5e-5,               # learning_rate
  per_device_train_batch_size=64,  # batch size per device during training
  per_device_eval_batch_size=64,   # batch size for evaluation
  warmup_steps=500,                # number of warmup steps for learning rate scheduler
  weight_decay=0.01,               # strength of weight decay
  logging_dir='./logs',            # directory for storing logs
  logging_steps=100,              # log saving step.
  evaluation_strategy='epoch',
  save_strategy='epoch', # evaluation strategy to adopt during training
                              # `no`: No evaluation during training.
                              # `steps`: Evaluate every `eval_steps`.
                              # `epoch`: Evaluate every end of epoch.
  eval_steps = 100,            # evaluation step.
  load_best_model_at_end = True
)
trainer = Trainer(
  model=model,                         # the instantiated 🤗 Transformers model to be trained
  args=training_args,                  # training arguments, defined above
  train_dataset=RE_train_dataset,         # training dataset
  eval_dataset=RE_dev_dataset,
  compute_metrics = compute_metrics
)

# train model
trainer.train()

loading file https://huggingface.co/monologg/kobigbird-bert-base/resolve/main/vocab.txt from cache at /opt/ml/.cache/huggingface/transformers/00ac7c2886f9d4555133877badce522b93b38439d90b0135d9b414cc1fafd167.34d17d2d06e0d29acc69761e3ddeced0dfdcf4cefa0aa81a1bb267a7dfdd5bcb
loading file https://huggingface.co/monologg/kobigbird-bert-base/resolve/main/tokenizer.json from cache at /opt/ml/.cache/huggingface/transformers/e2eb4ad30139b806997f999b45c0a0d9ea38b14e0d97f42db852be137e061b1e.616843352d77fff459e989408eaacf1280dc39dcd346ff746aa3b3fbe6a123d9
loading file https://huggingface.co/monologg/kobigbird-bert-base/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/monologg/kobigbird-bert-base/resolve/main/special_tokens_map.json from cache at /opt/ml/.cache/huggingface/transformers/9bea998b48658e35dd618115a266f6c173183a9a4261fc6e40730d74c4b67899.e3640e465e51ce85d94923a0b396029ecc2e3e4c7764031eee57ab272637652d
loading file https://huggingface.co/monologg/kobig

cuda:0


loading configuration file https://huggingface.co/monologg/kobigbird-bert-base/resolve/main/config.json from cache at /opt/ml/.cache/huggingface/transformers/3ff1f36a44e63a0ac32fcc55ff4c268a360e07ee22869bbc20ded21da8fdd596.4449f16b91f50859dc03ca5c81261c9952b3176fd389a7e99d067b33c0a8f3a1
Model config BigBirdConfig {
  "architectures": [
    "BigBirdForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "attention_type": "block_sparse",
  "block_size": 64,
  "bos_token_id": 5,
  "classifier_dropout": null,
  "eos_token_id": 6,
  "gradient_checkpointing": false,
  "hidden_act": "gelu_new",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 4096,
  "model_type": "big_bird",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_random_blocks": 3,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "rescale_embeddings": false,
  "sep_token_id": 3,
  "tok

BigBirdConfig {
  "_name_or_path": "monologg/kobigbird-bert-base",
  "architectures": [
    "BigBirdForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "attention_type": "block_sparse",
  "block_size": 64,
  "bos_token_id": 5,
  "classifier_dropout": null,
  "eos_token_id": 6,
  "gradient_checkpointing": false,
  "hidden_act": "gelu_new",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",
    "26": "LABEL_26",
    "27": 

Epoch,Training Loss,Validation Loss,Micro f1 score,Auprc,Accuracy
1,1.2354,0.92571,74.91453,45.304595,0.737912
2,0.6508,0.637842,80.605286,63.845391,0.78688
3,0.4504,0.592093,82.523659,71.156587,0.804127
4,0.3127,0.591133,83.121289,70.931718,0.810902


***** Running Evaluation *****
  Num examples = 3247
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-457
Configuration saved in ./results/checkpoint-457/config.json
Model weights saved in ./results/checkpoint-457/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-15] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 3247
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-914
Configuration saved in ./results/checkpoint-914/config.json
Model weights saved in ./results/checkpoint-914/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-463] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 3247
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-1371
Configuration saved in ./results/checkpoint-1371/config.json
Model weights saved in ./results/checkpoint-1371/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-926] due to args.save_total_limit
***** Runni

TrainOutput(global_step=1828, training_loss=0.8736059936965842, metrics={'train_runtime': 1837.1936, 'train_samples_per_second': 63.625, 'train_steps_per_second': 0.995, 'total_flos': 1.482220640753496e+16, 'train_loss': 0.8736059936965842, 'epoch': 4.0})

### output확인을 위한 PY to Ipynb

In [13]:
MODEL_NAME = 'monologg/kobigbird-bert-base'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [17]:
def label_to_num(label):
    num_label = []
    with open('dict_label_to_num.pkl', 'rb') as f:
        dict_label_to_num = pickle.load(f)
    for v in label:
        num_label.append(dict_label_to_num[v])
    
    return num_label

In [18]:
dataset = load_data("../dataset/train/train.csv")
train_dataset, dev_dataset= split_data(dataset)
train_dataset.head()
train_label = label_to_num(train_dataset['label'].values)
dev_label = label_to_num(dev_dataset['label'].values)

In [77]:
concat_entity = []
for row in dataset.itertuples():
    temp = [i for i in row.sentence]
    if row.sub_start_idx > row.obj_start_idx:
        temp[row.sub_start_idx:row.sub_end_idx+1] = [f'@+{row.sub_type}+{row.sub_word}@']
        temp[row.obj_start_idx:row.obj_end_idx+1] = [f'@+{row.obj_type}+{row.obj_word}@']
    else:
        temp[row.obj_start_idx:row.obj_end_idx+1] = [f'@+{row.obj_type}+{row.obj_word}@']
        temp[row.sub_start_idx:row.sub_end_idx+1] = [f'@+{row.sub_type}+{row.sub_word}@']
    concat_entity.append(''.join(temp))

In [78]:
concat_entity[0]

'〈Something〉는 @+PER+조지 해리슨@이 쓰고 @+ORG+비틀즈@가 1969년 앨범 《Abbey Road》에 담은 노래다.'

In [20]:
def tokenized_dataset(dataset, tokenizer):
    """ tokenizer에 따라 sentence를 tokenizing 합니다."""
    concat_entity = []
    cols = [dataset['subject_entity'], dataset['object_entity'], 
            dataset['sub_entity_type'], dataset['obj_entity_type']]
    for sub, obj, sub_type, obj_type in zip(*cols):
        temp = ''
        temp = sub + '[SEP]' + obj
        concat_entity.append(temp)
        
    tokenized_sentences = tokenizer(
        concat_entity,
        list(dataset['sentence']),
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=256,
        add_special_tokens=True,
        )
    return tokenized_sentences

tokenized_train = tokenized_dataset(train_dataset, tokenizer)
tokenized_dev = tokenized_dataset(dev_dataset, tokenizer)

In [21]:
tokenized_train

{'input_ids': tensor([[    2,  3420, 16597,  ...,     0,     0,     0],
        [    2, 19467,  5521,  ...,     0,     0,     0],
        [    2,   560, 23967,  ...,     0,     0,     0],
        ...,
        [    2,  2623,  4593,  ...,     0,     0,     0],
        [    2, 16941,  4893,  ...,     0,     0,     0],
        [    2, 21595,     3,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}