In [None]:
## should make cfg.train.entity_embedding (=True)

In [45]:
## in load_data.py
import pickle as pickle
import os
import pandas as pd
import torch
from tqdm import tqdm

class RE_Dataset(torch.utils.data.Dataset):
    """ Dataset 구성을 위한 class."""
    def __init__(self, dataset,labels,tokenizer,cfg):
        self.labels = labels
        self.tokenizer = tokenizer
        self.new_tokens = []
        self.marker_mode = cfg.train.marker_mode
        if self.marker_mode == 'EMask':
            self.new_tokens = ['<subj-ORG>','<subj-PER>','<obj-ORG>','<obj-PER>','<obj-DAT>','<obj-LOC>','<obj-POH>','<obj-NOH>']
        elif self.marker_mode == "EM":
            self.new_tokens = ['<subj>', '</subj>', '<obj>', '</obj>']
        elif self.marker_mode == "TEM":
            self.new_tokens = ['<s:ORG>', '<s:PER>', '<o:ORG>', '<o:PER>', '<o:DAT>', '<o:LOC>', '<o:POH>', '<o:NOH>', '</s:ORG>', '</s:PER>', '</o:ORG>', '</o:PER>', '</o:DAT>', '</o:LOC>', '</o:POH>', '</o:NOH>']
        self.tokenizer.add_tokens(self.new_tokens)
        
        self.dataset = self.tokenizing(dataset)

        self.cfg = cfg
    def __getitem__(self, idx):
        if self.cfg.train.entity_embedding:
            if len(self.labels) ==0:
                return {'input_ids': torch.LongTensor(self.dataset[idx]['input_ids']).squeeze(0),
                        'attention_mask': torch.LongTensor(self.dataset[idx]['attention_mask']).squeeze(0),
                        'token_type_ids': torch.LongTensor(self.dataset[idx]['token_type_ids']).squeeze(0),
                        'Entity_type_embedding': torch.LongTensor(self.dataset[idx]['Entity_type_embedding']).squeeze(0),
                        'Entity_idxes': torch.LongTensor(self.dataset[idx]['Entity_idxes']).squeeze(0)                    
                            }
            else:
                return {'input_ids': torch.LongTensor(self.dataset[idx]['input_ids']).squeeze(0),
                        'attention_mask': torch.LongTensor(self.dataset[idx]['attention_mask']).squeeze(0),
                        'token_type_ids': torch.LongTensor(self.dataset[idx]['token_type_ids']).squeeze(0),
                        'Entity_type_embedding': torch.LongTensor(self.dataset[idx]['Entity_type_embedding']).squeeze(0),
                        'Entity_idxes': torch.LongTensor(self.dataset[idx]['Entity_idxes']).squeeze(0),
                        'labels' : torch.LongTensor([self.labels[idx]]).squeeze()}
        else:
            if len(self.labels) ==0:
                return {'input_ids': torch.LongTensor(self.dataset[idx]['input_ids']).squeeze(0),
                        'attention_mask': torch.LongTensor(self.dataset[idx]['attention_mask']).squeeze(0),
                        'token_type_ids': torch.LongTensor(self.dataset[idx]['token_type_ids']).squeeze(0)                    
                            }
            else:
                return {'input_ids': torch.LongTensor(self.dataset[idx]['input_ids']).squeeze(0),
                        'attention_mask': torch.LongTensor(self.dataset[idx]['attention_mask']).squeeze(0),
                        'token_type_ids': torch.LongTensor(self.dataset[idx]['token_type_ids']).squeeze(0),
                        'labels' : torch.LongTensor([self.labels[idx]]).squeeze()}
    def __len__(self):
        return len(self.dataset)
    
    def tokenizing(self,dataframe):
        data = []
        for idx, item in tqdm(dataframe.iterrows(), desc='tokenizing', total=len(dataframe)):
            # 두 입력 문장을 [SEP] 토큰으로 이어붙여서 전처리합니다.
            text = self.add_special_enti(item,marker_mode = self.marker_mode)
            # text = '[SEP]'.join([concat_entity, item['sentence']])
            outputs = self.tokenizer(text, add_special_tokens=True,
                                          truncation=True,
                                          return_tensors="pt",
                                          padding='max_length',
                                          max_length=256
                                    )
            data.append(outputs)
        return data
    
    def add_special_enti(self,df,marker_mode= None):
        def change_enti(sub,obj,marker_mode = None):
            if marker_mode == 'TEM_punct':
                Eng_type_to_Kor = {"PER":"사람", "ORG":"단체", "POH" : "기타", "LOC" : "장소", "NOH" : "수량", "DAT" : "날짜"}
                marked_sub = ['@']+['*']+list(Eng_type_to_Kor[sub['type']]) + ['*']+list(sub['word'])+['@']
                marked_obj = ['#']+['^']+list(Eng_type_to_Kor[obj['type']]) + ['^']+list(obj['word'])+['#']
            elif marker_mode == 'TEM':
                marked_sub = ['<s:']+list(sub['type']) + ['>']+list(sub['word'])+['</s:']+list(sub['type']) + ['>']
                marked_obj = ['<o:']+list(obj['type']) + ['>']+list(obj['word'])+['</o:']+list(obj['type']) + ['>'] ## typo
            elif marker_mode == "EM":
                marked_sub = ['<subj>']+list(sub['word'])+['</subj>']
                marked_obj = ['<obj>']+list(obj['word'])+['</obj>']
            elif marker_mode == "EMask":
                marked_sub = [f'<subj-{sub["type"]}>']
                marked_obj = [f'<obj-{obj["type"]}>']
            return marked_sub, marked_obj
        marked = []
        sub = eval(df['subject_entity'])
        s_s, s_e = sub['start_idx'], sub['end_idx']+1
        obj = eval(df['object_entity'])
        o_s, o_e = obj['start_idx'], obj['end_idx']+1
        marked_sub,marked_obj = change_enti(sub,obj,marker_mode = marker_mode)
        if s_s < o_s:
            marked += df['sentence'][:s_s]
            marked += marked_sub
            marked += df['sentence'][s_e:o_s]
            marked += marked_obj
            marked += df['sentence'][o_e:]
            marked = ''.join(marked)
        else:
            marked += df['sentence'][:o_s]
            marked += marked_obj
            marked += df['sentence'][o_e:s_s]
            marked += marked_sub
            marked += df['sentence'][s_e:]
            marked = ''.join(marked)
        return marked

    
def load_data(dataset_dir):
    """ csv 파일을 경로에 맡게 불러 옵니다. """
    pd_dataset = pd.read_csv(dataset_dir)
    return pd_dataset


In [46]:
## in utils.py
def get_entity_idxes(tokenizer, token_list, cfg):
    """
        entity 표현 방식에 따른 entity 위치 계산
    """
    entity_embedding = np.zeros(len(token_list))
    if cfg.train.marker_mode == 'EM':
        # 스페셜 토큰 위치로 쉽게 찾을 수 있음 ## [0,0,0,0,0,0,1,1,1,1,1,0,0,0,0] ## ['<subj>', '</subj>', '<obj>', '</obj>']
        vocab_len = len(tokenizer)-4 ## special_token start_idx
        subj_start_idx = np.where(token_list==vocab_len)[0][0]+1
        subj_end_idx = np.where(token_list==vocab_len+1)[0][0]
        obj_start_idx = np.where(token_list==vocab_len+2)[0][0]+1
        obj_end_idx = np.where(token_list==vocab_len+3)[0][0]
        entity_embedding[subj_start_idx:subj_end_idx] = 1
        entity_embedding[obj_start_idx:obj_end_idx] = 2
        
        return entity_embedding, subj_start_idx, subj_end_idx, obj_start_idx, obj_end_idx
    elif cfg.train.marker_mode == 'EMask':
        # entity word만 1로함. ## [0,0,0,1,1,1,1,0,0,0,2,2,2,0,0] ## ['<subj-ORG>','<subj-PER>','<obj-ORG>','<obj-PER>','<obj-DAT>','<obj-LOC>','<obj-POH>','<obj-NOH>']
        subj_1 = tokenizer.convert_tokens_to_ids(['<subj-ORG>','<subj-PER>'])
        obj_1 = tokenizer.convert_tokens_to_ids(['<obj-ORG>','<obj-PER>','<obj-DAT>','<obj-LOC>','<obj-POH>','<obj-NOH>'])

        ## subj의 start_idx, end_idx를 찾는 과정. tokenized entity word 만 1로 구성할 것임.
        ## '<subj-ORG>'  로 구성되어 있음.그래서 '<subj-ORG>'.idx만 찾아서 1로함
        for idx, t in enumerate(token_list):
            if (t in subj_1):
                entity_embedding[idx] = 1
                subj_start_idx = idx
                subj_end_idx = idx+1
                break

        for idx, t in enumerate(token_list):
            if (t in obj_1):
                entity_embedding[idx] = 2
                obj_start_idx = idx
                obj_end_idx = idx+1
                break

        return entity_embedding, subj_start_idx, subj_end_idx, obj_start_idx, obj_end_idx
    elif cfg.train.marker_mode == 'TEM': ## check complete
        # entity word만 1로함 ## [0,0,0,1,1,1,1,0,0,0,2,2,2,0,0] ## ['<s:ORG>', '<s:PER>', '<o:ORG>', '<o:PER>', '<o:DAT>', '<o:LOC>', '<o:POH>', '<o:NOH>', '</s:ORG>', '</s:PER>', '</o:ORG>', '</o:PER>', '</o:DAT>', '</o:LOC>', '</o:POH>', '</o:NOH>']
        subj_1 = tokenizer.convert_tokens_to_ids(['<s:ORG>', '<s:PER>'])
        subj_2 = tokenizer.convert_tokens_to_ids(['</s:ORG>', '</s:PER>'])
        obj_1 = tokenizer.convert_tokens_to_ids(['<o:ORG>', '<o:PER>', '<o:DAT>', '<o:LOC>', '<o:POH>', '<o:NOH>'])
        obj_2 = tokenizer.convert_tokens_to_ids(['</o:ORG>', '</o:PER>', '</o:DAT>', '</o:LOC>', '</o:POH>', '</o:NOH>'])

        subj_start_idx = 0
        subj_end_idx = 0
        ## subj의 start_idx, end_idx를 찾는 과정. tokenized entity word 만 1로 구성할 것임.
        ## '<s:ORG>' word '</s:ORG>'  로 구성되어 있음.그래서 '<s:ORG>'.idx + 1 = word의 첫 시작 token
        for idx, t in enumerate(token_list):
            if (t in subj_1):
                subj_start_idx = idx + 1
                subj_end_idx = subj_start_idx + 1
                while token_list[subj_end_idx] not in subj_2:
                    subj_end_idx += 1
                break

        entity_embedding[subj_start_idx:subj_end_idx] = 1

        obj_start_idx = 0
        obj_end_idx = 0
        for idx, t in enumerate(token_list):
            if (t in obj_1):
                obj_start_idx = idx + 1
                obj_end_idx = obj_start_idx + 1
                while token_list[obj_end_idx] not in obj_2:
                    obj_end_idx += 1
                break

        entity_embedding[obj_start_idx:obj_end_idx] = 2
        return entity_embedding, subj_start_idx, subj_end_idx, obj_start_idx, obj_end_idx
    elif cfg.train.marker_mode == 'TEM_punct':
    # 패턴을 이용해 찾기
        subj_1 = tokenizer.convert_tokens_to_ids('@')
        subj_2 = tokenizer.convert_tokens_to_ids('*')
        obj_1 = tokenizer.convert_tokens_to_ids('#')
        obj_2 = tokenizer.convert_tokens_to_ids('^')
        names = tokenizer.convert_tokens_to_ids(['단체','사람','날짜','장소','기타','수량'])

        subj_start_idx = 0
        subj_end_idx = 0
        ## subj의 start_idx, end_idx를 찾는 과정. tokenized entity word 만 1로 구성할 것임.
        ## @ * type * word @ 로 구성되어 있음.그래서 @.idx + 4 = word의 첫 시작 token -> 이게 아닐 수도 있다. idx + 4 가 꼭 word의 시작점은 아님. type이 여러개의 token으로 tokenize될 수도 있음.
        ## 한국어 PLM에 'ORG','DAT','LOC','POH','NOH'가 vocab에 없다. 물론 그대로 진행할 수도 있지만, TEM_punct의 성능 증가 전제에 맞지 않는다. 차라리 한국어로 번역해서 type을 넣어주는게 좋을 수도 있다.
        for idx, t in enumerate(token_list):
            if t == subj_1 and token_list[idx+1] == subj_2 and (token_list[idx+2] in names):
                subj_start_idx = idx + 4
                subj_end_idx = subj_start_idx + 1
                while token_list[subj_end_idx] != subj_1:
                    subj_end_idx += 1
                break

        entity_embedding[subj_start_idx:subj_end_idx] = 1

        obj_start_idx = 0
        obj_end_idx = 0
        for idx, t in enumerate(token_list):
            if t == obj_1 and token_list[idx+1] == obj_2 and (token_list[idx+2] in names):
                obj_start_idx = idx + 4
                obj_end_idx = obj_start_idx + 1
                while token_list[obj_end_idx] != obj_1:
                    obj_end_idx += 1
                break
        
        entity_embedding[obj_start_idx:obj_end_idx] = 2
        return entity_embedding, subj_start_idx, subj_end_idx, obj_start_idx, obj_end_idx

    return entity_embedding, subj_start_idx, subj_end_idx, obj_start_idx, obj_end_idx

In [47]:
## in utils.py
def insert_entity_idx_tokenized_dataset(tokenizer, dataset, cfg):
    """
    entity 표현 방식에 따른 entity 위치를 계산한 것 반환 받아 dataset에 넣어줍니다.
    """
    for data in dataset:
        entity_embeddings = []
        entity_idxes = []
        for ids in data['input_ids'].numpy():
            entity_embedding, subj_start_idx, subj_end_idx, obj_start_idx, obj_end_idx = get_entity_idxes(tokenizer, ids, cfg)
            entity_embeddings.append(entity_embedding)
            entity_idxes.append([subj_start_idx, subj_end_idx, obj_start_idx, obj_end_idx])
        data['Entity_type_embedding'] = torch.tensor(entity_embeddings).to(torch.int64)
        data['Entity_idxes'] = torch.tensor(entity_idxes).to(torch.int64)

In [48]:
## in train_OmegaConf.py

import pickle as pickle
import os
import torch
from sklearn.model_selection import train_test_split
import numpy as np
from omegaconf import OmegaConf
import wandb
import argparse
import random
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, Trainer, TrainingArguments, RobertaConfig, RobertaTokenizer, RobertaForSequenceClassification, BertTokenizer,EarlyStoppingCallback
from utils import label_to_num

cfg = OmegaConf.load('/opt/ml/baseline/code/config/config.yaml')

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    
## Model & Tokenizer
tokenizer = AutoTokenizer.from_pretrained(cfg.model.model_name)
model_config = AutoConfig.from_pretrained(cfg.model.model_name)
model_config.num_labels = 30

model = AutoModelForSequenceClassification.from_pretrained(cfg.model.model_name, config=model_config)
model.parameters
model.to(device)

## load dataset 
train_dataset = load_data(cfg.data.train_data)
train_label = label_to_num(train_dataset['label'].values)

# train_dev split, stratify 옵션으로 데이터 불균형 해결!
train_data, dev_data, train_label, dev_label = train_test_split(train_dataset, train_label, test_size=0.2, random_state=cfg.train.seed, stratify=train_label)
train_data.reset_index(drop=True, inplace = True)
dev_data.reset_index(drop=True, inplace = True)

## make dataset for pytorch
RE_train_dataset = RE_Dataset(train_data, train_label, tokenizer, cfg)
RE_dev_dataset = RE_Dataset(dev_data, dev_label, tokenizer, cfg)
model.resize_token_embeddings(len(RE_train_dataset.tokenizer))

if cfg.train.entity_embedding:
    print('='*10, "Start", '='*10)
    insert_entity_idx_tokenized_dataset(tokenizer, RE_train_dataset.dataset, cfg)
    insert_entity_idx_tokenized_dataset(tokenizer, RE_dev_dataset.dataset, cfg)
    print('='*10, "END", '='*10)

Some weights of the model checkpoint at klue/roberta-small were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi



In [49]:
print(RE_train_dataset)
print(RE_train_dataset[0])
print(RE_train_dataset.dataset[0])
print(RE_train_dataset.dataset[0]['Entity_type_embedding'])

<__main__.RE_Dataset object at 0x7fcc0a817c10>
{'input_ids': tensor([    0,    36,    14,  3611,    14, 27048,  2132,    36,  4546,  2063,
            7,    65,  3971,    65,  9555,    39,    10,    39,     7,   793,
            6,    11, 10150,    11,  3621,  5116,  2125,  6100,  2377,  2391,
         1283, 27135, 19227,   545,  1560,  2073,  3654,  2138,  3796,  2168,
         2062,    16,  3744,  4084,  2170,  3618, 12483,  2069,  3663,  2318,
          858,  2062,     6,  1072,     6,  3919,  2073,  4084,  2138,  4860,
         2116,  2259,  4137,  2179,  3847,  5886, 21154,  3884,  2052,  1039,
         2062,     6,   594,  1432,  2348,  3669,  2069, 11067,  2062,    18,
            2,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
   

In [50]:
print(RE_train_dataset.dataset[0]['Entity_idxes'])

tensor([[ 5,  7, 14, 18]])
