# 2. 시퀀스에 Entity 정보 추가

<br>

## 2.1 라이브러리 import

In [31]:
from tqdm import tqdm
from load_data import *
import pandas as pd
from transformers import AutoTokenizer, AutoModel

MODEL_NAME = 'klue/bert-base'

<br>

## 2.2 데이터 불러오기

In [2]:
df = load_new_train_data()
df.head()

100%|██████████| 32470/32470 [00:04<00:00, 6519.20it/s]


Unnamed: 0,id,sentence,sentence_length,subject_entity_word,subject_entity_start_idx,subject_entity_end_idx,subject_entity_type,object_entity_word,object_entity_start_idx,object_entity_end_idx,object_entity_type,label,source
0,0,〈Something〉는 조지 해리슨이 쓰고 비틀즈가 1969년 앨범 《Abbey R...,59,비틀즈,24,26,ORG,조지 해리슨,13,18,PER,no_relation,wikipedia
1,1,호남이 기반인 바른미래당·대안신당·민주평화당이 우여곡절 끝에 합당해 민생당(가칭)으...,54,민주평화당,19,23,ORG,대안신당,14,17,ORG,no_relation,wikitree
2,2,K리그2에서 성적 1위를 달리고 있는 광주FC는 지난 26일 한국프로축구연맹으로부터...,99,광주FC,21,24,ORG,한국프로축구연맹,34,41,ORG,org:member_of,wikitree
3,3,균일가 생활용품점 (주)아성다이소(대표 박정부)는 코로나19 바이러스로 어려움을 겪...,78,아성다이소,13,17,ORG,박정부,22,24,PER,org:top_members/employees,wikitree
4,4,1967년 프로 야구 드래프트 1순위로 요미우리 자이언츠에게 입단하면서 등번호는 8...,56,요미우리 자이언츠,22,30,ORG,1967,0,3,DAT,no_relation,wikipedia


<br>

## 2.4 학습 데이터 토크나이징 (baseline 방법)

### 2.4.1 토크나이저 객체 생성

In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

<br>

### 2.4.2 테크나이즈된 데이터 생성

In [4]:
dataset = df.copy()

concat_entity = []

for e01, e02 in zip(dataset['subject_entity_word'], dataset['object_entity_word']):
    temp = e01 + '[SEP]' + e02
    concat_entity.append(temp)
    
tokenized_sentences = tokenizer(
    concat_entity,
    list(dataset['sentence']),
    return_tensors='pt',
    padding=True,
    truncation=True,
    max_length=256,
    add_special_tokens=True
)

In [5]:
tokenized_sentences.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

<br>

### 2.4.3 샘플 확인

In [6]:
sample_ids = tokenized_sentences['input_ids'][0]
sample_ids

tensor([    2, 29830,     3,  8373, 14113,  2234,     3,   168, 30985, 14451,
         7088,  4586,   169,   793,  8373, 14113,  2234,  2052,  1363,  2088,
        29830,  2116, 14879,  2440,  6711,   170, 21406, 26713,  2076, 25145,
         5749,   171,  1421,   818,  2073,  4388,  2062,    18,     3,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [7]:
sample_tokens = tokenizer.convert_ids_to_tokens(sample_ids)
sample_tokens = [token for token in sample_tokens if token != '[PAD]']
print(sample_tokens)

['[CLS]', '비틀즈', '[SEP]', '조지', '해리', '##슨', '[SEP]', '〈', 'So', '##me', '##th', '##ing', '〉', '는', '조지', '해리', '##슨', '##이', '쓰', '##고', '비틀즈', '##가', '1969', '##년', '앨범', '《', 'Ab', '##be', '##y', 'Ro', '##ad', '》', '에', '담', '##은', '노래', '##다', '.', '[SEP]']


In [8]:
dataset.loc[0, 'sentence']

'〈Something〉는 조지 해리슨이 쓰고 비틀즈가 1969년 앨범 《Abbey Road》에 담은 노래다.'

<br>

## 2.5 학습 데이터 토크나이징 (Entity 정보 포함시키기)

### 2.5.1 토크나이저 객체 생성

In [9]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

<br>

### 2.5.2 Entity Special Token - Typed entity marker

- Entity Special Token 종류 중 BERT base에서 가장 좋은 성능을 나타낸 **Typed entity marker** 방법을 사용한다.

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
<img src='https://drive.google.com/uc?id=1V_dVztR_mQDHsmwsP4P0Qith7hRU1vmz' width=800/>

<br>

### 2.5.3 스페셜 토큰 및 문장 생성 함수 정의

In [41]:
dataset = df.copy()

In [42]:
def get_entity_special_token_dataset(dataset):
    entity_special_tokens = []
    entity_sentences = []
    subject_entity_start_tokens = []
    subject_entity_end_tokens = []
    object_entity_start_tokens = []
    object_entity_end_tokens = []
    

    for i, row in tqdm(dataset.iterrows(), total=dataset.shape[0]):
    #for i, row in dataset.iterrows():
#         print('-'*100)
        # entity 객체 생성
        s_entity = {
            'prefix': 'S',
            'word': row['subject_entity_word'],
            's_idx': row['subject_entity_start_idx'],
            'e_idx': row['subject_entity_end_idx'],
            'type': row['subject_entity_type']
        }
        o_entity = {
            'prefix': 'O',
            'word': row['object_entity_word'],
            's_idx': row['object_entity_start_idx'],
            'e_idx': row['object_entity_end_idx'],
            'type': row['object_entity_type']
        }

        # start_index가 더 큰 entity 먼저 정렬
        entities = sorted([s_entity, o_entity], key=lambda item: item['s_idx'], reverse=True)

        # entity 스페셜 토큰 생성
        special_tokens = []
        for entity in entities:
            s_token = f"[{entity['prefix']}:{entity['type']}]"
            e_token = f"[/{entity['prefix']}:{entity['type']}]"
            
            entity['s_token'] = s_token
            entity['e_token'] = e_token
            
            if entity['prefix'] == 'S':
                subject_entity_start_tokens.append(s_token)
                subject_entity_end_tokens.append(e_token)
            else:
                object_entity_start_tokens.append(s_token)
                object_entity_end_tokens.append(e_token)
            
            special_tokens.extend([s_token, e_token])

        # 전체 스페셜 토큰 리스트에 추가
        for special_token in special_tokens:
            if special_token not in entity_special_tokens:
                entity_special_tokens.append(special_token)

#         print(f"subject: {s_entity['word']}({s_entity['type']})")
#         print(f"object: {o_entity['word']}({o_entity['type']})")
#         print()

        sentence = row['sentence']
#         print(sentence)

        for entity in entities:
            s_idx, e_idx = entity['s_idx'], entity['e_idx']
            new_sentence = ''
            new_sentence += sentence[:s_idx]
            new_sentence += entity['s_token']
            new_sentence += entity['word']
            new_sentence += entity['e_token']
            new_sentence += sentence[e_idx+1:]
            sentence = new_sentence

        entity_sentences.append(sentence)
        
#         print(sentence)
#         print('-'*100)

    dataset['sentence'] = entity_sentences
    dataset['subject_entity_start_token'] = subject_entity_start_tokens
    dataset['subject_entity_end_token'] = subject_entity_end_tokens
    dataset['object_entity_start_token'] = object_entity_start_tokens
    dataset['object_entity_end_token'] = object_entity_end_tokens    

    return dataset, entity_special_tokens

<br>

### 2.5.4 스페셜 토큰 획득 및 데이터프레임 확장

In [43]:
dataset, special_tokens = get_entity_special_token_dataset(dataset)

100%|██████████| 32470/32470 [00:03<00:00, 8904.05it/s]


In [44]:
special_tokens

['[S:ORG]',
 '[/S:ORG]',
 '[O:PER]',
 '[/O:PER]',
 '[O:ORG]',
 '[/O:ORG]',
 '[O:DAT]',
 '[/O:DAT]',
 '[S:PER]',
 '[/S:PER]',
 '[O:LOC]',
 '[/O:LOC]',
 '[O:POH]',
 '[/O:POH]',
 '[O:NOH]',
 '[/O:NOH]']

In [47]:
len(special_tokens) == (len(dataset['subject_entity_type'].unique()) + len(dataset['object_entity_type'].unique()))*2

True

In [48]:
dataset.head()

Unnamed: 0,id,sentence,sentence_length,subject_entity_word,subject_entity_start_idx,subject_entity_end_idx,subject_entity_type,object_entity_word,object_entity_start_idx,object_entity_end_idx,object_entity_type,label,source,subject_entity_start_token,subject_entity_end_token,object_entity_start_token,object_entity_end_token
0,0,〈Something〉는 [O:PER]조지 해리슨[/O:PER]이 쓰고 [S:ORG]...,59,비틀즈,24,26,ORG,조지 해리슨,13,18,PER,no_relation,wikipedia,[S:ORG],[/S:ORG],[O:PER],[/O:PER]
1,1,호남이 기반인 바른미래당·[O:ORG]대안신당[/O:ORG]·[S:ORG]민주평화당...,54,민주평화당,19,23,ORG,대안신당,14,17,ORG,no_relation,wikitree,[S:ORG],[/S:ORG],[O:ORG],[/O:ORG]
2,2,K리그2에서 성적 1위를 달리고 있는 [S:ORG]광주FC[/S:ORG]는 지난 2...,99,광주FC,21,24,ORG,한국프로축구연맹,34,41,ORG,org:member_of,wikitree,[S:ORG],[/S:ORG],[O:ORG],[/O:ORG]
3,3,균일가 생활용품점 (주)[S:ORG]아성다이소[/S:ORG](대표 [O:PER]박정...,78,아성다이소,13,17,ORG,박정부,22,24,PER,org:top_members/employees,wikitree,[S:ORG],[/S:ORG],[O:PER],[/O:PER]
4,4,[O:DAT]1967[/O:DAT]년 프로 야구 드래프트 1순위로 [S:ORG]요미...,56,요미우리 자이언츠,22,30,ORG,1967,0,3,DAT,no_relation,wikipedia,[S:ORG],[/S:ORG],[O:DAT],[/O:DAT]


<br>

### 2.5.5 토크나이저에 스페셜 토큰 정보 삽입

In [49]:
tokenizer

PreTrainedTokenizerFast(name_or_path='klue/bert-base', vocab_size=32000, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]', 'additional_special_tokens': ['[S:ORG]', '[/S:ORG]', '[O:PER]', '[/O:PER]', '[O:ORG]', '[/O:ORG]', '[O:DAT]', '[/O:DAT]', '[S:PER]', '[/S:PER]', '[O:LOC]', '[/O:LOC]', '[O:POH]', '[/O:POH]', '[O:NOH]', '[/O:NOH]']})

In [21]:
sample = dataset.iloc[0,:]
sample

id                                                                          0
sentence                    〈Something〉는 조지 해리슨이 쓰고 비틀즈가 1969년 앨범 《Abbey R...
sentence_length                                                            59
subject_entity_word                                                       비틀즈
subject_entity_start_idx                                                   24
subject_entity_end_idx                                                     26
subject_entity_type                                                       ORG
object_entity_word                                                     조지 해리슨
object_entity_start_idx                                                    13
object_entity_end_idx                                                      18
object_entity_type                                                        PER
label                                                             no_relation
source                                                          

In [26]:
# 추가 전
print(tokenizer.tokenize(sample['entity_sentence'], add_special_tokens=True))

['[CLS]', '〈', 'So', '##me', '##th', '##ing', '〉', '는', '[', 'O', ':', 'PER', ']', '조지', '해리', '##슨', '[', '/', 'O', ':', 'PER', ']', '이', '쓰', '##고', '[', 'S', ':', 'O', '##R', '##G', ']', '비틀즈', '[', '/', 'S', ':', 'O', '##R', '##G', ']', '가', '1969', '##년', '앨범', '《', 'Ab', '##be', '##y', 'Ro', '##ad', '》', '에', '담', '##은', '노래', '##다', '.', '[SEP]']


In [29]:
# 스페셜 토큰 추가
added_token_num = tokenizer.add_special_tokens({'additional_special_tokens': special_tokens})
print(added_token_num)

16


In [30]:
# 추가 후
print(tokenizer.tokenize(sample['entity_sentence'], add_special_tokens=True))

['[CLS]', '〈', 'So', '##me', '##th', '##ing', '〉', '는', '[O:PER]', '조지', '해리', '##슨', '[/O:PER]', '이', '쓰', '##고', '[S:ORG]', '비틀즈', '[/S:ORG]', '가', '1969', '##년', '앨범', '《', 'Ab', '##be', '##y', 'Ro', '##ad', '》', '에', '담', '##은', '노래', '##다', '.', '[SEP]']


<br>

### 2.5.6 모델 embedding size 확장

In [32]:
model = AutoModel.from_pretrained(MODEL_NAME)

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [33]:
# 추가 전
print(model.get_input_embeddings())

Embedding(32000, 768, padding_idx=0)


In [34]:
# 추가
model.resize_token_embeddings(tokenizer.vocab_size + added_token_num)

Embedding(32016, 768)

<br>

### 2.5.7 토크나이즈된 데이터 생성

In [53]:
concat_entity = []

for i, row in dataset.iterrows():
    temp = row['subject_entity_start_token'] + \
        row['subject_entity_word'] + \
        row['subject_entity_end_token'] + \
        '[SEP]' + \
        row['object_entity_start_token'] + \
        row['object_entity_word'] + \
        row['object_entity_end_token']
    concat_entity.append(temp)
    
tokenized_sentences = tokenizer(
    concat_entity,
    list(dataset['sentence']),
    return_tensors='pt',
    padding=True,
    truncation=True,
    max_length=256,
    add_special_tokens=True
)

<br>

### 2.5.8 샘플 확인

In [54]:
sample_ids = tokenized_sentences['input_ids'][0]
sample_ids

tensor([    2, 32000, 29830, 32001,     3, 32002,  8373, 14113,  2234, 32003,
            3,   168, 30985, 14451,  7088,  4586,   169,   793, 32002,  8373,
        14113,  2234, 32003,  1504,  1363,  2088, 32000, 29830, 32001,   543,
        14879,  2440,  6711,   170, 21406, 26713,  2076, 25145,  5749,   171,
         1421,   818,  2073,  4388,  2062,    18,     3,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [55]:
sample_tokens = tokenizer.convert_ids_to_tokens(sample_ids)
sample_tokens = [token for token in sample_tokens if token != '[PAD]']
print(sample_tokens)

['[CLS]', '[S:ORG]', '비틀즈', '[/S:ORG]', '[SEP]', '[O:PER]', '조지', '해리', '##슨', '[/O:PER]', '[SEP]', '〈', 'So', '##me', '##th', '##ing', '〉', '는', '[O:PER]', '조지', '해리', '##슨', '[/O:PER]', '이', '쓰', '##고', '[S:ORG]', '비틀즈', '[/S:ORG]', '가', '1969', '##년', '앨범', '《', 'Ab', '##be', '##y', 'Ro', '##ad', '》', '에', '담', '##은', '노래', '##다', '.', '[SEP]']


In [58]:
tokenizer.decode(sample_ids)

'[CLS] [S:ORG] 비틀즈 [/S:ORG] [SEP] [O:PER] 조지 해리슨 [/O:PER] [SEP] 〈 Something 〉 는 [O:PER] 조지 해리슨 [/O:PER] 이 쓰고 [S:ORG] 비틀즈 [/S:ORG] 가 1969년 앨범 《 Abbey Road 》 에 담은 노래다. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 

In [56]:
dataset.loc[0, 'sentence']

'〈Something〉는 [O:PER]조지 해리슨[/O:PER]이 쓰고 [S:ORG]비틀즈[/S:ORG]가 1969년 앨범 《Abbey Road》에 담은 노래다.'

In [57]:
print(len(sample_tokens))

47
