In [1]:
import pickle as pickle
import os
import pandas as pd
import torch
from tqdm import tqdm
from utils.utils import entity_marker, typed_entity_marker, typed_entity_marker_punc, TYPE_MARKERS, TYPE_MARKERS_PUNC

In [2]:
class RE_Dataset(torch.utils.data.Dataset):
    """ Dataset 구성을 위한 class."""
    def __init__(self, pair_dataset, labels):
        self.pair_dataset = pair_dataset
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.pair_dataset.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [3]:
def preprocessing_dataset(dataset):
    """ 처음 불러온 csv 파일을 원하는 형태의 DataFrame으로 변경 시켜줍니다."""
    typed_sentence = []
    for i, data in dataset.iterrows():
        typed_sentence.append(typed_entity_marker_punc(data))
        # print(data['sentence'])
    out_dataset = pd.DataFrame({'id':dataset['id'], 'sentence':typed_sentence, 'subject_entity':dataset['subject_word'], 'object_entity':dataset['object_word'], 'label':dataset['label'],})
    return out_dataset

In [4]:
def load_data(dataset_dir):
    """ csv 파일을 경로에 맡게 불러 옵니다. """
    pd_dataset = pd.read_csv(dataset_dir)
    dataset = preprocessing_dataset(pd_dataset)
    
    return dataset

In [5]:
def get_entity_embedding(
    examples,
    tokenizer,
    start_id,
    end_id
) :#-> Dict[str, List[Any]]:
    """ returns entity embeddings """
    # subj_start_id = tokenizer.convert_tokens_to_ids(["<S:PER>", "<S:ORG>"])
    # subj_end_id = tokenizer.convert_tokens_to_ids(["</S:PER>", "</S:ORG>"])
    # obj_start_id = tokenizer.convert_tokens_to_ids(["<O:PER>", "<O:ORG>", "<O:LOC>", "<O:DAT>", "<O:POH>", "<O:NOH>"])
    # obj_end_id = tokenizer.convert_tokens_to_ids(["</O:PER>", "</O:ORG>", "</O:LOC>", "</O:DAT>", "</O:POH>", "</O:NOH>"])

    entity_ids = []
    is_entity = False
    
    # start_id = subj_start_id+obj_start_id
    # end_id   = subj_end_id+obj_end_id
    
    for input_id in examples:
        if input_id in end_id:
            is_entity = False
            
        entity_id = 1 if is_entity else 0
        entity_ids.append(entity_id)
        
        if input_id in start_id:
            is_entity = True
    # entity_ids = torch.Tensor(entity_ids)
    return entity_ids

In [7]:
def tokenized_dataset(dataset, tokenizer):
    """ tokenizer에 따라 sentence를 tokenizing 합니다."""
    concat_entity = []
    for e01, e02 in zip(dataset['subject_entity'], dataset['object_entity']):
        # Prompting Sentence 
        temp = ''
        temp = e01 + '와(과)' + e02 + '은(는)?' #+'[SEP]'
        concat_entity.append(temp)
        
    # num_added_toks = tokenizer.add_special_tokens({"additional_special_tokens" : list(TYPE_MARKERS_PUNC.values())})
    # print("We have added", num_added_toks, "tokens")

    tokenized_sentences = tokenizer(
        list(dataset['sentence']),
        concat_entity,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=256,
        add_special_tokens=True,
        )
    return tokenized_sentences

In [10]:
def entity_ids_maker(data, start_id, end_id): # data에는 tokenizer를 거쳐 나온 input_ids가 들어온다
    def update_ranges_to_1(start_tokens, end_tokens, maxlen=251):
        # print(start_tokens)
        # print(end_tokens)
        res = []
        res += [0] * (start_tokens[0]+1) + [1] * (end_tokens[0]-start_tokens[0]-1)\
        + [0] * (start_tokens[1]-end_tokens[0]+1)\
        + [1] * (end_tokens[1]-start_tokens[1]-1)\
        + [0] * (maxlen-end_tokens[1])
        return res
    
             
    entity_ids = []
    for ids in tqdm(data):

        startidx = []
        endidx = []
        for i in range(len(ids)):
            if ids[i] in start_id:
                # print(tokens[i])
                startidx.append(i)
                
            elif ids[i] in end_id:
                # print(tokens[i])
                endidx.append(i)
                
        tmp = update_ranges_to_1(startidx, endidx, maxlen=len(ids))
        
        entity_ids.append(tmp)
    entity_ids = torch.Tensor(entity_ids)
    return entity_ids # Tensor로 변환하기
        

In [109]:
from transformers import AutoTokenizer
MODEL_NAME = "klue/roberta-large"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, additional_special_tokens=['#', '@'])
train_dataset = load_data('../../dataAugmentation/entity_split.csv')
tokenized_train = tokenized_dataset(train_dataset, tokenizer)

# subj_start_id = tokenizer.convert_tokens_to_ids(["<S:PER>", "<S:ORG>"])
# subj_end_id = tokenizer.convert_tokens_to_ids(["</S:PER>", "</S:ORG>"])
# obj_start_id = tokenizer.convert_tokens_to_ids(["<O:PER>", "<O:ORG>", "<O:LOC>", "<O:DAT>", "<O:POH>", "<O:NOH>"])
# obj_end_id = tokenizer.convert_tokens_to_ids(["</O:PER>", "</O:ORG>", "</O:LOC>", "</O:DAT>", "</O:POH>", "</O:NOH>"])
# start_id = subj_start_id+obj_start_id
# end_id   = subj_end_id+obj_end_id

# tokenized_train['entity_ids'] = entity_ids_maker(tokenized_train['input_ids'], start_id, end_id)

 ## 이 밑은 테스트 코드입니다.

In [15]:
tokenized_train.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [110]:
tokenized_train['input_ids'][0]

tensor([    0,   168, 30985, 14451,  7088,  4586,   169,   793,     7,    14,
        21639,    14,  8373, 14113,  2234,     7,  1504,  1363,  2088,    36,
           14,    51,  2107,  2341,    14, 29830,    36,   543, 14879,  2440,
         6711,   170, 21406, 26713,  2076, 25145,  5749,   171,  1421,   818,
         2073,  4388,  2062,    18,     2, 29830,  2522,    12,   604,    13,
         8373, 14113,  2234,  2073,    12,   793,    13,    35,     2,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1, 

In [13]:
entity_list = []
subj_start_id = tokenizer.convert_tokens_to_ids(["<S:PER>", "<S:ORG>"])
subj_end_id = tokenizer.convert_tokens_to_ids(["</S:PER>", "</S:ORG>"])
obj_start_id = tokenizer.convert_tokens_to_ids(["<O:PER>", "<O:ORG>", "<O:LOC>", "<O:DAT>", "<O:POH>", "<O:NOH>"])
obj_end_id = tokenizer.convert_tokens_to_ids(["</O:PER>", "</O:ORG>", "</O:LOC>", "</O:DAT>", "</O:POH>", "</O:NOH>"])
start_id = subj_start_id+obj_start_id
end_id   = subj_end_id+obj_end_id

# for ids in tqdm(tokenized_train['input_ids']):
#     entity_list.append(get_entity_embedding(ids, tokenizer, start_id, end_id))
# entity_list
len(entity_ids_maker(tokenized_train['input_ids'][:5], start_id, end_id)[0]) # 256
len(tokenized_train['input_ids'][0]) # 256

100%|██████████| 5/5 [00:00<00:00, 58.52it/s]


256

In [44]:
tokenizer.decode(tokenized_train['input_ids'][0][26 : 256])
# len(tokenized_train['input_ids'][0])

'@ 가 1969년 앨범 《 Abbey Road 》 에 담은 노래다. [SEP] 비틀즈와 ( 과 ) 조지 해리슨은 ( 는 )? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]

In [80]:
tokenized_train['entity_ids']

KeyError: 'entity_ids'

In [16]:
leng = []
for s in train_dataset['sentence']:
    leng.append(len(tokenizer.tokenize(s)))
leng

[35,
 38,
 51,
 46,
 30,
 56,
 55,
 59,
 85,
 67,
 26,
 51,
 122,
 58,
 74,
 69,
 96,
 38,
 48,
 51,
 30,
 62,
 58,
 60,
 37,
 82,
 52,
 46,
 29,
 42,
 51,
 65,
 142,
 31,
 59,
 60,
 37,
 70,
 57,
 39,
 28,
 44,
 77,
 53,
 108,
 45,
 52,
 45,
 140,
 37,
 46,
 78,
 52,
 46,
 30,
 57,
 43,
 103,
 33,
 81,
 42,
 80,
 26,
 87,
 51,
 59,
 85,
 106,
 40,
 51,
 31,
 43,
 47,
 70,
 90,
 38,
 62,
 50,
 78,
 71,
 35,
 45,
 51,
 64,
 63,
 77,
 23,
 79,
 80,
 38,
 43,
 49,
 68,
 57,
 57,
 44,
 31,
 48,
 32,
 77,
 53,
 50,
 35,
 113,
 66,
 16,
 39,
 85,
 52,
 47,
 74,
 69,
 73,
 50,
 54,
 110,
 32,
 45,
 88,
 39,
 36,
 31,
 76,
 51,
 114,
 23,
 55,
 49,
 87,
 59,
 106,
 49,
 40,
 34,
 43,
 59,
 34,
 26,
 59,
 87,
 27,
 106,
 46,
 72,
 28,
 51,
 61,
 48,
 18,
 53,
 38,
 43,
 36,
 41,
 55,
 70,
 39,
 62,
 74,
 35,
 19,
 59,
 30,
 46,
 44,
 40,
 51,
 70,
 101,
 35,
 131,
 30,
 85,
 49,
 31,
 40,
 62,
 62,
 38,
 67,
 48,
 59,
 33,
 53,
 49,
 33,
 42,
 43,
 63,
 28,
 61,
 34,
 56,
 54,
 85,
 85,
 36,
 3

In [17]:
max(leng)

243

In [60]:
print(len(entity_ids))

8182440


In [15]:
tokens = tokenizer.tokenize(train_dataset['sentence'][1])
tmp = [0 for _ in range(256)]
tmpidx = [[],[]]
print(tokens)
for i in range(len(tokens)):
    if '<S:' in tokens[i] or '<O:' in tokens[i]:
        # print(tokens[i])
        tmpidx[0].append(i)
    elif '</S:' in tokens[i] or '</O:' in tokens[i]:
        # print(tokens[i])
        tmpidx[1].append(i)
        
def update_ranges_to_1(start_tokens, end_tokens, maxlen=251):
    print(start_tokens)
    print(end_tokens)
    res = []
    res += [0] * (start_tokens[0]+1) + [1] * (end_tokens[0]-start_tokens[0]-1)\
    + [0] * (start_tokens[1]-end_tokens[0]+1)\
    + [1] * (end_tokens[1]-start_tokens[1]-1)\
    + [0] * (maxlen-end_tokens[1])
    return res

len(update_ranges_to_1(tmpidx[0], tmpidx[1]))


['호남', '##이', '기반', '##인', '바른', '##미', '##래', '##당', '·', '<O:ORG>', '대안', '##신', '##당', '</O:ORG>', '·', '<S:ORG>', '민주', '##평', '##화', '##당', '</S:ORG>', '이', '우여곡절', '끝', '##에', '합당', '##해', '민생', '##당', '(', '가칭', ')', '으로', '재', '##탄', '##생', '##한다', '.']
[9, 15]
[13, 20]


251

In [37]:
tokenized_train['entity_ids'] = entity_ids
tokenized_train.keys()
# 'input_ids', 'token_type_ids', 'attention_mask', 'entity_ids'
type(tokenized_train['input_ids'])

torch.Tensor

In [46]:
len(tokenized_train['entity_ids'][1])
len(tokenized_train['input_ids'][1])

256

In [16]:
def label_to_num(label):
  num_label = []
  with open('dict_label_to_num.pkl', 'rb') as f:
    dict_label_to_num = pickle.load(f)
  for v in label:
    num_label.append(dict_label_to_num[v])
  
  return num_label
train_label = label_to_num(train_dataset['label'].values)

RE_train_dataset = RE_Dataset(tokenized_train, train_label)


In [17]:
# item = {key: val[1].clone().detach() for key, val in train_dataset.items()}
item = {key: val[0] for key, val in train_dataset.items()}
item
# RE_train_dataset.__getitem__(1)

{'id': 0,
 'sentence': '〈Something〉는 <O:PER> 조지 해리슨 </O:PER>이 쓰고 <S:ORG> 비틀즈 </S:ORG>가 1969년 앨범 《Abbey Road》에 담은 노래다.',
 'subject_entity': '비틀즈',
 'object_entity': '조지 해리슨',
 'label': 'no_relation'}

In [37]:
concat_entity = []
i = 0
for e01, e02 in zip(train_dataset['subject_entity'], train_dataset['object_entity']):
    # Prompting Sentence 
    if i == 5 : break
    temp = ''
    temp = e01 + '와(과) ' + e02 + '의 관계' #+'[SEP]'
    temp = tokenizer(temp, add_special_tokens=True)#, add_special_tokens=True)
    concat_entity.append(temp)
    
    i += 1

In [40]:
concat_entity

[{'input_ids': [0, 29830, 2522, 12, 604, 13, 8373, 14113, 2234, 2079, 3654, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
 {'input_ids': [0, 3772, 2139, 2267, 2481, 2522, 12, 604, 13, 5605, 2250, 2481, 2079, 3654, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
 {'input_ids': [0, 4104, 10904, 2522, 12, 604, 13, 3629, 17287, 20212, 2079, 3654, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
 {'input_ids': [0, 27930, 24393, 2024, 2522, 12, 604, 13, 6580, 2144, 2079, 3654, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
 {'input_ids': [0, 20289, 20562, 2522, 12, 604, 13, 14925, 2079, 3654, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1

In [41]:
get_entity_embedding(concat_entity[0], tokenizer)

{'entity_ids': tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])}

In [1]:
subj_start_id = tokenizer.convert_tokens_to_ids(["<S:PER>", "<S:ORG>"])
subj_end_id = tokenizer.convert_tokens_to_ids(["</S:PER>", "</S:ORG>"])
obj_start_id = tokenizer.convert_tokens_to_ids(["<O:PER>", "<O:ORG>", "<O:LOC>", "<O:DAT>", "<O:POH>", "<O:NOH>"])
obj_end_id = tokenizer.convert_tokens_to_ids(["</O:PER>", "</O:ORG>", "</O:LOC>", "</O:DAT>", "</O:POH>", "</O:NOH>"])
obj_start_id

NameError: name 'tokenizer' is not defined

In [18]:
import numpy as np
def update_ranges_to_1(start_tokens, end_tokens, length):
    res = np.zeros(length, dtype=int)
    for start, end in zip(start_tokens, end_tokens):
        res[start + 1:end] = 1
    return res

def entity_ids_maker(data, start_id, end_id):
    entity_ids = []
    
    for ids in tqdm(data):
        length = len(ids)
        startidx = [i for i, id in enumerate(ids) if id in start_id]
        endidx = [i for i, id in enumerate(ids) if id in end_id]

        if startidx and endidx:
            tmp = update_ranges_to_1(startidx, endidx, length)
            entity_ids.append(tmp)
    
    entity_ids = torch.tensor(entity_ids, dtype=torch.int)
    return entity_ids

In [123]:
def get_entity_position_embedding(tokenizer, input_ids):
  special_token2id = {k:v for k,v in zip(tokenizer.all_special_tokens, tokenizer.all_special_ids)}

  sub_token_id = special_token2id['@']
  obj_token_id = special_token2id['#']
  
  pos_embeddings = []

  for y in tqdm(input_ids):
    pos = []
    for j in range(0, len(y)):
      if len(pos) == 4:
        break
      if y[j] == sub_token_id:
        pos.append(j)

      if y[j] == obj_token_id:
        pos.append(j)
    pos_embeddings.append(pos)
  return pos_embeddings
  # return torch.tensor(pos_embeddings, dtype=torch.int)

In [124]:
# '[CLS] 〈 Something 〉 는 # * PER * 조지 해리슨 # 이 쓰고 @ * ORG * 비틀즈 @ 가 1969년 앨범 《 Abbey Road 》 에 담은 노래다. [SEP] 비틀즈와 ( 과 ) 조지 해리슨은 ( 는 )? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [125]:
ent_pos_emb = get_entity_position_embedding(tokenizer, tokenized_train['input_ids'])

100%|██████████| 32470/32470 [00:13<00:00, 2359.66it/s]


In [136]:
for i in ent_pos_emb:
    if len(i) == 4:continue
    else:
        print("error")

In [144]:
def making_entity_pos_emb(pos_emb):
    ddd = []
    for idx, ids in tqdm(enumerate(pos_emb)):
        ent_emb = []
        ent_emb += [0] * ids[0] +\
                   [1] * (ids[1] - ids[0] + 1) + \
                   [0] * (ids[2] - ids[1]-1) + \
                   [1] * (ids[3] - ids[2] + 1) + \
                   [0] * (256 - ids[3]-1)
        ddd.append(ent_emb)
    return torch.Tensor(ddd)

In [145]:
asdf = making_entity_pos_emb(ent_pos_emb)
# asdf = torch.Tensor(asdf)

32470it [00:00, 130683.86it/s]


In [146]:
asdf
# len(asdf[1])

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 1., 1.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 1.,  ..., 0., 0., 0.]])

In [140]:
tokenized_train['entity_ids'] = asdf
tokenized_train.keys()
tokenized_train['input_ids'].shape
tokenized_train['entity_ids'].shape # same

torch.Size([32470, 256])