In [2]:
import pandas as pd
import pickle as pickle
import ast

from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, Trainer, TrainingArguments
from omegaconf import OmegaConf
from dataset_utils import load_data, label_to_num, tokenized_dataset
from datasets import RE_Dataset
from metrics import compute_metrics

import numpy as np
import argparse
import random
import torch

In [201]:
class Processor:
    def __init__(self, args, tokenizer):
        super().__init__()
        self.args = args
        self.tokenizer = tokenizer
        self.new_tokens = ['[PER]', '[ORG]', '[DAT]', '[LOC]', '[POH]', '[NOH]']
        self.tokenizer.add_tokens(self.new_tokens)
        self.LABEL_TO_ID = {'no_relation': 0, 'org:top_members/employees': 1, 'org:members': 2, 'org:product': 3, 'per:title': 4, 'org:alternate_names': 5, 'per:employee_of': 6, \
                'org:place_of_headquarters': 7, 'per:product': 8, 'org:number_of_employees/members': 9, 'per:children': 10, 'per:place_of_residence': 11, 'per:alternate_names': 12, \
                'per:other_family': 13, 'per:colleagues': 14, 'per:origin': 15, 'per:siblings': 16, 'per:spouse': 17, 'org:founded': 18, 'org:political/religious_affiliation': 19, \
                'org:member_of': 20, 'per:parents': 21, 'org:dissolved': 22, 'per:schools_attended': 23, 'per:date_of_death': 24, 'per:date_of_birth': 25, 'per:place_of_birth': 26, \
                'per:place_of_death': 27, 'org:founded_by': 28, 'per:religion': 29}
        
    def word_idx_extract(self, words, ns, ne):
        
        word_indices = []
        start_index = 0

        for word in words:
            end_index = start_index + len(word) - 1
            word_indices.append((start_index, end_index))
            start_index = end_index + 2

        word_idx=[]
        for i, (start, end) in enumerate(word_indices):
            if ns in range(start, end + 1) or ne in range(start, end + 1):
                word_idx.append(i)
                
        return word_idx[0] , word_idx[-1]


    def token_location(self, list1, list2):
        for i in range(len(list1) - len(list2) + 1):
            if list1[i:i + len(list2)] == list2:
                index = i
                return i, i+len(list2)-1

    def tokenize(self, sentence, subject_word, object_word, subj_type, obj_type, ss, se, os, oe):
        print(sentence,subject_word, object_word, subj_type, obj_type, ss, se, os, oe)
        words = sentence.split()

        sws, swe = self.word_idx_extract(words, ss,se)
        ows, owe = self.word_idx_extract(words, os,oe)

        subj_tokens= self.tokenizer.tokenize(subject_word)
        obj_tokens= self.tokenizer.tokenize(object_word)

        sents =[]
        subj_type , obj_type = f"[{subj_type}]", f"[{obj_type}]"
        subj_token_collect = []
        obj_token_collect = []

        for idx, word in enumerate(words):
            tokens = self.tokenizer.tokenize(word)
            if idx not in range(sws,swe+1) and idx not in range(ows,owe+1):
                sents.extend(tokens)

            else:
                if sws <= idx and idx <= swe:
                    subj_token_collect.extend(tokens)
                    if idx == swe:
                        ts, te = self.token_location(subj_token_collect, subj_tokens)
                        tokens = subj_token_collect[:ts] + ['@'] + ['*'] + [subj_type] + ['*'] + subj_tokens + ['@'] + subj_token_collect[te+1:]
                        new_ss = len(sents) + len(subj_token_collect[:ts])
                        sents.extend(tokens)

                if ows <= idx and idx <= owe:
                    obj_token_collect.extend(tokens)
                    if idx == owe:
                        ts, te = self.token_location(obj_token_collect, obj_tokens)
                        tokens = obj_token_collect[:ts] + ["#"] + ['^'] + [obj_type] + ['^'] + obj_tokens + ["#"] + obj_token_collect[te+1:]
                        new_os = len(sents) + len(obj_token_collect[:ts])
                        sents.extend(tokens)

        sents = sents[:self.args.model.max_seq_length - 2]
        input_ids = self.tokenizer.convert_tokens_to_ids(sents)
        input_ids = self.tokenizer.build_inputs_with_special_tokens(input_ids)
        print(sents)
        return input_ids, new_ss + 1, new_os + 1

    def read(self, file_in):
        features = []
        with open(file_in, "r") as fh:
            data = pd.read_csv(fh)

        for _, d in tqdm(data.iterrows()):
            ss, se = int(d['subject_start_idx']), int(d['subject_end_idx'])
            os, oe = int(d['object_start_idx']), int(d['object_end_idx'])
            input_ids, new_ss, new_os = self.tokenize(d['sentence'],d['subject_word'],d['object_word'], d['subject_type'], d['object_type'], ss, se, os, oe)
            rel = self.LABEL_TO_ID[d['label']]

            feature = {
                'input_ids': input_ids,
                'labels': rel,
                'ss': new_ss,
                'os': new_os,
            }
            features.append(feature)

        return features
    


In [202]:
parser = argparse.ArgumentParser()
parser.add_argument("--config", "-c", type=str, default="1.2.0_config")

args, _ = parser.parse_known_args()
conf = OmegaConf.load(f"../code/config/{args.config}.yaml")

# load model and tokenizer
MODEL_NAME = conf.model.model_name
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# load dataset
train_dataset = pd.read_csv("../dataset/train/train_final.csv")
train_label = label_to_num(train_dataset['label'].values)

# tokenizing dataset
tokenized_train = Processor(conf, tokenizer).read("../dataset/train/train_final.csv")

22it [00:00, 245.59it/s]

〈Something〉는 조지 해리슨이 쓰고 비틀즈가 1969년 앨범 《Abbey Road》에 담은 노래다. 비틀즈 조지 해리슨 ORG PER 24 26 13 18
['〈', 'So', '##me', '##th', '##ing', '〉', '는', '#', '^', '[PER]', '^', '조지', '해리', '##슨', '#', '##이', '쓰', '##고', '@', '*', '[ORG]', '*', '비틀즈', '@', '##가', '1969', '##년', '앨범', '《', 'Ab', '##be', '##y', 'Ro', '##ad', '》', '에', '담', '##은', '노래', '##다', '.']
호남이 기반인 바른미래당·대안신당·민주평화당이 우여곡절 끝에 합당해 민생당(가칭)으로 재탄생한다. 민주평화당 대안신당 ORG ORG 19 23 14 17
['호남', '##이', '기반', '##인', '바른', '##미', '##래', '##당', '·', '대안', '##신', '##당', '·', '@', '*', '[ORG]', '*', '민주', '##평', '##화', '##당', '@', '##이', '바른', '##미', '##래', '##당', '·', '#', '^', '[ORG]', '^', '대안', '##신', '##당', '#', '·', '@', '*', '[ORG]', '*', '민주', '##평', '##화', '##당', '@', '##이', '우여곡절', '끝', '##에', '합당', '##해', '민생', '##당', '(', '가칭', ')', '으로', '재', '##탄', '##생', '##한다', '.']
K리그2에서 성적 1위를 달리고 있는 광주FC는 지난 26일 한국프로축구연맹으로부터 관중 유치 성과와 마케팅 성과를 인정받아 ‘풀 스타디움상’과 ‘플러스 스타디움상’을 수상했다. 광주FC 한국프로축구연맹 ORG ORG 21 24 34 41
['K', '##리그', '##2', '##에서', '성적', 




TypeError: cannot unpack non-iterable NoneType object

In [222]:
sentence='박흥식은 첫 부인과의 사이에 장녀 박병숙을 두었고, 두 번째 부인은 경희대학교 교수를 지낸 피아니스트 한인하이며, 두 사람 사이에서 태어난 딸 박봉숙은 이화여자대학교 교수를 지냈다.'
subject_word = '박흥식' 
object_word = '한인하' 
subj_type, obj_type, ss, se, os, oe = 'PER', 'PER', 0, 2, 57, 59
words=sentence.split()

In [227]:
tokenize(sentence, subject_word, object_word, subj_type, obj_type, ss, se, os, oe)

['박', '##흥', '##식', '##은'] ['박', '##흥', '##식']
['한인', '##하이', '##며', ','] ['한인', '##하']


TypeError: cannot unpack non-iterable NoneType object

In [226]:
    def word_idx_extract( words, ns, ne):
        
        word_indices = []
        start_index = 0

        for word in words:
            end_index = start_index + len(word) - 1
            word_indices.append((start_index, end_index))
            start_index = end_index + 2
        word_idx=[]
        for i, (start, end) in enumerate(word_indices):
            if ns in range(start, end + 1) or ne in range(start, end + 1):
                word_idx.append(i)
        return word_idx[0] , word_idx[-1]

In [224]:
0 in range(0,3)

True

In [225]:
    def word_idx_extract( words, ns, ne):
        
        word_indices = []
        start_index = 0

        for word in words:
            end_index = start_index + len(word) - 1
            word_indices.append((start_index, end_index))
            start_index = end_index + 2

        word_idx=[]
        for i, (start, end) in enumerate(word_indices):
            if ns in range(start, end + 1) or ne in range(start, end + 1):
                word_idx.append(i)
                
        return word_idx[0] , word_idx[-1]


    def token_location(list1, list2):
        print(list1, list2)
        for i in range(len(list1) - len(list2) + 1):
            if list1[i:i + len(list2)] == list2:
                index = i
                return i, i+len(list2)-1


In [209]:
    def tokenize(sentence, subject_word, object_word, subj_type, obj_type, ss, se, os, oe):
        
        words = sentence.split()

        sws, swe = word_idx_extract(words, ss,se)
        ows, owe = word_idx_extract(words, os,oe)

        subj_tokens= tokenizer.tokenize(subject_word)
        obj_tokens= tokenizer.tokenize(object_word)

        sents =[]
        subj_type , obj_type = f"[{subj_type}]", f"[{obj_type}]"
        subj_token_collect = []
        obj_token_collect = []

        for idx, word in enumerate(words):
            tokens = tokenizer.tokenize(word)
            if idx not in range(sws,swe+1) and idx not in range(ows,owe+1):
                sents.extend(tokens)

            else:
                if sws <= idx and idx <= swe:
                    subj_token_collect.extend(tokens)
                    if idx == swe:
                        ts, te = token_location(subj_token_collect, subj_tokens)
                        tokens = subj_token_collect[:ts] + ['@'] + ['*'] + [subj_type] + ['*'] + subj_tokens + ['@'] + subj_token_collect[te+1:]
                        new_ss = len(sents) + len(subj_token_collect[:ts])
                        sents.extend(tokens)

                if ows <= idx and idx <= owe:
                    obj_token_collect.extend(tokens)
                    if idx == owe:
                        ts, te = token_location(obj_token_collect, obj_tokens)
                        tokens = obj_token_collect[:ts] + ["#"] + ['^'] + [obj_type] + ['^'] + obj_tokens + ["#"] + obj_token_collect[te+1:]
                        new_os = len(sents) + len(obj_token_collect[:ts])
                        sents.extend(tokens)

        input_ids = tokenizer.convert_tokens_to_ids(sents)
        input_ids = tokenizer.build_inputs_with_special_tokens(input_ids)
        print(sents)
        
        return input_ids, new_ss + 1, new_os + 1

In [191]:
parser = argparse.ArgumentParser()
parser.add_argument("--config", "-c", type=str, default="1.2.0_config")

args, _ = parser.parse_known_args()
conf = OmegaConf.load(f"../code/config/{args.config}.yaml")

# load model and tokenizer
MODEL_NAME = conf.model.model_name
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# load dataset
train_dataset = pd.read_csv("../dataset/train/train_final.csv")
train_label = label_to_num(train_dataset['label'].values)

In [192]:
example=next(train_dataset.iterrows())[1]

In [193]:
sentence = example['sentence']
subject_word,object_word = example['subject_word'],  example['object_word']
ss, se = example['subject_start_idx'], example['subject_end_idx']
os, oe = example['object_start_idx'], example['object_end_idx']
subject_type, oject_type = example['subject_type'], example['object_type']

In [194]:
tokenize(sentence, subject_word, object_word, subj_type, obj_type, ss, se, os, oe)

['〈', 'So', '##me', '##th', '##ing', '〉', '는', '#', '^', '[PER]', '^', '조지', '해리', '##슨', '#', '##이', '쓰', '##고', '@', '*', '[ORG]', '*', '비틀즈', '@', '##가', '1969', '##년', '앨범', '《', 'Ab', '##be', '##y', 'Ro', '##ad', '》', '에', '담', '##은', '노래', '##다', '.']


([2,
  168,
  30985,
  14451,
  7088,
  4586,
  169,
  793,
  7,
  65,
  1,
  65,
  8373,
  14113,
  2234,
  7,
  2052,
  1363,
  2088,
  36,
  14,
  1,
  14,
  29830,
  36,
  2116,
  14879,
  2440,
  6711,
  170,
  21406,
  26713,
  2076,
  25145,
  5749,
  171,
  1421,
  818,
  2073,
  4388,
  2062,
  18,
  3],
 19,
 8)

In [237]:
class Processor:
    def __init__(self, args, tokenizer):
        super().__init__()
        self.args = args
        self.tokenizer = tokenizer
        self.new_tokens = ['[PER]', '[ORG]', '[DAT]', '[LOC]', '[POH]', '[NOH]']
        self.tokenizer.add_tokens(self.new_tokens)
        self.LABEL_TO_ID = {'no_relation': 0, 'org:top_members/employees': 1, 'org:members': 2, 'org:product': 3, 'per:title': 4, 'org:alternate_names': 5, 'per:employee_of': 6, \
                'org:place_of_headquarters': 7, 'per:product': 8, 'org:number_of_employees/members': 9, 'per:children': 10, 'per:place_of_residence': 11, 'per:alternate_names': 12, \
                'per:other_family': 13, 'per:colleagues': 14, 'per:origin': 15, 'per:siblings': 16, 'per:spouse': 17, 'org:founded': 18, 'org:political/religious_affiliation': 19, \
                'org:member_of': 20, 'per:parents': 21, 'org:dissolved': 22, 'per:schools_attended': 23, 'per:date_of_death': 24, 'per:date_of_birth': 25, 'per:place_of_birth': 26, \
                'per:place_of_death': 27, 'org:founded_by': 28, 'per:religion': 29}
        
    def token_location(self, list1, list2):
        for idx in range(len(list1) - len(list2) + 1):
            if list1[idx:idx + len(list2)] == list2:
                index = idx
                return idx
            
    def tokenize(self, sentence, subj_type, obj_type, ss, se, os, oe):
        
        subj_type , obj_type = f"[{subj_type}]", f"[{obj_type}]"
        new_sentence=''
        if ss < os:
            new_sentence += sentence[ :ss]
            new_sentence += f"@*{subj_type}*{sentence[ss:se+1]}@"
            new_sentence += sentence[se+1:os]a
            new_sentence += f"#^{obj_type}^{sentence[os:oe+1]}#"
            new_sentence += sentence[oe+1:]
        else:
            new_sentence += sentence[ :os]
            new_sentence += f"#^{obj_type}^{sentence[os:oe+1]}#"
            new_sentence += sentence[oe+1:ss]
            new_sentence += f"@*{subj_type}*{sentence[ss:se+1]}@"
            new_sentence += sentence[se+1:]

        sents=self.tokenizer.tokenize(new_sentence)
        new_ss= self.token_location(sents,['@',"*"])
        new_os= self.token_location(sents,['#',"^"])

        sents = sents[:self.args.model.max_seq_length - 2]
        input_ids = self.tokenizer.convert_tokens_to_ids(sents)
        input_ids = self.tokenizer.build_inputs_with_special_tokens(input_ids)
        return input_ids, new_ss + 1, new_os + 1

    def read(self, file_in):
        features = []
        with open(file_in, "r") as fh:
            data = pd.read_csv(fh)

        for _, d in tqdm(data.iterrows()):
            ss, se = int(d['subject_start_idx']), int(d['subject_end_idx'])
            os, oe = int(d['object_start_idx']), int(d['object_end_idx'])
            input_ids, new_ss, new_os = self.tokenize(d['sentence'], d['subject_type'], d['object_type'], ss, se, os, oe)
            rel = self.LABEL_TO_ID[d['label']]

            feature = {
                'input_ids': input_ids,
                'labels': rel,
                'ss': new_ss,
                'os': new_os,
            }
            features.append(feature)

        return features
    


In [238]:
parser = argparse.ArgumentParser()
parser.add_argument("--config", "-c", type=str, default="1.2.0_config")

args, _ = parser.parse_known_args()
conf = OmegaConf.load(f"../code/config/{args.config}.yaml")

# load model and tokenizer
MODEL_NAME = conf.model.model_name
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# load dataset
train_dataset = pd.read_csv("../dataset/train/train_final.csv")
train_label = label_to_num(train_dataset['label'].values)

# tokenizing dataset
tokenized_train = Processor(conf, tokenizer).read("../dataset/train/train_final.csv")

32470it [00:27, 1178.71it/s]


In [243]:
tokenizer("")

{'input_ids': [2, 36, 3], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}

In [240]:
tokenized_train[0]

{'input_ids': [2,
  168,
  30985,
  14451,
  7088,
  4586,
  169,
  793,
  7,
  65,
  32000,
  65,
  8373,
  14113,
  2234,
  7,
  1504,
  1363,
  2088,
  36,
  14,
  32001,
  14,
  29830,
  36,
  543,
  14879,
  2440,
  6711,
  170,
  21406,
  26713,
  2076,
  25145,
  5749,
  171,
  1421,
  818,
  2073,
  4388,
  2062,
  18,
  3],
 'labels': 0,
 'ss': 19,
 'os': 8}

In [None]:
import pandas as pd
import ast

def add_typed_entity_marker_original(file_path):
    """기존각 엔티티 토큰을 스페셜 토큰에 추가하는 방식

    Args:
        file_path (str): csv파일경로

    Returns:
        list: 스페셜 토큰에 추가할 엔티티 토큰의 list
    """
    entity_tokens = set()
    df = pd.read_csv(file_path)

    result = []

    for idx, row in df.iterrows():
        sentence = row['sentence']
        subject_entity = ast.literal_eval(row['subject_entity'])
        object_entity = ast.literal_eval(row['object_entity'])

        new_sentence = ''

        curr_entity_tokens = [f'<S:{subject_entity["type"]}>',
                              f'</S:{subject_entity["type"]}>',
                              f'<O:{object_entity["type"]}>',
                              f'</O:{object_entity["type"]}>']
        entity_tokens.update(curr_entity_tokens)

        if subject_entity['start_idx'] < object_entity['start_idx']:
            new_sentence += sentence[ :subject_entity['start_idx'] ]
            new_sentence += curr_entity_tokens[0]
            new_sentence += sentence[ subject_entity['start_idx']:subject_entity['end_idx']+1 ]
            new_sentence += curr_entity_tokens[1]
            new_sentence += sentence[ subject_entity['end_idx']+1:object_entity['start_idx'] ]
            new_sentence += curr_entity_tokens[2]
            new_sentence += sentence[ object_entity['start_idx']:object_entity['end_idx']+1 ]
            new_sentence += curr_entity_tokens[3]
            new_sentence += sentence[ object_entity['end_idx']+1: ]
        else:
            new_sentence += sentence[ :object_entity['start_idx'] ]
            new_sentence += curr_entity_tokens[2]
            new_sentence += sentence[ object_entity['start_idx']:object_entity['end_idx']+1 ]
            new_sentence += curr_entity_tokens[3]
            new_sentence += sentence[ object_entity['end_idx']+1:subject_entity['start_idx'] ]
            new_sentence += curr_entity_tokens[0]
            new_sentence += sentence[ subject_entity['start_idx']:subject_entity['end_idx']+1 ]
            new_sentence += curr_entity_tokens[1]
            new_sentence += sentence[ subject_entity['end_idx']+1: ]

        result.append(new_sentence)
    
    df['sentence'] = result
    df.to_csv('typed_entity_marker_original_train.csv')

    return list(entity_tokens) 


def add_typed_entity_marker_punct(file_path):
    """각 엔티티 토큰은 @, *, &, ^로 고정. 다만 논문과 다르게 우리가 사용할 한글 토크나이저에
       1. 엔티티 타입이 없을 수 있어 [UNK]으로 되는것을 방지하고자 엔티티 타입 자체를 스페셜 토큰으로 추가
       2. wordpiece 토크나이저를 사용할 수 있으므로 object entity의 토큰을 논문의 '#'대신 '&'로 대체사용 
   
    Args:
        file_path (str): csv파일경로

    Returns:
        list: 스페셜 토큰에 추가할 엔티티 타입의 list
    """
    entity_tokens = set()
    df = pd.read_csv(file_path)

    result = []

    for idx, row in df.iterrows():
        sentence = row['sentence']
        subject_entity = ast.literal_eval(row['subject_entity'])
        object_entity = ast.literal_eval(row['object_entity'])

        new_sentence = ''

        curr_entity_tokens = [subject_entity["type"],
                              object_entity["type"]]
        entity_tokens.update(curr_entity_tokens)

        if subject_entity['start_idx'] < object_entity['start_idx']:
            new_sentence += sentence[ :subject_entity['start_idx'] ]
            new_sentence += f"@*{curr_entity_tokens[0]}*{sentence[subject_entity['start_idx']:subject_entity['end_idx']+1]}@"
            new_sentence += sentence[subject_entity['end_idx']+1:object_entity['start_idx']]
            new_sentence += f"&^{curr_entity_tokens[1]}^{sentence[object_entity['start_idx']:object_entity['end_idx']+1]}&"
            new_sentence += sentence[object_entity['end_idx']+1:]
        else:
            new_sentence += sentence[ :object_entity['start_idx'] ]
            new_sentence += f"&^{curr_entity_tokens[1]}^{sentence[object_entity['start_idx']:object_entity['end_idx']+1]}&"
            new_sentence += sentence[object_entity['end_idx']+1:subject_entity['start_idx']]
            new_sentence += f"@*{curr_entity_tokens[0]}*{sentence[subject_entity['start_idx']:subject_entity['end_idx']+1]}@"
            new_sentence += sentence[subject_entity['end_idx']+1:]
        result.append(new_sentence)
    
    df['sentence'] = result
    df.to_csv('typed_entity_marker_punct_train.csv')

    return list(entity_tokens) 

# add_typed_entity_marker_original("../dataset/train/train.csv")
# add_typed_entity_marker_punct("../dataset/train/train.csv") 

In [None]:
from transformers import AutoTokenizer, AutoConfig, AutoModel, Trainer, TrainingArguments
from omegaconf import OmegaConf
from dataset_utils import label_to_num
from datasets import RE_Dataset
from metrics import compute_metrics
from custom_model import Custom_Model
from custom_prepros import Processor
import numpy as np
import pandas as pd
import argparse
import random
import torch
import wandb
import os


def set_seed(seed:int = 42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)


if __name__ == '__main__':
  parser = argparse.ArgumentParser()
  parser.add_argument("--config", "-c", type=str, default="1.2.0_config")

  args, _ = parser.parse_known_args()
  conf = OmegaConf.load(f"../code/config/{args.config}.yaml")

  set_seed(42)

  wandb.login()
  wandb.init(project=conf.wandb.project_name)

  # load model and tokenizer
  MODEL_NAME = conf.model.model_name
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

  # load dataset
  train_dataset = pd.read_csv("../dataset/train/train_final.csv")
  train_label = label_to_num(train_dataset['label'].values)

  # tokenizing dataset
  tokenized_train = Processor(conf, tokenizer).read("../dataset/train/train_final.csv")

  # make dataset for pytorch.
  RE_train_dataset = RE_Dataset(tokenized_train, train_label)