In [4]:
import pickle as pickle
import os
import pandas as pd
import torch
from functools import partial

import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import re
from collections import defaultdict, Counter
from sklearn.model_selection import train_test_split

In [5]:
def preprocessing_dataset(dataset):
    train_subject_entity = []
    train_object_entity = []
    train_subject_type = []
    train_object_type = []
    val_subject_entity = []
    val_object_entity = []
    val_subject_type = []
    val_object_type = []

    for train in train_examples['train']:
        train_subject_entity.append(train['subject_entity']['word'])
        train_object_entity.append(train['object_entity']['word'])
        train_subject_type.append(train['subject_entity']['type'])
        train_object_type.append(train['object_entity']['type'])
    
    for val in train_examples['valid']:
        val_subject_entity.append(val['subject_entity']['word'])
        val_object_entity.append(val['object_entity']['word'])
        val_subject_type.append(val['subject_entity']['type'])
        val_object_type.append(val['object_entity']['type'])
        
        
    train_dataset = pd.DataFrame({'id':dataset['train']['guid'], 'sentence':dataset['train']['sentence'],
                                  'subject_entity':train_subject_entity,'object_entity':train_object_entity,
                                  'subject_type':train_subject_type, 'object_type':train_object_type,'source':dataset['train']['source'], 
                                  'label':dataset['train']['label'], 'text': dataset['train']['text']})
    
    val_dataset = pd.DataFrame({'id':dataset['valid']['guid'], 'sentence':dataset['valid']['sentence'],
                                  'subject_entity':val_subject_entity,'object_entity':val_object_entity,
                                  'subject_type':val_subject_type, 'object_type':val_object_type,'source':dataset['valid']['source'], 
                                  'label':dataset['valid']['label'], 'text': dataset['valid']['text']})
    
    out_dataset = pd.concat([train_dataset, val_dataset])
    out_dataset['id'] = out_dataset['id'].map(int)
    return out_dataset.sort_values(['id']).reset_index(drop=True)

In [7]:
from typing import Dict, Tuple, List, Any


# create_example
def _mark_entity_spans(examples,
                      subject_start_marker: str, subject_end_marker: str,
                      object_start_marker: str, object_end_marker: str):

    def _mark_entity_spans(
        text: str, 
        subject_range=Tuple[int, int], 
        object_range=Tuple[int, int]
    ) -> str:
        """ Adds entity markers to the text to identify the subject/object entities.
        Args:
            text: Original sentence
            subject_range: Pair of start and end indices of subject entity
            object_range: Pair of start and end indices of object entity
        Returns:
            A string of text with subject/object entity markers
        """
        if subject_range < object_range:
            segments = [
                text[: subject_range[0]],
                subject_start_marker,
                text[subject_range[0] : subject_range[1] + 1],
                subject_end_marker,
                text[subject_range[1] + 1 : object_range[0]],
                object_start_marker,
                text[object_range[0] : object_range[1] + 1],
                object_end_marker,
                text[object_range[1] + 1 :],
            ]
        elif subject_range > object_range:
            segments = [
                text[: object_range[0]],
                object_start_marker,
                text[object_range[0] : object_range[1] + 1],
                object_end_marker,
                text[object_range[1] + 1 : subject_range[0]],
                subject_start_marker,
                text[subject_range[0] : subject_range[1] + 1],
                subject_end_marker,
                text[subject_range[1] + 1 :],
            ]
        else:
            raise ValueError("Entity boundaries overlap.")

        marked_text = "".join(segments)

        return marked_text
    
    subject_entity = examples["subject_entity"]
    object_entity = examples["object_entity"]
    
    text = _mark_entity_spans(
        examples["sentence"],
        (subject_entity["start_idx"], subject_entity["end_idx"]),
        (object_entity["start_idx"], object_entity["end_idx"]),
    )
    return {"text": text}

In [9]:
markers = dict(
    subject_start_marker="<subj>",
    subject_end_marker="</subj>",
    object_start_marker="<obj>",
    object_end_marker="</obj>",
)
mark_entity_spans = partial(_mark_entity_spans, **markers)

In [10]:
with open('dict_label_to_num.pkl', 'rb') as f:
    dict_label_to_num = pickle.load(f)

In [11]:
from datasets import load_dataset

data = load_dataset("jinmang2/load_klue_re", script_version="v1.0.1b")

Reusing dataset klue_re (/opt/ml/.cache/huggingface/datasets/klue_re/re/1.0.1/72db5b4f9111ca3106d23ecec67a53f750c70b70b32c5512f925baac3b39a0de)


  0%|          | 0/3 [00:00<?, ?it/s]

In [12]:
train_examples = data.map(mark_entity_spans)

  0%|          | 0/25976 [00:00<?, ?ex/s]

  0%|          | 0/6494 [00:00<?, ?ex/s]

  0%|          | 0/7765 [00:00<?, ?ex/s]

In [13]:
pd_dataset = preprocessing_dataset(train_examples)
pd_dataset

Unnamed: 0,id,sentence,subject_entity,object_entity,subject_type,object_type,source,label,text
0,0,〈Something〉는 조지 해리슨이 쓰고 비틀즈가 1969년 앨범 《Abbey R...,비틀즈,조지 해리슨,ORG,PER,wikipedia,0,〈Something〉는 <obj>조지 해리슨</obj>이 쓰고 <subj>비틀즈</...
1,1,호남이 기반인 바른미래당·대안신당·민주평화당이 우여곡절 끝에 합당해 민생당(가칭)으...,민주평화당,대안신당,ORG,ORG,wikitree,0,호남이 기반인 바른미래당·<obj>대안신당</obj>·<subj>민주평화당</sub...
2,2,K리그2에서 성적 1위를 달리고 있는 광주FC는 지난 26일 한국프로축구연맹으로부터...,광주FC,한국프로축구연맹,ORG,ORG,wikitree,20,K리그2에서 성적 1위를 달리고 있는 <subj>광주FC</subj>는 지난 26일...
3,3,균일가 생활용품점 (주)아성다이소(대표 박정부)는 코로나19 바이러스로 어려움을 겪...,아성다이소,박정부,ORG,PER,wikitree,1,균일가 생활용품점 (주)<subj>아성다이소</subj>(대표 <obj>박정부</o...
4,4,1967년 프로 야구 드래프트 1순위로 요미우리 자이언츠에게 입단하면서 등번호는 8...,요미우리 자이언츠,1967,ORG,DAT,wikipedia,0,<obj>1967</obj>년 프로 야구 드래프트 1순위로 <subj>요미우리 자이...
...,...,...,...,...,...,...,...,...,...
32465,32465,한국당은 7일 오전 9시부터 오후 5시까지 진행된 원내대표 및 정책위의장 후보자 등...,유기준,부산 서구·동구,PER,LOC,wikitree,6,한국당은 7일 오전 9시부터 오후 5시까지 진행된 원내대표 및 정책위의장 후보자 등...
32466,32466,"법포는 다시 최시형, 서병학, 손병희 직계인 북접과 다시 서장옥, 전봉준, 김개남을...",최시형,손병희,PER,PER,wikipedia,14,"법포는 다시 <subj>최시형</subj>, 서병학, <obj>손병희</obj> 직..."
32467,32467,완도군(군수 신우철)이 국토교통부에서 실시한 '2019 교통문화지수 실태조사'에서 ...,완도군,신우철,ORG,PER,wikitree,1,<subj>완도군</subj>(군수 <obj>신우철</obj>)이 국토교통부에서 실...
32468,32468,"중앙일보, JTBC 회장을 지낸 이후 중앙홀딩스 회장, 재단법인 한반도평화만들기 이...",JTBC,중앙홀딩스,ORG,ORG,wikipedia,0,"중앙일보, <subj>JTBC</subj> 회장을 지낸 이후 <obj>중앙홀딩스</..."
