# An Improved Baseline for Sentence-level Relation Extraction 구현


In [1]:
import pandas as pd
import transformers
import torch
import matplotlib.pyplot as plt
import re
from collections import Counter
from itertools import combinations, permutations
import pickle
import numpy as np
import hanja
from hanja import hangul

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_data = pd.read_csv('/opt/ml/dataset/train/train.csv')
dev_data = pd.read_csv('/opt/ml/dataset/train/dev.csv')
test_data = pd.read_csv('/opt/ml/dataset/test/test_data.csv')

In [3]:
markers = {
    'PER' : 'person',
    'ORG' : 'organization',
    'LOC' : 'location',
    "POH" : 'occupation',
    'NOH' : 'number',
    'DAT' : 'date'
}

## Entity mask
ex: [SUBJ-PERSON] was born in [OBJ-CITY]\
eval:\
micro f1 - 83.26461570990591\
auprc - 76.52407859243088\
inference:\
micro f1 - 40.2905\
auprc - 38.7714

In [12]:
for i in range(len(train_data)):
    sub_ent = eval(train_data.iloc[i, 2])
    obj_ent = eval(train_data.iloc[i, 3])
    train_data.iloc[i, 1] = train_data.iloc[i, 1].replace(sub_ent['word'], '[SUBJ-'+markers[sub_ent['type']].upper()+']')
    train_data.iloc[i, 1] = train_data.iloc[i, 1].replace(obj_ent['word'], '[OBJ-'+markers[obj_ent['type']].upper()+']')


In [13]:
train_data.head()

Unnamed: 0,id,sentence,subject_entity,object_entity,label,source
0,24121,인천도시공사(사장 박인서)는 에너지절감 및 친환경 경영을 실천하고자 관용차량 일부를...,"{'word': '한국GM', 'start_idx': 47, 'end_idx': 5...","{'word': '자동차', 'start_idx': 61, 'end_idx': 63...",org:product,wikitree
1,10907,"[SUBJ-ORGANIZATION]은 7,000명의 보병부대와 [OBJ-NUMBER...","{'word': '스위스군', 'start_idx': 0, 'end_idx': 3,...","{'word': '2,000명', 'start_idx': 20, 'end_idx':...",no_relation,wikipedia
2,20809,2002년 [SUBJ-ORGANIZATION]에서 [OBJ-ORGANIZATION]...,"{'word': '한빛은행', 'start_idx': 6, 'end_idx': 9,...","{'word': '우리은행', 'start_idx': 13, 'end_idx': 1...",org:alternate_names,wikipedia
3,18935,지난 10일 국회가 확정한 내년도 [SUBJ-ORGANIZATION] 정부예산을 보...,"{'word': '여수시', 'start_idx': 66, 'end_idx': 68...","{'word': '1282억 원', 'start_idx': 52, 'end_idx'...",no_relation,wikitree
4,24880,가라스마 역은 일본 [OBJ-LOCATION] 교토시 시모교구에 있는 [SUBJ-O...,"{'word': '한큐 전철', 'start_idx': 28, 'end_idx': ...","{'word': '교토부', 'start_idx': 11, 'end_idx': 13...",org:place_of_headquarters,wikipedia


In [17]:
train_data.to_csv('../dataset/train/train_entity_mask.csv')

In [14]:
for i in range(len(dev_data)):
    sub_ent = eval(dev_data.iloc[i, 2])
    obj_ent = eval(dev_data.iloc[i, 3])
    dev_data.iloc[i, 1] = dev_data.iloc[i, 1].replace(sub_ent['word'], '[SUBJ-'+markers[sub_ent['type']].upper()+']')
    dev_data.iloc[i, 1] = dev_data.iloc[i, 1].replace(obj_ent['word'], '[OBJ-'+markers[obj_ent['type']].upper()+']')

In [15]:
dev_data.head()

Unnamed: 0,id,sentence,subject_entity,object_entity,label,source
0,17193,건설기술용역업자 사업수행능력(PQ) 세부평가기준 개정은 지난해 말 [SUBJ-ORG...,"{'word': '국토교통부', 'start_idx': 37, 'end_idx': ...","{'word': '국토부', 'start_idx': 60, 'end_idx': 62...",org:alternate_names,wikitree
1,10580,그는 “특히 [OBJ-ORGANIZATION]은 다시 영업을 재개한 [SUBJ-OR...,"{'word': '래미안', 'start_idx': 24, 'end_idx': 26...","{'word': '삼성물산', 'start_idx': 7, 'end_idx': 10...",no_relation,wikitree
2,7266,또 다른 경쟁사인 [OBJ-LOCATION] 글로벌 가전업체 [SUBJ-ORGANI...,"{'word': '일렉트로룩스', 'start_idx': 23, 'end_idx':...","{'word': '스웨덴', 'start_idx': 10, 'end_idx': 12...",org:place_of_headquarters,wikitree
3,8388,"[OBJ-DATE]년 2월, 황진성은 [SUBJ-ORGANIZATION]와 3년 재...","{'word': '포항 스틸러스', 'start_idx': 15, 'end_idx'...","{'word': '2008', 'start_idx': 0, 'end_idx': 3,...",no_relation,wikipedia
4,15116,[SUBJ-ORGANIZATION] 동구(청장 임택)가 '2020 기본복지 가이드라...,"{'word': '광주시', 'start_idx': 0, 'end_idx': 2, ...","{'word': '13개', 'start_idx': 39, 'end_idx': 41...",org:number_of_employees/members,wikitree


In [18]:
dev_data.to_csv('../dataset/train/dev_entity_mask.csv')

## Entity marker
ex: [E1] Bill [/E1] was born in [E2] Seattle [/E2]\
eval:\
micro f1 - 82.55309926311227\
auprc - 75.95485253894314\
inference:\
micro f1 - 54.9873	\
auprc - 53.3891

In [30]:
for i in range(len(train_data)):
    sub_ent = eval(train_data.iloc[i, 2])
    obj_ent = eval(train_data.iloc[i, 3])
    train_data.iloc[i, 1] = train_data.iloc[i, 1].replace(sub_ent['word'], '[E1] ' + sub_ent['word'] + '[/E1]')
    train_data.iloc[i, 1] = train_data.iloc[i, 1].replace(obj_ent['word'], '[E2] ' + obj_ent['word'] + '[/E2]')

In [31]:
train_data.to_csv('../dataset/train/train_entity_marker.csv')

In [32]:
for i in range(len(dev_data)):
    sub_ent = eval(dev_data.iloc[i, 2])
    obj_ent = eval(dev_data.iloc[i, 3])
    dev_data.iloc[i, 1] = dev_data.iloc[i, 1].replace(sub_ent['word'], '[E1] ' + sub_ent['word'] + '[/E1]')
    dev_data.iloc[i, 1] = dev_data.iloc[i, 1].replace(obj_ent['word'], '[E2] ' + obj_ent['word'] + '[/E2]')

In [33]:
dev_data.to_csv('../dataset/train/dev_entity_marker.csv')

## Entity marker (punct)
ex:  @ Bill @ was born in # Seattle #.\
eval:\
micro f1 - 82.97755883962779\
auprc - 75.15339575561447\
inference:\
micro f1 - 57.8795	\
auprc - 58.1863

In [35]:
for i in range(len(train_data)):
    sub_ent = eval(train_data.iloc[i, 2])
    obj_ent = eval(train_data.iloc[i, 3])
    train_data.iloc[i, 1] = train_data.iloc[i, 1].replace(sub_ent['word'], '@ ' + sub_ent['word'] + ' @')
    train_data.iloc[i, 1] = train_data.iloc[i, 1].replace(obj_ent['word'], '# ' + obj_ent['word'] + ' #')

In [36]:
train_data.to_csv('../dataset/train/train_entity_marker_punc.csv')

In [37]:
for i in range(len(dev_data)):
    sub_ent = eval(dev_data.iloc[i, 2])
    obj_ent = eval(dev_data.iloc[i, 3])
    dev_data.iloc[i, 1] = dev_data.iloc[i, 1].replace(sub_ent['word'], '@ ' + sub_ent['word'] + ' @')
    dev_data.iloc[i, 1] = dev_data.iloc[i, 1].replace(obj_ent['word'], '# ' + obj_ent['word'] + ' #')

In [38]:
dev_data.to_csv('../dataset/train/dev_entity_marker_punc.csv')

## Typed entity marker
ex: <S:PERSON> Bill </S:PERSON> was born in <O:CITY> Seattle </O:CITY>.\
eval:\
micro f1 - 82.86281429201587\
auprc - 73.9305362530267\
inference:\
micro f1 - 56.4742	\
auprc - 55.0037

In [40]:
for i in range(len(train_data)):
    sub_ent = eval(train_data.iloc[i, 2])
    obj_ent = eval(train_data.iloc[i, 3])
    train_data.iloc[i, 1] = train_data.iloc[i, 1].replace(sub_ent['word'], '<S:'+ markers[sub_ent['type']]+'>' + sub_ent['word']+'</S:'+markers[sub_ent['type']]+'>')
    train_data.iloc[i, 1] = train_data.iloc[i, 1].replace(obj_ent['word'], '<O:'+ markers[obj_ent['type']]+'>' + obj_ent['word']+'</O:'+markers[obj_ent['type']]+'>')

In [41]:
train_data.head()

Unnamed: 0,id,sentence,subject_entity,object_entity,label,source
0,24121,인천도시공사(사장 박인서)는 에너지절감 및 친환경 경영을 실천하고자 관용차량 일부를...,"{'word': '한국GM', 'start_idx': 47, 'end_idx': 5...","{'word': '자동차', 'start_idx': 61, 'end_idx': 63...",org:product,wikitree
1,10907,"<S:organization>스위스군</S:organization>은 7,000명의...","{'word': '스위스군', 'start_idx': 0, 'end_idx': 3,...","{'word': '2,000명', 'start_idx': 20, 'end_idx':...",no_relation,wikipedia
2,20809,2002년 <S:organization>한빛은행</S:organization>에서 ...,"{'word': '한빛은행', 'start_idx': 6, 'end_idx': 9,...","{'word': '우리은행', 'start_idx': 13, 'end_idx': 1...",org:alternate_names,wikipedia
3,18935,지난 10일 국회가 확정한 내년도 <S:organization>여수시</S:orga...,"{'word': '여수시', 'start_idx': 66, 'end_idx': 68...","{'word': '1282억 원', 'start_idx': 52, 'end_idx'...",no_relation,wikitree
4,24880,가라스마 역은 일본 <O:location>교토부</O:location> 교토시 시모...,"{'word': '한큐 전철', 'start_idx': 28, 'end_idx': ...","{'word': '교토부', 'start_idx': 11, 'end_idx': 13...",org:place_of_headquarters,wikipedia


In [42]:
train_data.to_csv('../dataset/train/train_typed_entity_marker.csv')

In [43]:
for i in range(len(dev_data)):
    sub_ent = eval(dev_data.iloc[i, 2])
    obj_ent = eval(dev_data.iloc[i, 3])
    dev_data.iloc[i, 1] = dev_data.iloc[i, 1].replace(sub_ent['word'], '<S:'+ markers[sub_ent['type']]+'>' + sub_ent['word']+'</S:'+markers[sub_ent['type']]+'>')
    dev_data.iloc[i, 1] = dev_data.iloc[i, 1].replace(obj_ent['word'], '<O:'+ markers[obj_ent['type']]+'>' + obj_ent['word']+'</O:'+markers[obj_ent['type']]+'>')

In [44]:
dev_data.to_csv('../dataset/train/dev_typed_entity_marker.csv')

## Typed entity marker (punct)
ex: @ * person * Bill @ was born in # ∧ city ∧ Seattle #
eval:\
micro f1 - 80.77239112571898\
auprc - 67.82235926263863\
inference:\
micro f1 - 58.1295\
auprc - 51.2219

In [4]:
for i in range(len(train_data)):
    sub_ent = eval(train_data.iloc[i, 2])
    obj_ent = eval(train_data.iloc[i, 3])
    train_data.iloc[i, 1] = train_data.iloc[i, 1].replace(sub_ent['word'], '@ * '+ markers[sub_ent['type']] + '*' + sub_ent['word']+'@')
    train_data.iloc[i, 1] = train_data.iloc[i, 1].replace(obj_ent['word'], '# ^ '+ markers[obj_ent['type']] + '^' + obj_ent['word']+'#')


In [5]:
train_data.head()

Unnamed: 0,id,sentence,subject_entity,object_entity,label,source
0,24121,인천도시공사(사장 박인서)는 에너지절감 및 친환경 경영을 실천하고자 관용차량 일부를...,"{'word': '한국GM', 'start_idx': 47, 'end_idx': 5...","{'word': '자동차', 'start_idx': 61, 'end_idx': 63...",org:product,wikitree
1,10907,"@ * organization*스위스군@은 7,000명의 보병부대와 # ^ numb...","{'word': '스위스군', 'start_idx': 0, 'end_idx': 3,...","{'word': '2,000명', 'start_idx': 20, 'end_idx':...",no_relation,wikipedia
2,20809,2002년 @ * organization*한빛은행@에서 # ^ organizatio...,"{'word': '한빛은행', 'start_idx': 6, 'end_idx': 9,...","{'word': '우리은행', 'start_idx': 13, 'end_idx': 1...",org:alternate_names,wikipedia
3,18935,지난 10일 국회가 확정한 내년도 @ * organization*여수시@ 정부예산을...,"{'word': '여수시', 'start_idx': 66, 'end_idx': 68...","{'word': '1282억 원', 'start_idx': 52, 'end_idx'...",no_relation,wikitree
4,24880,가라스마 역은 일본 # ^ location^교토부# 교토시 시모교구에 있는 @ * ...,"{'word': '한큐 전철', 'start_idx': 28, 'end_idx': ...","{'word': '교토부', 'start_idx': 11, 'end_idx': 13...",org:place_of_headquarters,wikipedia


In [6]:
train_data.to_csv('../dataset/train/train_marker.csv')

In [7]:
for i in range(len(test_data)):
    sub_ent = eval(test_data.iloc[i, 2])
    obj_ent = eval(test_data.iloc[i, 3])
    test_data.iloc[i, 1] = test_data.iloc[i, 1].replace(sub_ent['word'], '@ * '+ markers[sub_ent['type']] + '*' + sub_ent['word']+'@')
    test_data.iloc[i, 1] = test_data.iloc[i, 1].replace(obj_ent['word'], '# ^ '+ markers[obj_ent['type']] + '^' + obj_ent['word']+'#')

In [8]:
for i in range(len(dev_data)):
    sub_ent = eval(dev_data.iloc[i, 2])
    obj_ent = eval(dev_data.iloc[i, 3])
    dev_data.iloc[i, 1] = dev_data.iloc[i, 1].replace(sub_ent['word'], '@ * '+ markers[sub_ent['type']] + '*' + sub_ent['word']+'@')
    dev_data.iloc[i, 1] = dev_data.iloc[i, 1].replace(obj_ent['word'], '# ^ '+ markers[obj_ent['type']] + '^' + obj_ent['word']+'#')

In [9]:
test_data.head()

Unnamed: 0,id,sentence,subject_entity,object_entity,label,source
0,0,지난 15일 @ * organization*MBC@ '# ^ organization...,"{'word': 'MBC', 'start_idx': 7, 'end_idx': 9, ...","{'word': '탐사기획 스트레이트', 'start_idx': 12, 'end_i...",100,wikitree
1,1,사랑스러운 ‘@ * person*프린세스 프링@’의 이름은 봄의 # ^ occupa...,"{'word': '프린세스 프링', 'start_idx': 7, 'end_idx':...","{'word': '공주', 'start_idx': 84, 'end_idx': 85,...",100,wikipedia
2,2,"한편, 본인(이근안)을 모델로 한 MBC 특집드라마 가 # ^ date^1995년#...","{'word': '경찰', 'start_idx': 121, 'end_idx': 12...","{'word': '1995년', 'start_idx': 31, 'end_idx': ...",100,wikipedia
3,3,# ^ person^정창손#은 김질과 같이 대궐로 달려가 고변하며 '신은 실로 모르...,"{'word': '세조', 'start_idx': 78, 'end_idx': 79,...","{'word': '정창손', 'start_idx': 0, 'end_idx': 2, ...",100,wikipedia
4,4,당시 @ * organization*민주당@ 이진련 시의원은 # ^ person^권...,"{'word': '민주당', 'start_idx': 3, 'end_idx': 5, ...","{'word': '권영진', 'start_idx': 16, 'end_idx': 18...",100,wikitree


In [10]:
dev_data.to_csv('../dataset/train/dev_markers.csv')