# 1. data preprocessing

In [1]:
import pandas as pd
df = pd.read_csv('/opt/ml/dataset/train/train.csv')
df.head(3)

Unnamed: 0,id,sentence,subject_entity,object_entity,label,source
0,0,〈Something〉는 조지 해리슨이 쓰고 비틀즈가 1969년 앨범 《Abbey R...,"{'word': '비틀즈', 'start_idx': 24, 'end_idx': 26...","{'word': '조지 해리슨', 'start_idx': 13, 'end_idx':...",no_relation,wikipedia
1,1,호남이 기반인 바른미래당·대안신당·민주평화당이 우여곡절 끝에 합당해 민생당(가칭)으...,"{'word': '민주평화당', 'start_idx': 19, 'end_idx': ...","{'word': '대안신당', 'start_idx': 14, 'end_idx': 1...",no_relation,wikitree
2,2,K리그2에서 성적 1위를 달리고 있는 광주FC는 지난 26일 한국프로축구연맹으로부터...,"{'word': '광주FC', 'start_idx': 21, 'end_idx': 2...","{'word': '한국프로축구연맹', 'start_idx': 34, 'end_idx...",org:member_of,wikitree


In [2]:
input_texts = []
output_texts = []
rels = set()
rel_cnt = {}
for sent, s, o, rel in zip(df.sentence, df.subject_entity, df.object_entity, df.label):
    sbj = eval(s)['word']
    obj = eval(o)['word']
    rels.add(rel)
    if rel in rel_cnt:
        cnt = rel_cnt[rel]
        cnt += 1
        rel_cnt[rel] = cnt
    else:
        rel_cnt[rel] = 1
    input_texts.append(sbj + ' - ' + obj + ' [' + rel + ']')
    output_texts.append(sent)

In [3]:
print(len(input_texts))
print(len(output_texts))
print(len(rels))

32470
32470
30


In [4]:
print(input_texts[0])
print(output_texts[0])

비틀즈 - 조지 해리슨 [no_relation]
〈Something〉는 조지 해리슨이 쓰고 비틀즈가 1969년 앨범 《Abbey Road》에 담은 노래다.


# 2. Model setting

In [5]:
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration

In [6]:
tokenizer = PreTrainedTokenizerFast.from_pretrained('hyunwoongko/kobart')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BartTokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [7]:
tokenizer.tokenize(input_texts[0])

['▁비틀',
 '즈',
 '▁-',
 '▁조지',
 '▁해',
 '리',
 '슨',
 '▁[',
 'n',
 'o',
 '_',
 're',
 'la',
 'tion',
 ']']

In [8]:
additional_tokens = []
for rel in rels:
    additional_tokens.append('[' + rel + ']')

데이터의 relation들을 special token으로 추가합니다

In [9]:
added_token_num = tokenizer.add_special_tokens({'additional_special_tokens':additional_tokens})

In [10]:
tokenizer.tokenize(input_texts[0])

['▁비틀', '즈', '▁-', '▁조지', '▁해', '리', '슨', '▁', '[no_relation]']

In [11]:
print(added_token_num)

30


기존 vocab에 임의로 token들을 추가했으니, 모델사이즈도 반드시 늘려줘야겠죠?

In [12]:
model = BartForConditionalGeneration.from_pretrained('hyunwoongko/kobart')

In [13]:
model.resize_token_embeddings(tokenizer.vocab_size + added_token_num)

Embedding(30030, 768)

In [14]:
model.to('cuda')

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(30030, 768)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(30030, 768)
      (embed_positions): BartLearnedPositionalEmbedding(1028, 768)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (1): BartEncoderLayer(
       

# 3. Dataset

In [15]:
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, TrainingArguments, BatchEncoding

In [16]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model = model,
    padding = True
)

In [17]:
from torch.utils.data import Dataset

class DummySeq2SeqDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizerFast):
        self.tokenizer = tokenizer
        self.src_texts = input_texts
        self.tgt_texts = output_texts
    def __len__(self):
        return len(self.src_texts)
    def __getitem__(self, index: int) -> BatchEncoding:
        src_text = self.src_texts[index].strip()
        tokenized_src_text = self.tokenizer.encode(src_text)[0:512-4]
        src_text = '<s>' + self.tokenizer.decode(tokenized_src_text) + '</s>'

        tgt_text = self.tgt_texts[index].strip()
        tokenized_tgt_text = self.tokenizer.encode(tgt_text)[0:512-4]
        tgt_text = '<s>' + self.tokenizer.decode(tokenized_tgt_text) + '</s>'
        
        return self.tokenizer.prepare_seq2seq_batch(
            src_text, tgt_text, truncation=True, max_length=512, return_token_type_ids=False, padding=False
        )

In [18]:
train_dataset = DummySeq2SeqDataset(tokenizer)

# 4. Train

In [19]:
train_args = TrainingArguments(
    output_dir='./re_generator',
    overwrite_output_dir=True,
    num_train_epochs=4,
    per_device_train_batch_size=8,
    save_steps=2000,
    save_total_limit=4,
    logging_steps=200,
)

In [20]:
trainer = Seq2SeqTrainer(
    model=model,
    args=train_args,
    data_collator=data_collator,
    train_dataset=train_dataset
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [21]:
trainer.train()

***** Running training *****
  Num examples = 32470
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 16236
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdayday[0m (use `wandb login --relogin` to force relogin)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: wandb version 0.12.3 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade

CondaEnvException: Unable to determine environment

Please re-run this command with one of the following options:

* Provide an environment name via --name or -n
* Re-run this command inside an activated conda environment.



`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Step,Training Loss
200,3.8812
400,3.5917
600,3.4909
800,3.4746
1000,3.4237
1200,3.4151
1400,3.399
1600,3.3655
1800,3.3643
2000,3.3308


Saving model checkpoint to ./re_generator/checkpoint-2000
Configuration saved in ./re_generator/checkpoint-2000/config.json
Model weights saved in ./re_generator/checkpoint-2000/pytorch_model.bin
`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

Saving model checkpoint to ./re_generator/checkpoint-4000
Configuration saved in ./re_generator/checkpoint-4000/config.json
Model weights saved in ./re_generator/ch

TrainOutput(global_step=16236, training_loss=2.6974834095348013, metrics={'train_runtime': 2125.7314, 'train_samples_per_second': 61.099, 'train_steps_per_second': 7.638, 'total_flos': 7738997824696320.0, 'train_loss': 2.6974834095348013, 'epoch': 4.0})

In [22]:
trainer.save_model('re_generator_final_model')

Saving model checkpoint to re_generator_final_model
Configuration saved in re_generator_final_model/config.json
Model weights saved in re_generator_final_model/pytorch_model.bin


In [28]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [29]:
! cp -r re_generator_final_model drive/MyDrive/

# 5. Test

In [23]:
for rel in rels:
    print(rel)

org:political/religious_affiliation
per:place_of_death
org:founded_by
per:siblings
org:product
org:members
per:place_of_birth
org:place_of_headquarters
org:alternate_names
per:children
per:origin
per:title
org:top_members/employees
per:spouse
per:parents
per:alternate_names
org:dissolved
no_relation
per:colleagues
org:founded
org:member_of
per:schools_attended
per:other_family
per:employee_of
org:number_of_employees/members
per:product
per:date_of_death
per:place_of_residence
per:date_of_birth
per:religion


In [37]:
import torch

In [25]:
unk_token_id = tokenizer.encode('<unk>')
print(unk_token_id)

[5]


In [61]:
def get_sent(sbj, obj, rel):
    if rel not in rels:
        return "not defined relation"
    text = '<s>' + sbj + ' - ' + obj + ' [' + rel + ']</s>'
    input_ids = tokenizer.encode(text)
    input_ids = torch.tensor(input_ids)
    input_ids = input_ids.unsqueeze(0).to('cuda')

    outputs = model.generate(
        input_ids,
        do_sample=True,
        eos_token_id=1, # </s>
        max_length=130,
        top_p=0.8,
        top_k=10,
        num_return_sequences=5,
        no_repeat_ngram_size=2,
        early_stopping=True,
        bad_words_ids=[unk_token_id]
    )

    result = []
    for output in outputs:
        output = tokenizer.decode(output, skip_special_tokens=True)
        result.append(output)
    return result


In [28]:
test_df = df.sample(10)

In [29]:
test_df

Unnamed: 0,id,sentence,subject_entity,object_entity,label,source
2893,2893,"문하시중 이자연의 조카이고 인예왕후, 인경현비, 인절현비와는 사촌간이다.","{'word': '인절현비', 'start_idx': 27, 'end_idx': 3...","{'word': '이자연', 'start_idx': 5, 'end_idx': 7, ...",per:other_family,wikipedia
25844,25844,"1년 후, 바이에른은 전설적인 오스트리아인 감독 에른스트 하펠이 이끄는 함부르크 S...","{'word': '에른스트 하펠', 'start_idx': 27, 'end_idx'...","{'word': '오스트리아', 'start_idx': 17, 'end_idx': ...",per:origin,wikipedia
17759,17759,그러다가 1949년 6월 6일 이승만 대통령과 신성모 내무부 장관의 사주를 받은 친...,"{'word': '신성모', 'start_idx': 26, 'end_idx': 28...","{'word': '이승만', 'start_idx': 17, 'end_idx': 19...",per:colleagues,wikipedia
10515,10515,무라트 1세의 아들 바예지드 1세는 1396년에 불가리아 북부에서 니코폴리스 전투를...,"{'word': '바예지드 1세', 'start_idx': 11, 'end_idx'...","{'word': '무라트 1세', 'start_idx': 0, 'end_idx': ...",per:parents,wikipedia
12680,12680,실제로 남한 쪽에서는 남조선로동당·근로인민당 등 좌익계열 정당뿐 아니라 한국독립당·...,"{'word': '박헌영', 'start_idx': 79, 'end_idx': 81...","{'word': '조선로동당', 'start_idx': 13, 'end_idx': ...",no_relation,wikipedia
16401,16401,1983년 한국 프로야구(KBO)의 삼미에 입단하면서 장명부라는 등록명을 사용했다.,"{'word': '한국 프로야구', 'start_idx': 6, 'end_idx':...","{'word': 'KBO', 'start_idx': 14, 'end_idx': 16...",org:alternate_names,wikipedia
13741,13741,최남곤 유안타증권 연구원은 “지난달 30일 자로 과학기술정보통신부가 SK브로드밴드의...,"{'word': 'SK브로드밴드', 'start_idx': 38, 'end_idx'...","{'word': 'SK텔레콤', 'start_idx': 84, 'end_idx': ...",org:member_of,wikitree
19987,19987,화순군(군수 구충곤)이 신종 코로나바이러스 감염증(코로나19) 장기화로 어려움을 겪...,"{'word': '화순군', 'start_idx': 0, 'end_idx': 2, ...","{'word': '구충곤', 'start_idx': 7, 'end_idx': 9, ...",org:top_members/employees,wikitree
5601,5601,시애틀 레인의 감독을 역임하고 있던 로라 하비는 킴 리틀의 시애틀 레인 이적을 주선...,"{'word': '킴 리틀', 'start_idx': 27, 'end_idx': 3...","{'word': '아스널', 'start_idx': 82, 'end_idx': 84...",per:employee_of,wikipedia
2678,2678,아일랜드 공화국은 1919년 1월 그레이트브리튼으로부터의 독립을 선언한 혁명정체다.,"{'word': '아일랜드', 'start_idx': 0, 'end_idx': 3,...","{'word': '공화국', 'start_idx': 5, 'end_idx': 7, ...",org:political/religious_affiliation,wikipedia


In [34]:
from tqdm import tqdm

In [35]:
generated = []
for sbj, obj, label in tqdm(zip(test_df.subject_entity, test_df.object_entity, test_df.label)):
    generated.append(get_sent(eval(sbj)['word'],eval(obj)['word'],label))

10it [00:09,  1.07it/s]


In [57]:
origin = test_df.iloc[0]
print('original sbj   : ', eval(origin['subject_entity'])['word'])
print('original obj   : ',eval(origin['object_entity'])['word'])
print('original label : ',origin['label'])
print('original sentence: ', origin['sentence'])
print('-'*100)
print('generated sentence')
for i in range(5):
    print(f'[{i+1}] : {generated[0][i]}')

original sbj   :  인절현비
original obj   :  이자연
original label :  per:other_family
original sentence:  문하시중 이자연의 조카이고 인예왕후, 인경현비, 인절현비와는 사촌간이다.
----------------------------------------------------------------------------------------------------
generated sentence
[1] : 인승군 이세공 연경의 부인 이자연과 혼인하여 승문원, 예문관, 예서사화를 비롯하여 의경태후의 부인 인절현비, 영조계의 시조인 명현승, 영조의 손자 이자연, 예덕후의 며느리이며 후궁이다.
[2] : 이자 인조의 아들 이자연은 인절현비를 시해하고 사형이 언도된 공(公)이었는데 이것은 요절(堯節)이라 한다.
[3] : 신빈 이씨의 아내 이자연은 인절현비의 조카이자 효종이다.
[4] : 인 후일 경연왕후가 한학을 할 때 이자연의 후궁이 되어서, 영조와 인촌현비, 인효현비는 그의 아들이고 인절현비의 동생이고, 이자연이 그의 누이이다.
[5] : 이자왕(王)을 폐하고 인숙현비의 측근인 이자연을 왕으로 추대하여 문신을 차출하고, 관직과 직위를 나누어 인현왕후를 폐위시키고, 문신 이이를 성종의 종으로 삼았다.


In [60]:
for idx in range(10):
    origin = test_df.iloc[idx]
    print('-'*100)
    print('-'*100)
    print('original sbj   : ', eval(origin['subject_entity'])['word'])
    print('original obj   : ',eval(origin['object_entity'])['word'])
    print('original label : ',origin['label'])
    print('original sentence: ', origin['sentence'])
    print('-'*100)
    print('generated sentence')
    for i in range(5):
        print(f'[{i+1}] : {generated[idx][i]}')

----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
original sbj   :  인절현비
original obj   :  이자연
original label :  per:other_family
original sentence:  문하시중 이자연의 조카이고 인예왕후, 인경현비, 인절현비와는 사촌간이다.
----------------------------------------------------------------------------------------------------
generated sentence
[1] : 인승군 이세공 연경의 부인 이자연과 혼인하여 승문원, 예문관, 예서사화를 비롯하여 의경태후의 부인 인절현비, 영조계의 시조인 명현승, 영조의 손자 이자연, 예덕후의 며느리이며 후궁이다.
[2] : 이자 인조의 아들 이자연은 인절현비를 시해하고 사형이 언도된 공(公)이었는데 이것은 요절(堯節)이라 한다.
[3] : 신빈 이씨의 아내 이자연은 인절현비의 조카이자 효종이다.
[4] : 인 후일 경연왕후가 한학을 할 때 이자연의 후궁이 되어서, 영조와 인촌현비, 인효현비는 그의 아들이고 인절현비의 동생이고, 이자연이 그의 누이이다.
[5] : 이자왕(王)을 폐하고 인숙현비의 측근인 이자연을 왕으로 추대하여 문신을 차출하고, 관직과 직위를 나누어 인현왕후를 폐위시키고, 문신 이이를 성종의 종으로 삼았다.
----------------------------------------------------------------------------------------------------
------------------------------------

In [62]:
generated = []
for sbj, obj, label in tqdm(zip(test_df.subject_entity, test_df.object_entity, test_df.label)):
    generated.append(get_sent(eval(sbj)['word'],eval(obj)['word'],label))

10it [00:06,  1.48it/s]


In [63]:
for idx in range(10):
    origin = test_df.iloc[idx]
    print('-'*100)
    print('-'*100)
    print('original sbj   : ', eval(origin['subject_entity'])['word'])
    print('original obj   : ',eval(origin['object_entity'])['word'])
    print('original label : ',origin['label'])
    print('original sentence: ', origin['sentence'])
    print('-'*100)
    print('generated sentence')
    for i in range(5):
        print(f'[{i+1}] : {generated[idx][i]}')

----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
original sbj   :  인절현비
original obj   :  이자연
original label :  per:other_family
original sentence:  문하시중 이자연의 조카이고 인예왕후, 인경현비, 인절현비와는 사촌간이다.
----------------------------------------------------------------------------------------------------
generated sentence
[1] : 또한연과 인현왕후, 인절현비(仁顯顯vet)의 아들이며 인조의 동생이며 태조의 친정아버지이다.
[2] : 한편연(李容淵, ~)은 조선 세종 때 인조(세종)의 장남이자 인현의 직계비로, 인조의 차남이며, 인절현비의 동생인 이자연의 차녀이다.
[3] : 14시 27분, 이자연은 인절현비의 둘째 아들로 태어났으며, 인조는 그의 형 이자연의 딸이자 사도세자의 외숙부였다.
[4] : 인연왕후는 인절현비(李慈賢妃), 의종후(義宗后) 등과 혼인하였는데, 이 때 이자연은 인연이 없었다.
[5] : 14연왕비 이자연의 사돈이며 인현왕후, 인혜왕후의 사촌동생이다.
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
original sbj   :  에른스

In [27]:
get_sent('홍길동', '1010년', 'per:date_of_birth')

['홍길동(洪吉東, 1010년 ~ 1060년 11월 3일)은 1094년에서 고려말의 장군이자 문신으로 서기 1070년까지 종1품 대장부 겸 의병장인 겸 장군이었다.',
 '홍길동(洪吉東, 1010년 ~ 1048년)은 고려의 정치가이다.',
 '홍길동(洪吉東(柳吉李, 1010년 ~ 1043년 음력 12월)은 고려의 제4대 황제이며 호는 길동정(吉重大)이다.',
 '홍길동(洪吉東, 1010년 ~ 1073년)은 고려 말 조선 초기의 대표적인 장수로, 고려의 문신이자 정치가이다.',
 '홍길동(洪吉東, 1010년 - 1098년)은 고려 말기의 문신이다.']

In [None]:
get_sent('홍길동', '1853년', 'per:date_of_birth')

['홍길동(鴻吉東, 1853년 ~ 1903년)은 대한제국 군주인 윤치호, 순종의 서자였다.',
 '홍길동(洪吉東, 1853년 ~ 1912년 12월 18일)은 일제 강점기의 조선인, 독립운동가 겸 정치가이자 정치가이며, 대한제국 성립의 시초인 대한독립운동과 대한불교조계승상도 겸하고 있다.',
 '홍길동(洪吉東, 1881년 ~ 1853년)은 조선 말기의 관료 및 정치인이다.',
 '홍길동(洪吉東, 1853년 ~ 1926년 9월 29일)은 조선 말기의 무신이다.',
 '홍길동(洪吉東, 1853년 ~ 1938년 12월 14일)은 한국의 독립운동가다.']

In [None]:
get_sent('홍길동', '2010년', 'per:date_of_birth')

['2010년 홍길동이 2010년 이후 무려 32년 만에 대표팀 감독으로 재영입 되었다.',
 "한편, 2010년에 신인 발굴을 위해 발탁된 '홍길동'은 신인 지명된 첫 대상 선수가 됐다.",
 '2010년에는 홍길동이 다시 MC로 돌아왔다.',
 '2010년 홍길동이 데뷔했는데 당시엔 홍길이동의 동생이었기 때문에 홍길을동이라는 가명을 사용했는데 이 가명은 그가 데뷔했을 당시 이미 쓰던 가명이 있었다고 한다.',
 '2010년부터 다시 황정민이 홍길동상을 수상했다.']

In [None]:
get_sent('이명박', '4대강', 'per:product')

['이에 이명박 정부 출범 이후 첫 사업으로 추진된 4대강 사업은 그동안 이명박 정부에서 거의 전무한 사업으로 국토부 예산 305조원 투입과 각종 특혜, 특례 등 대규모 예산이 투입되는 등 사실상 무용론이 대두되고 있었다.',
 '이명박 정부는 특히 정부의 무리한 4대강사업 예산 편성·편성에 비판하며 4대강 사업의 국고투척으로 인한 국민의 혈세를 전액 국가금으로 환수하겠다는 대국민 약속을 지키겠다고 밝혔다.',
 '4대강 “4대강”과 관련해 노무현 전 대통령은 “노무현의 삶과 철학을 이어받은 사람이고 이명박의 정신을 이어받아 왔다”며 이명박 대통령의 사과를 촉구했다.',
 '이명박과 여당은 이를 묵살하고 4대강 사업, 부산지역 주요 관광지 및 문화 유산 등과 같은 굵직한 굵기 굵기의 굵기를 새단장해 새로운 관광 명소로 다시 탈환하려 했다.',
 '한편 이후 이명박의 측근이었던 김성주가 2007년 6월 14일에 친이명박 계열이 아닌 4대강추진본부 수석부지사를 지내면서 4대강 사업의 본질을 파악하고 4대강 사업을 직접 지시하는 등 정치적인 행보를 이어나갔다.']

In [None]:
get_sent('업스테이지', 'KLUE', 'org:product')

['업스테이지(KLUE)는 영국 런던의 중심가 Barnestone Center for Live Arts와 필라델피아에서 출발하였으며 영국과 독일의 합작으로 설립되는 첫 번째 영국 록 밴드이다.',
 '업퍼티즈(KLUE, KOREA Chair, 이하 KNN)는 그룹 업스테이지의 여섯 번째 싱글 음반이다.',
 '업스테이지(KLUE)는 미국 미식축구 리그(MeFeat Calls)와 미국 아이스 댄스 그룹이다.',
 '《스테이지 KLUE》의 음악이론가 클라이언트 츠미르 클라인(The Claimi Clements)이 작업한 노래로 《Rock in the University》에서 사용된 업스테이지는 《Lucked Perfector》에 수록된 〈Technology〉의 OST로 "크레멘"을 리메이크하고 〈Lamenet〉로 인기를 모은 바 있다.',
 '스테이지(KLUE) 연습생은 현재 미국에 거주하고 있으며, 영국에서는 2017년 가을부터 활동을 시작했다.']

In [41]:
print(rel_cnt)

{'no_relation': 9534, 'org:member_of': 1866, 'org:top_members/employees': 4284, 'org:alternate_names': 1320, 'per:date_of_birth': 1130, 'org:place_of_headquarters': 1195, 'per:employee_of': 3573, 'per:origin': 1234, 'per:title': 2103, 'org:members': 420, 'per:schools_attended': 82, 'per:colleagues': 534, 'per:alternate_names': 1001, 'per:spouse': 795, 'org:founded_by': 155, 'org:political/religious_affiliation': 98, 'per:children': 304, 'org:founded': 450, 'org:number_of_employees/members': 48, 'per:place_of_birth': 166, 'org:dissolved': 66, 'per:parents': 520, 'per:religion': 96, 'per:date_of_death': 418, 'per:place_of_residence': 193, 'per:other_family': 190, 'org:product': 380, 'per:siblings': 136, 'per:product': 139, 'per:place_of_death': 40}


In [42]:
sorted_rel_cnt = sorted(rel_cnt.items(), key=lambda item: item[1], reverse=True)

In [43]:
sorted_rel_cnt

[('no_relation', 9534),
 ('org:top_members/employees', 4284),
 ('per:employee_of', 3573),
 ('per:title', 2103),
 ('org:member_of', 1866),
 ('org:alternate_names', 1320),
 ('per:origin', 1234),
 ('org:place_of_headquarters', 1195),
 ('per:date_of_birth', 1130),
 ('per:alternate_names', 1001),
 ('per:spouse', 795),
 ('per:colleagues', 534),
 ('per:parents', 520),
 ('org:founded', 450),
 ('org:members', 420),
 ('per:date_of_death', 418),
 ('org:product', 380),
 ('per:children', 304),
 ('per:place_of_residence', 193),
 ('per:other_family', 190),
 ('per:place_of_birth', 166),
 ('org:founded_by', 155),
 ('per:product', 139),
 ('per:siblings', 136),
 ('org:political/religious_affiliation', 98),
 ('per:religion', 96),
 ('per:schools_attended', 82),
 ('org:dissolved', 66),
 ('org:number_of_employees/members', 48),
 ('per:place_of_death', 40)]

In [None]:
get_sent('홍길동', '활빈당', 'per:place_of_death')

['그러나종친당은 홍길동 뿐만 아니라 활빈당 계열 인사들을 공천했는데, 홍건동 일파를 꺾지 못한다면 종친들에게 독이 될 것을 우려하여 홍익인 세력에게 염증을 느껴 참패를 한 사람들을 추방시키겠다는 것이었다.',
 '활길동이 이 당에 참여하였고, 그들과 함께 당과 함께 활빈당의 당수 홍길동을 두었고, 그는 당주 홍익표에 따라 홍의락에게 홍을표를 줄 것을 지시하였다.',
 '그러나 뒤이어 청나라에서 활빈당과 홍길동의 반탁활동을 적극 반대하였으며, 청 또한 이에 격파하고 나섰다.',
 '홍9년(1874년 1월 1일 ~ 1878년 1월 26일)은 홍길동의 부인으로, 활빈당(활빈당의 전신) 대표였다.',
 '김 활빈당은 홍길동이 재건사업(재건공사, 재건설사업)과 연계하여 홍씨 종의 활력 넘치는 사업들을 추진하였을 뿐만 아니라, 조선에 귀순한 독립운동가 겸 작가였던 송병서의 딸 홍건주의 아내 홍문교의 친정 오빠 홍경례와 숙모 홍재명사의 친형 홍정주 이씨의 친오빠 홍주상홍 홍판서를 두고, 홍사익의 조카인 홍용준 홍완중 등이 홍익당의 후신 홍찬주 이사장으로서 홍의주의 후신이 되어 조선시대를 관장하였다.']