In [1]:
import os
import sys
import re
import json
from datasets import (load_dataset, 
    load_from_disk,
    Dataset,
    DatasetDict,
    Value,
    Features
)

In [2]:
sys.path.append('../')

In [3]:
import torch
import random
import pandas as pd
import numpy as np
import collections
import matplotlib.pyplot as plt

In [4]:
from transformers import AutoTokenizer
from tqdm.notebook import tqdm

## Tokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained('gogamza/kobart-base-v2', use_fast=True)



In [6]:
print('Size of tokenizer : %d' %len(tokenizer))

Size of tokenizer : 30000


## Dataset

In [7]:
paper_dataset = load_dataset('metamong1/summarization_paper', 
    use_auth_token='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')

law_dataset = load_dataset('metamong1/summarization_law', 
    use_auth_token='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')

magazine_dataset = load_dataset('metamong1/summarization_magazine', 
    use_auth_token='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')

news_dataset = load_dataset('metamong1/summarization_news', 
    use_auth_token='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')

Reusing dataset paper_summarization (/opt/ml/.cache/huggingface/datasets/metamong1___paper_summarization/Paper Summarization/1.4.0/24bb09528ebb04fdc6aafb6e110202e52fbb818c0f204839bc833d8ce1e86a5f)


  0%|          | 0/2 [00:00<?, ?it/s]

Reusing dataset law_summarization (/opt/ml/.cache/huggingface/datasets/metamong1___law_summarization/Paper Summarization/1.2.0/b422baca30e481895dd2b572a7ff9f6c6428725e575fdafb73c0aa1d62356973)


  0%|          | 0/2 [00:00<?, ?it/s]

Reusing dataset magazine_summarization (/opt/ml/.cache/huggingface/datasets/metamong1___magazine_summarization/Magizine Summarization/1.0.0/506cb41eb0b96b084eafa5dd5fe3b51ff0d1061256700adf1aa92d3b19762c36)


  0%|          | 0/2 [00:00<?, ?it/s]

Reusing dataset news_summarization (/opt/ml/.cache/huggingface/datasets/metamong1___news_summarization/News Summarization/1.0.0/ae25c3215dc878e979d01f1157dbfb014c0a6985fc959ae45eaf10847db75600)


  0%|          | 0/2 [00:00<?, ?it/s]

## Documents

In [8]:
news_docs = [data['text'] for data in news_dataset['train']]
law_docs = [data['text'] for data in law_dataset['train']]
magazine_docs = [data['text'] for data in magazine_dataset['train']]
paper_docs = [data['text'] for data in paper_dataset['train']]

In [9]:
total_docs = news_docs + law_docs + magazine_docs + paper_docs
random.shuffle(total_docs)

## Preprocessing

In [10]:
from preprocessor import DocsPreprocessor

In [11]:
bracket_comp = re.compile(r"\([^)]+\)")
data_preprocessor = DocsPreprocessor()

def preprocess(doc) :
    doc = bracket_comp.sub(' ', doc)
    data_preprocessor.doc_preprocess(doc)
    doc = re.sub('\s+', ' ', doc)
    return doc

In [12]:
total_docs = [preprocess(doc) for doc in tqdm(total_docs)]

  0%|          | 0/390689 [00:00<?, ?it/s]

In [13]:
total_docs[11]

'본 연구에서는 붉은장목수수 MeOH 추출물과 분획물들의 다양한 생리활성을 비교하기 위하여항산화 , 항당뇨 , 항암 활성을 관찰 하였다. 항산화활성의 경우 EtOAc fraction이 높은 총 페놀과 플라보노이드 함량을 나타내었으며, DPPH assay와 reducing power 결과 역시 EtOAc fraction과 MeOH fraction이 positive control로 사용한 α-tocopherol보다 우수한 활성을 나타냈다. α-Glucosidase, α-amlyase 저해능 평가 결과 D.W. fraction이 가장 높은 활성을 보였으며 n-BuOH fraction 과 MeOH extract도 활성을 나타내었다. 암세포인 AGS,HT29, HCT116세포주에 대한 세포독성 연구인 MTT assay 에서는 EtOAc, n-BuOH, D.W. fraction을 처리한 처리구에서 독성을 보였으며 이중 n-BuOH fraction이 모든세포주에서 농도 의존적으로 세포독성을 가지는 것을 확인할 수 있었다. 연구결과 붉은장목수수의 추출물 EtOAc fraction의 항산화활성, D.W. fraction의 항당뇨활성, n-BuOH fraction의 암세포에 대한 독성과 관련된 화합물의분리 및 구조규명에 관한 연구가 필요 할 것으로 사료되어지며, normal cell에 대한 MTT assay를 진행하여 항암활성에 관한 연구를 진행하여 붉은장목수수 추출물을 이용한 다양한 건강기능식품과 의약품 개발을 통해 건강 증진과질병으로부터 보호 받을 수 있을 것으로 기대한다.'

## Extract Unk Tokens

In [14]:
target_docs = {}

for i,doc in enumerate(tqdm(total_docs)) :
    if tokenizer.unk_token_id in tokenizer.encode(doc) :
        target_docs[i] = doc

  0%|          | 0/390689 [00:00<?, ?it/s]

In [15]:
print('Size of Documents which has UNK Tokens : %d' %len(target_docs))

Size of Documents which has UNK Tokens : 231


In [16]:
unk_ids = list(target_docs.keys())

In [18]:
unk_chars = []

for idx, doc in tqdm(target_docs.items()) :
    doc = re.sub('\s+', '', doc)
    for ch in doc :
        if tokenizer.convert_tokens_to_ids(ch) == tokenizer.unk_token_id :
            unk_chars.append(ch)

  0%|          | 0/231 [00:00<?, ?it/s]

In [19]:
unk_char_counter = collections.Counter()
unk_char_counter.update(unk_chars)
unk_char_counter = dict(unk_char_counter)

In [39]:
unk_char_counter = sorted(unk_char_counter.items(), key=lambda x : x[1], reverse=True)

In [56]:
target_chars = []

private_range = re.compile('[^\u0000-\u007f\ue000-\uf8ff\uac00-\ud7af]')
for char, count in unk_char_counter :
    if private_range.match(char) != None :
        if count >= 1 :
            target_chars.append(char)

In [62]:
target_chars = target_chars[:100]

## Optimize Tokenizer

In [64]:
tokenizer.save_pretrained('./tmp')

('./tmp/tokenizer_config.json',
 './tmp/special_tokens_map.json',
 './tmp/tokenizer.json')

In [77]:
print('Index : 7 \t Token : %s' %tokenizer.convert_ids_to_tokens(7))
print('Index : 106 \t Token : %s' %tokenizer.convert_ids_to_tokens(106))

Index : 7 	 Token : <unused0>
Index : 106 	 Token : <unused99>


In [84]:
def load_tokenizer(path) :
    with open(path, "r") as f:
       data = json.load(f)
    return data

def write_json(tokenizer_data, path) :
   with open(path, 'w') as f:
      json.dump(tokenizer_data, f)

In [87]:
# optimizing tokenizer
def optimize(dir_path, vocab_list) :
    file_path = os.path.join(dir_path, 'tokenizer.json')
    assert os.path.isfile(file_path) 

    if len(vocab_list) > 100 :
        vocab_list = vocab_list[:100]
    
    tokenizer_data = load_tokenizer(file_path)

    # part1 added_tokens
    unused_start = 7
    unused_size = 100
    vocab_idx = 0
    for i in range(unused_start, unused_start+unused_size) :
        tokenizer_data['added_tokens'][i]['content'] = vocab_list[vocab_idx]
        if vocab_idx >= len(vocab_list) :
            break
        vocab_idx += 1

    # part2 model
    tokenizer_vocab_data = tokenizer_data['model']['vocab']
    idx2vocab = {idx:vocab for vocab, idx in tokenizer_vocab_data.items()}

    vocab_idx = 0
    for i in range(unused_start, unused_start+unused_size) :
        idx2vocab[i] = vocab_list[vocab_idx]
        if vocab_idx >= len(vocab_list) :
            break
        vocab_idx += 1

    vocab2idx = {vocab:idx for idx,vocab in idx2vocab.items()}
    tokenizer_data['model']['vocab'] = vocab2idx

    write_json(tokenizer_data, file_path)
    tokenizer = AutoTokenizer.from_pretrained(dir_path, use_fast=True)
    return tokenizer

In [101]:
optimized_tokenizer = optimize('./kobart', target_chars)

In [90]:
print('Index : 7 \t Token : %s' %optimized_tokenizer.convert_ids_to_tokens(7))
print('Index : 7 \t Token : %s' %optimized_tokenizer.convert_ids_to_tokens(50))
print('Index : 106 \t Token : %s' %optimized_tokenizer.convert_ids_to_tokens(106))

Index : 7 	 Token : ㄹ
Index : 7 	 Token : 匿
Index : 106 	 Token : ㅠ


In [103]:
metamong1_tokenizer = AutoTokenizer.from_pretrained('metamong1/kobart')

In [104]:
print('Index : 7 \t Token : %s' %metamong1_tokenizer.convert_ids_to_tokens(7))
print('Index : 7 \t Token : %s' %metamong1_tokenizer.convert_ids_to_tokens(50))
print('Index : 106 \t Token : %s' %metamong1_tokenizer.convert_ids_to_tokens(106))

Index : 7 	 Token : ㄹ
Index : 7 	 Token : 匿
Index : 106 	 Token : ㅠ
