In [2]:
import os
import sys
import re
import json
from datasets import (load_dataset, 
    load_from_disk,
    Dataset,
    DatasetDict,
    Value,
    Features
)

In [3]:
sys.path.append('../')

In [4]:
import torch
import random
import pandas as pd
import numpy as np
import collections
import matplotlib.pyplot as plt

In [5]:
from transformers import AutoTokenizer
from tqdm.notebook import tqdm

2021-12-02 01:55:54.830098: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


## Tokenizer

In [6]:
tokenizer = AutoTokenizer.from_pretrained('gogamza/kobart-summarization', use_fast=True)



## Dataset

In [7]:
paper_dataset = load_dataset('metamong1/summarization_paper', 
    use_auth_token='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')

Reusing dataset paper_summarization (/opt/ml/.cache/huggingface/datasets/metamong1___paper_summarization/Paper Summarization/2.2.0/46d835d4e22daa3a5a46d13de39e3d75f6c2eaef5ead153d48cbe8d7cd3bec9c)


  0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
law_dataset = load_dataset('metamong1/summarization_law', 
    use_auth_token='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')

Reusing dataset law_summarization (/opt/ml/.cache/huggingface/datasets/metamong1___law_summarization/Paper Summarization/1.2.0/b422baca30e481895dd2b572a7ff9f6c6428725e575fdafb73c0aa1d62356973)


  0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
magazine_dataset = load_dataset('metamong1/summarization_magazine', 
    use_auth_token='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')

Reusing dataset magazine_summarization (/opt/ml/.cache/huggingface/datasets/metamong1___magazine_summarization/Magizine Summarization/1.0.0/506cb41eb0b96b084eafa5dd5fe3b51ff0d1061256700adf1aa92d3b19762c36)


  0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
news_dataset = load_dataset('metamong1/summarization_news', 
    use_auth_token='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')

Reusing dataset news_summarization (/opt/ml/.cache/huggingface/datasets/metamong1___news_summarization/News Summarization/1.0.0/ae25c3215dc878e979d01f1157dbfb014c0a6985fc959ae45eaf10847db75600)


  0%|          | 0/2 [00:00<?, ?it/s]

## Preprocessor

In [11]:
from preprocessor import PaperPreprocessor, DocsPreprocessor, Filter

In [12]:
paper_preprocessor = PaperPreprocessor()
docs_preprocessor = DocsPreprocessor()
data_filter = Filter(title_size=5)

### Paper Dataset

In [13]:
paper_dataset.cleanup_cache_files()
paper_dataset = paper_dataset.map(paper_preprocessor.for_train)
paper_dataset = paper_dataset.filter(data_filter)

  0%|          | 0/73640 [00:00<?, ?ex/s]

  0%|          | 0/18411 [00:00<?, ?ex/s]

  0%|          | 0/74 [00:00<?, ?ba/s]

  0%|          | 0/19 [00:00<?, ?ba/s]

In [14]:
paper_dataset

DatasetDict({
    train: Dataset({
        features: ['doc_id', 'title', 'text', 'doc_type', 'file'],
        num_rows: 69950
    })
    validation: Dataset({
        features: ['doc_id', 'title', 'text', 'doc_type', 'file'],
        num_rows: 17493
    })
})

## Law

In [15]:
law_dataset.cleanup_cache_files()
law_dataset = law_dataset.map(docs_preprocessor.for_train)
law_dataset = law_dataset.filter(data_filter)

  0%|          | 0/23730 [00:00<?, ?ex/s]

  0%|          | 0/5933 [00:00<?, ?ex/s]

  0%|          | 0/24 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

In [16]:
law_dataset

DatasetDict({
    train: Dataset({
        features: ['doc_id', 'title', 'text', 'doc_type', 'file'],
        num_rows: 20921
    })
    validation: Dataset({
        features: ['doc_id', 'title', 'text', 'doc_type', 'file'],
        num_rows: 5223
    })
})

### Magazine

In [17]:
magazine_dataset.cleanup_cache_files()
magazine_dataset = magazine_dataset.map(paper_preprocessor.for_train)
magazine_dataset = magazine_dataset.filter(data_filter)

  0%|          | 0/52691 [00:00<?, ?ex/s]

  0%|          | 0/13173 [00:00<?, ?ex/s]

  0%|          | 0/53 [00:00<?, ?ba/s]

  0%|          | 0/14 [00:00<?, ?ba/s]

In [18]:
magazine_dataset

DatasetDict({
    train: Dataset({
        features: ['doc_id', 'title', 'text', 'doc_type', 'file'],
        num_rows: 51211
    })
    validation: Dataset({
        features: ['doc_id', 'title', 'text', 'doc_type', 'file'],
        num_rows: 12823
    })
})

### News

In [19]:
news_dataset.cleanup_cache_files()
news_dataset = news_dataset.map(docs_preprocessor.for_train)
news_dataset = news_dataset.filter(data_filter)

  0%|          | 0/240628 [00:00<?, ?ex/s]

  0%|          | 0/60157 [00:00<?, ?ex/s]

  0%|          | 0/241 [00:00<?, ?ba/s]

  0%|          | 0/61 [00:00<?, ?ba/s]

In [20]:
news_dataset

DatasetDict({
    train: Dataset({
        features: ['doc_id', 'title', 'text', 'doc_type', 'file'],
        num_rows: 214237
    })
    validation: Dataset({
        features: ['doc_id', 'title', 'text', 'doc_type', 'file'],
        num_rows: 53572
    })
})

## Split Tokens

In [21]:
paper_train = paper_dataset['train']
paper_docs = [doc['text'] for doc in paper_train]

magazine_train = magazine_dataset['train']
magazine_docs = [doc['text'] for doc in magazine_train]

law_train = law_dataset['train']
law_docs = [doc['text'] for doc in law_train]

news_train = news_dataset['train']
news_docs = [doc['text'] for doc in news_train]

In [22]:
total_docs = paper_docs + magazine_docs + law_docs + news_docs

In [23]:
random.shuffle(total_docs)

In [24]:
print('Data Size : %d' %len(total_docs))

Data Size : 356319


In [25]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [26]:
total_sens = []

In [27]:
for doc in tqdm(total_docs) :
    sen_list = sent_tokenize(doc)
    total_sens.extend(sen_list)

  0%|          | 0/356319 [00:00<?, ?it/s]

In [28]:
print('Sentence Size : %d' %len(total_sens))

Sentence Size : 4573278


In [29]:
total_sens[13]

"하지만 대법원은 '모욕적 인신공격 발언'이 아니라며 2심 재판을 다시하라고 결정했다."

In [30]:
sen_tokenized = [tokenizer.tokenize(sen) for sen in tqdm(total_sens)]

  0%|          | 0/4573278 [00:00<?, ?it/s]

In [31]:
sen_unk = {i:vec for i,vec in enumerate(tqdm(sen_tokenized)) if tokenizer.unk_token in vec}
sen_unk_ids = list(sen_unk.keys())
print('Size : %d' %len(sen_unk))

  0%|          | 0/4573278 [00:00<?, ?it/s]

Size : 229


## Case1

In [68]:
print('Original Sentence')
print(total_sens[sen_unk_ids[121]])
print('')
print('Tokenized Sentence')
print(tokenizer.convert_tokens_to_string(sen_unk[sen_unk_ids[121]]))

Original Sentence
둘째, 유흠은 이처럼 왕제 의 ‘천자칠묘’를 혈연적 친소에 따라 입묘되는 迭毁廟의 常數로 해석한 후, 功德에 따라 세우는 ‘宗’은 變數라고 하여 그 설치에 제한을 두지 않는다는 ‘宗變’論을 주장하였다.

Tokenized Sentence
둘째, 유흠은 이처럼 왕제 의 ‘천자칠묘’를 혈연적 친소에 따라 입묘되는 <unk>毁廟의 常數로 해석한 후, 功德에 따라 세우는 ‘宗’은 變數라고 하여 그 설치에 제한을 두지 않는다는 ‘宗變’論을 주장하였다.


## Case2

In [32]:
print('Original Sentence')
print(total_sens[sen_unk_ids[13]])
print('')
print('Tokenized Sentence')
print(tokenizer.convert_tokens_to_string(sen_unk[sen_unk_ids[13]]))

Original Sentence
장에서는 全唐詩에 수록된 詩歌 중 西王母의 이미지를‘美’ ?

Tokenized Sentence
장에서는 <unk>全唐詩<unk>에 수록된 詩歌 중 西王母의 이미지를‘美’ ?


### Tokenizer Add Tokens

In [40]:
print('Original Sentence')
print(total_sens[sen_unk_ids[125]])
print('')
print('Tokenized Sentence')
print(tokenizer.convert_tokens_to_string(sen_unk[sen_unk_ids[125]]))

Original Sentence
당시 정준영은 “동영상 찍어서 보내준 거 걸려가지곸 ”라는 말을 했고, 용준형은 “그 여자애한테 걸렸다고?”라며 반문했다.

Tokenized Sentence
당시 정준영은 “동영상 찍어서 보내준 거 걸려가지<unk> ”라는 말을 했고, 용준형은 “그 여자애한테 걸렸다고?”라며 반문했다.


In [42]:
tokenizer = AutoTokenizer.from_pretrained('gogamza/kobart-summarization', use_fast=True)
tokenizer.add_tokens(['곸'])
tokenizer.tokenize('걸려가지곸')

['걸', '려', '가지', '곸']

In [43]:
tokenizer = AutoTokenizer.from_pretrained('gogamza/kobart-summarization', use_fast=True)
tokenizer.add_tokens(['▁곸'])
tokenizer.tokenize('걸려가지곸')

['걸', '려', '가지', '<unk>']

In [47]:
tokenizer = AutoTokenizer.from_pretrained('gogamza/kobart-summarization', use_fast=True)
vocab_set = tokenizer.vocab

In [46]:
start_id = ord('가')
end_id = ord('힣')

In [54]:
size = 0

for i in range(start_id, end_id+1) :
    ch1 = chr(i)
    ch2 = '▁' + ch1
    flag = ch1 in vocab_set and ch2 not in vocab_set

    if flag == True :
        size += 1 

print('원래 글자는 있고 _붙인 글자는 없는 경우') # 굣 - O,  ##굣 - X
print('Size : %d' %size)

원래 글자는 있고 _붙인 글자는 없는 경우
Size : 3989


In [49]:
size = 0
for i in range(start_id, end_id+1) :
    ch1 = chr(i)
    ch2 = '▁' + ch1
    flag = ch1 not in vocab_set and ch2 in vocab_set
    
    if flag == True :
        size += 1

print('원래 글자는 없고 _붙인 글자는 있는 경우') # 굣 - X,  ##굣 - O
print('Size : %d' %size)

원래 글자는 없고 _붙인 글자는 있는 경우
Size : 0


In [52]:
size = 0 

for i in range(start_id, end_id+1) :
    ch1 = chr(i)
    ch2 = '▁' + ch1
    flag = ch1 in vocab_set and ch2 in vocab_set
    
    if flag == True :
        size += 1

print('원래 글자는 있고 _붙인 글자는 있는 경우') # 굣 - O,  ##굣 - O
print('Size : %d' %size)

원래 글자는 있고 _붙인 글자는 있는 경우
Size : 888


## Select Characters

In [55]:
from konlpy.tag import Mecab
mecab = Mecab()

In [56]:
unk_words = []

In [57]:
for idx in tqdm(sen_unk_ids) :
    sen = total_sens[idx]
    tok_list = mecab.morphs(sen)

    for tok in tok_list :
        if tokenizer.unk_token_id in tokenizer.encode(tok) :
            unk_words.append(tok)

  0%|          | 0/229 [00:00<?, ?it/s]

In [60]:
unk_words[:10]

['\x01',
 '\x02',
 '\x01',
 '\x02',
 '\x01',
 '\x02',
 '\x01',
 '\x01',
 '\x02',
 '\x01']

In [61]:
unk_chars = []

In [62]:
for word in tqdm(unk_words) :
    for ch in word :
        if tokenizer.convert_tokens_to_ids(ch) == tokenizer.unk_token_id :
            unk_chars.append(ch)

  0%|          | 0/331 [00:00<?, ?it/s]

In [63]:
unk_chars[:10]

['\x01',
 '\x02',
 '\x01',
 '\x02',
 '\x01',
 '\x02',
 '\x01',
 '\x01',
 '\x02',
 '\x01']

In [64]:
unk_ch_counter = collections.Counter()
unk_ch_counter.update(unk_chars)

In [65]:
unk_char_items = sorted(unk_ch_counter.items(), key=lambda x : x[1], reverse=True)
unk_char_items = [item for item in unk_char_items if re.search('[\x00-\x20]', item[0]) == None]

In [91]:
unk_char_items[:10]

[('룖', 54),
 ('윾', 34),
 ('뢾', 11),
 ('뢿', 11),
 ('拗', 7),
 ('剡', 7),
 ('嘔', 5),
 ('‒', 5),
 ('줼', 4),
 ('귱', 3)]

In [92]:
unk_ch_list = [item[0] for item in unk_char_items]
unk_ch_count = [item[1] for item in unk_char_items]

In [93]:
extra_unk_chars = pd.DataFrame({'Character' : unk_ch_list, 'Count' : unk_ch_count})

In [94]:
def check_korean(ch) :
    if ord(ch) in range(ord('가'), ord('힣')+1) :
        return True
    else :
        return False 

In [95]:
extra_unk_chars['KoreanFlag'] = extra_unk_chars['Character'].map(check_korean)

In [96]:
extra_unk_chars.head()

Unnamed: 0,Character,Count,KoreanFlag
0,룖,54,True
1,윾,34,True
2,뢾,11,True
3,뢿,11,True
4,拗,7,False


In [98]:
extra_unk_chars.to_csv('./Tokenizer/extra_for_bart.csv')