In [2]:
import pandas as pd

import re
from itertools import chain
from collections import Counter
from typing import  Dict, List, Tuple

from tqdm import tqdm
from transformers import AutoTokenizer

2021-10-02 23:08:58.688222: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [3]:
Train_csv_file = '/opt/ml/dataset/train/train.csv'
Test_csv_file = '/opt/ml/dataset/test/test_data.csv'

df = pd.read_csv(Train_csv_file)
df = df.drop_duplicates(['sentence','subject_entity','object_entity','label']).reset_index()

In [4]:
MODEL_NAME = "klue/roberta-large"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

vocab = list(tokenizer.get_vocab().keys())
unused_list = [word for word in vocab if word.startswith('[unused')]        
print(f'unused token count: {len(unused_list)}')



unused token count: 500


# 🛰️ 전처리

In [5]:
def remove_special_char(sentence):
    """ 특수문자 및 독일어 제거, 수정"""
    sentence = re.sub(r'[À-ÿ]+','', sentence) # 독일어
    sentence = re.sub(r'[\u0600-\u06FF]+','', sentence)  # 사우디어
    sentence = re.sub(r'[\u00C0-\u02B0]+','', sentence)  # 라틴어
    sentence = re.sub(r'[ß↔Ⓐب€☎☏±∞]+','', sentence)
    sentence = re.sub('–','─', sentence)
    sentence = re.sub('⟪','《', sentence)
    sentence = re.sub('⟫','》', sentence)
    sentence = re.sub('･','・', sentence)
    sentence = re.sub('µ','ℓ', sentence)
    sentence = re.sub('®','㈜', sentence)
    sentence = re.sub('～','㈜', sentence)
    return sentence

test_sen = 'Hermann Müller'
remove_special_char(test_sen)

'Hermann Mller'

In [6]:
def add_space_char(sentence) :
    def add_space(match) :
        res_str = ', '.join(match.group().split(',')).rstrip()
        return res_str
    p = re.compile(r'([기-힣\w\-]+,)+[기-힣\w\-]+')
    sentence = p.sub(add_space, sentence)
    return sentence
test_sen = '앨범에는 에미넴,G-Unit,닥터드레,제이미 폭스 등이 참여하고,영국차트에서도 1위를 한다.'
add_space_char(test_sen)

'앨범에는 에미넴, G-Unit, 닥터드레, 제이미 폭스 등이 참여하고, 영국차트에서도 1위를 한다.'

In [7]:
def substitution_date(sentence):
    """
    기간 표시 '-' => '~'
    1223년 – => 1223년 ~ 
    """
    def sub_tibble(match) :
        res_str = re.sub('[–\-]','~',match.group())
        return res_str
    re_patterns = [
        r'(\d{2,4}년\s*)(\d{1,2}[월|일]\s*)(\d{1,2}[월|일])\s*[–\-]',
        r'(\d{2,4}년\s*)(\d{1,2}[월|일]\s*)\s*[–\-]',
        r'(\d{2,4}년\s*)\s*[–\-]',
        r'\((\d{4}[–\-]\d{2,4})\)'
    ]
    for re_pattern in re_patterns :
        p = re.compile(re_pattern)
        sentence = p.sub(sub_tibble, sentence)   
    return sentence

test_sen = '후 시니어 대회로 올라가서 (1934–1942) 시즌 ISU 쇼트트랙 월드컵에 4차례 출전하여 금메달 4개, 은메달 5개를 차지하였으며, 2013-14 시즌에서 김아랑은 쇼트트랙 스피드'
substitution_date(test_sen)

'후 시니어 대회로 올라가서 (1934~1942) 시즌 ISU 쇼트트랙 월드컵에 4차례 출전하여 금메달 4개, 은메달 5개를 차지하였으며, 2013-14 시즌에서 김아랑은 쇼트트랙 스피드'

In [8]:
def add_space_year(sentence):
    """
    숫자와 년 사이에 공백
    1223년 => 1223 년 => ⑦ 년
    """
    def add_space(match) :
        # res_str = '⑦ ' + match.group()[4:]
        res_str =  match.group()[:4] +' ' + match.group()[4:]
        return res_str
    p = re.compile(r'\d{4}년')
    sentence = p.sub(add_space, sentence)
    return sentence
test_sen = '2010년에는 아시아 가수 최초로 마이클 잭슨의 곡을 리메이크하였는데'
add_space_year(test_sen)

'2010 년에는 아시아 가수 최초로 마이클 잭슨의 곡을 리메이크하였는데'

In [9]:
def preprocessing(sentence) :
    sent = remove_special_char(sentence)
    sent = substitution_date(sent)
    # sent = add_space_year(sent)
    sent = add_space_char(sent)
    return sent

In [10]:
df = pd.read_csv(Train_csv_file)

ess = ['sentence','subject_entity','object_entity']
preprocessed_df = df.copy()
for col in ess :
    preprocessed_df[col] = preprocessed_df[col].apply(preprocessing)

# 🛰️UNK으로 변하는 word 및 char 확인

In [11]:
from IPython.core.display import HTML
def word_highligt_html(txt, word, color='black', highlight=None, attr=None):
    if isinstance(word, str):
        txt = txt.replace(word, f'<span style="color: {color}; background-color:{highlight}">{word}</span>')
    else:
        if not isinstance(color, list):
            color = [color] * len(word)
        if not isinstance(highlight, list):
            highlight = [highlight] * len(word)
        for w, c, h in zip(word, color, highlight):
            txt = txt.replace(w, f'<span style="color: {c}; background-color:{h}">{w}</span>')
    return txt

In [12]:
def subword_parsing(wordpiece:List) -> List[str]: ## subword # 제거용
    Known_char = []
    for subword in wordpiece :
        if subword == tokenizer.unk_token :
            Known_char.append(tokenizer.unk_token)
        else :
            string = subword.replace('#','')
            Known_char.extend(string)
    return Known_char


def UNK_word_and_chr(text:str) -> Tuple[List[str], List[str]]:
    sub_word_UNK_list = []
    
    def add_space(match) :
        bracket = match.group()
        added = ' ' + bracket + ' '
        return added
    p = re.compile(r'[\([)\]|,|-|~|-|‘|’|"|\']')
    words_list = p.sub(add_space, text).split()
    for word in words_list :
        subwordpieces_ID_encoded = tokenizer.tokenize(word)
        Known_subword = subword_parsing(subwordpieces_ID_encoded)
        for sub_char, NK_char in zip(word, Known_subword) :
            if sub_char != NK_char and len(word) == len(Known_subword) :
                sub_word_UNK_list.append(sub_char)
            elif sub_char != NK_char and len(word) != len(Known_subword) :
                sub_word_UNK_list.append(word)
                break
    return sub_word_UNK_list
    
text ='박용오(朴容旿, 1937년 4월 29일(음력 3월 19일)(음력 3월 19일) ~ 2009년 11월 4일)는 서울에서 태어난 대한민국의 기업인으로 두산그룹 회장, KBO 총재 등을 역임했다.'
print(UNK_word_and_chr(text))
if tokenizer.unk_token in tokenizer.tokenize(text) :
    print(tokenizer.tokenize(text))

['容', '旿']
['박용', '##오', '(', '朴', '[UNK]', '[UNK]', ',', '1937', '##년', '4', '##월', '29', '##일', '(', '음력', '3', '##월', '19', '##일', ')', '(', '음력', '3', '##월', '19', '##일', ')', '~', '2009', '##년', '11', '##월', '4', '##일', ')', '는', '서울', '##에서', '태어난', '대한민국', '##의', '기업인', '##으로', '두산', '##그룹', '회장', ',', 'KBO', '총재', '등', '##을', '역임', '##했', '##다', '.']


# 🛰️ UNK 분포

In [13]:
txt = ''
count = 1
unk_len = 0
for sen in tqdm(preprocessed_df['sentence']) :
    if tokenizer.unk_token in tokenizer.tokenize(sen) :
        UNK_subword = UNK_word_and_chr(sen)
        txt += word_highligt_html(sen, UNK_subword, ['white']*len(UNK_subword),  ['#96C4ED']*len(UNK_subword)) + '<br/><br/>'
        if count > 5:
            break
        count += 1
HTML(txt)

  1%|          | 191/32470 [00:00<00:10, 2944.90it/s]


## 🛰️ Sentece UNK 확인

In [14]:
cnt = 1
UNK_sentence_list = []
for sen in tqdm(preprocessed_df['sentence']) :
    if tokenizer.unk_token in tokenizer.tokenize(sen) :
        UNK_sentence_list.extend(UNK_word_and_chr(sen))
        cnt+=1
print(cnt)

100%|██████████| 32470/32470 [00:14<00:00, 2193.51it/s]

2924





In [15]:
for idx, cont in enumerate(Counter(UNK_sentence_list).most_common(100)) :
    if idx % 10 == 9 :
        print()
    else :
        print(cont, end="\t")

('李', 225)	('崔', 60)	('皇', 60)	('后', 54)	('永', 41)	('尹', 38)	('昌', 33)	('慶', 30)	('俊', 29)	
('趙', 25)	('興', 24)	('홋스퍼', 24)	('孝', 23)	('盧', 22)	('承', 22)	('梁', 22)	('容', 21)	('徐', 21)	
('熙', 21)	('貞', 20)	('沈', 20)	('陵', 19)	('鍾', 19)	('錫', 18)	('放', 18)	('池', 18)	('團', 18)	
('賢', 18)	('洪', 18)	('申', 17)	('進', 17)	('洙', 17)	('泰', 17)	('植', 16)	('夏', 16)	('秀', 16)	
('校', 16)	('勳', 16)	('吳', 16)	('康', 15)	('景', 15)	('홋카이도', 15)	('炳', 15)	('恩', 15)	('哲', 14)	
('羅', 14)	('源', 14)	('惠', 14)	('範', 14)	('榮', 14)	('煥', 14)	('宇', 14)	('崇', 14)	('少', 13)	
('忠', 13)	('姬', 13)	('숀', 13)	('浩', 12)	('嬪', 12)	('根', 12)	('唐', 12)	('翁', 12)	('鉉', 12)	
('勇', 12)	('建', 12)	('桓', 12)	('玉', 11)	('敬', 11)	('淑', 11)	('恭', 11)	('智', 11)	('宣', 11)	
('펭수', 10)	('秋', 10)	('樂', 10)	('延', 10)	('昭', 10)	('順', 10)	('奎', 10)	('斗', 10)	('應', 10)	
('베렝가리오', 10)	('清', 10)	('奉', 10)	('藤', 10)	('澤', 10)	('閔', 10)	('織', 10)	('弐', 9)	('泳', 9)	


In [16]:
for_add = [token for token, cnt in Counter(UNK_sentence_list).items() if cnt >= 10]

added_token_num = tokenizer.add_tokens(for_add)
print(added_token_num)

97


## 🛰️ entity UNK 확인

In [17]:
subject_entity = []
object_entity = []

for i, j in zip(preprocessed_df['subject_entity'], preprocessed_df['object_entity']):
    i = eval(i)['word']
    j = eval(j)['word']

    subject_entity.append(i)
    object_entity.append(j)

In [18]:
from konlpy.tag import Mecab

mecab = Mecab()

In [221]:
cnt = 1
UNK_entity_list = []
for token in tqdm(subject_entity+object_entity+t_subject_entity+t_object_entity) :
    if tokenizer.unk_token in tokenizer.tokenize(token) :
        char_unk = [UNK_word_and_chr(mor) for mor in mecab.morphs(token)]
        UNK_entity_list.extend(chain(*char_unk))
        cnt += 1
print(cnt)

100%|██████████| 80470/80470 [00:06<00:00, 12697.74it/s]

572





In [222]:
Counter(UNK_entity_list).most_common(100)
for idx, cont in enumerate(Counter(UNK_entity_list).most_common(120)) :
    if idx % 10 == 9 :
        print()
    else :
        print(cont, end="\t")

('홋스퍼', 28)	('李', 12)	('숀', 11)	('홋카이도', 10)	('에스파뇰', 10)	('쥘', 9)	('放', 9)	('送', 9)	('렝가리오', 8)	
('陵', 8)	('織', 8)	('쾰른', 7)	('슝', 7)	('리콴유', 6)	('셴', 6)	('묀헨글라트바흐', 6)	('리셴녠', 5)	('비욘세', 5)	
('弐', 5)	('局', 5)	('昌', 5)	('梁', 5)	('슌', 4)	('흄', 4)	('욘', 4)	('푀', 4)	('다롄', 4)	
('超', 4)	('衛', 4)	('弓', 4)	('샨', 3)	('쟝', 3)	('뮐러', 3)	('훙윈', 3)	('푸르트벵글러', 3)	('로퀜스', 3)	
('헴스워스', 3)	('젬', 3)	('슌지', 3)	('꼰', 3)	('에미넴', 3)	('아녜스', 3)	('훙', 3)	('쓰촨', 3)	('뎀', 3)	
('晋', 3)	('宋', 3)	('葛', 3)	('后', 3)	('聯', 3)	('香', 3)	('慶', 3)	('윰', 3)	('바라캇', 2)	
('녜', 2)	('쳄부르스키', 2)	('꽈드로스', 2)	('래이쾨넨', 2)	('뱌체슬라프', 2)	('쥰', 2)	('카뮈', 2)	('귈', 2)	('쳉', 2)	
('뮈르달', 2)	('츨러', 2)	('킵', 2)	('젭', 2)	('로바쳅스키', 2)	('호엔촐레른', 2)	('昶', 2)	('쾨', 2)	('퓌르트', 2)	
('맬', 2)	('아르툠', 2)	('촐라', 2)	('채드윅', 2)	('겅', 2)	('똔텃투옛', 2)	('앳킨슨', 2)	('브뢴뷔', 2)	('핼리팩스', 2)	
('켐니츠', 2)	('伊', 2)	('達', 2)	('孝', 2)	('廳', 2)	('会', 2)	('樹', 2)	('郭', 2)	('驥', 2)	
('干', 2)	('永', 2)	('乃', 2)	('朗', 2)	('麗', 2)	('降', 2)	('内', 2)	('웡', 2)	('洋', 2)	
('術

In [267]:
for_add = [token for token, cnt in Counter(UNK_entity_list).items() if cnt > 2]

added_token_num = tokenizer.add_tokens(for_add)
print(added_token_num)

# 🛰️시험

In [21]:
Test_csv_file = '/opt/ml/dataset/test/test_data.csv'
test_df = pd.read_csv(Test_csv_file)

In [26]:
t_subject_entity = []
t_object_entity = []

for i, j in zip(test_df['subject_entity'], test_df['object_entity']):
    i = eval(i)['word']
    j = eval(j)['word']

    t_subject_entity.append(i)
    t_object_entity.append(j)

In [199]:
cnt = 1
t_UNK_entity_list = []
for sen in tqdm(t_subject_entity+t_object_entity) :
    if tokenizer.unk_token in tokenizer.tokenize(sen) :
        t_UNK_entity_list.extend(UNK_word_and_chr(sen))
        cnt += 1
print(cnt)

100%|██████████| 15530/15530 [00:01<00:00, 12986.41it/s]

99





In [200]:
print(Counter(t_UNK_entity_list).most_common())

[('李', 5), ('윰댕', 3), ('숀', 3), ('묀헨글라트바흐', 2), ('葛', 2), ('衛', 2), ('秀', 2), ('雄', 2), ('에스파뇰', 2), ('宋', 2), ('맬컴', 2), ('姜', 2), ('쥘', 2), ('펭수', 2), ('守', 1), ('벵어', 1), ('帶', 1), ('慶', 1), ('應', 1), ('도스토옙스키', 1), ('엔지켐생명과학', 1), ('丸', 1), ('彫', 1), ('헴스워스', 1), ('滿', 1), ('빕스', 1), ('오뎀윙기', 1), ('크뢸루프', 1), ('沙', 1), ('梁', 1), ('웡', 1), ('바르뎀', 1), ('閔', 1), ('綾', 1), ('皓', 1), ('황페이훙', 1), ('沈', 1), ('御', 1), ('陵', 1), ('珥', 1), ('포로셴코', 1), ('簾', 1), ('에미넴', 1), ('溪', 1), ('브륀힐드', 1), ('寧', 1), ('로케푀이', 1), ('우젠슝', 1), ('朔', 1), ('趙', 1), ('妃', 1), ('尹', 1), ('潽', 1), ('쑨시엔위', 1), ('昌', 1), ('仇', 1), ('牙', 1), ('Perišić', 1), ('必', 1), ('琪', 1), ('바츨라프', 1), ('쿠샨', 1), ('段', 1), ('龕', 1), ('址', 1), ('鎔', 1), ('쑨원', 1), ('응우옌반냑', 1), ('쑨양', 1), ('尙', 1), ('桓', 1), ('흄', 1), ('諸', 1), ('亮', 1), ('臨', 1), ('홋스퍼', 1), ('일리리쿰', 1), ('菅', 1), ('偉', 1), ('斌', 1), ('朗', 1), ('顯', 1), ('后', 1), ('袁', 1), ('譚', 1), ('惠', 1), ('局', 1), ('쟌느', 1), ('亨', 1), ('로퀜스', 1), ('뿡뿡이', 1), ('됭케르크',

# 🛸 한 단어로 이루어지지 않은 entity
- 여러 단어
- 특정 패턴이 존재하는
- 맞추기 어려울 것 같은
+ 위 내용을 만족하는 entity는 entity token wrap 해주는게 어떠려나?

In [9]:
for obj in df['object_entity'] :
    d_obj = eval(obj)
    if d_obj['type'] == 'DAT' :
        if len(d_obj['word'].split()) > 1 :
            print(obj)
            test_encoced = tokenizer.encode(d_obj['word'])
            print(test_encoced)
            Id_to_tokens = tokenizer.convert_ids_to_tokens(test_encoced[1:-1])
            print(Id_to_tokens)
            break

{'word': '1937년 4월 29일', 'start_idx': 9, 'end_idx': 20, 'type': 'DAT'}
[0, 20533, 2440, 24, 2429, 4346, 2210, 2]
['1937', '##년', '4', '##월', '29', '##일']
