# preprocessing data

### train, val, test set 각각 cleaning data를 만듭니다
1. data_cleaning.yaml파일을 다운로드 해주세요.
1. pip install hanja

In [164]:
import pandas as pd
import hanja
import re
import yaml
from transformers import AutoTokenizer
from tqdm.autonotebook import tqdm
## data_cleaning.yaml 이랑 같은 폴더에 있어야합니다.
with open("./data/data_cleaning.yaml") as f:
    cleaning = yaml.load(f, Loader=yaml.FullLoader)
tokenizer = AutoTokenizer.from_pretrained('klue/bert-base')

# 데이터 로딩

In [165]:
df = pd.read_csv('../data/remove_dup_train_split.csv')
df = df.drop_duplicates(subset='sentence').reset_index(drop=True)
df['subject_entity'] = df['subject_entity'].map(lambda x:eval(x))
df['object_entity'] = df['object_entity'].map(lambda x:eval(x))

# data_cleaning

In [166]:
for i, row in tqdm(df.iterrows()):
    sent = ''
    for char in row['sentence']:
        try:
            sent += cleaning[char]
        except:
            sent += char

    sub= ''
    for char in row['subject_entity']['word']:
        try:
            sub += cleaning[char]
        except:
            sub += char

    obj = ''
    for char in row['object_entity']['word']:
        try:
            obj += cleaning[char]
        except:
            obj += char
    
    df.loc[i, 'sentence'] = sent
    df.loc[i, 'subject_entity']['word'] = sub
    df.loc[i, 'object_entity']['word'] = obj
            
            


0it [00:00, ?it/s]

# 한자

In [167]:
for i , row in tqdm(df.iterrows()):
    # chinese
    if re.findall(r'[一-龥]+', row['sentence']):
        df.loc[i, 'sentence'] = hanja.translate(row['sentence'], 'substitution')
        df.loc[i, 'subject_entity']['word'] = hanja.translate(row['subject_entity']['word'], 'substitution')
        df.loc[i, 'object_entity']['word'] =hanja.translate(row['object_entity']['word'], 'substitution')

0it [00:00, ?it/s]

# 검사

In [168]:
# 불량 인덱스 검사 후 수정
for i, row in tqdm(df.iterrows()):
    sub_start = row['subject_entity']['start_idx']
    sub_end = row['subject_entity']['end_idx']
    obj_start = row['object_entity']['start_idx']
    obj_end = row['object_entity']['end_idx']
    sent = row['sentence']
    sub = row['subject_entity']['word']
    obj = row['object_entity']['word']
    if sent[sub_start:sub_end+1] != sub or sent[obj_start:obj_end+1] != obj:
        sub_start = sent.find(sub)
        sub_end = sub_start + len(sub) - 1
        obj_start = sent.find(obj)
        obj_end = obj_start + len(obj) - 1
        if sub_start < 0 or obj_start < 0:
            print(sent)
            print(sub, obj)
            raise Exception(f"{sub_start}, {obj_start}")
        print('fixed:',sub, sent[sub_start:sub_end+1])
        print('fixed:',obj, sent[obj_start:obj_end+1])
        df.loc[i,'subject_entity']['start_idx'] = sub_start
        df.loc[i,'subject_entity']['end_idx'] = sub_end
        df.loc[i,'object_entity']['start_idx'] = obj_start
        df.loc[i,'object_entity']['end_idx'] = obj_end
    

0it [00:00, ?it/s]

In [169]:
# 남은 토큰
bad_word_unk = []
bad_word_all = []
pattern = r'[^ 0-9a-zA-Zㄱ-ㅎㅏ-ㅣ가-힣\.\,\?\!\:\;\'\"\(\)\[\]\~\-\+\_\%\<\>\《\》@#$&\*\`\{\}\=\|]'
for i , row in tqdm(df.iterrows()):
    if re.findall(r'[一-龥]+|[ぁ-ゔ]+|[ァ-ヴー]+[々〆〤]+', row['sentence']):
        print(row['sentence'])
    bad = re.findall(pattern, df.loc[i, 'sentence'])
    bad_word_all.extend(bad)
    if bad:
        tokenized = tokenizer.tokenize(''.join(bad), add_special_tokens=False)
        if '[UNK]' in tokenized:
            print('original:', bad, end=' ')
            print('tokenized: ',tokenized)
            bad_word_unk.extend(bad)
        # print(df.loc[i, 'sentence'])
bad_word_unk = list(set(bad_word_unk))
bad_word_all = list(set(bad_word_all))
print('number of [UNK]:', len(bad_word_unk))
print('remain:',bad_word_all)

0it [00:00, ?it/s]

original: ['ь'] tokenized:  ['[UNK]']
number of [UNK]: 1
remain: ['/', 'ь', '·']


In [170]:
from unidecode import unidecode
if bad_word_unk: print('data_cleaning.yaml에 추가하고 재시작해주세요.')
for i in bad_word_unk:
    tmp = unidecode(i)[0]
    if re.findall(r'\w',tmp):
        if tmp == '@' or tmp=='#' or tmp=='^' or tmp=='*':
            tmp = '-'
        print(f'{i}: {tmp}')

data_cleaning.yaml에 추가하고 재시작해주세요.


# 저장

In [171]:
df = df.sort_values('id').reset_index(drop=True)
df.to_csv('./clean_train_data.csv', index=False)
print(len(df))

6347
