SR (Synonym Replacement): 특정 단어를 비슷한 의미의 유의어로 교체
RI (Random Insertion): 임의의 단어를 삽입
RS (Random Swap): 텍스트 내의 두 단어를 임의로 선정하여 서로 위치를 바꿔줌
RD (Random Deletion): 임의의 단어를 삭제

안전하게 데이터 증강을 하고 싶다면 RD, RS만을 사용하고, 데이터가 많이 필요하다싶으면 SR과 RI까지 사용하고 인간지능으로 데이터를 걸러내는 작업이 필요할 것이다.

References: https://github.com/catSirup/KorEDA

In [1]:
pip install koeda

In [4]:
from koeda import EDA
import random
import pandas as pd
import numpy as np
from tqdm import tqdm

In [6]:
np.random.seed(1004)
random.seed(1004)

In [7]:
np.random.seed(1004)
random.seed(1004)
########################################################################
# Random deletion
# Randomly delete words from the sentence with probability p
########################################################################
def random_deletion(words, p):
	if len(words) == 1:
		return words

	new_words = []
	for word in words:
		r = random.uniform(0, 1)
		if r > p:
			new_words.append(word)

	if len(new_words) == 0:
		rand_int = random.randint(0, len(words)-1)
		return [words[rand_int]]

	return new_words

########################################################################
# Random swap
# Randomly swap two words in the sentence n times
########################################################################
def random_swap(words, n):
	new_words = words.copy()
	for _ in range(n):
		new_words = swap_word(new_words)

	return new_words

def swap_word(new_words):
	random_idx_1 = random.randint(0, len(new_words)-1)
	random_idx_2 = random_idx_1
	counter = 0

	while random_idx_2 == random_idx_1:
		random_idx_2 = random.randint(0, len(new_words)-1)
		counter += 1
		if counter > 3:
			return new_words

	new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
	return new_words

In [8]:
# np.random.seed(1004)
# random.seed(1004)
# random.uniform(0, 1)

sr, ri는 적용시 문장의 의미가 변형되는경우가 생겨 제외

In [10]:
def rd_EDA(sentence, p_rd=0.1, num_aug=9):
	words = sentence.split(' ')
	words = [word for word in words if word != ""]
	num_words = len(words)

	augmented_sentences = []

	# rd
	for _ in range(num_aug):
		a_words = random_deletion(words, p_rd)
		augmented_sentences.append(" ".join(a_words))

	augmented_sentences = [sentence for sentence in augmented_sentences]
	random.shuffle(augmented_sentences)

	return augmented_sentences

In [11]:
def rs_EDA(sentence, alpha_rs=0.1,  num_aug=9):
	words = sentence.split(' ')
	words = [word for word in words if word != ""]
	num_words = len(words)

	augmented_sentences = []
	# num_new_per_technique = int(num_aug/4) + 1

	n_rs = max(1, int(alpha_rs*num_words))

	# rs
	for _ in range(num_aug):
		a_words = random_swap(words, n_rs)
		augmented_sentences.append(" ".join(a_words))

	augmented_sentences = [sentence for sentence in augmented_sentences]
	random.shuffle(augmented_sentences)

	return augmented_sentences

In [12]:
train = pd.read_csv('C:/Users/Admin/Desktop/Boostcamp-AI-Tech/dataset/train/train.csv')
no_relation_df = train[train['label'] == 'no_relation']
other_df = train[train['label'] != 'no_relation']

In [13]:
def extract_word(x):
    return x.split(',')[0].split(':')[-1].strip()[1:-1]

def extract_type(x):
    return x.split(',')[-1].split(':')[-1].strip()[1:-1]

def find_index(sentence, word):
    length = len(word)
    start_index = sentence.find(word)
    end_index = start_index + length -1
    return start_index, end_index

In [15]:
original_subject = train['subject_entity'].apply(extract_word).tolist()
original_object = train['object_entity'].apply(extract_word).tolist()

In [16]:
def ratio_aug(df, eda_list, index_list, subject_word_list, object_word_list):
    cnt = len(eda_list[0])
    eda_list = np.array(eda_list).reshape(-1)
    concat_df = pd.DataFrame()
    idx = 0

    for i in tqdm(range(len(eda_list))):
        eda_idx = index_list[idx]
        if subject_word_list[eda_idx] in eda_list[i] and object_word_list[eda_idx] in eda_list[i]:

            ss,se = find_index(eda_list[i], subject_word_list[eda_idx])
            slabel = extract_type(df['subject_entity'].iloc[eda_idx])
            sdict = "{'word': '"+subject_word_list[eda_idx]+"',  \'start_idx\': "+str(ss)+",  \'end_idx\': "+str(se)+", 'type': '"+slabel+"}"

            os,oe = find_index(eda_list[i], object_word_list[eda_idx])
            olabel = extract_type(df['object_entity'].iloc[eda_idx])
            odict = "{'word': '"+object_word_list[eda_idx]+"',  \'start_idx\': "+str(os)+",  \'end_idx\': "+str(oe)+", 'type': '"+olabel+"}"
            data = [{
                'id' : i,
                'sentence' : eda_list[i],
                'subject_entity' : sdict,
                'object_entity' : odict,
                'label' : df['label'].iloc[eda_idx],
                'source' : df['source'].iloc[eda_idx]
            }]
            new_df = pd.DataFrame(data)
            concat_df = pd.concat([concat_df,new_df])

        if i % cnt == cnt -1:
            idx += 1
        
    return concat_df

def EDA(df):
    original_subject = df['subject_entity'].apply(extract_word).tolist()
    original_object = df['object_entity'].apply(extract_word).tolist()

    # 라벨기준으로 분류한 문장을 담은 리스트
    eda_list1 = []
    eda_list2 = []
    eda_list4 = []
    eda_list9 = []

    # 위문장의 데이터프레임 인덱스를 담은 리스트
    index1 = [] 
    index2 = []
    index4 = []
    index9 = []

    for i in range(len(df)):
        """
        증강비율에 따라 리스트 생성후 아래와같은 형식으로 분기
        """
        # 1개 증강
        if df['label'].iloc[i] in ['org:top_members/employees', 'per:employee_of']:
            # eda_list1.append(rs_EDA(df['sentence'].iloc[i], num_aug=1))
            # eda_list1.append(rd_EDA(df['sentence'].iloc[i], num_aug=1))
            # index1.append(i)
            pass

        # 2개 증강
        elif df['label'].iloc[i] in ['per:title', 'org:member_of', 'org:alternate_names', 'per:origin', 'org:place_of_headquarters', 'per:date_of_birth', 'per:alternate_names']:
            # eda_list2.append(rs_EDA(df['sentence'].iloc[i], num_aug=1))
            eda_list2.append(rd_EDA(df['sentence'].iloc[i], num_aug=1))
            index2.append(i)

        # 4개 증강
        elif df['label'].iloc[i] in ['per:spouse', 'per:colleagues', 'per:parents', 'org:founded', 'org:members', 'per:date_of_death', 'org:product', 'per:children']:
            # eda_list4.append(rs_EDA(df['sentence'].iloc[i], num_aug=2))
            eda_list4.append(rd_EDA(df['sentence'].iloc[i], num_aug=2))
            index4.append(i)

        # 9개 증강    
        else:
            eda_list9.append(rs_EDA(df['sentence'].iloc[i], num_aug=9))
            eda_list9.append(rd_EDA(df['sentence'].iloc[i], num_aug=9))
            index9.append(i)

    print(np.array(eda_list1).shape, np.array(eda_list2).shape, np.array(eda_list4).shape, np.array(eda_list9).shape)
    result_df = pd.DataFrame()
    
    df1 = pd.DataFrame()
    # df1 = ratio_aug(df, eda_list1, index1, original_subject, original_object)
    df2 = ratio_aug(df, eda_list2, index2, original_subject, original_object)
    df4 = ratio_aug(df, eda_list4, index4, original_subject, original_object)
    # df9 = ratio_aug(df, eda_list9, index9, original_subject, original_object)

    df_list = [df1, df2, df4]  # [df1, df2, df4, df9]
    result_df = pd.concat(df_list, ignore_index=True)

    result_df = result_df.sample(frac=1)  # 행 랜덤으로 셔플
    result_df = result_df.reset_index(drop=True)

    return result_df

In [17]:
np.random.seed(1004)
random.seed(1004)
eda_df = EDA(other_df)

In [18]:
eda_df

In [19]:
merge_df = pd.concat([train, eda_df])
merge_df = merge_df.reset_index(drop=True)
merge_df['id'] = list(range(len(merge_df)))

In [20]:
merge_df

In [21]:
# merge_df.to_csv("eda_train.csv", index=False, encoding="utf-8-sig")

# Check EDA_Data

In [22]:
merge_df

In [23]:
import os
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from collections import Counter

In [24]:
bs = pd.read_csv('bs_merge_cnen.csv')
fig, ax = plt.subplots(1,1, figsize=(10, 8))
ax = sns.countplot(y='label', data=bs, order = bs['label'].value_counts().index)
for i in range(bs['label'].value_counts().shape[0]):
    ax.text(x=bs['label'].value_counts()[i], y=i+0.3, s=bs['label'].value_counts()[i])
plt.show()

In [25]:
len(bs)

In [26]:
fig, ax = plt.subplots(1,1, figsize=(10, 8))
ax = sns.countplot(y='label', data=merge_df, order = merge_df['label'].value_counts().index)
for i in range(merge_df['label'].value_counts().shape[0]):
    ax.text(x=merge_df['label'].value_counts()[i], y=i+0.3, s=merge_df['label'].value_counts()[i])
plt.show()

In [27]:
merge_df['label'].value_counts()