In [99]:
from datasets import load_from_disk
from typing import List, Tuple
import numpy as np
from transformers import AutoTokenizer

In [128]:
tokenizer = AutoTokenizer.from_pretrained('klue/roberta-large')
ids = [0, 3845, 2125, 4004, 2446, 2084, 7604, 2079, 25087, 2052, 860, 1644, 2073, 35, 2]
tokenizer.convert_ids_to_tokens(ids)

['[CLS]',
 '현대',
 '##적',
 '인사',
 '##조',
 '##직',
 '##관리',
 '##의',
 '시발점',
 '##이',
 '된',
 '책',
 '##은',
 '?',
 '[SEP]']

In [162]:
def add_ner_tag(examples):
    """
    데이터 증강 시 map에서 적용할 함수.
    batch 데이터를 받으면 "question" 문장들을 변형시켜 batch 데이터를 반환함.
    """ 
    cls_token = 0
    sep_token = 2
    mask_token = 4
    ratio = 0.3
    MAX_MASK_NUM = 2

    
    
    tokenizer = AutoTokenizer.from_pretrained('klue/roberta-large')
    tokenized_examples = tokenizer(
            examples['question'],
            examples['context'],
            truncation="only_second",
            max_length=384,
            stride=128,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            return_token_type_ids=False, # roberta모델을 사용할 경우 False, bert를 사용할 경우 True로 표기해야합니다.
            padding="max_length",
        )

    
    # print(tokenized_examples['input_ids'])
    # question_include_context_ids = tokenized_examples['input_ids'][0]
    # 
    new_input_ids = []
    past_question = []
    past_masked_question = []
    for question_include_context_ids in tokenized_examples['input_ids']:
        
        question = []
        for input_id in question_include_context_ids:
            if input_id == cls_token :
                continue
            if input_id == sep_token :
                break
            question.append(input_id)
        
        new_sentence = past_question != question

        past_question = question

        if new_sentence:
            mask = np.random.rand(len(question)) < ratio
            if sum(mask) > MAX_MASK_NUM:
                mask_idx = np.where(mask)
                set_false_pos = np.random.choice(mask_idx[0], sum(mask) - MAX_MASK_NUM, replace=False)
                mask[set_false_pos] = False
            masked_question = [mask_token if m else word for word, m in zip(question, mask)]
        else :
            masked_question = past_masked_question
        
        question_masked_ids = [cls_token] + masked_question + [sep_token] + question_include_context_ids[len(question)+2:]
        past_masked_question = masked_question

        print('-----here1------')
        print(question_masked_ids)
        new_input_ids.append(question_masked_ids)
        print('-----here2------')
        # print(new_input_ids)
        print('-----here3------')
    # print(len(question_include_context_ids))
    # print(question_include_context_ids)
    

    
    # print(len(question_masked_ids))
    # print(question_masked_ids)
    
    
    
    
    
    tokenized_examples['input_ids'] = new_input_ids
    # print('token_examples---------------------')
    # print(tokenized_examples['input_ids'])
    # print('token_examples---------------------')

    return {"question": question_masked_ids}

## 데이터 증강에 쓰일 데이터셋을 load합니다.

In [163]:
datasets = load_from_disk("/opt/ml/data/train_dataset")
train_dataset = datasets["train"]

## `datasets.Dataset.map()`을 이용하여 batch 단위로 데이터 증강을 진행합니다.

시간이 꽤 많이 소요됩니다. (-30분) 따라서 우선 sample 30개만으로 합니다.

In [164]:
train_dataset = train_dataset.select(range(30))  # sample

In [165]:
train_dataset_aug = train_dataset.map(
                        add_ner_tag,
                        batched=True,
                        batch_size=8
                    )

0, 3698, 2069, 3954, 2470, 3666, 2079, 7895, 8586, 2207, 2069, 554, 2259, 3728, 3860, 2073, 35, 2
0,    4,    4, 3954, 2470, 3666,    4, 7895, 8586,    4, 2069, 554, 2259,    4, 3860, 2073, 35, 2


-----here1------
[0, 3698, 2069, 3954, 2470, 3666, 2079, 7895, 8586, 2207, 2069, 4, 4, 3728, 3860, 2073, 35, 2, 3666, 10346, 2252, 4013, 3666, 10450, 12, 29963, 30605, 2041, 19148, 2012, 9230, 13, 1497, 1402, 2252, 2021, 2179, 3666, 4570, 2079, 10450, 28674, 18, 3, 81, 3, 81, 2044, 2226, 17352, 2052, 10450, 2079, 27345, 3622, 18, 544, 12881, 22, 2211, 2079, 10450, 5069, 2052, 6940, 2496, 2051, 3911, 2211, 2079, 10450, 5069, 6233, 3896, 2496, 2051, 1513, 2062, 18, 6724, 2259, 26, 2440, 2052, 2307, 16, 22, 2440, 10598, 3956, 2019, 2223, 1570, 21, 19, 23, 3292, 10450, 5069, 2069, 3755, 6940, 7488, 7145, 2170, 14352, 18, 3, 81, 3, 81, 2044, 2226, 10450, 2073, 3666, 11119, 2145, 2259, 4405, 2318, 3666, 3698, 2069, 12104, 6233, 1889, 2259, 3666, 7145, 7895, 2170, 4424, 5187, 2138, 1889, 2259, 3860, 28674, 18, 11119, 2052, 5387, 2145, 3674, 2170, 3618, 5851, 16, 3698, 2069, 3954, 2470, 8199, 2079, 4668, 2069, 25154, 2085, 5851, 2069, 554, 2088, 1513, 2259, 3735, 2069, 3661, 2205, 2259, 3860, 

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

-----here1------
[0, 3698, 2069, 3954, 2470, 3666, 2079, 7895, 4, 2207, 2069, 554, 4, 3728, 3860, 2073, 35, 2, 3666, 10346, 2252, 4013, 3666, 10450, 12, 29963, 30605, 2041, 19148, 2012, 9230, 13, 1497, 1402, 2252, 2021, 2179, 3666, 4570, 2079, 10450, 28674, 18, 3, 81, 3, 81, 2044, 2226, 17352, 2052, 10450, 2079, 27345, 3622, 18, 544, 12881, 22, 2211, 2079, 10450, 5069, 2052, 6940, 2496, 2051, 3911, 2211, 2079, 10450, 5069, 6233, 3896, 2496, 2051, 1513, 2062, 18, 6724, 2259, 26, 2440, 2052, 2307, 16, 22, 2440, 10598, 3956, 2019, 2223, 1570, 21, 19, 23, 3292, 10450, 5069, 2069, 3755, 6940, 7488, 7145, 2170, 14352, 18, 3, 81, 3, 81, 2044, 2226, 10450, 2073, 3666, 11119, 2145, 2259, 4405, 2318, 3666, 3698, 2069, 12104, 6233, 1889, 2259, 3666, 7145, 7895, 2170, 4424, 5187, 2138, 1889, 2259, 3860, 28674, 18, 11119, 2052, 5387, 2145, 3674, 2170, 3618, 5851, 16, 3698, 2069, 3954, 2470, 8199, 2079, 4668, 2069, 25154, 2085, 5851, 2069, 554, 2088, 1513, 2259, 3735, 2069, 3661, 2205, 2259, 3860, 2

ArrowInvalid: Column 5 named question expected length 8 but got length 384

## `datasets.Dataset.save_to_disk()`로 저장합니다.
`save_to_disk()`로 저장하면 `load_from_disk()`로 쉽게 불러올 수 있습니다.

In [None]:
datasets["train"] = train_dataset_aug
datasets.save_to_disk("/opt/ml/data/ner_only_train_dataset_3")

## `datasets.Dataset.load_from_disk()`로 load까지 성공했는지 확인합니다.

In [None]:
datasets_to_aug = load_from_disk("/opt/ml/data/ner_only_train_dataset_3/")
train_dataset_to_aug = datasets_to_aug["train"]
# type(train_dataset_to_aug)  # datasets.arrow_dataset.Dataset
len(train_dataset_to_aug)  # 30

FileNotFoundError: Directory /opt/ml/data/ner_only_train_dataset_3/ not found

In [76]:
a = {'a':20}

In [77]:
a.hello(10)

AttributeError: 'dict' object has no attribute 'hello'