In [3]:
import os
import random
import math
import sys
from dotenv import load_dotenv

import numpy as np
import torch
import wandb

from functools import partial
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSeq2SeqLM,
    HfArgumentParser,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    EarlyStoppingCallback
)

In [4]:
sys.path.append('../')

In [5]:
from dataloader import SumDataset
from processor import preprocess_function
from rouge import compute_metrics
from data_collator import DataCollatorForSeq2SeqWithDocType

## Dataset

In [6]:
types = ['magazine']
dataset_name = ['metamong1/summarization_' + dt for dt in types]

In [7]:
train_dataset = SumDataset(
    dataset_name,
    'train',
    shuffle_seed=1234,
    ratio=0.01,
    USE_AUTH_TOKEN=True
).load_data()

Reusing dataset magazine_summarization (/opt/ml/.cache/huggingface/datasets/metamong1___magazine_summarization/Magizine Summarization/1.0.0/506cb41eb0b96b084eafa5dd5fe3b51ff0d1061256700adf1aa92d3b19762c36)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached shuffled indices for dataset at /opt/ml/.cache/huggingface/datasets/metamong1___magazine_summarization/Magizine Summarization/1.0.0/506cb41eb0b96b084eafa5dd5fe3b51ff0d1061256700adf1aa92d3b19762c36/cache-ec04b3f04ce8063a.arrow


In [8]:
train_dataset

Dataset({
    features: ['doc_id', 'title', 'text', 'doc_type', 'file'],
    num_rows: 527
})

## Tokenizer

In [9]:
model_checkpoint = 'gogamza/kobart-base-v1'

In [10]:
tokenizer = AutoTokenizer.from_pretrained(
    model_checkpoint,
    use_fast=True,
)



In [11]:
tokenizer.all_special_ids

[0, 1, 5, 3, 6]

In [12]:
for sep_id in tokenizer.all_special_ids :
    sep_token = tokenizer.convert_ids_to_tokens(sep_id)
    print(f'Special Id : {sep_id} \t Special Token : {sep_token}')

Special Id : 0 	 Special Token : <s>
Special Id : 1 	 Special Token : </s>
Special Id : 5 	 Special Token : <unk>
Special Id : 3 	 Special Token : <pad>
Special Id : 6 	 Special Token : <mask>


## Arguments

In [13]:
from args import DataTrainingArguments

In [14]:
data_args = DataTrainingArguments()
data_args

DataTrainingArguments(dataset_name='paper,news', text_column='text', summary_column='title', overwrite_cache=False, preprocessing_num_workers=4, max_source_length=1024, max_target_length=128, val_max_target_length=None, pad_to_max_length=False, max_train_samples=None, max_eval_samples=None, max_predict_samples=None, num_beams=None, ignore_pad_token_for_loss=True, use_auth_token_path='./use_auth_token.env', relative_sample_ratio=1.0, relative_eval_steps=10)

## Preprocess Data

In [15]:
def preprocess_function(examples, tokenizer, data_args):
    bos_token = tokenizer.bos_token
    eos_token = tokenizer.eos_token
    padding = "max_length" if data_args.pad_to_max_length else False

    inputs = examples['text']
    inputs = [bos_token + inp + eos_token for inp in inputs]

    model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True)
    return model_inputs

In [16]:
prep_fn  = partial(preprocess_function, tokenizer=tokenizer, data_args=data_args)
tokenized_dataset = train_dataset.map(
    prep_fn,
    batched=True,
    num_proc=1,
    remove_columns=train_dataset.column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on train dataset",
)

Running tokenizer on train dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

In [17]:
tokenized_dataset

Dataset({
    features: ['attention_mask', 'input_ids', 'token_type_ids'],
    num_rows: 527
})

## Pipeline

In [18]:
from torch.utils.data import DataLoader
from transformers.data.data_collator import DataCollatorForLanguageModeling

In [19]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
    mlm=0.15)

In [22]:
data_loader = DataLoader(tokenized_dataset,
    batch_size=16,
    collate_fn=data_collator)

## Results

In [23]:
for data in data_loader :
    break

In [24]:
data.keys()

dict_keys(['attention_mask', 'input_ids', 'token_type_ids', 'labels'])

In [25]:
for key in data.keys() :
    print(f'Key : {key} \t Tensor Shape : {data[key].shape}')

Key : attention_mask 	 Tensor Shape : torch.Size([16, 785])
Key : input_ids 	 Tensor Shape : torch.Size([16, 785])
Key : token_type_ids 	 Tensor Shape : torch.Size([16, 785])
Key : labels 	 Tensor Shape : torch.Size([16, 785])


In [26]:
data['input_ids'][0][:20]

tensor([    0, 14287, 15832, 27607, 17881,     6, 13417, 14501,  9866,  9229,
        17908,     6, 17510,   299, 25873, 19632, 10213, 19553, 15533,     6])

In [27]:
data['labels'][0][:20]

tensor([ -100,  -100,  -100, 27607,  -100, 14063,  -100,  -100,  -100,  -100,
         -100, 21738,  -100,  -100,  -100,  -100,  -100,  -100,  -100, 14501])

# Summary
  1. _torch_collate_batch 를 활용해서 Batch 안에서 Padding을 하고 Tensor로 만든다.
  2. tokenizer.get_special_tokens_mask를 활용해서 각 데이터 마다 special token의 위치에 Mask를 취한다.
  3. 그 외에 자리에서 torch.bernoulli를 활용해서 Mask이 될 토큰을 선정한다.
      * 80% Masking 처리
      * 10% Random으로 위치 변경
      * 10% 그대로
  4. tokenizer에 sepcial token의 위치 이외에서 Masking을 진행한다.

# Infilling
  방향
  1. 푸아송 분포에 따라서 값을 추출한다.
  2. 1번 값의 길이에 해당하는 text span을 선정해서 MASK Token으로 변형한다. 
      * 값이 0인 경우에는 MASK Token을 추가한다. 
  3. 그렇게 변형된 입력이 들어오면 모델을 통해서 원래의 입력을 맞춰야 한다.

## Data Collator 구현

### Libarary

In [28]:
import random
import warnings
from dataclasses import dataclass
from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union

from transformers.file_utils import PaddingStrategy
from transformers.models.bert import BertTokenizer, BertTokenizerFast
from transformers.tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase
from transformers.data.data_collator import (
        DataCollatorForLanguageModeling,
        _torch_collate_batch
)

In [29]:
poisson_nums = np.random.poisson(3, 10)
print(poisson_nums)

[4 5 4 1 4 3 3 5 3 4]


In [46]:

@dataclass
class DataCollatorForBartPretraining(DataCollatorForLanguageModeling):
    tokenizer: PreTrainedTokenizerBase
    poisson: int = 3
    label_pad_token_id: int = -100
    pad_to_multiple_of: Optional[int] = None
    return_tensors: str = "pt"

    def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
        # Handle dict or lists with proper padding and conversion to tensor.
        if isinstance(examples[0], (dict, BatchEncoding)):
            batch = self.tokenizer.pad(examples, return_tensors="pt", pad_to_multiple_of=self.pad_to_multiple_of)
        else:
            ## input_ids를 배치안에서 최대 길이로 padding을 진행한다.
            batch = {
                "input_ids": _torch_collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
            }

        batch["input_ids"], batch["labels"] = self.torch_infilling(
            batch["input_ids"], poisson_num=self.poisson
        )

        # label에서 pad_token_id로 된 부분은 loss 계산에서 제외하기 위해서 label_pad_token_id로 변경한다.
        batch["labels"] = torch.where(batch["labels"] == tokenizer.pad_token_id, 
            self.label_pad_token_id, batch["labels"])

        return batch

    def torch_infilling(self, inputs: Any, poisson_num: Optional[int] = None) -> Tuple[Any, Any]:
        import torch
        labels = inputs.clone() # label은 기존 문장과 동일
        pad_token_id = tokenizer.pad_token_id
        mask_token_id = tokenizer.mask_token_id
        batch_size, seq_size = inputs.shape

        poisson_value_list = np.random.poisson(poisson_num, batch_size) # batch size에 맞게 possion 값 생성
        input_list = []
        max_size = 0
        for i, poisson in enumerate(poisson_value_list) :
            input_arr = list(inputs[i])
            poisson = poisson_value_list[i] # 해당 문장에 대한 poisson 값
            pad_ids = np.where(np.array(input_arr) == pad_token_id)[0] # pad의 첫번째 부분 파악
            pad_size = pad_ids[0] if len(pad_ids) > 0 else 0 

            sen_size = seq_size - pad_size # sequence size - padding size : 기존의 input size
            infilling_start = np.random.randint(sen_size-poisson) # poisson value의 길이에 맞는 span의 시작점 선정
            infilling_end = infilling_start + poisson

            input_arr = input_arr[:infilling_start] + [mask_token_id] + input_arr[infilling_end:] # poisson value에 해당되는 길이의 span 자체를 mask 처리
            max_size = max(max_size, len(input_arr)) # 문장의 길이 파악
            input_list.append(input_arr)

        input_infilling = []
        # 각각의 문장의 길이가 다르기 때문에 Batch 중에서 가장 긴 길이에 맞춰서 padding을 진행한다.
        for input_ids in input_list :
            if self.tokenizer.padding_side == 'right' :
                input_ids = input_ids + [pad_token_id] * (max_size - len(input_ids))
            else :
                input_ids = [pad_token_id] * (max_size - len(input_ids)) + input_ids
            input_infilling.append(torch.tensor(input_ids))
            
        input_infilling = torch.stack(input_infilling, dim=0)
        return input_infilling, labels


## Pipeline

In [47]:
data_collator = DataCollatorForBartPretraining(tokenizer=tokenizer,
    poisson=3)

In [48]:
data_loader = DataLoader(tokenized_dataset,
    batch_size=16,
    collate_fn=data_collator)

In [49]:
for data in data_loader :
    break

data.keys()

dict_keys(['attention_mask', 'input_ids', 'token_type_ids', 'labels'])

In [50]:
for key in data.keys() :
    print(f'Key : {key} \t Tensor Shape : {data[key].shape}')

Key : attention_mask 	 Tensor Shape : torch.Size([16, 785])
Key : input_ids 	 Tensor Shape : torch.Size([16, 786])
Key : token_type_ids 	 Tensor Shape : torch.Size([16, 785])
Key : labels 	 Tensor Shape : torch.Size([16, 785])


### Case 1

In [192]:
list(data['input_ids'][0]).index(6)

176

In [199]:
print('After Infilling')
print(tokenizer.decode(data['input_ids'][0][160:180]))
print('\nBefore Infilling')
print(tokenizer.decode(data['labels'][0][160:180]))

After Infilling
직면하자 애완견을 방패막이로 삼았다. "개인적으로 받은 것은<mask>반려견

Before Infilling
직면하자 애완견을 방패막이로 삼았다. "개인적으로 받은 것은 체커스(반


### Case2

In [194]:
list(data['input_ids'][11]).index(6)

158

In [198]:
print('After Infilling')
print(tokenizer.decode(data['input_ids'][11][150:180]))
print('\nBefore Infilling')
print(tokenizer.decode(data['labels'][11][150:180]))

After Infilling
제는 각 지방자치단체가 도시공원 등 도시<mask> 녹지를 20 년 이상 개발하지 않으면 공원지정 효력을 자동 해제해야 하는 제도다. 지금까지 지자

Before Infilling
제는 각 지방자치단체가 도시공원 등 도시계획시설로 정한 녹지를 20 년 이상 개발하지 않으면 공원지정 효력을 자동 해제해야 하는 제도


### Case 3

In [200]:
list(data['input_ids'][5]).index(6)

161

In [201]:
print('After Infilling')
print(tokenizer.decode(data['input_ids'][5][150:180]))
print('\nBefore Infilling')
print(tokenizer.decode(data['labels'][5][150:180]))

After Infilling
주장에 대해 7월 9일 페이스북을 통해 "G20이<mask> 새벽 1시 반이 되어서야 숙소로 돌아올 수 있었습니다. 함께 동행한 청와대

Before Infilling
주장에 대해 7월 9일 페이스북을 통해 "G20이 있던 첫째 날 대통령은 새벽 1시 반이 되어서야 숙소로 돌아올 수 있었습니다. 함께
