# GPT-2 학습해보기

> 작성자      
```
* 김성현 (bananaband657@gmail.com)  
김바다 (qkek983@gmail.com)
박상희 (parksanghee0103@gmail.com)  
이정우 (jungwoo.l2.rs@gmail.com)
```
[CC BY-NC-ND](https://creativecommons.org/licenses/by-nc-nd/2.0/kr/)

이번 시간엔 한국어 코퍼스를 활용해, 직접 한국어 GPT-2를 학습해보겠습니다.

In [1]:
!pip install transformers



In [2]:
import torch
torch.cuda.is_available()

True

역시 위키 데이터를 가져와볼까요?

In [3]:
!mkdir my_data
!curl -c ./cookie -s -L "https://drive.google.com/uc?export=download&id=1zib1GI8Q5wV08TgYBa2GagqNh4jyfXZz" > /dev/null
!curl -Lb ./cookie "https://drive.google.com/uc?export=download&confirm=`awk '/download/ {print $NF}' ./cookie`&id=1zib1GI8Q5wV08TgYBa2GagqNh4jyfXZz" -o my_data/wiki_20190620_small.txt

mkdir: cannot create directory ‘my_data’: File exists
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   408    0   408    0     0    413      0 --:--:-- --:--:-- --:--:--   413
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
100 1323k  100 1323k    0     0  1063k      0  0:00:01  0:00:01 --:--:-- 1063k


In [4]:
path = "/content/my_data/wiki_20190620_small.txt"

지금까지는 BertWordPieceTokenizer를 사용해왔다면,   
이번에는 SentencePiceBPETokenizer를 사용해 모델을 학습해보겠습니다.

각 tokenizer의 차이는 허훈님의 블로그 [여기](https://huffon.github.io/2020/07/05/tokenizers/) 에서 확인하실 수 있습니다.



In [5]:
from tokenizers import SentencePieceBPETokenizer
from tokenizers.normalizers import BertNormalizer

tokenizer = SentencePieceBPETokenizer()

tokenizer._tokenizer.normalizer = BertNormalizer(clean_text=True,
handle_chinese_chars=False,
lowercase=False)

tokenizer.train(
    path,
    vocab_size=10000,
    special_tokens=[
        "<s>",
        "<pad>",
        "</s>",
        "<unk>",
    ],
)



In [6]:
print(tokenizer.encode("이순신은 조선 중기의 무신이다."))
print(tokenizer.encode("이순신은 조선 중기의 무신이다.").ids)
print(tokenizer.encode("이순신은 조선 중기의 무신이다.").tokens)
print(tokenizer.decode(tokenizer.encode("<s>이순신은 조선 중기의 무신이다.</s>").ids, skip_special_tokens=True))
# SentencePiece를 사용하면, 나중에 decoding 과정에서 '_' 만 ' '로 replace해주면 띄어쓰기 복원이 가능해집니다.

Encoding(num_tokens=9, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
[1005, 579, 6613, 1303, 1041, 2071, 1136, 596, 1033]
['▁이', '순', '신은', '▁조선', '▁중', '기의', '▁무', '신', '이다.']
이순신은 조선 중기의 무신이다.


In [7]:
tokenizer.save_model(".")

['./vocab.json', './merges.txt']

In [8]:
tokenizer = SentencePieceBPETokenizer.from_file(vocab_filename="vocab.json", merges_filename="merges.txt")

In [9]:
print(tokenizer.encode("이순신은 조선 중기의 무신이다."))
print(tokenizer.encode("이순신은 조선 중기의 무신이다.").ids)
print(tokenizer.encode("이순신은 조선 중기의 무신이다.").tokens)
print(tokenizer.encode("<s>이순신은 조선 중기의 무신이다.</s>").tokens)
print(tokenizer.decode(tokenizer.encode("<s>이순신은 조선 중기의 무신이다.</s>").ids, skip_special_tokens=True))

Encoding(num_tokens=9, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
[1005, 579, 6613, 1303, 1041, 2071, 1136, 596, 1033]
['▁이', '순', '신은', '▁조선', '▁중', '기의', '▁무', '신', '이다.']
['▁<', 's', '>', '이', '순', '신은', '▁조선', '▁중', '기의', '▁무', '신', '이다.', '<', '/s', '>']
<s>이순신은 조선 중기의 무신이다.</s>


In [10]:
tokenizer.add_special_tokens(["<s>", "</s>", "<unk>", "<pad>", "<shkim>"])
tokenizer.pad_token_id = tokenizer.token_to_id("<pad>")
tokenizer.unk_token_id = tokenizer.token_to_id("<unk>")
tokenizer.bos_token_id = tokenizer.token_to_id("<bos>")
tokenizer.eos_token_id = tokenizer.token_to_id("<eos>")

print(tokenizer.encode("<s>이순신은 조선 중기의 무신이다.</s>").ids)
print(tokenizer.encode("<s>이순신은 조선 중기의 무신이다.</s>").tokens)
print(tokenizer.decode(tokenizer.encode("<s>이순신은 조선 중기의 무신이다.</s>").ids, skip_special_tokens=True))

[0, 1005, 579, 6613, 1303, 1041, 2071, 1136, 596, 1033, 2]
['<s>', '▁이', '순', '신은', '▁조선', '▁중', '기의', '▁무', '신', '이다.', '</s>']
이순신은 조선 중기의 무신이다.


In [11]:
from transformers import GPT2Config, GPT2LMHeadModel
# creating the configurations from which the model can be made
config = GPT2Config(
  vocab_size=tokenizer.get_vocab_size(),
  bos_token_id=tokenizer.token_to_id("<s>"),
  eos_token_id=tokenizer.token_to_id("</s>"),
)
# creating the model
model = GPT2LMHeadModel(config)

In [12]:
model.num_parameters()

93523200

In [13]:
import json
import os
import pickle
import random
import time
import warnings
from typing import Dict, List, Optional

import torch
from torch.utils.data.dataset import Dataset

from filelock import FileLock

from transformers.tokenization_utils import PreTrainedTokenizer
from transformers.utils import logging

logger = logging.get_logger(__name__)

class TextDataset(Dataset):
    """
    This will be superseded by a framework-agnostic approach soon.
    """

    def __init__(
        self,
        tokenizer: PreTrainedTokenizer,
        file_path: str,
        block_size: int,
        overwrite_cache=False,
        cache_dir: Optional[str] = None,
    ):
        assert os.path.isfile(file_path), f"Input file path {file_path} not found"

        block_size = block_size - tokenizer.num_special_tokens_to_add(is_pair=False)

        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
            cache_dir if cache_dir is not None else directory,
            "cached_lm_{}_{}_{}".format(
                tokenizer.__class__.__name__,
                str(block_size),
                filename,
            ),
        )

        # Make sure only the first process in distributed training processes the dataset,
        # and the others will use the cache.
        lock_path = cached_features_file + ".lock"
        with FileLock(lock_path):

            if os.path.exists(cached_features_file) and not overwrite_cache:
                start = time.time()
                with open(cached_features_file, "rb") as handle:
                    self.examples = pickle.load(handle)
                logger.info(
                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
                )

            else:
                logger.info(f"Creating features from dataset file at {directory}")
                # 여기서부터 본격적으로 데이터셋을 만들기 시작합니다.
                self.examples = []
                text = ""
                with open(file_path, encoding="utf-8") as f:
                    lines = f.readlines()
                    for line in lines:
                        line = line.strip()
                        line = "<s>"+line+"</s>" # 학습 데이터 앞 뒤에 문장 구분 기호를 추가해줍니다.
                        text += line    # 'text' 객체에 모든 학습 데이터를 다 합쳐버립니다 :-)
                tokenized_text = tokenizer.encode(text).ids

                # 모델의 최대 sequence length만큼 데이터를 잘라서 저장합니다.
                for i in range(0, len(tokenized_text) - block_size + 1, block_size):  # Truncate in block of block_size
                    self.examples.append(
                        tokenized_text[i : i + block_size]
                    )
                # Note that we are losing the last truncated example here for the sake of simplicity (no padding)
                # If your dataset is small, first you should look for a bigger one :-) and second you
                # can change this behavior by adding (model specific) padding.

                start = time.time()
                with open(cached_features_file, "wb") as handle:
                    pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
                logger.info(
                    "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
                )

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i) -> torch.Tensor:
        return torch.tensor(self.examples[i], dtype=torch.long)

In [14]:
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=path,
    block_size=128,
)
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(    # GPT는 생성모델이기 때문에 [MASK] 가 필요 없습니다 :-)
    tokenizer=tokenizer, mlm=False,
)

In [15]:
print(dataset[0])

tensor([   0, 3997, 3546, 8406,  463,    4, 5481, 9528, 1798, 1890, 2297, 1262,
        9626, 2679, 1188, 2174,    2,    0, 5709, 5481,  254, 6466,  751, 3426,
         873, 1556,  681,  895, 1627, 9223,  588, 3621, 1010, 3303,    2,    0,
        6466, 7418, 2305,  403, 2217, 1074,    2,    0, 1013, 1107, 3716,  647,
        8576, 1024,  940,   92, 7323,  371,   92,  722, 9295,  706, 1651,  453,
        3166, 1032, 1074,    2,    0, 6343, 1262, 3716, 1009, 2932, 1176,  913,
        2037, 1171, 3227,  845,   92,  439,  974, 1486, 1017,    3, 1323, 3914,
        2094, 1042,    2,    0, 1382, 2068, 2225, 1095,  325,  845, 1823,  507,
           4, 1242, 7698,    2,    0, 3897, 6466, 1053, 1077,  687, 2318, 4649,
        5204, 5672, 1013, 1759,  117, 2742, 3004,  106,  656, 2283, 9765, 1192,
        1796, 2449, 2546, 9938, 6466, 1053, 1037,  535])


In [16]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='model_output',
    overwrite_output_dir=True,
    num_train_epochs=50,
    per_device_train_batch_size=32, # 512:32  # 128:64
    save_steps=1000,
    save_total_limit=2,
    logging_steps=100

)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)


In [17]:
trainer.train()

Step,Training Loss
100,7.9962
200,7.5347
300,7.2872
400,7.0269
500,6.7977
600,6.5901
700,6.3854
800,6.2146
900,6.0618
1000,5.9002


TrainOutput(global_step=3000, training_loss=5.556810953776042, metrics={'train_runtime': 2641.4214, 'train_samples_per_second': 1.136, 'total_flos': 6812678799360000.0, 'epoch': 50.0, 'init_mem_cpu_alloc_delta': 2800807936, 'init_mem_gpu_alloc_delta': 387416064, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 15409152, 'train_mem_gpu_alloc_delta': 1150346752, 'train_mem_cpu_peaked_delta': 0, 'train_mem_gpu_peaked_delta': 6587866112})

In [18]:
trainer.save_model()

In [19]:
USE_GPU = 1
# Device configuration
device = torch.device('cuda' if (torch.cuda.is_available() and USE_GPU) else 'cpu')

In [20]:
import torch
torch.manual_seed(42)

input_ids = torch.tensor(tokenizer.encode("<s>이순신", add_special_tokens=True).ids).unsqueeze(0).to('cuda')

output_sequences = model.generate(input_ids=input_ids, do_sample=True, max_length=100, num_return_sequences=3)
for generated_sequence in output_sequences:
    generated_sequence = generated_sequence.tolist()
    print("GENERATED SEQUENCE : {0}".format(tokenizer.decode(generated_sequence, skip_special_tokens=True)))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


GENERATED SEQUENCE : 이순신.
GENERATED SEQUENCE : 이순신공 비읽 수도 있다.
GENERATED SEQUENCE : 이순신 정선죽우작젖을 여러 마생리가 수 있었다.
