In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
!mkdir my_data

In [2]:
!curl -c ./cookie -s -L "https://drive.google.com/uc?export=download&id=1zib1GI8Q5wV08TgYBa2GagqNh4jyfXZz" > /dev/null
!curl -Lb ./cookie "https://drive.google.com/uc?export=download&confirm=`awk '/download/ {print $NF}' ./cookie`&id=1zib1GI8Q5wV08TgYBa2GagqNh4jyfXZz" -o my_data/wiki_20190620_small.txt

지정된 경로를 찾을 수 없습니다.
curl: (3) URL using bad/illegal format or missing URL


In [25]:
!pip install transformers



In [26]:
import transformers

In [27]:
transformers.__version__

'4.15.0'

In [28]:
!mkdir wordPieceTokenizer

mkdir: cannot create directory ‘wordPieceTokenizer’: File exists


In [29]:
from tokenizers import BertWordPieceTokenizer

# Initialize an empty tokenizer
wp_tokenizer = BertWordPieceTokenizer(
    clean_text=True,   # ["이순신", "##은", " ", "조선"] ->  ["이순신", "##은", "조선"]
    # if char == " " or char == "\t" or char == "\n" or char == "\r":
    handle_chinese_chars=True,  # 한자는 모두 char 단위로 쪼개버립니다.
    strip_accents=False,    # True: [YehHamza] -> [Yep, Hamza]
    lowercase=False,    # Hello -> hello
)

# And then train
wp_tokenizer.train(
    files="my_data/wiki_20190620_small.txt",
    vocab_size=20000,   # vocab size 를 지정해줄 수 있습니다.
    min_frequency=2,
    show_progress=True,
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
    wordpieces_prefix="##"
)

# Save the files
wp_tokenizer.save_model("wordPieceTokenizer", "my_tokenizer")

['wordPieceTokenizer/my_tokenizer-vocab.txt']

In [30]:
print(wp_tokenizer.get_vocab_size())

20000


In [31]:
text = "이순신은 [MASK] 중기의 무신이다."
tokenized_text = wp_tokenizer.encode(text)
print(tokenized_text.tokens)
print(tokenized_text.ids)

['이', '##순', '##신은', '[MASK]', '중', '##기의', '무신', '##이다', '.']
[705, 1304, 7573, 4, 754, 2602, 13158, 1895, 16]


In [32]:
import torch
torch.cuda.is_available()

True

In [33]:
from transformers import BertConfig, BertForPreTraining, BertTokenizerFast #bertformaskedLM

In [34]:
tokenizer = BertTokenizerFast(
    vocab_file='/content/wordPieceTokenizer/my_tokenizer-vocab.txt',
    max_len=128,
    do_lower_case=False,
    )

In [35]:
print(tokenizer.tokenize("은 [MASK] 조선 중기의 무신이다."))

['은', '[', 'M', '##AS', '##K', ']', '조선', '중', '##기의', '무신', '##이다', '.']


In [36]:
tokenizer.add_special_tokens({'mask_token':'[MASK]'})
print(tokenizer.tokenize("이순신은 [MASK] 중기의 무신이다."))

['이', '##순', '##신은', '[MASK]', '중', '##기의', '무신', '##이다', '.']


In [37]:
config = BertConfig(    # https://huggingface.co/transformers/model_doc/bert.html#bertconfig
    vocab_size=20000, # default는 영어 기준이므로 내가 만든 vocab size에 맞게 수정해줘야 함
    # hidden_size=512,
    # num_hidden_layers=12,    # layer num
    # num_attention_heads=8,    # transformer attention head number
    # intermediate_size=3072,   # transformer 내에 있는 feed-forward network의 dimension size
    # hidden_act="gelu",
    # hidden_dropout_prob=0.1,
    # attention_probs_dropout_prob=0.1,
    max_position_embeddings=128,    # embedding size 최대 몇 token까지 input으로 사용할 것인지 지정
    # type_vocab_size=2,    # token type ids의 범위 (BERT는 segmentA, segmentB로 2종류)
    # pad_token_id=0,
    # position_embedding_type="absolute"
)

model = BertForPreTraining(config=config)
model.num_parameters()

101720098

In [38]:
from transformers import DataCollatorForLanguageModeling

In [39]:
import torch
from torch.utils.data.dataset import Dataset
from transformers.tokenization_utils import PreTrainedTokenizer
from typing import Dict, List, Optional
import os
import json
import pickle
import random
import time
import warnings

from filelock import FileLock

from transformers.utils import logging

logger = logging.get_logger(__name__)


In [40]:
class TextDatasetForNextSentencePrediction(Dataset):
    """
    This will be superseded by a framework-agnostic approach soon.
    """

    def __init__(
        self,
        tokenizer: PreTrainedTokenizer,
        file_path: str,
        block_size: int,
        overwrite_cache=False,
        short_seq_probability=0.1,
        nsp_probability=0.5,
    ):
        # 여기 부분은 학습 데이터를 caching하는 부분입니다 :-)
        assert os.path.isfile(file_path), f"Input file path {file_path} not found"

        self.block_size = block_size - tokenizer.num_special_tokens_to_add(pair=True)
        self.short_seq_probability = short_seq_probability
        self.nsp_probability = nsp_probability

        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
            directory,
            "cached_nsp_{}_{}_{}".format(
                tokenizer.__class__.__name__,
                str(block_size),
                filename,
            ),
        )

        self.tokenizer = tokenizer

        lock_path = cached_features_file + ".lock"

        # Input file format:
        # (1) One sentence per line. These should ideally be actual sentences, not
        # entire paragraphs or arbitrary spans of text. (Because we use the
        # sentence boundaries for the "next sentence prediction" task).
        # (2) Blank lines between documents. Document boundaries are needed so
        # that the "next sentence prediction" task doesn't span between documents.
        #
        # Example:
        # I am very happy.
        # Here is the second sentence.
        #
        # A new document.

        with FileLock(lock_path):
            if os.path.exists(cached_features_file) and not overwrite_cache:
                start = time.time()
                with open(cached_features_file, "rb") as handle:
                    self.examples = pickle.load(handle)
                logger.info(
                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
                )
            else: # 캐시가 없는 경우
                logger.info(f"Creating features from dataset file at {directory}")
                # 여기서부터 본격적으로 dataset을 만듭니다.
                self.documents = [[]] # document 단위로 학습이 이뤄짐
                with open(file_path, encoding="utf-8") as f:
                    while True: # 일단 문장을 읽고
                        line = f.readline()
                        if not line:
                            break
                        line = line.strip() # 필수!!!!

                        # 이중 띄어쓰기가 발견된다면, 나왔던 문장들을 모아 하나의 문서로 묶어버립니다.
                        # 즉, 문단 단위로 데이터를 저장합니다.
                        if not line and len(self.documents[-1]) != 0:
                            self.documents.append([])
                        tokens = tokenizer.tokenize(line)
                        tokens = tokenizer.convert_tokens_to_ids(tokens)
                        if tokens:
                            self.documents[-1].append(tokens)
                # 이제 코퍼스 전체를 읽고, 문서 데이터를 생성했습니다! :-)
                logger.info(f"Creating examples from {len(self.documents)} documents.")
                self.examples = []
                # 본격적으로 학습을 위한 데이터로 변형시켜볼까요?
                for doc_index, document in enumerate(self.documents):
                    self.create_examples_from_document(document, doc_index) # 함수로 가봅시다.

                start = time.time()
                with open(cached_features_file, "wb") as handle:
                    pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
                logger.info(
                    "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
                )

    def create_examples_from_document(self, document: List[List[int]], doc_index: int):
        """Creates examples for a single document."""
        # 문장의 앞, 뒤에 [CLS], [SEP] token이 부착되기 때문에, 내가 지정한 size에서 2 만큼 빼줍니다.
        # 예를 들어 128 token 만큼만 학습 가능한 model을 선언했다면, 학습 데이터로부터는 최대 126 token만 가져오게 됩니다.
        max_num_tokens = self.block_size - self.tokenizer.num_special_tokens_to_add(pair=True)

        # We *usually* want to fill up the entire sequence since we are padding
        # to `block_size` anyways, so short sequences are generally wasted
        # computation. However, we *sometimes*
        # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
        # sequences to minimize the mismatch between pretraining and fine-tuning.
        # The `target_seq_length` is just a rough target however, whereas
        # `block_size` is a hard limit.

        # 여기가 재밌는 부분인데요!
        # 위에서 설명했듯이, 학습 데이터는 126 token(128-2)을 채워서 만들어지는게 목적입니다.
        # 하지만 나중에 BERT를 사용할 때, 126 token 이내의 짧은 문장을 테스트하는 경우도 분명 많을 것입니다 :-)
        # 그래서 short_seq_probability 만큼의 데이터에서는 2-126 사이의 random 값으로 학습 데이터를 만들게 됩니다.
        target_seq_length = max_num_tokens
        if random.random() < self.short_seq_probability:
            target_seq_length = random.randint(2, max_num_tokens)

        current_chunk = []  # a buffer stored current working segments
        current_length = 0
        i = 0

        # 데이터 구축의 단위는 document 입니다
        # 이 때, 무조건 문장_1[SEP]문장_2 이렇게 만들어지는 것이 아니라,
        # 126 token을 꽉 채울 수 있게 문장_1+문장_2[SEP]문장_3+문장_4 형태로 만들어질 수 있습니다.
        while i < len(document):
            segment = document[i]
            current_chunk.append(segment)
            current_length += len(segment)
            if i == len(document) - 1 or current_length >= target_seq_length:
                if current_chunk:
                    # `a_end` is how many segments from `current_chunk` go into the `A`
                    # (first) sentence.
                    a_end = 1
                    # 여기서 문장_1+문장_2 가 이루어졌을 때, 길이를 random하게 짤라버립니다 :-)
                    if len(current_chunk) >= 2:
                        a_end = random.randint(1, len(current_chunk) - 1)
                    tokens_a = []
                    for j in range(a_end):
                        tokens_a.extend(current_chunk[j])
                    # 이제 [SEP] 뒷 부분인 segmentB를 살펴볼까요?
                    tokens_b = []
                    # 50%의 확률로 랜덤하게 다른 문장을 선택하거나, 다음 문장을 학습데이터로 만듭니다.
                    if len(current_chunk) == 1 or random.random() < self.nsp_probability:
                        is_random_next = True
                        target_b_length = target_seq_length - len(tokens_a)

                        # This should rarely go for more than one iteration for large
                        # corpora. However, just to be careful, we try to make sure that
                        # the random document is not the same as the document
                        # we're processing.
                        for _ in range(10):
                            random_document_index = random.randint(0, len(self.documents) - 1)
                            if random_document_index != doc_index:
                                break
                        # 여기서 랜덤하게 선택합니다 :-)
                        random_document = self.documents[random_document_index]
                        random_start = random.randint(0, len(random_document) - 1)
                        for j in range(random_start, len(random_document)):
                            tokens_b.extend(random_document[j])
                            if len(tokens_b) >= target_b_length:
                                break
                        # We didn't actually use these segments so we "put them back" so
                        # they don't go to waste.
                        num_unused_segments = len(current_chunk) - a_end
                        i -= num_unused_segments
                    # Actual next
                    else:
                        is_random_next = False
                        for j in range(a_end, len(current_chunk)):
                            tokens_b.extend(current_chunk[j])

                    # 이제 126 token을 넘는다면 truncation을 해야합니다.
                    # 이 때, 126 token 이내로 들어온다면 행위를 멈추고,
                    # 만약 126 token을 넘는다면, segmentA와 segmentB에서 랜덤하게 하나씩 제거합니다.
                    def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens):
                        """Truncates a pair of sequences to a maximum sequence length."""
                        while True:
                            total_length = len(tokens_a) + len(tokens_b)
                            if total_length <= max_num_tokens:
                                break
                            trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
                            assert len(trunc_tokens) >= 1
                            # We want to sometimes truncate from the front and sometimes from the
                            # back to add more randomness and avoid biases.
                            if random.random() < 0.5:
                                del trunc_tokens[0]
                            else:
                                trunc_tokens.pop()

                    truncate_seq_pair(tokens_a, tokens_b, max_num_tokens)

                    assert len(tokens_a) >= 1
                    assert len(tokens_b) >= 1

                    # add special tokens
                    input_ids = self.tokenizer.build_inputs_with_special_tokens(tokens_a, tokens_b)
                    # add token type ids, 0 for sentence a, 1 for sentence b
                    token_type_ids = self.tokenizer.create_token_type_ids_from_sequences(tokens_a, tokens_b)
                    
                    # 드디어 아래 항목에 대한 데이터셋이 만들어졌습니다! :-)
                    # 즉, segmentA[SEP]segmentB, [0, 0, .., 0, 1, 1, ..., 1], NSP 데이터가 만들어진 것입니다 :-)
                    # 그럼 다음은.. 이 데이터에 [MASK] 를 씌워야겠죠?
                    example = {
                        "input_ids": torch.tensor(input_ids, dtype=torch.long),
                        "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
                        "next_sentence_label": torch.tensor(1 if is_random_next else 0, dtype=torch.long),
                    }

                    self.examples.append(example)

                current_chunk = []
                current_length = 0

            i += 1

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return self.examples[i]

In [41]:
dataset = TextDatasetForNextSentencePrediction(
    tokenizer=tokenizer,
    file_path='/content/my_data/wiki_20190620_small.txt',
    block_size=128,
    overwrite_cache=True,
    short_seq_probability=0.1,
    nsp_probability=0.5,
)


Token indices sequence length is longer than the specified maximum sequence length for this model (137 > 128). Running this sequence through the model will result in indexing errors


In [42]:
for example in dataset.examples[0:1]:
    print(example)

{'input_ids': tensor([    2,  4354,   639,     5,  5497,     5,  5504, 11105,  2492,  2428,
         2780,  1968,  5379,  3120,  1940,  2407,    16,  5497, 10307, 16248,
          552,  1266,   822,  1024,  1207,   931, 16491, 12283,  1095,  3667,
           16,  6532,  8934,  1077,  2677,  1906,    16,     3,   727,  1052,
           93,  7742,    93, 10411,  1007, 18365,  3483, 18887,    16,  6439,
         1968,  4021,   279,  3361,   657,  1401,  2105,  1933, 17664,    93,
          439,  1114,  2137,     1,  2023,  4086, 17983,    16,  2062,   496,
         2734,     5, 17664, 12973,     5,   379,  7718,    16,  4186,  6532,
          750,   541,  1018,  2795,  4860,  5152,  4776,   176, 11840, 15561,
          654,  2786,  9394,  1946,  2370,  2895,  2054,    14,  9869,  6532,
          750,   762,  1116,  6463,  5152,  2823,  3951,  6532,   750,   762,
         2092,  6208,  1899,    16,  2829,  4530,   727, 16248, 18217,  2661,
        17631, 13570,  2489,    14,     3]), 'toke

In [43]:
data_collator = DataCollatorForLanguageModeling(    # [MASK] 를 씌우는 것은 저희가 구현하지 않아도 됩니다! :-)
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [44]:
for example in dataset.examples[-2:-1]:
    print(example)

{'input_ids': tensor([    2,   757,  3609,  1907,  3751,  4410,  2049,  2944,  2178,  1896,
         1988,  3610,  2967,  1885,  7858,  1118,  2061,    16,  3954,  4326,
         1227, 11705,  2189,  1015,  7815, 13051,  1987,  9863,    16,   757,
           14, 17896, 15385, 14780, 12216, 11234,  6428,  1988,  2412, 10610,
         1022, 12379,  1991,  5982,  9863,    16,     3, 17259,   705, 17869,
         1903,  7274, 14064,  3013,  9005,  7578,  2025, 15151,  3296,  1261,
         2758,  1897,    16,  9005,  9642, 17252,  1027,  5614,    14,  2418,
         2886,  4867, 19579,  1903,  2713,  3123,  4846,    16,  9005,  1094,
         8604,  3184,   705,  3662,  4060,  1976,  2115,  2454,  2301,  8025,
        10911,  9601,  8604, 10848,  5920,    16,     3]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [45]:
mlm_data=data_collator(dataset.examples)

In [46]:
mlm_data

{'input_ids': tensor([[    2,     4,   639,  ...,  2489,    14,     3],
        [    2,  2761,     4,  ...,  5057,    16,     3],
        [    2,  5961, 16532,  ...,     4,  2286,     3],
        ...,
        [    2, 16634,  2036,  ...,    22,  2651,     3],
        [    2,   757,  3609,  ...,     0,     0,     0],
        [    2,  7001,     4,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        ...,
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'next_sentence_label': tensor([0, 1, 1,  ..., 1, 1, 0]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[ -100,  4354,  -100,  ...,  -100,  -100,  -100],
        [ -100,  -100, 1

In [47]:
print(data_collator(dataset.examples)['input_ids'][0])

tensor([    2,  4354,   639,     5,  5497,     5,  5504, 11105,  2492,  2428,
         2780,  1968,  5379,  3120,  1940,  2407,    16,  5497,     4, 16248,
          552,  1266,   822,  1024,  1207,   931, 16491, 12283,  1095,  3667,
           16,  6532,  8934,  1077,  2677,  1906,    16,     3,   727,  1052,
           93,  7742,    93, 10411,  1007, 18365,  3483, 18887,    16,  6439,
         1968,  4021,   279,  3361,     4,  1401,  2105,  1933, 17664,    93,
          439,  1114, 19303,     1,  2023,  4086, 17983,    16,  2062,   496,
         2734,     5, 17664, 12973,     5,   379,  7718,    16,  4186,     4,
          750,   541,   711,  2795,  4860,  5152,  4776,   176,     4, 15561,
          654,  2786,     4,  1946,     4,  2895,  2054,    14,  9869,  6532,
          750,   762,  1116,  6463,  5152,  2823,  3951,  6532,   750,   762,
         2092,     4,  1899,    16,  2829,  4530,   727,  4783, 18217,  2661,
        17631, 13570,  2489,    14,     3])


In [48]:
print(data_collator(dataset.examples)['labels'][0])

tensor([ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  1968,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  1077,  2677,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,    16,  -100,
         -100,  -100,  -100,  3361,  -100,  1401,  2105,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,    16,  -100,   496,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  6532,
         -100,  -100,  1018,  -100,  4860,  -100,  -100,  -100, 11840,  -100,
         -100,  -100,  -100,  -100,  -100,  2895,  2054,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         2092,  -100,  1899,  -100,  2829,  -100,   727,  -100, 18217,  -100,
         -100,  -100,  2489,  -100,  -100])


In [49]:
tokenizer.decode(data_collator(dataset.examples)['input_ids'][0].tolist())

'[CLS] [MASK] 얼 " 지미 " 카터 상계 [MASK] 민주당 출신 미국 39번째 대통령 이다. 지미 [MASK] 조지아주 섬터 카운티 플레인스 마을에서 [MASK]. 조지아 [MASK]를 [MASK] [MASK] [MASK] [SEP] 전함 [MASK] 원자력 · 잠수함의 승무원으로 일하였다. 1953년 미국 해군 대위로 예편하였고 이후 땅콩 · 면화 등을 [UNK] 많은 돈을 벌었다. 그의 별 [MASK] " 땅콩 [MASK] " [MASK] 알려졌다. 1962년 섭 [MASK] 선형의원 의원 선거에서 낙선 [MASK] 그 [MASK] 부정선거 전쟁이음을 입증하게 되어 당선되고, 1966년 조지아 주 지사 선거에 낙선하지만 1970년 조지아 [MASK] 지사를 역임했다. 대통령이 되기 전 조지아주 상원의원을 두번 연임했으며, [SEP]'

In [50]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='model_output',
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_gpu_train_batch_size=16,
    save_steps=1000, # step 수마다 모델을 저장
    save_total_limit=2, # 마지막 두 모델 빼고 과거 모델은 삭제
    logging_steps=100
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator, # 밥을 어떻게 떠먹여줄지
    train_dataset=dataset # 밥이 뭔지
)

In [51]:
trainer.train() # wiki 전체 데이터로 학습 시, 1 epoch에 9시간 정도 소요됩니다!! 

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
***** Running training *****
  Num examples = 2726
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 342
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


Step,Training Loss
100,9.7332
200,9.3155
300,9.1853




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=342, training_loss=9.384090222810444, metrics={'train_runtime': 83.2172, 'train_samples_per_second': 65.515, 'train_steps_per_second': 4.11, 'total_flos': 352718194962000.0, 'train_loss': 9.384090222810444, 'epoch': 2.0})

In [None]:
a= 1+2

In [None]:
a