# 대회 데이터로 KoBERT를 fine-tuning 한 뒤 유사도 구하기

## 데이터 준비

In [1]:
from datasets import load_from_disk
raw_dataset = load_from_disk("../data/train_dataset/")
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['__index_level_0__', 'answers', 'context', 'document_id', 'id', 'question', 'title'],
        num_rows: 3952
    })
    validation: Dataset({
        features: ['__index_level_0__', 'answers', 'context', 'document_id', 'id', 'question', 'title'],
        num_rows: 240
    })
})

In [2]:
# 이미 설치 했으면 주석 처리
# !pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

In [68]:
import torch
from transformers import (
    BertModel, BertForMaskedLM, DataCollatorForLanguageModeling, AutoConfig,
    TrainingArguments, Trainer
)
from kobert_tokenizer import KoBERTTokenizer

config = AutoConfig.from_pretrained('skt/kobert-base-v1')
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
model = BertForMaskedLM.from_pretrained('skt/kobert-base-v1', config=config)

Some weights of BertForMaskedLM were not initialized from the model checkpoint at skt/kobert-base-v1 and are newly initialized: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [69]:
text_column_name = 'question'
def tokenize_func(examples):
    return tokenizer(examples[text_column_name],
                     truncation=True,
                     max_length=512,
                     return_special_tokens_mask=True)

In [70]:
column_names = raw_dataset['train'].column_names
tokenized_datasets = raw_dataset.map(
                tokenize_func,
                batched=True,
                remove_columns=column_names
                )

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [71]:
train_dataset = tokenized_datasets['train']

In [72]:
train_dataset[1]

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'input_ids': [2,
  5051,
  7202,
  3769,
  7260,
  5478,
  7095,
  2959,
  6295,
  7224,
  1770,
  4457,
  7086,
  258,
  3],
 'special_tokens_mask': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

In [73]:
train_dataset.num_rows

3952

In [74]:
# max_len = 0
# for i in range(train_dataset.num_rows):
#     seq_len = len(train_dataset['input_ids'][i])
#     if seq_len > max_len:
#         max_len = seq_len
        
# print(max_len) # 45

In [75]:
from torch.utils.data import DataLoader

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer)
train_dataloader = DataLoader(
    train_dataset, shuffle=True, batch_size=8, collate_fn=data_collator
)

In [76]:
# 라벨을 따로 만들어주지 않아도 자동으로 생성해주는 것을 확인 할 수 있음
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'attention_mask': torch.Size([8, 21]),
 'input_ids': torch.Size([8, 21]),
 'token_type_ids': torch.Size([8, 21]),
 'labels': torch.Size([8, 21])}

In [77]:
# 체크 완료
# print(batch.input_ids)
# print(batch.labels)

In [101]:
# Why token_type_ids has the value 3?
for i in range(8):
    print(batch.token_type_ids[i])

tensor([3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


In [108]:
# No error (Error when given token_type_ids)
outputs = model(input_ids=batch.input_ids, attention_mask=batch.attention_mask)

In [89]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5) # Trainer default

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tru

In [None]:
from transformers import get_scheduler

num_epochs = 3 # Trainer default
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)
print(num_training_steps)

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [None]:
tokenizer.vocab_size

In [None]:
config

In [None]:
tokenizer.model_max_length

In [63]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained('skt/kobert-base-v1')

In [67]:
config.type_vocab_size

2