# 대회 데이터로 KoBERT를 fine-tuning 한 뒤 유사도 구하기

## 데이터 준비하기

In [1]:
from datasets import load_from_disk
raw_dataset = load_from_disk("../data/train_dataset/")
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['__index_level_0__', 'answers', 'context', 'document_id', 'id', 'question', 'title'],
        num_rows: 3952
    })
    validation: Dataset({
        features: ['__index_level_0__', 'answers', 'context', 'document_id', 'id', 'question', 'title'],
        num_rows: 240
    })
})

In [2]:
# 이미 설치 했으면 주석 처리
# !pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

In [3]:
import torch
from transformers import (
    BertModel, BertForMaskedLM, DataCollatorForLanguageModeling, AutoConfig,
    TrainingArguments, Trainer
)
from kobert_tokenizer import KoBERTTokenizer

config = AutoConfig.from_pretrained('skt/kobert-base-v1')
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
model = BertForMaskedLM.from_pretrained('skt/kobert-base-v1', config=config)

Some weights of BertForMaskedLM were not initialized from the model checkpoint at skt/kobert-base-v1 and are newly initialized: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
tokenizer.padding_side = 'right'

In [5]:
text_column_name = 'question'
def tokenize_func(examples):
    return tokenizer(examples[text_column_name],
                     truncation=True,
                     max_length=500,
                     return_special_tokens_mask=False,
                     return_token_type_ids=False) # token_type_ids 사용하면 에러 발생

In [6]:
column_names = raw_dataset['train'].column_names
tokenized_datasets = raw_dataset.map(
                tokenize_func,
                batched=True,
                remove_columns=column_names
                )

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [7]:
train_dataset = tokenized_datasets['train']

In [8]:
train_dataset[1]

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'input_ids': [2,
  5051,
  7202,
  3769,
  7260,
  5478,
  7095,
  2959,
  6295,
  7224,
  1770,
  4457,
  7086,
  258,
  3]}

In [9]:
train_dataset.num_rows

3952

## KoBERT 학습

In [10]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="./models/bertmlm",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=1000,
    save_total_limit=2,
    prediction_loss_only=True,
    report_to = None,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

In [11]:
torch.cuda.is_available()

True

In [13]:
trainer.train()
trainer.save_model("./models/bertmlm")

Step,Training Loss
500,2.6621
1000,2.3702


## Papago 데이터와 유사도 구하기

### Papago RTT 데이터 불러오기

In [14]:
import pandas as pd
ppg = pd.read_csv('trainset_rtt_papago.csv')

In [15]:
good = [0, 6, 7, 8, 9, 14, 18, 19, 22, 23, 26, 28, 39, 46, 47]
bad = [2, 11, 13, 21, 24, 27, 30, 31, 36, 37, 41, 43, 45, 48]

In [16]:
good_ppg = pd.DataFrame(columns=ppg.columns) # 빈 데이터 프레임 생성
bad_ppg = pd.DataFrame(columns=ppg.columns)

good = good[:10] # 10개만 시도
bad = bad[:10]

for k in good:
    good_ppg = good_ppg.append(ppg.iloc[k])
    
for j in bad:
    bad_ppg = bad_ppg.append(ppg.iloc[j])

In [17]:
good_ppg = good_ppg.reset_index(drop=True)
bad_ppg = bad_ppg.reset_index(drop=True)

### 추가학습 시킨 KoBERT로 임베딩을 만들어보자

In [18]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
tokenizer.padding_side = 'right'
model = BertForMaskedLM.from_pretrained('./models/bertmlm/')

In [19]:
# 파라미터를 freeze
for param in model.parameters():
    param.requires_grad_(False)

In [None]:
# 모델의 중간 레이어의 출력값을 얻는 함수를 정의합니다.
# 출처: https://deep-learning-study.tistory.com/678
'''
def get_features(x, model, layers):
    features = {}
    for name, layer in enumerate(model.children()): # 0, conv
        print(name, layer)
        x = layer(x)
        if str(name) in layers:
            features[layers[str(name)]] = x
    return features
'''

In [24]:
for name, layer in enumerate(model.modules()):
    print(name, layer)

0 BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

In [17]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(8002, 768, padding_idx=1)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )

In [26]:
import torch.nn as nn
def get_BertModel_features(x, model):
    module_name = [1, 217] # 1: BertModel, 217: Linear(768,768)
    tanh = nn.Tanh()
    features = {}
    for name, layer in enumerate(model.modules()): # 0, conv
        # print(type(name), name)
        if name == 1: # BertModel
            print('name: ', name)
            x = layer(input_ids=torch.tensor(x['input_ids']),
                      attention_mask = torch.tensor(x['attention_mask']))
            # pooler input
            first_token_tensor = x.last_hidden_state[:, 0]
        if name == 217: # dense layer
            print('name: ', name)
            features = layer(first_token_tensor)
    return tanh(features)

In [27]:
# question을 string으로 구성된 리스트로 넣기
good_ppg_q = good_ppg['question'].to_list()

tokenized_good_ppg_q = tokenizer.batch_encode_plus(good_ppg_q, padding=True)

In [28]:
good_ppg_out = get_BertModel_features(tokenized_good_ppg_q, model)

name:  1
name:  217


In [29]:
good_ppg_out.size()

torch.Size([10, 768])

In [30]:
# raw data (reference) 가져오기
raw_dataset = raw_dataset['train']

good_ref = []
bad_ref = []

for i in good:
    good_ref.append(raw_dataset[i]['question'])

for k in bad:
    bad_ref.append(raw_dataset[k]['question'])

KeyError: "Column train not in the dataset. Current columns in the dataset: ['title', 'context', 'question', 'id', 'answers', 'document_id', '__index_level_0__']"

In [31]:
tokenized_good_ref = tokenizer.batch_encode_plus(good_ref, padding=True)
good_ref_out = get_BertModel_features(tokenized_good_ref, model)

name:  1
name:  217


In [32]:
# from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from numpy import dot
from numpy.linalg import norm

def cos_sim(A, B):
    return dot(A, B)/(norm(A)*norm(B))

In [33]:
for i in range(10):
    print(f'{i}th sentence cosine similarity: ', cos_sim(good_ppg_out[i].detach().numpy(), good_ref_out[i].detach().numpy()))

0th sentence cosine similarity:  0.95662755
1th sentence cosine similarity:  0.9589746
2th sentence cosine similarity:  0.92807776
3th sentence cosine similarity:  0.8765848
4th sentence cosine similarity:  0.9608211
5th sentence cosine similarity:  0.9542782
6th sentence cosine similarity:  0.9714728
7th sentence cosine similarity:  0.97925645
8th sentence cosine similarity:  0.95834714
9th sentence cosine similarity:  0.93096423


In [34]:
bad_ppg_q = bad_ppg['question'].to_list()
tokenized_bad_ppg_q = tokenizer.batch_encode_plus(bad_ppg_q, padding=True)
bad_ppg_out = get_BertModel_features(tokenized_bad_ppg_q, model)

name:  1
name:  217


In [35]:
tokenized_bad_ref = tokenizer.batch_encode_plus(bad_ref, padding=True)
bad_ref_out = get_BertModel_features(tokenized_bad_ref, model)

name:  1
name:  217


In [36]:
for i in range(10):
    print(f'{i}th sentence cosine similarity: ', cos_sim(bad_ppg_out[i].detach().numpy(), bad_ref_out[i].detach().numpy()))

0th sentence cosine similarity:  0.9828185
1th sentence cosine similarity:  0.96994036
2th sentence cosine similarity:  0.85488194
3th sentence cosine similarity:  0.96770674
4th sentence cosine similarity:  0.904391
5th sentence cosine similarity:  0.9749282
6th sentence cosine similarity:  0.9095956
7th sentence cosine similarity:  0.9493344
8th sentence cosine similarity:  0.94170237
9th sentence cosine similarity:  0.9745601
