# KoBERT

### 패키지 설치

In [None]:
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece==0.1.96
!pip install transformers
!pip install torch



In [None]:
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

Collecting kobert_tokenizer
  Cloning https://github.com/SKTBrain/KoBERT.git to /tmp/pip-install-m4e87tlj/kobert-tokenizer_e6b71064ca9d4a8f90bf08833b3632a6
  Running command git clone --filter=blob:none --quiet https://github.com/SKTBrain/KoBERT.git /tmp/pip-install-m4e87tlj/kobert-tokenizer_e6b71064ca9d4a8f90bf08833b3632a6
  Resolved https://github.com/SKTBrain/KoBERT.git to commit 47a69af87928fc24e20f571fe10c3cc9dd9af9a3
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [None]:
import os
import os.path as p
import pandas as pd
import numpy as np
from tqdm import tqdm
import urllib.request
from sklearn import preprocessing
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AdamW, AutoTokenizer, AutoModel
import gluonnlp as nlp

from torch.utils.data import Dataset, DataLoader
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm, tqdm_notebook

# import warnings
# warnings.filterwarnings("ignore")
# os.environ["TOKENIZERS_PARALLELISM"] = "false"

### 데이터 전처리

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data = pd.read_csv("./drive/MyDrive/Colab Notebooks/hackathon/data.csv", sep='|')
data.tail()

Unnamed: 0,text,score
469,선생님 밥은 챙겨드셨나요?,0
470,안녕하세요. 선생님 우리 애의 학업에 대해서 걱정이 좀 되어서 이렇게 연락드렸습니다.,0
471,그건 변명이죠. 우리 애가 원래는 이렇게 후달리지 않았는데 선생님이 차별하시는 거 ...,2
472,다른 애들은 잘만 하는데 우리 애만 왜 이래요,1
473,선생님 우리의 성적이 왜 이래요 선생님 능력 부족 아닌가요,2


In [None]:
data[data.isna( ).any(axis=1)]

Unnamed: 0,text,score


In [None]:
data_list = []
for q, label in zip(data['text'], data['score']):
    data = []
    data.append(q)
    data.append(str(label))

    data_list.append(data)

print(data_list[200])

['그리고 선생님~ 2학기 때도 잘 부탁드릴게요~', '0']


In [None]:
from sklearn.model_selection import train_test_split

dataset_train, dataset_test = train_test_split(data_list, test_size=0.23, random_state=42)

print(len(dataset_train))
print(len(dataset_test))

364
110


### 학습 세팅

In [None]:
# Setting parameters
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 12
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [None]:
from transformers import BertModel
from kobert_tokenizer import KoBERTTokenizer

bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token="[PAD]")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


In [None]:
# 데이터 형식
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, vocab, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, vocab=vocab, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.float64(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [None]:
tok = tokenizer.tokenize

data_train = BERTDataset(dataset_train, 0, 1, tok, vocab, max_len, True, False)
data_test = BERTDataset(dataset_test,0, 1, tok, vocab, max_len, True, False)

In [None]:
data_train[0]

(array([   2, 3833,  990, 3270, 7074, 6705, 7275,  258,    3,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1], dtype=int32),
 array(9, dtype=int32),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       dtype=int32),
 2.0)

In [None]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

In [None]:
# 모델 설정
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=3,   ##클래스 수 조정##
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [None]:
from transformers.optimization import get_cosine_schedule_with_warmup

#BERT 모델 불러오기
device = torch.device("cuda:0")
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)

#optimizer와 schedule 설정
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

#정확도 측정을 위한 함수 정의
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

train_dataloader



<torch.utils.data.dataloader.DataLoader at 0x7d8c93ead000>

### 학습

In [None]:
# 학습

for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))

    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

    # 모델 저장
    torch.save(model.state_dict(), f"./drive/MyDrive/Colab Notebooks/hackathon/KoBERT-notebook/model/model_{e+1}epochs.pt")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/6 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 1.1687934398651123 train acc 0.390625
epoch 1 train acc 0.3972537878787879


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/2 [00:00<?, ?it/s]

epoch 1 test acc 0.53125


  0%|          | 0/6 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 1.027596354484558 train acc 0.53125
epoch 2 train acc 0.5423768939393939


  0%|          | 0/2 [00:00<?, ?it/s]

epoch 2 test acc 0.6199048913043479


  0%|          | 0/6 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 0.910099446773529 train acc 0.65625
epoch 3 train acc 0.6266571969696969


  0%|          | 0/2 [00:00<?, ?it/s]

epoch 3 test acc 0.6603260869565217


  0%|          | 0/6 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 0.8083211183547974 train acc 0.671875
epoch 4 train acc 0.6669034090909092


  0%|          | 0/2 [00:00<?, ?it/s]

epoch 4 test acc 0.6307744565217391


  0%|          | 0/6 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 0.7238174080848694 train acc 0.703125
epoch 5 train acc 0.6889204545454546


  0%|          | 0/2 [00:00<?, ?it/s]

epoch 5 test acc 0.6307744565217391


  0%|          | 0/6 [00:00<?, ?it/s]

epoch 6 batch id 1 loss 0.6493707299232483 train acc 0.75
epoch 6 train acc 0.6875


  0%|          | 0/2 [00:00<?, ?it/s]

epoch 6 test acc 0.6807065217391304


  0%|          | 0/6 [00:00<?, ?it/s]

epoch 7 batch id 1 loss 0.6634775400161743 train acc 0.765625
epoch 7 train acc 0.7163825757575758


  0%|          | 0/2 [00:00<?, ?it/s]

epoch 7 test acc 0.6559103260869565


  0%|          | 0/6 [00:00<?, ?it/s]

epoch 8 batch id 1 loss 0.6493077874183655 train acc 0.75
epoch 8 train acc 0.7575757575757575


  0%|          | 0/2 [00:00<?, ?it/s]

epoch 8 test acc 0.6885190217391304


  0%|          | 0/6 [00:00<?, ?it/s]

epoch 9 batch id 1 loss 0.5125104188919067 train acc 0.765625
epoch 9 train acc 0.7523674242424242


  0%|          | 0/2 [00:00<?, ?it/s]

epoch 9 test acc 0.6433423913043479


  0%|          | 0/6 [00:00<?, ?it/s]

epoch 10 batch id 1 loss 0.5081787109375 train acc 0.796875
epoch 10 train acc 0.8030303030303031


  0%|          | 0/2 [00:00<?, ?it/s]

epoch 10 test acc 0.7041440217391304


  0%|          | 0/6 [00:00<?, ?it/s]

epoch 11 batch id 1 loss 0.46256041526794434 train acc 0.828125
epoch 11 train acc 0.8196022727272728


  0%|          | 0/2 [00:00<?, ?it/s]

epoch 11 test acc 0.6807065217391304


  0%|          | 0/6 [00:00<?, ?it/s]

epoch 12 batch id 1 loss 0.46826112270355225 train acc 0.8125
epoch 12 train acc 0.8233901515151515


  0%|          | 0/2 [00:00<?, ?it/s]

epoch 12 test acc 0.6698369565217391


### 사용

In [None]:
# 모델 불러오기
model.load_state_dict(torch.load("./drive/MyDrive/Colab Notebooks/hackathon/KoBERT-notebook/model/model_8epochs.pt"))

<All keys matched successfully>

In [None]:
# 사용하는 부분

tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1', return_dict=False)
tok = tokenizer.tokenize

def predict(predict_sentence):

    data = [predict_sentence, '0']
    dataset_another = [data]

    another_test = BERTDataset(dataset_another, 0, 1, tok, vocab, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(another_test, batch_size=batch_size, num_workers=5)

    model.eval()

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)

        valid_length= valid_length
        label = label.long().to(device)

        out = model(token_ids, valid_length, segment_ids)


        test_eval=[]
        for i in out:
            logits=i
            logits = logits.detach().cpu().numpy()

            if np.argmax(logits) == 0:
                test_eval.append("IMMORAL_NONE")
            elif np.argmax(logits) == 1:
                test_eval.append("IMMORAL")
            elif np.argmax(logits) == 2:
                test_eval.append("IMMORAL_MAX")

        print(">> 입력하신 내용에서 " + test_eval[0] + " 이/가 느껴집니다.")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


In [None]:
end = 1
while end == 1 :
    sentence = input("하고싶은 말을 입력해주세요 : ")
    if sentence == "0": break
    predict(sentence)
    print("\n")

하고싶은 말을 입력해주세요 : 안녕하세요 선생님
>> 입력하신 내용에서 IMMORAL_NONE 이/가 느껴집니다.


하고싶은 말을 입력해주세요 : 저희 아이가 학교에서 잘 배우고 있는지 궁금해서 연락드렸어요
>> 입력하신 내용에서 IMMORAL_NONE 이/가 느껴집니다.


하고싶은 말을 입력해주세요 : 장난하세요 선생님?
>> 입력하신 내용에서 IMMORAL_MAX 이/가 느껴집니다.


하고싶은 말을 입력해주세요 : 방금 말을 취소할게요 죄송해요
>> 입력하신 내용에서 IMMORAL_NONE 이/가 느껴집니다.


하고싶은 말을 입력해주세요 : 장난이에요 선생님
>> 입력하신 내용에서 IMMORAL_MAX 이/가 느껴집니다.


하고싶은 말을 입력해주세요 : 선생님 밥은 챙겨드셨나요?
>> 입력하신 내용에서 IMMORAL 이/가 느껴집니다.


하고싶은 말을 입력해주세요 : 우리 애가 우스워요?
>> 입력하신 내용에서 IMMORAL_MAX 이/가 느껴집니다.


하고싶은 말을 입력해주세요 : 저희 아이 입시 관련해 궁금한게 있어요. 유찬이 성적으로 어느 정도 대학까지 갈 수 있나요?
>> 입력하신 내용에서 IMMORAL_NONE 이/가 느껴집니다.


하고싶은 말을 입력해주세요 : 재림이는 못생겼나요?
>> 입력하신 내용에서 IMMORAL_MAX 이/가 느껴집니다.


하고싶은 말을 입력해주세요 : 아오 선생시치
>> 입력하신 내용에서 IMMORAL_MAX 이/가 느껴집니다.


하고싶은 말을 입력해주세요 : 씨발련아 감사합니다
>> 입력하신 내용에서 IMMORAL_NONE 이/가 느껴집니다.


하고싶은 말을 입력해주세요 : 잘 먹고 잘 사세요
>> 입력하신 내용에서 IMMORAL_NONE 이/가 느껴집니다.


하고싶은 말을 입력해주세요 : 저희 애 잘 부탁드리고요 남들보다 조금 더 챙겨주세요 저희 애는 귀하니까요
>> 입력하신 내용에서 IMMORAL 이/가 느껴집니다.


하고싶은 말을 입력해주세요 : 변명하지 마세요
>> 입력하신 내용에서 IMM