# 네이버 영화리뷰 감정분석 with SKT KoBERT
참고 소스 출처(링크) : https://github.com/SKTBrain/KoBERT

https://github.com/SKTBrain/KoBERT#using-with-pytorch

https://colab.research.google.com/github/SKTBrain/KoBERT/blob/master/scripts/NSMC/naver_review_classifications_pytorch_kobert.ipynb

# 준비
라이브러리, 파라미터 세팅

In [1]:
# !pip install mxnet
# !pip install gluonnlp pandas tqdm
# !pip install sentencepiece
# !pip install transformers
# !pip install torch

In [2]:
# !pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

In [3]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
import pandas as pd
from tqdm import tqdm, tqdm_notebook
import csv
import os

from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

### Parameter Setting

In [4]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'

TEST_SIZE = 0.2
RANDOM_SEED = 42
MAXLEN = 128  # 64
BATCHSIZE = 32  # 64
EPOCHS = 5  # 5

### GPU Setting

In [5]:
# 디바이스 설정 확인
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 4 GPU(s) available.
We will use the GPU: GeForce RTX 2080 Ti


In [6]:
# GPU 할당 변경하기
GPU_NUM = 2 # 원하는 GPU 번호 입력
device = torch.device(f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')
torch.cuda.set_device(device) # change allocation of current GPU
print ('Current cuda device ', torch.cuda.current_device()) # check

Current cuda device  2


### KoBERT Setting

In [7]:
bertmodel, vocab = get_pytorch_kobert_model()

using cached model
using cached model


In [8]:
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model


# 데이터 로드

In [9]:
# 학습 데이터 로드
train = pd.read_csv(DATA_IN_PATH + 'ratings_train.txt', sep='\t')
train.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [10]:
# 검증(Original Test Dataset) 데이터 로드
dev = pd.read_csv(DATA_IN_PATH + 'ratings_test.txt', sep='\t')
dev.head()

Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
1,9274899,GDNTOPCLASSINTHECLUB,0
2,8544678,뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아,0
3,6825595,지루하지는 않은데 완전 막장임... 돈주고 보기에는....,0
4,6723715,3D만 아니었어도 별 다섯 개 줬을텐데.. 왜 3D로 나와서 제 심기를 불편하게 하죠??,0


In [11]:
# 테스트(캐글) 데이터 로드
test = pd.read_csv(DATA_IN_PATH + 'ko_data.csv', encoding = 'cp949')
test.columns = ['id','document']
test.head()

Unnamed: 0,id,document
0,0,정말 많이 울었던 영화입니다.
1,1,시간 낭비예요.
2,2,포스터를 저렇게밖에 만들지 못했던 제작자의 소심함에 침을 뱉고 싶다.
3,3,지금 봐도 재미있는 영화!!! 코믹과 감동!!! 그리고 요리!!!
4,4,이걸 영화로 만드는 거야?얼마나 가는지 보자.


In [12]:
# 학습/검증 데이터를 txt 파일로 저장
train.to_csv(DATA_IN_PATH+'train.txt', sep = '\t', index = True)
dev.to_csv(DATA_IN_PATH+'dev.txt', sep = '\t', index = True)

In [13]:
# 테스트(캐글) 데이터를 txt 파일로 저장(label 없음)
csv_file = './data_in/ko_data.csv'
txt_file = './data_in/ko_data.txt'
with open(txt_file, "w") as my_output_file:
    with open(csv_file, "r", encoding='cp949') as my_input_file: # , encoding='cp949'
        [ my_output_file.write("\t".join(row)+'\n') for row in csv.reader(my_input_file)]
    my_output_file.close()

In [14]:
dataset_train= nlp.data.TSVDataset(DATA_IN_PATH+"train.txt", field_indices=[2,3], num_discard_samples=1)
dataset_dev = nlp.data.TSVDataset(DATA_IN_PATH+"dev.txt", field_indices=[2,3], num_discard_samples=1)

In [15]:
dataset_test = nlp.data.TSVDataset(DATA_IN_PATH+"ko_data.txt", field_indices=[1], num_discard_samples=1)

In [16]:
dataset_test

<gluonnlp.data.dataset.TSVDataset at 0x7f62d895fdd8>

# 모델 구현

In [17]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [18]:
class BERTDataset_Test(Dataset):
    def __init__(self, dataset, sent_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        #self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i])

    def __len__(self):
        return (len(self.sentences))

In [19]:
## Setting parameters
max_len = MAXLEN  # 64
batch_size = BATCHSIZE  # 64
warmup_ratio = 0.1
num_epochs = EPOCHS  # 5
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [20]:
data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
data_dev = BERTDataset(dataset_dev, 0, 1, tok, max_len, True, False)
data_test = BERTDataset_Test(dataset_test, 0, tok, max_len, True, False)

In [21]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
dev_dataloader = torch.utils.data.DataLoader(data_dev, batch_size=batch_size, num_workers=5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

In [22]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=2,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [23]:
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)

In [24]:
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [25]:
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

In [26]:
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

In [27]:
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

# 모델 학습 및 평가

In [28]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [29]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    
    print()
    print('======== Epoch {:} / {:} ========'.format(e+1, num_epochs))
    
    # 모델 학습
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    
#     # 모델 평가 => GPU 메모리 최적화 위해 주석 처리
#     model.eval()
#     for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(dev_dataloader)):
#         token_ids = token_ids.long().to(device)
#         segment_ids = segment_ids.long().to(device)
#         valid_length= valid_length
#         label = label.long().to(device)
#         out = model(token_ids, valid_length, segment_ids)
#         test_acc += calc_accuracy(out, label)
#     print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))
    print()




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4688.0), HTML(value='')))

epoch 1 batch id 1 loss 0.8223128914833069 train acc 0.34375
epoch 1 batch id 201 loss 0.6484859585762024 train acc 0.5212997512437811
epoch 1 batch id 401 loss 0.5482144355773926 train acc 0.6188435162094763
epoch 1 batch id 601 loss 0.3797118365764618 train acc 0.6797004991680532
epoch 1 batch id 801 loss 0.48848333954811096 train acc 0.7158239700374532
epoch 1 batch id 1001 loss 0.3856777846813202 train acc 0.7411963036963037
epoch 1 batch id 1201 loss 0.4960686266422272 train acc 0.7584825145711906
epoch 1 batch id 1401 loss 0.3125966787338257 train acc 0.7716809421841542
epoch 1 batch id 1601 loss 0.5354685187339783 train acc 0.781816052467208
epoch 1 batch id 1801 loss 0.4149617552757263 train acc 0.7892837312604108
epoch 1 batch id 2001 loss 0.3206002116203308 train acc 0.7955553473263368
epoch 1 batch id 2201 loss 0.8603451251983643 train acc 0.8009427532939573
epoch 1 batch id 2401 loss 0.2964036762714386 train acc 0.8054066014160767
epoch 1 batch id 2601 loss 0.26456096768379

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4688.0), HTML(value='')))

epoch 2 batch id 1 loss 0.40682393312454224 train acc 0.78125
epoch 2 batch id 201 loss 0.27893537282943726 train acc 0.8826181592039801
epoch 2 batch id 401 loss 0.28871193528175354 train acc 0.8817019950124688
epoch 2 batch id 601 loss 0.2230505496263504 train acc 0.8811356073211315
epoch 2 batch id 801 loss 0.21518996357917786 train acc 0.882334581772784
epoch 2 batch id 1001 loss 0.3399496376514435 train acc 0.8837100399600399
epoch 2 batch id 1201 loss 0.33655238151550293 train acc 0.8852258534554538
epoch 2 batch id 1401 loss 0.3510580360889435 train acc 0.8860635260528195
epoch 2 batch id 1601 loss 0.42435014247894287 train acc 0.8868480637101811
epoch 2 batch id 1801 loss 0.2732967138290405 train acc 0.8877533314825097
epoch 2 batch id 2001 loss 0.2490934580564499 train acc 0.8883214642678661
epoch 2 batch id 2201 loss 0.3309955298900604 train acc 0.8891271013175829
epoch 2 batch id 2401 loss 0.3931578993797302 train acc 0.8902800916284881
epoch 2 batch id 2601 loss 0.036052014

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4688.0), HTML(value='')))

epoch 3 batch id 1 loss 0.33729299902915955 train acc 0.84375
epoch 3 batch id 201 loss 0.15499551594257355 train acc 0.919931592039801
epoch 3 batch id 401 loss 0.124224953353405 train acc 0.9205891521197007
epoch 3 batch id 601 loss 0.2853637933731079 train acc 0.9211730449251248
epoch 3 batch id 801 loss 0.3366405665874481 train acc 0.9230259051186017
epoch 3 batch id 1001 loss 0.28993335366249084 train acc 0.9242007992007992
epoch 3 batch id 1201 loss 0.40696951746940613 train acc 0.9252966278101582
epoch 3 batch id 1401 loss 0.3583436906337738 train acc 0.9261018915060671
epoch 3 batch id 1601 loss 0.41202470660209656 train acc 0.9272134603372892
epoch 3 batch id 1801 loss 0.15729005634784698 train acc 0.9286160466407551
epoch 3 batch id 2001 loss 0.09297055751085281 train acc 0.9295508495752124
epoch 3 batch id 2201 loss 0.41372087597846985 train acc 0.9304435483870968
epoch 3 batch id 2401 loss 0.1043100506067276 train acc 0.9318122657226156
epoch 3 batch id 2601 loss 0.07822263

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4688.0), HTML(value='')))

epoch 4 batch id 1 loss 0.32589760422706604 train acc 0.90625
epoch 4 batch id 201 loss 0.08926602452993393 train acc 0.9508706467661692
epoch 4 batch id 401 loss 0.04859985411167145 train acc 0.95160536159601
epoch 4 batch id 601 loss 0.14075715839862823 train acc 0.9534109816971714
epoch 4 batch id 801 loss 0.18673458695411682 train acc 0.9547440699126092
epoch 4 batch id 1001 loss 0.04200735688209534 train acc 0.9551698301698301
epoch 4 batch id 1201 loss 0.43951284885406494 train acc 0.955661948376353
epoch 4 batch id 1401 loss 0.17713716626167297 train acc 0.9560581727337616
epoch 4 batch id 1601 loss 0.4491548538208008 train acc 0.956648188632105
epoch 4 batch id 1801 loss 0.12827111780643463 train acc 0.9571418656302054
epoch 4 batch id 2001 loss 0.06795501708984375 train acc 0.9580990754622689
epoch 4 batch id 2201 loss 0.3580518662929535 train acc 0.9588681281235802
epoch 4 batch id 2401 loss 0.013877208344638348 train acc 0.9598604748021657
epoch 4 batch id 2601 loss 0.005834

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4688.0), HTML(value='')))

epoch 5 batch id 1 loss 0.28779685497283936 train acc 0.90625
epoch 5 batch id 201 loss 0.04054595157504082 train acc 0.9715485074626866
epoch 5 batch id 401 loss 0.03454947844147682 train acc 0.9732699501246883
epoch 5 batch id 601 loss 0.019890131428837776 train acc 0.9740536605657238
epoch 5 batch id 801 loss 0.008477913215756416 train acc 0.9744460049937578
epoch 5 batch id 1001 loss 0.005957565736025572 train acc 0.9748688811188811
epoch 5 batch id 1201 loss 0.26583659648895264 train acc 0.9744743963363863
epoch 5 batch id 1401 loss 0.09290117770433426 train acc 0.9750624553890078
epoch 5 batch id 1601 loss 0.22208064794540405 train acc 0.9754645534041224
epoch 5 batch id 1801 loss 0.11809958517551422 train acc 0.9757599944475291
epoch 5 batch id 2001 loss 0.004006384406238794 train acc 0.9760588455772113
epoch 5 batch id 2201 loss 0.24160245060920715 train acc 0.9764311676510677
epoch 5 batch id 2401 loss 0.0084375673905015 train acc 0.9768586005830904
epoch 5 batch id 2601 loss 

### F1-Score 확인

In [30]:
dev_preds = []
y_dev = []

for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(dev_dataloader)):
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    valid_length = valid_length
    label = label.to('cpu').numpy()  # label을 torch tensor에서 numpy 타입으로 변경
    out = model(token_ids, valid_length, segment_ids)
    
    for i in label:
        y_dev.append(i)
    
    for i in range(len(out)):
        dev_preds.append(np.argmax(out[i].detach().cpu().numpy()))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1563.0), HTML(value='')))




In [31]:
dev_preds[:5], y_dev[:5]

([1, 1, 0, 0, 0], [1, 0, 0, 0, 0])

In [32]:
from sklearn.metrics import f1_score
f1 = f1_score(y_dev, dev_preds, average='weighted')
print('F1 Score : {:.5f}'.format(f1))

F1 Score : 0.89658


# (참고) 제출 파일 생성

In [33]:
test_preds = []

for batch_id, (token_ids, valid_length, segment_ids) in enumerate(tqdm_notebook(test_dataloader)):
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    valid_length = valid_length
    out = model(token_ids, valid_length, segment_ids)
    
    for i in range(len(out)):
        test_preds.append(np.argmax(out[i].detach().cpu().numpy()))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=350.0), HTML(value='')))




In [34]:
len(test_preds)

11187

In [35]:
test_preds[:5]

[1, 0, 1, 1, 0]

In [36]:
# 테스트 데이터의 리뷰 부분을 리스트 처리
test_id = list(test['id'])

# 데이터프레임 통해 데이터 구성하여 output에 투입
output = pd.DataFrame( data={"Id": test_id, "Predicted": test_preds} )
output.head()

Unnamed: 0,Id,Predicted
0,0,1
1,1,0
2,2,1
3,3,1
4,4,0


In [37]:
# 해당 경로가 없으면 생성
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

SAVE_NM = "NSMC_KoBERT_MAXLN"+str(max_len)+"_BATSZ"+str(batch_size)+"_EPOCH"+str(num_epochs)+".csv"

# csv 파일 생성
output.to_csv(DATA_OUT_PATH + SAVE_NM, index = False)  # 앙상블 조합 및 캐글 제출 용도

### 캐글 제출 결과
**[2020.12.14]**<br>
max_len = 64, batch_size = 64, epoch 1 => 0.87162<br>
max_len = 64, batch_size = 64, epoch 3 => 0.89719<br>
**max_len = 64, batch_size = 64, epoch 5 => 0.90076** => 0.90452<br>

max_len = 64, batch_size = 32, epoch 1 => 0.87681<br>
max_len = 64, batch_size = 32, epoch 3 => 0.88896<br>
**max_len = 64, batch_size = 32, epoch 5 => 0.90005** => 0.90059<br>

max_len = 128, batch_size = 32, epoch 1 => 0.87806<br>
max_len = 128, batch_size = 32, epoch 3 => 0.89504<br>
**max_len = 128, batch_size = 32, epoch 5 => 0.90112** => 0.90541<br>

(실패) max_len = 128, batch_size = 64, epoch 1 => Out of Memory (GPU)<br>
(실패) max_len = 128, batch_size = 64, epoch 3 => Out of Memory (GPU)<br>
(실패) max_len = 128, batch_size = 64, epoch 5 => Out of Memory (GPU)<br>

**[2020.12.15]**<br>
**max_len = 64, batch_size = 32, epoch 5 => 0.90059<br>
max_len = 64, batch_size = 64, epoch 5 => 0.90452<br>
max_len = 128, batch_size = 32, epoch 5 => 0.90541**<br>