# 네이버 영화리뷰 감정분석 with SKT KoBERT and Data Augmentaion(KNU감성사전)
참고 소스 출처(링크) : https://github.com/SKTBrain/KoBERT

https://github.com/SKTBrain/KoBERT#using-with-pytorch

https://colab.research.google.com/github/SKTBrain/KoBERT/blob/master/scripts/NSMC/naver_review_classifications_pytorch_kobert.ipynb

In [1]:
# !pip install mxnet
# !pip install gluonnlp pandas tqdm
# !pip install sentencepiece
# !pip install transformers
# !pip install torch

In [2]:
# !pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

In [3]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm, tqdm_notebook
import csv
import os

from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

### Parameter Setting

In [4]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'

TEST_SIZE = 0.2
RANDOM_SEED = 42
MAXLEN = 128  # 64
BATCHSIZE = 32  # 64
EPOCHS = 5  # 5

### GPU Setting

In [5]:
# 디바이스 설정 확인
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 4 GPU(s) available.
We will use the GPU: GeForce RTX 2080 Ti


In [6]:
# GPU 할당 변경하기
GPU_NUM = 0 # 원하는 GPU 번호 입력
device = torch.device(f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')
torch.cuda.set_device(device) # change allocation of current GPU
print ('Current cuda device ', torch.cuda.current_device()) # check

Current cuda device  0


### KoBERT Setting

In [7]:
bertmodel, vocab = get_pytorch_kobert_model()

using cached model
using cached model


In [8]:
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model


# 데이터 로드

In [9]:
# 학습 데이터 로드
train = pd.read_csv(DATA_IN_PATH + 'ratings_train.txt', sep='\t')
train.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [10]:
# 검증(Original Test Dataset) 데이터 로드
dev = pd.read_csv(DATA_IN_PATH + 'ratings_test.txt', sep='\t')
dev.head()

Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
1,9274899,GDNTOPCLASSINTHECLUB,0
2,8544678,뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아,0
3,6825595,지루하지는 않은데 완전 막장임... 돈주고 보기에는....,0
4,6723715,3D만 아니었어도 별 다섯 개 줬을텐데.. 왜 3D로 나와서 제 심기를 불편하게 하죠??,0


In [11]:
# 테스트(캐글) 데이터 로드
test = pd.read_csv(DATA_IN_PATH + 'ko_data.csv', encoding = 'cp949')
test.columns = ['id','document']
test.head()

Unnamed: 0,id,document
0,0,정말 많이 울었던 영화입니다.
1,1,시간 낭비예요.
2,2,포스터를 저렇게밖에 만들지 못했던 제작자의 소심함에 침을 뱉고 싶다.
3,3,지금 봐도 재미있는 영화!!! 코믹과 감동!!! 그리고 요리!!!
4,4,이걸 영화로 만드는 거야?얼마나 가는지 보자.


In [12]:
# 학습/검증 데이터를 txt 파일로 저장
train.to_csv(DATA_IN_PATH+'train.txt', sep = '\t', index = True)
dev.to_csv(DATA_IN_PATH+'dev.txt', sep = '\t', index = True)

### KNU감성사전

In [13]:
dic_data_value = pd.read_table(DATA_IN_PATH + 'SentiWordDict/SentiWord_Dict.txt', header=None)
dic_data_value.rename(columns={dic_data_value.columns[0] : 'document', dic_data_value.columns[1] : 'label'}, inplace=True)
dic_data_value

Unnamed: 0,document,label
0,(-;,1.0
1,(;_;),-1.0
2,(^^),1.0
3,(^-^),1.0
4,(^^*,1.0
...,...,...
14850,갈등 -1,
14851,의혹,-1.0
14852,내팽개치다,-2.0
14853,횡령,-2.0


In [14]:
dic_data_value['document'] = dic_data_value['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","") # 정규 표현식 수행
dic_data_value['document'].replace('', np.nan, inplace=True) # 공백은 Null 값으로 변경
dic_data_value = dic_data_value.dropna(how='any') # Null 값 제거
dic_data_value = dic_data_value.dropna(how = 'any') # Null 값이 존재하는 행 제거
print(dic_data_value.isnull().values.any()) # Null 값이 존재하는지 확인
#감성이 없는 단어 삭제
idx_ = dic_data_value[dic_data_value['label']==0.0].index
dic_data_value = dic_data_value.drop(idx_)
print(dic_data_value.groupby(['label']).size().reset_index(name = 'count'))

False
   label  count
0   -2.0   4799
1   -1.0   5016
2    1.0   2246
3    2.0   2603


In [15]:
# 감정을 숫자로 변환
def emotion_labeling_dic(emotion):
    return{-2.0 : 0, -1.0: 0, 1.0: 1, 2.0: 1}[emotion]

emotion_labels = []

for e in dic_data_value['label']:
    emotion_labels.append(emotion_labeling_dic(e))

dic_data_value['label'] = emotion_labels
dic_data_value

Unnamed: 0,document,label
10,ㅡㅡ,0
28,ㅅ,0
41,ㄱㅅ,1
42,ㄱㅇㄷ,1
43,가격이 싸다,1
...,...,...
14849,오류,0
14851,의혹,0
14852,내팽개치다,0
14853,횡령,0


In [16]:
dic_data_value.reset_index(drop=True)

Unnamed: 0,document,label
0,ㅡㅡ,0
1,ㅅ,0
2,ㄱㅅ,1
3,ㄱㅇㄷ,1
4,가격이 싸다,1
...,...,...
14659,오류,0
14660,의혹,0
14661,내팽개치다,0
14662,횡령,0


In [17]:
print('KNU감성사전 데이터 개수 :',len(dic_data_value))

KNU감성사전 데이터 개수 : 14664


### 기본 데이터 + KNU감성사전 통합

In [18]:
#합치기 위해 포멧 통일
train.drop(labels='id', axis="columns", inplace=True)

In [19]:
#frame = [dic_data_value, train_data]
train = pd.concat([dic_data_value, train])
train

Unnamed: 0,document,label
10,ㅡㅡ,0
28,ㅅ,0
41,ㄱㅅ,1
42,ㄱㅇㄷ,1
43,가격이 싸다,1
...,...,...
149995,인간이 문제지.. 소는 뭔죄인가..,0
149996,평점이 너무 낮아서...,1
149997,이게 뭐요? 한국인은 거들먹거리고 필리핀 혼혈은 착하다?,0
149998,청춘 영화의 최고봉.방황과 우울했던 날들의 자화상,1


In [20]:
#인덱스 재정렬
train.reset_index(drop=True, inplace=True)

In [21]:
# Null 값 제거
train = train.fillna(' ')

In [22]:
# txt 파일 저장
train.to_csv(DATA_IN_PATH + 'kobert_train.txt', sep = '\t', index = True)
dev.to_csv(DATA_IN_PATH + 'kobert_dev.txt', sep = '\t', index = True)

In [23]:
dataset_train= nlp.data.TSVDataset(DATA_IN_PATH + "kobert_train.txt", field_indices=[1,2], num_discard_samples=1)
dataset_dev = nlp.data.TSVDataset(DATA_IN_PATH + "kobert_dev.txt", field_indices=[2,3], num_discard_samples=1)

### Original 테스트 데이터

In [24]:
# 테스트(캐글) 데이터 로드
test = pd.read_csv(DATA_IN_PATH + 'ko_data.csv', encoding = 'cp949')
test.columns = ['id','document']
test.head()

Unnamed: 0,id,document
0,0,정말 많이 울었던 영화입니다.
1,1,시간 낭비예요.
2,2,포스터를 저렇게밖에 만들지 못했던 제작자의 소심함에 침을 뱉고 싶다.
3,3,지금 봐도 재미있는 영화!!! 코믹과 감동!!! 그리고 요리!!!
4,4,이걸 영화로 만드는 거야?얼마나 가는지 보자.


In [25]:
# test['label'] = 0
# test.head(10)

In [26]:
csv_file = DATA_IN_PATH + 'ko_data.csv'
txt_file = DATA_IN_PATH + 'kobert_test.txt'
with open(txt_file, "w") as my_output_file:
    with open(csv_file, "r", encoding='cp949') as my_input_file: # , encoding='cp949'
        [ my_output_file.write("\t".join(row)+'\n') for row in csv.reader(my_input_file)]
    my_output_file.close()

In [27]:
dataset_test = nlp.data.TSVDataset(DATA_IN_PATH + "kobert_test.txt", field_indices=[1], num_discard_samples=1)

# 모델 구현

In [28]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [29]:
class BERTDataset_Test(Dataset):
    def __init__(self, dataset, sent_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        #self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i])

    def __len__(self):
        return (len(self.sentences))

In [30]:
## Setting parameters
max_len = MAXLEN
batch_size = BATCHSIZE
warmup_ratio = 0.1
num_epochs = EPOCHS
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [31]:
data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
data_dev = BERTDataset(dataset_dev, 0, 1, tok, max_len, True, False)
data_test = BERTDataset_Test(dataset_test, 0, tok, max_len, True, False)

In [32]:
print(len(data_train))
print(len(data_test))
print(len(data_test))

164664
11187
11187


In [33]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
dev_dataloader = torch.utils.data.DataLoader(data_dev, batch_size=batch_size, num_workers=5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

In [34]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=2,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [35]:
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)

In [36]:
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [37]:
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

In [38]:
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

In [39]:
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

# 모델 학습

In [40]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [41]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    
    print()
    print('======== Epoch {:} / {:} ========'.format(e+1, num_epochs))
    
    # 모델 학습
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    
    # 모델 평가
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(dev_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))
    print()




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5146.0), HTML(value='')))

epoch 1 batch id 1 loss 0.6974334120750427 train acc 0.4375
epoch 1 batch id 201 loss 0.6441563963890076 train acc 0.6435012437810945
epoch 1 batch id 401 loss 0.5635945796966553 train acc 0.6763559850374065
epoch 1 batch id 601 loss 0.5871680974960327 train acc 0.71386231281198
epoch 1 batch id 801 loss 0.38045650720596313 train acc 0.7391151685393258
epoch 1 batch id 1001 loss 0.32231274247169495 train acc 0.7579920079920079
epoch 1 batch id 1201 loss 0.28496745228767395 train acc 0.7721690258118235
epoch 1 batch id 1401 loss 0.38086313009262085 train acc 0.7844842969307637
epoch 1 batch id 1601 loss 0.3370823860168457 train acc 0.7927467207995003
epoch 1 batch id 1801 loss 0.21143558621406555 train acc 0.7998160744031094
epoch 1 batch id 2001 loss 0.3006352186203003 train acc 0.8055347326336831
epoch 1 batch id 2201 loss 0.3831128478050232 train acc 0.8099159472966834
epoch 1 batch id 2401 loss 0.22931703925132751 train acc 0.8136453561016244
epoch 1 batch id 2601 loss 0.31987342238

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1563.0), HTML(value='')))


epoch 1 test acc 0.8841770633397313




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5146.0), HTML(value='')))

epoch 2 batch id 1 loss 0.39492377638816833 train acc 0.8125
epoch 2 batch id 201 loss 0.7162758708000183 train acc 0.8745335820895522
epoch 2 batch id 401 loss 0.5467223525047302 train acc 0.8890274314214464
epoch 2 batch id 601 loss 0.3913254141807556 train acc 0.8883631447587355
epoch 2 batch id 801 loss 0.44554826617240906 train acc 0.8853776529338327
epoch 2 batch id 1001 loss 0.2692955732345581 train acc 0.8846778221778222
epoch 2 batch id 1201 loss 0.2935902774333954 train acc 0.8859804329725229
epoch 2 batch id 1401 loss 0.17989122867584229 train acc 0.887178800856531
epoch 2 batch id 1601 loss 0.21719947457313538 train acc 0.888487663960025
epoch 2 batch id 1801 loss 0.256223201751709 train acc 0.8896793448084398
epoch 2 batch id 2001 loss 0.21433773636817932 train acc 0.8907421289355323
epoch 2 batch id 2201 loss 0.41797009110450745 train acc 0.8915123807360291
epoch 2 batch id 2401 loss 0.06167921796441078 train acc 0.8920111411911703
epoch 2 batch id 2601 loss 0.21661512553

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1563.0), HTML(value='')))


epoch 2 test acc 0.8887156110044786




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5146.0), HTML(value='')))

epoch 3 batch id 1 loss 0.1409730464220047 train acc 0.96875
epoch 3 batch id 201 loss 0.7729413509368896 train acc 0.957089552238806
epoch 3 batch id 401 loss 0.09616588801145554 train acc 0.9521508728179551
epoch 3 batch id 601 loss 0.28002721071243286 train acc 0.9426476705490848
epoch 3 batch id 801 loss 0.24394437670707703 train acc 0.9362125468164794
epoch 3 batch id 1001 loss 0.11244723200798035 train acc 0.9340034965034965
epoch 3 batch id 1201 loss 0.22075262665748596 train acc 0.9338311823480433
epoch 3 batch id 1401 loss 0.11068390309810638 train acc 0.9336411491791577
epoch 3 batch id 1601 loss 0.06881464272737503 train acc 0.933381480324797
epoch 3 batch id 1801 loss 0.2210633009672165 train acc 0.9333182953914492
epoch 3 batch id 2001 loss 0.1775878369808197 train acc 0.9339705147426287
epoch 3 batch id 2201 loss 0.3700553774833679 train acc 0.9340356656065425
epoch 3 batch id 2401 loss 0.01925620809197426 train acc 0.934454394002499
epoch 3 batch id 2601 loss 0.044937912

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1563.0), HTML(value='')))


epoch 3 test acc 0.8932141714651312




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5146.0), HTML(value='')))

epoch 4 batch id 1 loss 0.09691984206438065 train acc 0.96875
epoch 4 batch id 201 loss 0.23639018833637238 train acc 0.9709266169154229
epoch 4 batch id 401 loss 0.005552453920245171 train acc 0.9745168329177057
epoch 4 batch id 601 loss 0.18029442429542542 train acc 0.9670861064891847
epoch 4 batch id 801 loss 0.12911619246006012 train acc 0.964107365792759
epoch 4 batch id 1001 loss 0.24402816593647003 train acc 0.962475024975025
epoch 4 batch id 1201 loss 0.2612472176551819 train acc 0.9620888842631141
epoch 4 batch id 1401 loss 0.12883147597312927 train acc 0.9618129907209136
epoch 4 batch id 1601 loss 0.10112447291612625 train acc 0.9616060274828232
epoch 4 batch id 1801 loss 0.022475605830550194 train acc 0.9613929761243754
epoch 4 batch id 2001 loss 0.036809246987104416 train acc 0.961878435782109
epoch 4 batch id 2201 loss 0.23610541224479675 train acc 0.9622046796910495
epoch 4 batch id 2401 loss 0.0605836883187294 train acc 0.962411495210329
epoch 4 batch id 2601 loss 0.1210

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1563.0), HTML(value='')))


epoch 4 test acc 0.8950135956493922




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5146.0), HTML(value='')))

epoch 5 batch id 1 loss 0.05562307685613632 train acc 0.9375
epoch 5 batch id 201 loss 0.053011033684015274 train acc 0.9794776119402985
epoch 5 batch id 401 loss 0.0037660435773432255 train acc 0.982465710723192
epoch 5 batch id 601 loss 0.16985149681568146 train acc 0.9785253743760399
epoch 5 batch id 801 loss 0.2279297411441803 train acc 0.9766307740324595
epoch 5 batch id 1001 loss 0.004054858814924955 train acc 0.9763361638361638
epoch 5 batch id 1201 loss 0.09751266241073608 train acc 0.9765039550374688
epoch 5 batch id 1401 loss 0.1140034943819046 train acc 0.9761777301927195
epoch 5 batch id 1601 loss 0.008265100419521332 train acc 0.9761086820737039
epoch 5 batch id 1801 loss 0.005953286774456501 train acc 0.9761070238756246
epoch 5 batch id 2001 loss 0.02020999602973461 train acc 0.976667916041979
epoch 5 batch id 2201 loss 0.28904321789741516 train acc 0.9768145161290323
epoch 5 batch id 2401 loss 0.005413970910012722 train acc 0.976780508121616
epoch 5 batch id 2601 loss 0.

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1563.0), HTML(value='')))


epoch 5 test acc 0.8976527511196417



In [42]:
test_preds = []

for batch_id, (token_ids, valid_length, segment_ids) in enumerate(tqdm_notebook(test_dataloader)):
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    valid_length = valid_length
    out = model(token_ids, valid_length, segment_ids)
    
    for i in range(len(out)):
        test_preds.append(np.argmax(out[i].detach().cpu().numpy()))


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=350.0), HTML(value='')))




In [43]:
len(test_preds)

11187

In [44]:
# 테스트 데이터의 리뷰 부분을 리스트 처리
test_id = list(test['id'])

# 판다스 데이터프레임 통해 데이터 구성하여 output에 투입
output = pd.DataFrame( data={"Id": test_id, "Predicted": test_preds} )
output.head()

Unnamed: 0,Id,Predicted
0,0,1
1,1,0
2,2,1
3,3,1
4,4,0


In [45]:
# 해당 경로가 없으면 생성
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

SAVE_NM = "NSMC_KoBERT_SWD_MAXLN"+str(max_len)+"_BATSZ"+str(batch_size)+"_EPOCH"+str(num_epochs)+".csv"

# csv파일 생성
output.to_csv(DATA_OUT_PATH + SAVE_NM, index = False)  # 앙상블 조합 및 캐글 제출 용도

### 캐글 제출 결과
**[2020.12.16]**<br>
max_len = 64, batch_size = 32, epoch 5 => 0.86053<br>
max_len = 64, batch_size = 64, epoch 5 => 0.86053<br>
max_len = 128, batch_size = 32, epoch 5 => 0.88396<br>