# data는 e9t(Lucy Park)님께서 github에 공유해주신 네이버 영화평점 데이터를 사용하였습니다.
# https://github.com/e9t/nsmc

# 불러오기

In [1]:
from collections import defaultdict
import numpy as np


def read_txt(path_to_file):
    txt_ls = []
    label_ls = []

    with open(path_to_file) as f:
        for i, line in enumerate(f.readlines()[1:]):
            id_num, txt, label = line.split('\t')
            txt_ls.append(txt)
            label_ls.append(int(label.replace('\n','')))
    return txt_ls, label_ls


In [2]:
# 데이터 불러오기
x_train, y_train = read_txt('../ratings_train.txt')
x_test, y_test = read_txt('../ratings_test.txt')

In [3]:
x_train[0]

'아 더빙.. 진짜 짜증나네요 목소리'

## 비어있는 리뷰 제거

In [4]:
def remove_empty_review(X, Y):
    empty_idx_ls = []
    
    for idx, review in enumerate(X):
        if len(review) == 0:
            empty_idx_ls.append(idx)
    
    # idx 값이 큰 것부터 제거 (앞으로 밀리는 것을 방지)
    empty_idx_ls = sorted(empty_idx_ls, reverse = True)
    
    for empty_idx in empty_idx_ls:
        del X[empty_idx], Y[empty_idx]
    
    return X, Y

In [5]:
x_train, y_train = remove_empty_review(x_train, y_train)
x_test, y_test = remove_empty_review(x_test, y_test)

In [6]:
x_train[0]

'아 더빙.. 진짜 짜증나네요 목소리'

In [7]:
len(x_train), len(x_test)

(149995, 49997)

## 토큰 인덱싱 (token2idx)

In [8]:
# 단어에 대한 idx 부여
def convert_token_to_idx(token_ls):
    for tokens in token_ls:
        yield [token2idx[token] for token in tokens.split(' ')]
    return

In [9]:
token2idx = defaultdict(lambda : len(token2idx)) # token과 index를 매칭시켜주는 딕셔너리
pad = token2idx['<PAD>']  # pytorch Variable로 변환하기 위해, 문장의 길이를 맞춰주기 위한 padding 

x_train = list(convert_token_to_idx(x_train))
x_test = list(convert_token_to_idx(x_test))

idx2token = {val : key for key,val in token2idx.items()}

#### 인덱싱 결과 확인 

In [10]:
x_train[0]

[1, 2, 3, 4, 5]

#### 원본 텍스트로 변환 확인 

In [11]:
[idx2token[x] for x in x_train[0]]

['아', '더빙..', '진짜', '짜증나네요', '목소리']

## Add Padding

In [12]:
# Pytorch Variable로 변환하기 위해서는 모든 data의 길이(length)가 동일하여야 한다.
# 영화 리뷰는 길이가 제각각이므로, 길이를 맞춰주는 작업을 수행
# 짧은 문장에는 padding(공간을 채우기 위해 사용하는 더미)을 추가하고,
# 긴 문장은 짤라서 줄인다.

In [13]:
# Sequence Length를 맞추기 위한 padding
def add_padding(token_ls, max_len):
    for i, tokens in enumerate(token_ls):
        n_token = len(tokens)
        
        # 길이가 짧으면 padding을 추가
        if n_token < max_len:
            token_ls[i] += [pad] * (max_len - n_token) # 부족한 만큼 padding을 추가함
        
        # 길이가 길면, max_len에서 짜름
        elif n_token > max_len:
            token_ls[i] = tokens[:max_len]
    return token_ls

In [14]:
max_len = 30
x_train = add_padding(x_train, max_len)
x_test = add_padding(x_test, max_len)

#### Padding 결과 확인

In [15]:
' '.join([idx2token[x] for x in x_train[0]])

'아 더빙.. 진짜 짜증나네요 목소리 <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>'

## Pytorch 모델 학습을 위해 Data의 type을 Variable 로 변환

In [16]:
import torch.nn as nn
import torch
from torch.autograd import Variable
import torch.nn.functional as F

In [17]:
# torch Variable로 변환
def convert_to_long_variable(w2i_ls):
    return Variable(torch.LongTensor(w2i_ls))

In [18]:
x_train = convert_to_long_variable(x_train)
x_test = convert_to_long_variable(x_test)

y_train = convert_to_long_variable(y_train)
y_test = convert_to_long_variable(y_test)

In [19]:
x_train[0]

tensor([1, 2, 3, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0])

# CBOW with Pytorch

In [20]:
class CBOW(nn.Module):
    def __init__(self, n_words, embed_size, pad_index, hid_size, dropout, n_class):
        super(CBOW, self).__init__()
        
        self.n_words = n_words   # 고유한 토큰의 갯수
        self.embed_size = embed_size   # 임베딩 차원의 크기
        self.pad_index = pad_index     # 문장에 포함된 padding_token, embedding 과정에서 제외시킴
        self.embed = nn.Embedding(n_words, embed_size, padding_idx=pad_index) # non-static embedding with Pytorch
        
        self.hid_size = hid_size    # Fully-Connet layer의 히든 레이어의 갯수
        self.dropout = dropout   # 드롭아웃 비율
        self.n_class = n_class   # 카테고리의 갯수

        # pre-train된 embedding을 사용하고 싶다면,
        # self.embed.weight = pre_trained_weight_matrix
        # self.embed.weight.requires_grad = False  # embedding weight 고정 : static
        
        self.lin = nn.Sequential(
            nn.Linear(embed_size, hid_size), nn.ReLU(), nn.Dropout(),
            nn.Linear(hid_size, n_class)
        )

    def forward(self, x):
        x_embeded = self.embed(x)  # batch_size x sequence_length x embed_size

        # 모든 단어의 embedding vector를 모두 더하여 sentence를 모델링한다.
        x_cbow = x_embeded.sum(dim=1) # batch_size x 1 x embeded_size
        x_cbow = x_cbow.squeeze(1)    # fully-connet를 위해, 1번째 차원을 축소

        logit = self.lin(x_cbow)
        return logit

In [21]:
params = {
    'n_words' : len(token2idx),     # 고유한 토큰의 갯수
    'embed_size' : 32,                 # embedding 차원의 크기
    'pad_index' : token2idx['<PAD>'],  # embedding 과정에서 제외시킬, padding token
    'hid_size' : 32,                   # 히든 레이어 갯수
    'dropout' : 0.5,                   # 드롭아웃 비율
    'n_class' : 2,                     # 카테고리 갯수 (긍/부정)
}

In [22]:
model = CBOW(**params)

In [23]:
model

CBOW(
  (embed): Embedding(450542, 32, padding_idx=0)
  (lin): Sequential(
    (0): Linear(in_features=32, out_features=32, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5)
    (3): Linear(in_features=32, out_features=2, bias=True)
  )
)

# Train

In [24]:
import random

In [25]:
def adjust_learning_rate(optimizer, epoch, init_lr=0.001, lr_decay_epoch=10):
    """Decay learning rate by a factor of 0.1 every lr_decay_epoch epochs."""
    lr = init_lr * (0.1**(epoch // lr_decay_epoch))

    if epoch % lr_decay_epoch == 0:
        print('LR is set to %s'%(lr))

    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    return optimizer

In [26]:
epochs = 20
lr = 0.003
batch_size = 10000

train_idx = np.arange(x_train.size(0))
test_idx = np.arange(x_test.size(0))
optimizer = torch.optim.Adam(model.parameters(),lr)  # Adam Optimizer 사용
criterion = nn.CrossEntropyLoss(reduction='sum')     # model이 logit을 반환하므로, 크로스-엔트로피-Loss를 사용,
                                                     # 크로스-엔트로피-Loss는 Log_softmax + NLL_loss 
loss_ls = []

for epoch in range(1, epochs+1):
    model.train()
    
    # input 데이터 순서 섞기
    random.shuffle(train_idx)
    x_train = x_train[train_idx]
    y_train = y_train[train_idx]
    train_loss = 0

    for start_idx, end_idx in zip(range(0, x_train.size(0), batch_size),
                                  range(batch_size, x_train.size(0)+1, batch_size)):
        
        x_batch = x_train[start_idx : end_idx]
        y_batch = y_train[start_idx : end_idx].long()
        
        scores = model(x_batch)
        predict = F.softmax(scores, dim=1).argmax(dim=1)
        
        acc = (predict == y_batch).sum().item() / batch_size
        
        loss = criterion(scores, y_batch)
        train_loss += loss.item()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    print('Train epoch : %s,  loss : %s,  accuracy :%.3f'%(epoch, train_loss / batch_size, acc))
    print('=================================================================================================')
    
    loss_ls.append(train_loss)
    optimizer = adjust_learning_rate(optimizer, epoch, lr, 10) # adjust learning_rate while training
    
    if (epoch+1) % 10 == 0:
        model.eval()
        scores = model(x_test)
        predict = F.softmax(scores, dim=1).argmax(dim = 1)
        
        acc = (predict == y_test).sum().item() / y_test.size(0)
        loss = criterion(scores, y_test.long())
        
        print('*************************************************************************************************')
        print('*************************************************************************************************')
        print('Test Epoch : %s, Test Loss : %.03f , Test Accuracy : %.03f'%(epoch, loss.item()/y_test.size(0), acc))
        print('*************************************************************************************************')
        print('*************************************************************************************************')


Train epoch : 1,  loss : 10.597307177734375,  accuracy :0.519
Train epoch : 2,  loss : 9.757400048828124,  accuracy :0.544
Train epoch : 3,  loss : 9.49492783203125,  accuracy :0.583
Train epoch : 4,  loss : 9.2565888671875,  accuracy :0.615
Train epoch : 5,  loss : 8.845070458984376,  accuracy :0.660
Train epoch : 6,  loss : 8.246880810546875,  accuracy :0.701
Train epoch : 7,  loss : 7.528774365234375,  accuracy :0.739
Train epoch : 8,  loss : 6.74151396484375,  accuracy :0.773
Train epoch : 9,  loss : 5.959237329101563,  accuracy :0.821
*************************************************************************************************
*************************************************************************************************
Test Epoch : 9, Test Loss : 0.604 , Test Accuracy : 0.714
*************************************************************************************************
*************************************************************************************************
Trai