# data는 e9t(Lucy Park)님께서 github에 공유해주신 네이버 영화평점 데이터를 사용하였습니다.
# https://github.com/e9t/nsmc

In [1]:
from collections import defaultdict

import torch.nn as nn
import torch
from torch.autograd import Variable
import torch.nn.functional as F
import random
import numpy as np

# data를 읽어옴
def read_txt(path_to_file):
    txt_ls = []
    label_ls = []

    with open(path_to_file) as f:
        for i, line in enumerate(f.readlines()[1:]):
            id_num, txt, label = line.split('\t')
            txt_ls.append(txt)
            label_ls.append(int(label.replace('\n','')))
    return txt_ls, label_ls

## Data loading

In [43]:
from collections import Counter

In [44]:
x_train, y_train = read_txt('../ratings_train.txt')
x_test, y_test = read_txt('../ratings_test.txt')

In [45]:
x_train = [x.split() for x in x_train]
x_test = [x.split() for x in x_test]

In [46]:
x_train[0]

['아', '더빙..', '진짜', '짜증나네요', '목소리']

In [47]:
Counter(y_train)

Counter({0: 75173, 1: 74827})

## SOS EOS 토큰 추가

In [48]:
def add_start_end_token(token_ls):
    sos = '<SOS>'
    eos = '<EOS>'
    
    for tokens in token_ls:
        tokens = [sos] + tokens + [eos] # text
        yield tokens
    

In [49]:
token2idx = defaultdict(lambda : len(token2idx))
pad = token2idx['<PAD>'] #0
sos = token2idx['<SOS>'] #1
eos = token2idx['<EOS>'] #2

In [50]:
x_train = list(add_start_end_token(x_train))
x_test = list(add_start_end_token(x_test))

In [51]:
x_train[0]

['<SOS>', '아', '더빙..', '진짜', '짜증나네요', '목소리', '<EOS>']

## Add Padding

In [52]:
# Sequence Length를 맞추기 위한 padding
def add_padding(token_ls, max_len):
    pad = '<PAD>'
    seq_length_ls = []
    
    for i, tokens in enumerate(token_ls):
        seq_length = len(tokens)
        
        # 짧으면 padding을 추가
        if seq_length < max_len:
            seq_length_ls.append(seq_length)
            token_ls[i] += [pad] * (max_len - seq_length)
        
        # 길이가 길면, max_len까지의 token만 사용
        elif seq_length >= max_len:
            seq_length_ls.append(max_len)
            token_ls[i] = tokens[:max_len]
            
    return token_ls, seq_length_ls


In [53]:
max_sequence_length = 30
x_train, x_train_seq_length = add_padding(x_train, max_sequence_length)
x_test, x_test_seq_length = add_padding(x_test, max_sequence_length)

## Converting token to index 

In [54]:
# 단어에 대한 idx 부여
def convert_token_to_idx(token_ls):
     
    for tokens in token_ls:
        yield [token2idx[token] for token in tokens]
    return

In [55]:
x_train = list(convert_token_to_idx(x_train))
x_test = list(convert_token_to_idx(x_test))

idx2token = {val : key for key,val in token2idx.items()}

In [56]:
' '.join([idx2token[x] for x in x_train[0]])

'<SOS> 아 더빙.. 진짜 짜증나네요 목소리 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>'

# Sorting by sequence_length 

In [57]:
def sort_by_sequence_length(x, y, seq_len):
    sorted_idx = np.argsort(seq_len)[::-1]
    
    x = np.array(x)[sorted_idx]
    y = np.array(y)[sorted_idx]
    seq_len = np.array(seq_len)[sorted_idx]
    
    return x, y, seq_len

In [58]:
x_train, y_train, x_train_seq_length = sort_by_sequence_length(x_train, y_train, x_train_seq_length)
x_test, y_test, x_test_seq_length = sort_by_sequence_length(x_test, y_test, x_test_seq_length)

In [59]:
x_train[0]

array([     1,    540,    298, 297657,  13552,   2639, 297658, 297659,
       297660, 297661,  21755, 297662,  80473,    715,    141, 297663,
        31873, 276536,   8747, 196553,   9286, 297664, 151635,     84,
          141, 297665,    707,    581, 297666, 297667])

In [60]:
x_train_seq_length[0]

30

# Converting to Variable

In [61]:
# torch Variable로 변환
def convert_to_variable(x):
    return Variable(torch.LongTensor(x))

In [62]:
x_train = convert_to_variable(x_train)
x_test = convert_to_variable(x_test)

y_train = convert_to_variable(y_train)
y_test = convert_to_variable(y_test)

In [63]:
[idx2token[x.item()] for x in x_train[7]]

['<SOS>',
 '솔직히',
 '초반에는',
 '좀',
 '아쉽지만',
 '갈수록',
 '갑툭튀',
 '개쩔고',
 '반전도',
 '있었고',
 '갠적으로',
 '꽤',
 '괜찮게',
 '봤는데',
 '평점이',
 '이상하다.',
 '딱7점',
 '중후반은',
 '되어야',
 '되는데,',
 '7.81이',
 '적당한',
 '평점이다.',
 '그리고',
 '은주',
 '첨',
 '봤을때',
 '한효주닮아서',
 '좀',
 '놀랐다.']

In [64]:
y_train[7]

tensor(1)

# RNN

In [70]:
class RNN(nn.Module):
    def __init__(self, token2idx, vocab_size, embed_size, hid_size, n_layers, dropout, n_category):
        super(RNN, self).__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.padding_index = token2idx['<PAD>']
        
        self.embed = nn.Embedding(
            num_embeddings=vocab_size, 
            embedding_dim=embed_size, 
            padding_idx=self.padding_index
        )
        
        self.hid_size = hid_size
        self.n_layers = n_layers
        self.drouput = dropout
        self.n_category = n_category
        
        self.rnn = nn.RNN(embed_size, hid_size, n_layers, batch_first=True)
        self.lin = nn.Sequential(
            nn.Linear(hid_size, n_category), nn.Tanh(), nn.Dropout(dropout)
        )
        
        self.outputs = []
        
    def init_hidden(self, batch_size):
        # the weights are of the form (nb_layers, batch_size, hid_size(n_neuron))
        hidden = Variable(torch.randn(self.n_layers, batch_size, self.hid_size))
        return hidden    
    
    def forward(self, x, x_sequence_length):
        # init h randomly
        batch_size = x.size(0)
        self.h = self.init_hidden(batch_size)
        
        # embedding
        x = self.embed(x) # sequence_length(max_len), batch_size, embed_size
        
        # packing for rnn
        x = torch.nn.utils.rnn.pack_padded_sequence(x, x_sequence_length, batch_first=True)
        
        # RNN
        output, self.h = self.rnn(x, self.h)
        
        # unpack
        x, _ = torch.nn.utils.rnn.pad_packed_sequence(output, batch_first=True)
        
        # cbow
        x = x.sum(dim = 1)
        
        # fully-connect
        logit = self.lin(x)
        return logit
        

In [71]:
params = {
    'token2idx' : token2idx,
    'vocab_size' : len(token2idx),
    'embed_size' : 64,
    'hid_size' : 64,
    'n_layers' : 2,
    'dropout' : 0.5,
    'n_category' : 2,
}

In [72]:
model = RNN(**params)

In [73]:
model

RNN(
  (embed): Embedding(448188, 64, padding_idx=0)
  (rnn): RNN(64, 64, num_layers=2, batch_first=True)
  (lin): Sequential(
    (0): Linear(in_features=64, out_features=2, bias=True)
    (1): Tanh()
    (2): Dropout(p=0.5)
  )
)

# Train

In [75]:
def adjust_learning_rate(optimizer, epoch, init_lr=0.001, lr_decay_epoch=10):
    """Decay learning rate by a factor of 0.1 every lr_decay_epoch epochs."""
    lr = init_lr * (0.1**(epoch // lr_decay_epoch))

    if epoch % lr_decay_epoch == 0:
        print('LR is set to %s'%(lr))

    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    return optimizer

In [76]:
epochs = 50
lr = 0.01
batch_size = 10000

train_idx = np.arange(x_train.size(0))
test_idx = np.arange(x_test.size(0))
optimizer = torch.optim.Adam(model.parameters(),lr)
criterion = nn.CrossEntropyLoss(reduction='sum')

loss_ls = []

for epoch in range(1, epochs+1):
    model.train()
    
    # input 데이터 순서 섞기
    '''
    random.shuffle(train_idx)
    x_train = x_train[train_idx]
    y_train = y_train[train_idx]
    x_train_seq_length = x_train_seq_length[train_idx]
    '''
    
    train_loss = 0

    for start_idx, end_idx in zip(range(0, x_train.size(0), batch_size),
                                  range(batch_size, x_train.size(0)+1, batch_size)):
        
        x_batch = x_train[start_idx : end_idx]
        y_batch = y_train[start_idx : end_idx].long()
        x_batch_seq_length = x_train_seq_length[start_idx: end_idx]
        
        scores = model(x_batch, x_batch_seq_length)
        predict = F.softmax(scores, dim=1).argmax(dim = 1)
        
        acc = (predict == y_batch).sum().item() / batch_size
        
        loss = criterion(scores, y_batch)
        train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    print('Train epoch : %s,  loss : %s,  accuracy :%.3f'%(epoch, train_loss / batch_size, acc))
    print('=================================================================================================')
    
    loss_ls.append(train_loss)
    optimizer = adjust_learning_rate(optimizer, epoch, lr, 10) # adjust learning_rate while training
    
    if (epoch) % 10 == 0:
        model.eval()
        scores = model(x_test, x_test_seq_length)
        predict = F.softmax(scores, dim=1).argmax(dim = 1)
        
        acc = (predict == y_test.long()).sum().item() / y_test.size(0)
        loss = criterion(scores, y_test.long())
        
        print('*************************************************************************************************')
        print('*************************************************************************************************')
        print('Test Epoch : %s, Test Loss : %.03f , Test Accuracy : %.03f'%(epoch, loss.item()/y_test.size(0), acc))
        print('*************************************************************************************************')
        print('*************************************************************************************************')


Train epoch : 1,  loss : 3.963375,  accuracy :0.866
Train epoch : 2,  loss : 3.9973619384765624,  accuracy :0.860
Train epoch : 3,  loss : 4.0389154296875,  accuracy :0.867
Train epoch : 4,  loss : 3.9888525146484377,  accuracy :0.864
Train epoch : 5,  loss : 3.96855693359375,  accuracy :0.866
Train epoch : 6,  loss : 3.9455397216796877,  accuracy :0.862
Train epoch : 7,  loss : 3.94292861328125,  accuracy :0.865
Train epoch : 8,  loss : 3.9366978515625,  accuracy :0.863
Train epoch : 9,  loss : 3.9222626220703125,  accuracy :0.868
Train epoch : 10,  loss : 3.9387919677734375,  accuracy :0.864
LR is set to 0.001
*************************************************************************************************
*************************************************************************************************
Test Epoch : 10, Test Loss : 0.580 , Test Accuracy : 0.756
*************************************************************************************************
***************************

KeyboardInterrupt: 