# data는 e9t(Lucy Park)님께서 github에 공유해주신 네이버 영화평점 데이터를 사용하였습니다.
# https://github.com/e9t/nsmc

In [1]:
from collections import defaultdict

import torch.nn as nn
import torch
from torch.autograd import Variable
import torch.nn.functional as F
import random
import numpy as np

# data를 읽어옴
def read_txt(path_to_file):
    txt_ls = []
    label_ls = []

    with open(path_to_file) as f:
        for i, line in enumerate(f.readlines()[1:]):
            id_num, txt, label = line.split('\t')
            txt_ls.append(txt)
            label_ls.append(int(label.replace('\n','')))
    return txt_ls, label_ls

# 단어에 대한 idx 부여
def convert_word_to_idx(sents):
    for sent in sents:
        yield [w2i_dict[word] for word in sent.split(' ')]
    return


# Sequence Length를 맞추기 위한 padding
def add_padding(sents, max_len):
    for i, sent in enumerate(sents):
        if len(sent)< max_len:
            sents[i] += [pad] * (max_len - len(sent))
    
        elif len(sent) > max_len:
            sents[i] = sent[:max_len]
    
    return sents

# torch Variable로 변환
def convert_to_variable(w2i_ls):
    
    var = Variable(torch.LongTensor(w2i_ls))
    return var

# Data Loading

In [2]:
w2i_dict = defaultdict(lambda : len(w2i_dict))
pad = w2i_dict['<PAD>']

# 데이터 불러오기
train_txt_ls, train_label_ls = read_txt('ratings_train.txt')
test_txt_ls, test_label_ls = read_txt('ratings_test.txt')

In [3]:
len(train_txt_ls), len(test_txt_ls)

(150000, 50000)

In [4]:
train_w2i_ls = list(convert_word_to_idx(train_txt_ls))
test_w2i_ls = list(convert_word_to_idx(test_txt_ls))

i2w_dict = {val : key for key, val in w2i_dict.items()}

In [5]:
for w2i in train_w2i_ls[0]:
    print(i2w_dict[w2i])

아
더빙..
진짜
짜증나네요
목소리


In [6]:
max_sequence_length = 30

x_train = convert_to_variable(add_padding(train_w2i_ls, max_sequence_length))
x_test = convert_to_variable(add_padding(test_w2i_ls, max_sequence_length))

y_train = convert_to_variable(train_label_ls).float()
y_test = convert_to_variable(test_label_ls).float()

# CNN 모델 준비

Pre-train된 Embedding은 사용하지 않았습니다.

모든 embedding은 랜덤으로 초기화된 상태로 학습을 진행하였습니다. (non-static)

In [8]:
class CNN_text(nn.Module):
    
    def __init__(self,w2i_dict, n_words, embed_size, hid_size, drop_rate, kernel_size_ls, num_filter, n_category):
        super(CNN_text, self).__init__()
        self.w2i_dict = w2i_dict
        self.padding_index = w2i_dict['<PAD>']
        self.embed_size = embed_size
        self.hid_size = hid_size
        self.drop_rate = drop_rate
        self.num_filter = num_filter
        self.kernel_size_ls = kernel_size_ls
        self.num_kernel = len(kernel_size_ls)
        self.n_category = n_category
        
        self.embedding = nn.Embedding(
            num_embeddings=n_words, 
            embedding_dim=embed_size,
            padding_idx=self.padding_index
        )
        
        # kernel size는 (n-gram, embed_size)
        self.convs = nn.ModuleList([nn.Conv2d(1, num_filter, (kernel_size, embed_size)) for kernel_size in kernel_size_ls])
        
        self.lin = nn.Sequential(
            nn.Linear(self.num_kernel*num_filter, hid_size), nn.ReLU(), 
            nn.Dropout(drop_rate),
            nn.Linear(hid_size, n_category),
        )
        
    def forward(self, x):
        embed = self.embedding(x) # [batch_size, max_length, embed_size]
        embed.unsqueeze_(1)  # [batch_size, 1, max_length, embed_size]
        
        # convolution
        conved = [conv(embed).squeeze(3) for conv in self.convs] # [batch_size, num_filter, max_length -kernel_size +1]
        
        # max_pool
        pooled = [F.max_pool1d(conv, (conv.size(2))).squeeze(2) for conv in conved] # [batch_size, num_kernel, num_filter]
            
        # dropout
        dropouted = [F.dropout(pool, self.drop_rate) for pool in pooled]
        
        # concatenate
        concated = torch.cat(pooled, dim = 1) # [batch_size, num_kernel * num_filter]
        logit = self.lin(concated)
        
        return logit
        

In [11]:
params = {
    'w2i_dict' : w2i_dict,
    'n_words' : len(w2i_dict),
    'embed_size' : 128,
    'hid_size' : 128,
    'drop_rate' : 0.5,
    'kernel_size_ls' : [2,3,4,5],
    'num_filter' : 32,
    'n_category' : 2,
}

In [12]:
model = CNN_text(**params)

In [13]:
model

CNN_text(
  (embedding): Embedding(450543, 128, padding_idx=0)
  (convs): ModuleList(
    (0): Conv2d(1, 32, kernel_size=(2, 128), stride=(1, 1))
    (1): Conv2d(1, 32, kernel_size=(3, 128), stride=(1, 1))
    (2): Conv2d(1, 32, kernel_size=(4, 128), stride=(1, 1))
    (3): Conv2d(1, 32, kernel_size=(5, 128), stride=(1, 1))
  )
  (lin): Sequential(
    (0): Linear(in_features=128, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5)
    (3): Linear(in_features=128, out_features=2, bias=True)
  )
)

In [14]:
len(list(model.parameters()))

13

In [None]:
epochs = 50
lr = 0.001
batch_size = 10000

train_idx = np.arange(x_train.size(0))
test_idx = np.arange(x_test.size(0))
optimizer = torch.optim.Adam(model.parameters(),lr)
criterion = nn.CrossEntropyLoss(reduction='sum')

loss_ls = []

for epoch in range(epochs):
    model.train()
    
    # input 데이터 순서 섞기
    random.shuffle(train_idx)
    x_train = x_train[train_idx]
    y_train = y_train[train_idx]
    train_loss = 0

    for start_idx, end_idx in zip(range(0, x_train.size(0), batch_size),
                                  range(batch_size, x_train.size(0)+1, batch_size)):
        
        x_batch = x_train[start_idx : end_idx]
        y_batch = y_train[start_idx : end_idx].long()
        
        scores = model(x_batch)
        predict = F.softmax(scores).argmax(dim = 1)
        
        acc = (predict == y_batch).sum().item() / batch_size
        
        loss = criterion(scores, y_batch)
        train_loss += loss.item()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    print('Train epoch : %s,  loss : %s,  accuracy :%.3f'%(epoch+1, train_loss / batch_size, acc))
    print('=================================================================================================')
    
    loss_ls.append(train_loss)
    
    if (epoch+1) % 10 == 0:
        model.eval()
        scores = model(x_test)
        predict = F.softmax(scores).argmax(dim = 1)
        
        acc = (predict == y_test.long()).sum().item() / y_test.size(0)
        loss = criterion(scores, y_test.long())
        
        print('*************************************************************************************************')
        print('*************************************************************************************************')
        print('Test Epoch : %s, Test Loss : %.03f , Test Accuracy : %.03f'%(epoch+1, loss.item()/y_test.size(0), acc))
        print('*************************************************************************************************')
        print('*************************************************************************************************')




Train epoch : 1,  loss : 10.4093115234375,  accuracy :0.554
Train epoch : 2,  loss : 9.951091943359375,  accuracy :0.622
Train epoch : 3,  loss : 9.330300244140625,  accuracy :0.664
Train epoch : 4,  loss : 8.5607712890625,  accuracy :0.716
Train epoch : 5,  loss : 7.69082138671875,  accuracy :0.753
Train epoch : 6,  loss : 6.7921904296875,  accuracy :0.795
Train epoch : 7,  loss : 5.874925,  accuracy :0.828
Train epoch : 8,  loss : 4.9870984375,  accuracy :0.857
