# data는 e9t(Lucy Park)님께서 github에 공유해주신 네이버 영화평점 데이터를 사용하였습니다.
# https://github.com/e9t/nsmc

In [1]:
from collections import defaultdict

In [2]:
w2i_dict = defaultdict(lambda : len(w2i_dict))
pad = w2i_dict['<PAD>']

def read_txt(path_to_file):
    txt_ls = []
    label_ls = []

    with open(path_to_file) as f:
        for i, line in enumerate(f.readlines()[1:]):
            id_num, txt, label = line.split('\t')
            txt_ls.append(txt)
            label_ls.append(int(label.replace('\n','')))
    return txt_ls, label_ls


def convert_word_to_idx(sents):
    for sent in sents:
        yield [w2i_dict[word] for word in sent.split(' ')]
    return


def add_padding(sents, max_len):
    for i, sent in enumerate(sents):
        if len(sent)< max_len:
            sents[i] += [pad] * (max_len - len(sent))
    
        elif len(sent) > max_len:
            sents[i] = sent[:max_len]
    
    return sents

In [3]:
# 데이터 불러오기
train_txt_ls, train_label_ls = read_txt('ratings_train.txt')
test_txt_ls, test_label_ls = read_txt('ratings_test.txt')

In [4]:
len(train_txt_ls), len(test_txt_ls)

(150000, 50000)

In [5]:
w2i_dict = defaultdict(lambda : len(w2i_dict))

train_w2i_ls = list(convert_word_to_idx(train_txt_ls))
test_w2i_ls = list(convert_word_to_idx(test_txt_ls))

i2w_dict = {val : key for key, val in w2i_dict.items()}

In [6]:
for w2i in train_w2i_ls[0]:
    print(i2w_dict[w2i])

아
더빙..
진짜
짜증나네요
목소리


# CNN 모델 준비

In [17]:
import torch.nn as nn
import torch
from torch.autograd import Variable
import torch.nn.functional as F
import random
import numpy as np

In [9]:
def convert_to_variable(w2i_ls):
    
    var = Variable(torch.LongTensor(w2i_ls))
    return var

In [10]:
x_train = convert_to_variable(add_padding(train_w2i_ls, 15))
x_val = convert_to_variable(add_padding(test_w2i_ls[:10000],15))
x_test = convert_to_variable(add_padding(test_w2i_ls[10000:],15))

y_train = convert_to_variable(train_label_ls).float()
y_val = convert_to_variable(test_label_ls[:10000]).float()
y_test = convert_to_variable(test_label_ls[10000:]).float()

In [11]:
class CNN_text(nn.Module):
    
    def __init__(self, n_words, embed_size, hid_size, drop_rate, kernel_size_ls, num_filter):
        super(CNN_text, self).__init__()
        
        self.embed_size = embed_size
        self.hid_size = hid_size
        self.drop_rate = drop_rate
        self.num_filter = num_filter
        self.kernel_size_ls = kernel_size_ls
        self.num_kernel = len(kernel_size_ls)
    
        self.embedding = nn.Embedding(n_words, embed_size)
        self.convs = nn.ModuleList([nn.Conv2d(1, num_filter, (kernel_size, embed_size)) for kernel_size in kernel_size_ls])
        
        self.lin = nn.Sequential(
            nn.Linear(self.num_kernel*num_filter, hid_size), nn.ReLU(), 
            nn.Dropout(drop_rate),
            nn.Linear(hid_size, 1),
            )
        
    def forward(self, x):
        embed = self.embedding(x) # [batch_size, max_length, embed_size]
        embed.unsqueeze_(1)  # [batch_size, 1, max_length, embed_size]
        conved = [conv(embed).squeeze(3) for conv in self.convs] # [batch_size, num_filter, max_length -kernel_size +1]
        pooled = [F.max_pool1d(conv, (conv.size(2))).squeeze(2) for conv in conved] # [batch_size, num_kernel, num_filter]
        concated = torch.cat(pooled, dim = 1) # [batch_size, num_kernel * num_filter]
        logit = self.lin(concated)
        return torch.sigmoid(logit)
        

In [12]:
n_words = len(w2i_dict)
EMBED_SIZE = 64
HID_SIZE = 64
DROP_RATE = 0.5
KERNEL_SIZE_LS = [2,3,4,5]
NUM_FILTER = 16

In [13]:
model = CNN_text(n_words = n_words, embed_size =EMBED_SIZE, drop_rate= DROP_RATE,
                 hid_size=HID_SIZE, kernel_size_ls= KERNEL_SIZE_LS, num_filter=NUM_FILTER)

In [14]:
model

CNN_text(
  (embedding): Embedding(450542, 64)
  (convs): ModuleList(
    (0): Conv2d(1, 16, kernel_size=(2, 64), stride=(1, 1))
    (1): Conv2d(1, 16, kernel_size=(3, 64), stride=(1, 1))
    (2): Conv2d(1, 16, kernel_size=(4, 64), stride=(1, 1))
    (3): Conv2d(1, 16, kernel_size=(5, 64), stride=(1, 1))
  )
  (lin): Sequential(
    (0): Linear(in_features=64, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5)
    (3): Linear(in_features=64, out_features=1, bias=True)
  )
)

In [16]:
epochs = 100
lr = 0.0003
batch_size = 10000

train_idx = np.arange(x_train.size(0))
test_idx = np.arange(x_test.size(0))
optimizer = torch.optim.Adam(model.parameters(),lr)
criterion = nn.BCELoss(reduction='sum')

for epoch in range(epochs):
    model.train()
    
    # input 데이터 순서 섞기
    random.shuffle(train_idx)
    x_train = x_train[train_idx]
    y_train = y_train[train_idx]
    train_loss = 0

    for start_idx, end_idx in zip(range(0, x_train.size(0), batch_size),
                                  range(batch_size, x_train.size(0)+1, batch_size)):
        x_batch = x_train[start_idx : end_idx]
        y_batch = y_train[start_idx : end_idx]
        
        logit = model(x_batch)
        predict = logit.ge(0.5).float()
        y_batch = y_batch.unsqueeze(1)
        
        acc = (predict == y_batch).sum().item() / batch_size
        loss = criterion(logit, y_batch)
        train_loss += loss
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    print('Train epoch : %s,  loss : %s,  accuracy :%.3f'%(epoch+1, train_loss.item(), acc))
    print('=================================================================================================')
    if (epoch+1) % 10 == 0:
        model.eval()
        logit = model(x_val).squeeze(1)
        predict = logit.ge(0.5).float()
        
        acc = (predict == y_val).sum().item() / 10000
        loss = criterion(logit, y_val)
        
        print('Test Epoch : %s, Test Loss : %.03f , Test Accuracy : %.03f'%(epoch+1, loss.item(), acc))


Train epoch :1,  loss : 105025.5703125,  accuracy :0.502
Test Loss : 6926.19580078125
Test Accuracy : 0.515
Train epoch :2,  loss : 104215.4375,  accuracy :0.513
Test Loss : 6911.78466796875
Test Accuracy : 0.526
Train epoch :3,  loss : 103785.40625,  accuracy :0.527
Test Loss : 6900.783203125
Test Accuracy : 0.540
Train epoch :4,  loss : 103407.75,  accuracy :0.545
Test Loss : 6881.7939453125
Test Accuracy : 0.547
Train epoch :5,  loss : 102946.765625,  accuracy :0.560
Test Loss : 6855.4140625
Test Accuracy : 0.552
Train epoch :6,  loss : 102368.8125,  accuracy :0.571
Test Loss : 6819.2724609375
Test Accuracy : 0.563
Train epoch :7,  loss : 101599.40625,  accuracy :0.584
Test Loss : 6773.35498046875
Test Accuracy : 0.573
Train epoch :8,  loss : 100493.96875,  accuracy :0.605
Test Loss : 6715.81103515625
Test Accuracy : 0.579
Train epoch :9,  loss : 99160.6328125,  accuracy :0.605
Test Loss : 6648.8291015625
Test Accuracy : 0.589
Train epoch :10,  loss : 97579.2890625,  accuracy :0.621

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/donghyungko/anaconda3/envs/fininsight_python3.5/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-16-ed6d9065a215>", line 33, in <module>
    loss.backward()
  File "/home/donghyungko/anaconda3/envs/fininsight_python3.5/lib/python3.5/site-packages/torch/tensor.py", line 102, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph)
  File "/home/donghyungko/anaconda3/envs/fininsight_python3.5/lib/python3.5/site-packages/torch/autograd/__init__.py", line 90, in backward
    allow_unreachable=True)  # allow_unreachable flag
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/donghyungko/anaconda3/envs/fininsight_python3.5/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 1863, in showtraceback
    stb 

KeyboardInterrupt: 