# data는 e9t(Lucy Park)님께서 github에 공유해주신 네이버 영화평점 데이터를 사용하였습니다.
# https://github.com/e9t/nsmc

In [2]:
from collections import defaultdict

import torch.nn as nn
import torch
from torch.autograd import Variable
import torch.nn.functional as F
import random
import numpy as np


def read_txt(path_to_file):
    txt_ls = []
    label_ls = []

    with open(path_to_file) as f:
        for i, line in enumerate(f.readlines()[1:]):
            id_num, txt, label = line.split('\t')
            txt_ls.append(txt)
            label_ls.append(int(label.replace('\n','')))
    return txt_ls, label_ls


def convert_word_to_idx(sents):
    for sent in sents:
        yield [w2i_dict[word] for word in sent.split(' ')]
    return


def add_padding(sents, max_len):
    for i, sent in enumerate(sents):
        if len(sent)< max_len:
            sents[i] += [pad] * (max_len - len(sent))
    
        elif len(sent) > max_len:
            sents[i] = sent[:max_len]
    
    return sents


def convert_to_variable(w2i_ls):
    
    var = Variable(torch.LongTensor(w2i_ls))
    return var

In [3]:
w2i_dict = defaultdict(lambda : len(w2i_dict))
pad = w2i_dict['<PAD>']

# 데이터 불러오기
x_train, y_train = read_txt('ratings_train.txt')
x_test, y_test = read_txt('ratings_test.txt')

x_train = list(convert_word_to_idx(x_train))
x_test = list(convert_word_to_idx(x_test))

i2w_dict = {val : key for key, val in w2i_dict.items()}

In [4]:
x_train = convert_to_variable(add_padding(x_train, 15))
x_val = convert_to_variable(add_padding(x_test[:10000],15))
x_test = convert_to_variable(add_padding(x_test[10000:],15))

y_train = convert_to_variable(y_train).float()
y_val = convert_to_variable(y_test[:10000]).float()
y_test = convert_to_variable(y_test[10000:]).float()

In [5]:
len(x_train), len(x_test)

(150000, 40000)

# CNN 모델 준비

In [6]:
import torch.nn as nn
import torch
from torch.autograd import Variable
import torch.nn.functional as F
import random
import numpy as np

In [7]:
def convert_to_variable(w2i_ls):
    
    var = Variable(torch.LongTensor(w2i_ls))
    return var

In [8]:
class CNN_DynamicKMaxPooling(nn.Module):
    
    def __init__(self, **kwargs):
        super(CNN_DynamicKMaxPooling, self).__init__()
        
        self.batch_size   = kwargs['batch_size']
        self.n_words      = kwargs['n_words']
        self.embed_size   = kwargs['embed_size']
        self.n_map_1      = kwargs['n_feature_map_1']
        self.n_map_2      = kwargs['n_feature_map_2']
        self.k_top        = kwargs['k_top'] # final pooling k
        self.L            = kwargs['L']  # number of convolution-layers
        self.hid_size     = kwargs['hid_size']
        self.n_category   = kwargs['n_category']
        self.dropout      = kwargs['dropout']
        self.tanh = nn.Tanh()
        
        KS_1 = 7 # first convolution filter size (row)
        KS_2 = 5 # second filter size (row)
        
        self.embedding = nn.Embedding(self.n_words, self.embed_size)
       
        # convolution layers
        self.conv_dict = {}
        self.conv_dict['conv1'] = nn.Conv2d(1, self.n_map_1, (KS_1, 1), padding=(KS_1-1, 0))
        self.conv_dict['conv2'] = nn.Conv2d(self.n_map_1, self.n_map_2, (KS_2, 1), padding=(KS_2-1, 0))
        
        self.modules = nn.ModuleDict(self.conv_dict)
        
        
        # Fully-connect
        self.fully_connect = nn.Sequential(
            nn.Linear(self.n_map_2 * self.k_top * self.embed_size//4, self.hid_size), 
            nn.ReLU(),
            nn.Dropout(self.dropout),
            
            nn.Linear(self.hid_size, self.n_category),
        )
        
    def dynamic_k_max_pooling(self, x, l):
        s = x.size(2)
        k_l = round(max(self.k_top, ((self.L-l)/self.L * s)))
        index = x.topk(k_l, dim=2)[1].sort(dim=2)[0]
        return x.gather(2, index)
    
    def fold(self, x):
        # Embedding column을 2개씩 묶어서 더하는 folding작업을 하는 함수
        N, C, S, E = x.size() # [batch_size(1), channel(2), sequence_length, embedding_size]
        zero = torch.zeros((N, C, S, E//2))

        for c in range(C):
            for s in range(S):
                i = 0 # i번째 묶음 Folding

                for start_idx, end_idx in zip(range(0, E, 2),
                                              range(2, E+1, 2)):
                    # folding 값 계산
                    folded_val = torch.sum(x[0, c, s, start_idx : end_idx])

                    # zero 텐서에 결과 누적
                    zero[0, c, s, i] += folded_val

                    # 다음 묶음으로 넘어감
                    i += 1 
        return zero
    
       
        
    def forward(self, x):
        embeded = self.embedding(x) # [batch_size, sequence_length, embed_size]
        embeded.unsqueeze_(1)  # [batch_size, 1, sequence_length, embed_size]
        
        # 첫 번째 convolution과 k-max-pooling layer를 통과시킨다.
        # convolution
        conv1 = self.conv_dict['conv1']
        conved1 = conv1(embeded) # [batch_size, n_filter_1, sequence_length + KS_1 -1, embed_size]

        # folding
        folded1 = self.fold(conved1) # [batch_size, n_filter_1, sequence_length + KS_1 -1, embed_size/2]
        
        # dynamic-k-max-pooling
        pooled1 = self.dynamic_k_max_pooling(folded1, l=1) # [batch_size, n_filter_1, k_max1, embed_size/2]

        # nonlinearity
        first_output = self.tanh(pooled1) # [batch_size, n_filter_1, k_max1, embed_size/2]
        
        
        # 두 번째 convolution에는 Multiple Feature Map이 적용된다.
        # 각각의 필터에 대해, multiple 피처맵을 적용하고, 결과를 합하여 새로운 피처맵 생성
        # 해당 연산은 자동적으로 수행됨
        conv2 = self.conv_dict['conv2']
        conved2 = conv2(first_output)  # [batch_size, n_filter_2, k_max1 + KS_2 -1, embed_size/2]

        # Folding
        folded2 = self.fold(conved2) # [batch_size, n_filter_2, k_max1 + KS_2 - 1, embed_size/4]
        
        # second K-max Pooling
        pooled2 = self.dynamic_k_max_pooling(folded2, l=2) #[batch_size, n_filter_2, k_top, embed_size/4]
        
        # nonlinearity
        second_output = self.tanh(pooled2) # [batch_size, n_filter_2, k_top, embed_size/4]
        
        # flatten
        flat = second_output.squeeze(0).contiguous().view(self.batch_size,-1) # [batch_size, n_filter_2 * k_top * embed_size/4]
        
        
        logit = self.fully_connect(flat)
        
        return logit


In [9]:
n_words = len(w2i_dict)

In [10]:
kwargs = {
    'batch_size'    : 100,
    'n_words'       : n_words,
    'embed_size'    : 48,  # as written in paper
    'n_feature_map_1': 6,  # as written in paper
    'n_feature_map_2' : 14,# as written in paper
    'k_top'         : 4,   # as written in paper
    'L'             : 2,   # as written in paper
    'hid_size'      : 128,
    'n_category'    : 2,   # as written in paper (binary classfication)
    'dropout'       : 0.5,
} 

In [11]:
model = CNN_DynamicKMaxPooling(**kwargs)
model

CNN_DynamicKMaxPooling(
  (tanh): Tanh()
  (embedding): Embedding(450543, 48)
  (modules): ModuleDict(
    (conv1): Conv2d(1, 6, kernel_size=(7, 1), stride=(1, 1), padding=(6, 0))
    (conv2): Conv2d(6, 14, kernel_size=(5, 1), stride=(1, 1), padding=(4, 0))
  )
  (fully_connect): Sequential(
    (0): Linear(in_features=672, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5)
    (3): Linear(in_features=128, out_features=2, bias=True)
  )
)

In [None]:
epochs = 50
lr = 0.001
batch_size = 100

train_idx = np.arange(x_train.size(0))
test_idx = np.arange(x_test.size(0))
optimizer = torch.optim.Adam(model.parameters(),lr)
criterion = nn.CrossEntropyLoss(reduction='sum')

loss_ls = []

for epoch in range(epochs):
    model.train()
    
    # input 데이터 순서 섞기
    random.shuffle(train_idx)
    x_train = x_train[train_idx]
    y_train = y_train[train_idx]
    train_loss = 0

    for start_idx, end_idx in zip(range(0, x_train.size(0), batch_size),
                                  range(batch_size, x_train.size(0)+1, batch_size)):
        x_batch = x_train[start_idx : end_idx]
        y_batch = y_train[start_idx : end_idx].long()
        
        scores = model(x_batch)
        predict = F.softmax(scores, dim = 1).argmax(dim = 1)
        
        acc = (predict == y_batch).sum().item() / batch_size
        
        loss = criterion(scores, y_batch)
        train_loss += loss.item()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    print('Train epoch : %s,  loss : %s,  accuracy :%.3f'%(epoch+1, train_loss, acc))
    print('=================================================================================================')
    
    loss_ls.append(train_loss)
    
    if (epoch+1) % 10 == 0:
        model.eval()
        scores = model(x_val)
        predict = F.softmax(scores, dim = 1).argmax(dim = 1)
        
        acc = (predict == y_val.long()).sum().item() / 10000
        loss = criterion(scores, y_val.long())
        
        print('Test Epoch : %s, Test Loss : %.03f , Test Accuracy : %.03f'%(epoch+1, loss.item(), acc))

