In [1]:
import torch.nn as nn
import torch
from torch.autograd import Variable
import torch.nn.functional as F
import random

#모델 


class NewsClassification(nn.Module):
    def __init__(self, opts, vocab_size, padding_index=0) :
        super(NewsClassification, self).__init__()
        # 1) embedding
        embed_size = opts['embed_size']
        hidden_size = opts['hidden_size']
        lstm_layers = opts['n_layers']
        self.embed = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embed_size,
            padding_idx=padding_index
        )

        # 2) LSTM
        self.dropout = opts['dropout']
        n_category = opts['n_category']
        bidirectional = opts['bidirectional'] # 양방향이면 True, 단방향이면 False
        self.lstm = nn.LSTM(embed_size, hidden_size, lstm_layers, batch_first=True, bidirectional=bidirectional)
        if(bidirectional) : # 양방향의 경우, 왼쪽, 오른쪽 방향 2개이기 때문에 출력값의 크기가 2배
            input_size = 2 * hidden_size 
        else :
            input_size = hidden_size

        # 3) Linear
        self.linear = nn.Linear(input_size, n_category) #fully connected layer
        
    def attention_net(self, lstm_output, final_state):
        hidden = final_state.squeeze(0)
        attn_weights = torch.bmm(lstm_output, hidden.unsqueeze(2)).squeeze(2)
        soft_attn_weights = F.softmax(attn_weights, 1)
        new_hidden_state = torch.bmm(lstm_output.transpose(1, 2), soft_attn_weights.unsqueeze(2)).squeeze(2)

        return new_hidden_state
    
    def forward(self, sent_tensor, sent_mask):
        sent_emb = self.embed(sent_tensor)
        output, _ = self.lstm(sent_emb)

        attn_output = self.attention_net(output, output.transpose(0, 1)[-1]) 
        logits = self.linear(attn_output)
        return logits
    


In [2]:
#-*- coding:utf-8 -*-
import random
import json
import torch
import torch.nn as nn
from torch.autograd import Variable
#from model import NewsClassification
import torch.nn.functional as F

opts = {
    # model
    'embed_size' : 200,     # embedding 차원 (단어 -> Vector로 변환할 때 Vector의 크기!)
    'hidden_size' : 250,    # hidden 차원 (중간 연산과정의 Vector 크기)
    'n_layers' : 2,         # LSTM layer의 개수
    'dropout' : 0.15,        # dropout rate (* 이건 직접 추가해보세용, 성능 비교)
    'bidirectional' : True, # 양방향 LSTM
    'n_category' : 6,       # 긍정, 부정 >> 2개

    # train
    'vocab_path' : '/vocab.json',   # Vocab 경로
    'data_path' : '/dataset.json',  # 데이터셋 경로
    'use_gpu' : True,                   # GPU 사용여부. False면, CPU로 학습
    'epochs' : 50,                      
    'batch_size' : 5,
    'learning_rate' : 0.001
}

class BatchGen: # Batch를 만들어 주는 Class
    def __init__(self, raw_data, batch_size, evaluation=False):
        self.batch_size = batch_size
        self.eval = evaluation
        data = raw_data
        # shuffle
        if not evaluation: # 학습데이터는 순서를 랜덤으로 Shuffle, 평가데이터는 X
            indices = list(range(len(data)))
            random.shuffle(indices)
            data = [data[i] for i in indices]
        # chunk into batches
        data = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]
        # batch size만큼 data를 나눔 [ [batch1], [batch2], [batch3], ...]
        self.data = data

    def __len__(self):
        return len(self.data) # batch 개수 >> 'len(batch_data)'로 사용!

    def __iter__(self):
        for batch in self.data: 
            #(0, 1, 2, 3), (text, tokens, idx, answer)
            text_list = [b[0] for b in batch]
            token_list = [b[1] for b in batch]
            batch_size = len(batch)
            max_len = max([len(b[2]) for b in batch]) # batch의 문장 중에 가장 긴 문장의 길이
            answer_tensor = torch.LongTensor([b[3] for b in batch]) # size : (batch_size)
            sentence_tensor = torch.LongTensor(batch_size, max_len).fill_(0) # size : (batch_size, max_len)
                                                                            # 0(PAD index)으로 초기화
            for b_idx in range(batch_size) :
                sent = batch[b_idx][2]
                sentence_tensor[b_idx][:len(sent)] = torch.LongTensor(sent)
                # sentence의 앞에서부터 채워넣음!
                # 예를 들어 batch_size가 4이고, max_len이 5이면,
                # 1,  2,  3,  4,  5,  >> 가장 긴 문장의 index
                # 6,  0,  0,  0,  0,  >> 0은 padding index
                # 7,  8,  9,  0,  0
                # 10, 11, 12, 13, 0
                # 14, 15, 0,  0,  0
                # 와 같은 tensor(행렬)이 생성


            sentence_mask = sentence_tensor.eq(0) # 0인 부분만 1

            yield [text_list, token_list, sentence_tensor, sentence_mask, answer_tensor]

def load_data(vocab_path) :
    with open(vocab_path) as json_file :
        vocab = json.load(json_file)

    return len(vocab)



In [3]:
# 입력문장 >> 텐서로 변환

#-*- coding:utf-8 -*-

#from konlpy.tag import Okt#,Hannanum, Kkma, Komoran, ...
from eunjeon import Mecab
from collections import Counter
import json

tagger = Mecab()

def read_txt(input):
    txt_ls = []
    label_ls = []

    txt_ls.append(input)
    label_ls.append(0)
    
    return txt_ls, label_ls

def remove_empty_review(X, Y):
    # 비어있는 문장 제거 (예제와 동일)
    empty_idx_ls = []

    for idx, review in enumerate(X):
        if len(review) == 0:
            empty_idx_ls.append(idx)

    empty_idx_ls = sorted(empty_idx_ls, reverse=True)

    for empty_idx in empty_idx_ls:
        del X[empty_idx], Y[empty_idx]

    return X, Y

def text2pos(text) :
    # 문장을 형태소 분석 토큰의 리스트로 변환
    rep_list = []
    for word, pos in tagger.pos(text) :
        rep = '{}/{}'.format(word, pos)
        rep_list.append(rep)

    return rep_list



def convert_token_to_idx(vocab, tokens):
    # token 리스트를 index로 변환 (즉, 단어 표현을 index로 변환)
    idx = []
    for token in tokens :
        if(token not in vocab) : # vocab에 존재하지 않는 단어는 '<UNK>' index로!
            idx.append(vocab['<UNK>'])
        else :
            idx.append(vocab[token])
    return idx

vocab_path = '/vocab.json' # vocab 경로
with open(vocab_path) as json_file :
    vocab = json.load(json_file) # Vocab 불러오기

vocab_size = load_data(opts['vocab_path'])

model = NewsClassification(opts, vocab_size) # 모델

prm = torch.load("84.2.prm", map_location="cpu")
model.load_state_dict(prm)
model.eval()

optimizer = torch.optim.Adam(model.parameters(), opts['learning_rate'])
criterion = nn.CrossEntropyLoss()

In [None]:
import time
import zmq

context = zmq.Context()
socket = context.socket(zmq.REP)
socket.bind("tcp://127.0.0.1:5858")

while True:
    message = socket.recv()
    test = message.decode('utf-8')

    all_data = {'test' : []}
    for setname in ['test'] :
        text_list, answer_list = read_txt(test)
        text_list, answer_list = remove_empty_review(text_list, answer_list)

        for text, answer in zip(text_list, answer_list) :
            tokens = text2pos(text)
            idx = convert_token_to_idx(vocab, tokens)

            all_data[setname].append((text, tokens, idx, answer))
            all_data[setname].append((text, tokens, idx, answer))



    test_data = all_data['test']

    test_batch = BatchGen(test_data, opts['batch_size'], evaluation=True)
    #  attention(모델)부분에서 차원축소과정에서 에러뜨는 거 같아서 배치size 2개(입력받은거 두개로해서) 테스트 진행하였음

    predict_list = []

    for batch in test_batch :
        sent_tensor = Variable(batch[2])
        sent_mask = Variable(batch[3])
        ans_tensor = Variable(batch[4])

        logits = model.forward(sent_tensor, sent_mask)

        predict = F.softmax(logits, dim=1).argmax(dim=1)

        predict = predict.tolist()
        predict = predict[0]

        if predict == 0:
            test = "정치"
        elif predict == 1:
            test = "경제"
        elif predict == 2:
            test = "사회"
        elif predict == 3:
            test = "생활/문화"
        elif predict == 4:
            test = "세계"
        elif predict == 5:
            test = "IT/과학"
        print(test)
    socket.send(test.encode('utf-8')) #//클라이언트쪽으로 보내는 문구