In [25]:
import re
import unicodedata
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
import nltk
import pickle
from sklearn import preprocessing
import numpy as np
torch.manual_seed(1)
# Enable inline plotting
%matplotlib inline

In [26]:
# if gpu is to be used
use_cuda = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor
Tensor = FloatTensor
Variable = torch.autograd.Variable

In [27]:
class Vocabulary(object):
    """Simple vocabulary wrapper."""
    def __init__(self, stop_words, counter):
        self.stop_words = stop_words
        self.word2idx = {}
        self.idx2word = {}
        self.stop_words = {}
        self.counter = counter
        self.idx = 0

    def add_word(self, word):
        if not word in self.word2idx and word not in self.stop_words:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def __call__(self, word):
        if not word in self.word2idx:
            return self.word2idx['<unk>']
        return self.word2idx[word]

    def __len__(self):
        return len(self.word2idx)
    
    def get_text_indexes(self, text):
         return [(self(token),self.counter[token]) for token in nltk.tokenize.word_tokenize(text.lower()) if token not in self.stop_words]
    
    def get_text_string(self, indexes):
        return [self.idx2word[index] for index in indexes]
    

def build_vocab(texts, stop_words, threshold = 2):
    counter = Counter()
    for i, text in enumerate(texts):
        tokens = [t for t in nltk.tokenize.word_tokenize(text.lower()) if t not in stop_words and len(t) > 1]
        counter.update(tokens)
        if i % 1000 == 0:
            print("[%d/%d] Tokenized the texts." %(i, len(texts)))
    
    words = [word for word, cnt in counter.items() if cnt >= threshold]
    
    vocab = Vocabulary(stop_words, counter)
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')

    for i, word in enumerate(words):
        vocab.add_word(word)
    return vocab
    
class Model(torch.nn.Module) :
    def __init__(self,embedding_dim,hidden_dim,vocab_limit,number_of_class) :
        super(Model,self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_limit, embedding_dim)
        self.lstm1 = nn.LSTM(embedding_dim + 1,hidden_dim,dropout=0.5)
        self.linear1 = nn.Linear(hidden_dim,number_of_class)
    def forward(self,inputs,bow,hidden) :
        x = self.embeddings(inputs).view(len(inputs),1,-1)
        x = torch.cat([x,bow.view(x.size(0),x.size(1),1)], dim=2)
        lstm_out1,lstm_h1 = self.lstm1(x,hidden)
        x = lstm_out1[-1]
        x = self.linear1(x)
        x = F.log_softmax(x)
        return x,lstm_h1
    def init_hidden(self) :
        return (Variable(torch.zeros(1, 1, self.hidden_dim).type(FloatTensor)),Variable(torch.zeros(1, 1, self.hidden_dim).type(FloatTensor)))    

In [28]:
stop_words = []
with open('./data/vietnamese-stopwords.txt', 'r') as file_stop_words:
    for line in file_stop_words:
        stop_words.append(line.strip())

In [59]:
with open('./data/vocabs.pkl','rb') as f:
    vocabs = pickle.load(f)
vocabs.stop_words = stop_words
vocab_limit = len(vocabs)

le = preprocessing.LabelEncoder()
le.classes_ = np.load('./data/classes.npy')
number_of_class = le.classes_.size

model = Model(80,100, vocab_limit, number_of_class)
model.load_state_dict(torch.load('./data/model_last.pth'))
if(use_cuda):
    model.cuda()

In [60]:
file = open('./data/DevTesting.txt', 'r')
data = []
X = [] # text 
y = [] # label (text)
for line in file:
    row = line.split(' ', 1)
    data.append((row[1].strip(), row[0].split('__')[1]))
    X.append(row[1].strip())
    y.append(row[0].split('__')[1])

In [61]:
X_sample = X
Y_real = le.transform(y)
Y_pred = []
total_freq_words = sum(vocabs.counter.values())
model.eval() # to disable drop out
for idx, x in enumerate(X_sample):
    input_data, input_data_bow = zip(*vocabs.get_text_indexes(x))
    input_data = Variable(LongTensor(input_data))
    # input_data_bow = F.normalize(FloatTensor(input_data_bow),dim=0)
    input_data_bow = FloatTensor(input_data_bow) / total_freq_words
    input_data_bow = Variable(input_data_bow)
    hidden = model.init_hidden()
    y_pred,_ = model(input_data,input_data_bow,hidden)
    Y_pred.append(int(y_pred.max(1)[1]))



In [62]:
Y_pred = np.array(Y_pred)

In [63]:
precise = np.count_nonzero(Y_real == Y_pred) / Y_real.size
print(precise)

0.832


In [64]:
def predict(text):
    input_data, input_data_bow = zip(*vocabs.get_text_indexes(x))
    input_data = Variable(LongTensor(input_data))
    input_data_bow = F.normalize(FloatTensor(input_data_bow),dim=0)
    input_data_bow = Variable(input_data_bow)
    hidden = model.init_hidden()
    y_pred,_ = model(input_data,input_data_bow,hidden)
    return le.inverse_transform(int(y_pred.max(1)[1]))

In [65]:
predict("Kiểm tra đột xuất, cơ quan chức năng phát hiện cơ sở thẩm mỹ không phép trong chung cư đang chuẩn bị “dao kéo” trên cơ thể người bệnh. Tại đây, nhiều loại thuốc không rõ nguồn gốc xuất xứ đã bị niêm phong chờ xử lý.")

  if diff:


'CTXH'