In [4]:
import re
import unicodedata
import torch
import torch.nn as nn
import torch
from torch.autograd import Variable
from torch import optim
from collections import Counter
import torch.nn.functional as F
import pickle
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import nltk
torch.manual_seed(10)
# Enable inline plotting
nltk.download('punkt')
%matplotlib inline

[nltk_data] Downloading package punkt to /home/computer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
# if gpu is to be used
use_cuda = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor
Tensor = FloatTensor
Variable = torch.autograd.Variable

In [6]:
file = open('./data/Training_shuf.txt', 'r')
data = []
X = [] # text 
y = [] # label (text)
for line in file:
    row = line.split(' ', 1)
    data.append((row[1].strip(), row[0].split('__')[1]))
    X.append(row[1].strip())
    y.append(row[0].split('__')[1])

In [7]:
le = preprocessing.LabelEncoder()
le.fit(y)
Y_train = le.transform(y)
number_of_class = le.classes_.size

In [8]:
stop_words = []
with open('./data/vietnamese-stopwords.txt', 'r') as file_stop_words:
    for line in file_stop_words:
        stop_words.append(line.strip())

In [9]:
np.save('./data/classes.npy', le.classes_)

In [22]:
class Vocabulary(object):
    """Simple vocabulary wrapper."""
    def __init__(self, stop_words, counter):
        self.stop_words = stop_words
        self.word2idx = {}
        self.idx2word = {}
        self.stop_words = {}
        self.counter = counter
        self.idx = 0

    def add_word(self, word):
        if not word in self.word2idx and word not in self.stop_words:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def __call__(self, word):
        if not word in self.word2idx:
            return self.word2idx['<unk>']
        return self.word2idx[word]

    def __len__(self):
        return len(self.word2idx)
    
    def get_text_indexes(self, text):
         return [(self(token),self.counter[token]) for token in nltk.tokenize.word_tokenize(text.lower()) if token not in self.stop_words]
    
    def get_text_string(self, indexes):
        return [self.idx2word[index] for index in indexes]
    

def build_vocab(texts, stop_words, threshold = 2):
    counter = Counter()
    for i, text in enumerate(texts):
        tokens = [t for t in nltk.tokenize.word_tokenize(text.lower()) if t not in stop_words and len(t) > 2]
        counter.update(tokens)
        if i % 1000 == 0:
            print("[%d/%d] Tokenized the texts." %(i, len(texts)))
    
    words = [word for word, cnt in counter.items() if cnt >= threshold]
    
    vocab = Vocabulary(stop_words, counter)
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')

    for i, word in enumerate(words):
        vocab.add_word(word)
    return vocab
    
class Model(torch.nn.Module) :
    def __init__(self,embedding_dim,hidden_dim,vocab_limit,number_of_class) :
        super(Model,self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_limit, embedding_dim)
        self.lstm1 = nn.LSTM(embedding_dim + 1,hidden_dim,dropout=0.5)
        self.linear1 = nn.Linear(hidden_dim,number_of_class)
    def forward(self,inputs,bow,hidden) :
        x = self.embeddings(inputs).view(len(inputs),1,-1)
        x = torch.cat([x,bow.view(x.size(0),x.size(1),1)], dim=2)
        lstm_out1,lstm_h1 = self.lstm1(x,hidden)
        x = lstm_out1[-1]
        x = self.linear1(x)
        x = F.log_softmax(x)
        return x,lstm_h1
    def init_hidden(self) :
        return (Variable(torch.zeros(1, 1, self.hidden_dim).type(FloatTensor)),Variable(torch.zeros(1, 1, self.hidden_dim).type(FloatTensor)))    

In [37]:
%%time
use_preload_vocabs = True
if use_preload_vocabs:
    with open('./data/vocabs.pkl','rb') as f:
        vocabs = pickle.load(f)
else:
    vocabs = build_vocab(X, stop_words)
    with open('./data/vocabs.pkl','wb') as f:
        pickle.dump(vocabs, f)
vocab_limit = len(vocabs)

CPU times: user 28.4 ms, sys: 0 ns, total: 28.4 ms
Wall time: 25.9 ms


In [38]:
vocabs.stop_words = stop_words

In [39]:
model = Model(80,100,vocab_limit,number_of_class)
if(use_cuda):
    model.cuda()

loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [40]:
%%time
epochs = 20

torch.save(model.state_dict(), './data/model' + str(0)+'.pth')
print('starting training')
X_sample = X#[2000:4000]
Y_train_sample = Y_train#[2000:4000]
total_freq_words = sum(vocabs.counter.values())
for i in range(epochs):
    avg_loss = 0.0
    for idx, (x, y_train) in enumerate(zip(X_sample, Y_train_sample)):
        if not idx == 0:
            input_data, input_data_bow = zip(*vocabs.get_text_indexes(x))
            input_data = Variable(LongTensor(input_data))
            input_data_bow = FloatTensor(input_data_bow) / total_freq_words 
            # F.normalize(FloatTensor(input_data_bow),dim=0)
            # print(input_data_bow)
            input_data_bow = Variable(input_data_bow)
            # print(input_data_bow.shape, input_data.shape)
            
            y_train = y_train.item()
            target_data = Variable(LongTensor([y_train]))
            hidden = model.init_hidden()
            model.zero_grad()
            y_pred,_ = model(input_data,input_data_bow,hidden)
            loss = loss_function(y_pred,target_data)
            avg_loss += loss.data[0]
            
            if idx%500 == 0 or idx == 1:
                print('epoch :%d iterations :%d loss :%g'%(i,idx,loss.data[0]))
                
            loss.backward()
            optimizer.step()
    torch.save(model.state_dict(), './data/' + str(i+1)+'.pth')            
    print('the average loss after completion of %d epochs is %g'%((i+1),(avg_loss/len(X_sample))))    

starting training
epoch :0 iterations :1 loss :2.35323




epoch :0 iterations :500 loss :2.57905
epoch :0 iterations :1000 loss :2.14521
epoch :0 iterations :1500 loss :2.12214
epoch :0 iterations :2000 loss :2.10907
epoch :0 iterations :2500 loss :2.13964
epoch :0 iterations :3000 loss :2.19514
epoch :0 iterations :3500 loss :2.78045
epoch :0 iterations :4000 loss :1.47368
epoch :0 iterations :4500 loss :2.68557
epoch :0 iterations :5000 loss :2.22539
epoch :0 iterations :5500 loss :2.84578
epoch :0 iterations :6000 loss :1.10089
epoch :0 iterations :6500 loss :1.74738
epoch :0 iterations :7000 loss :0.925474
epoch :0 iterations :7500 loss :0.340396
epoch :0 iterations :8000 loss :0.489485
epoch :0 iterations :8500 loss :1.79475
epoch :0 iterations :9000 loss :0.18229
epoch :0 iterations :9500 loss :1.98812
the average loss after completion of 1 epochs is 1.70931
epoch :1 iterations :1 loss :1.07808
epoch :1 iterations :500 loss :1.30159
epoch :1 iterations :1000 loss :0.611976
epoch :1 iterations :1500 loss :2.44001
epoch :1 iterations :200

epoch :9 iterations :2000 loss :9.53674e-06
epoch :9 iterations :2500 loss :0.000170708
epoch :9 iterations :3000 loss :4.48227e-05
epoch :9 iterations :3500 loss :5.72205e-06
epoch :9 iterations :4000 loss :0.000107765
epoch :9 iterations :4500 loss :0.0194364
epoch :9 iterations :5000 loss :0.000354767
epoch :9 iterations :5500 loss :0.00366163
epoch :9 iterations :6000 loss :0.000216484
epoch :9 iterations :6500 loss :7.72476e-05
epoch :9 iterations :7000 loss :8.39233e-05
epoch :9 iterations :7500 loss :9.53674e-06
epoch :9 iterations :8000 loss :0.00265598
epoch :9 iterations :8500 loss :0.00021553
epoch :9 iterations :9000 loss :8.2016e-05
epoch :9 iterations :9500 loss :3.8147e-06
the average loss after completion of 10 epochs is 0.0350467
epoch :10 iterations :1 loss :3.8147e-05
epoch :10 iterations :500 loss :2.03514
epoch :10 iterations :1000 loss :9.53674e-07
epoch :10 iterations :1500 loss :0.000243187
epoch :10 iterations :2000 loss :1.90735e-06
epoch :10 iterations :2500 

epoch :18 iterations :1500 loss :1.71661e-05
epoch :18 iterations :2000 loss :0
epoch :18 iterations :2500 loss :0.000257492
epoch :18 iterations :3000 loss :0
epoch :18 iterations :3500 loss :0
epoch :18 iterations :4000 loss :7.62939e-06
epoch :18 iterations :4500 loss :7.62939e-05
epoch :18 iterations :5000 loss :0.000117302
epoch :18 iterations :5500 loss :0.20551
epoch :18 iterations :6000 loss :2.09808e-05
epoch :18 iterations :6500 loss :4.48227e-05
epoch :18 iterations :7000 loss :5.91278e-05
epoch :18 iterations :7500 loss :0
epoch :18 iterations :8000 loss :0.00021553
epoch :18 iterations :8500 loss :0.000352859
epoch :18 iterations :9000 loss :7.34329e-05
epoch :18 iterations :9500 loss :0
the average loss after completion of 19 epochs is 0.0156473
epoch :19 iterations :1 loss :0
epoch :19 iterations :500 loss :9.53674e-07
epoch :19 iterations :1000 loss :0
epoch :19 iterations :1500 loss :1.90735e-06
epoch :19 iterations :2000 loss :0
epoch :19 iterations :2500 loss :0.0133

In [41]:
torch.save(model.state_dict(), './data/model_last.pth')

In [42]:
vocabs.counter.most_common

