In [1]:
from gensim.models.wrappers import FastText
from gensim.utils import tokenize

In [2]:
fasttext_model = FastText.load('/mnt/A20CC3B20CC37FB1/cc.vi.300.gs')

In [3]:
import re
import unicodedata
import torch
import torch.nn as nn
import torch
from torch.autograd import Variable
from torch import optim
from collections import Counter
import torch.nn.functional as F
import pickle
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import nltk
torch.manual_seed(10)
# Enable inline plotting
nltk.download('punkt')
%matplotlib inline

[nltk_data] Downloading package punkt to /home/computer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# if gpu is to be used
use_cuda = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor
Tensor = FloatTensor
Variable = torch.autograd.Variable

In [5]:
file = open('./data/Training_shuf.txt', 'r')
data = []
X = [] # text 
y = [] # label (text)
for line in file:
    row = line.split(' ', 1)
    data.append((row[1].strip(), row[0].split('__')[1]))
    X.append(row[1].strip())
    y.append(row[0].split('__')[1])

In [6]:
le = preprocessing.LabelEncoder()
le.fit(y)
Y_train = le.transform(y)
number_of_class = le.classes_.size
np.save('./data/classes.npy',le.classes_)

In [7]:
def from_pretrained(embeddings, freeze=True):
    assert embeddings.dim() == 2, \
         'Embeddings parameter is expected to be 2-dimensional'
    rows, cols = embeddings.shape
    embedding = torch.nn.Embedding(num_embeddings=rows, embedding_dim=cols)
    embedding.weight = torch.nn.Parameter(embeddings)
    embedding.weight.requires_grad = not freeze
    return embedding
class Model(torch.nn.Module) :
    def __init__(self,weights,hidden_dim,number_of_class) :
        super(Model,self).__init__()
        self.hidden_dim = hidden_dim
        embedding_dim = weights.shape[1]
        self.embeddings = from_pretrained(weights)
        self.lstm1 = nn.LSTM(embedding_dim,hidden_dim,dropout=0.2)
        self.linear1 = nn.Linear(hidden_dim,number_of_class)
    def forward(self,inputs,hidden) :
        x = self.embeddings(inputs).view(len(inputs),1,-1)
        lstm_out1,lstm_h1 = self.lstm1(x,hidden)
        x = lstm_out1[-1]
        x = self.linear1(x)
        x = F.log_softmax(x)
        return x,lstm_h1
    def init_hidden(self) :
        return (Variable(torch.zeros(1, 1, self.hidden_dim).type(FloatTensor)),Variable(torch.zeros(1, 1, self.hidden_dim).type(FloatTensor)))
def get_indexes(text):
    return [fasttext_model.wv.vocab.get(t).index for t in tokenize(text) if t in fasttext_model.wv.vocab]
def save_params(model, i):
    pl = list(model.parameters())
    pl = [p for i,p in enumerate(pl) if i > 0]
    torch.save(pl,'./data/params-%d.dat' % i)
def load_params(model, params_path):
    pl = list(model.parameters())
    pl = [p for i,p in enumerate(pl) if i > 0]
    pll = torch.load(params_path)
    for p1, p2 in zip(pl, pll):
        p1.data = p2.data

In [8]:
weights = FloatTensor(fasttext_model.wv.syn0)

In [9]:
model = Model(weights,80,number_of_class)

In [10]:
if(use_cuda):
    model.cuda()

In [11]:
loss_function = nn.NLLLoss()
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-3)

In [12]:
%%time
epochs = 30

print('starting training')
X_sample = X#[2000:4000]
Y_train_sample = Y_train#[2000:4000]
for i in range(epochs):
    avg_loss = 0.0
    for idx, (x, y_train) in enumerate(zip(X_sample, Y_train_sample)):
        if not idx == 0:
            input_data = get_indexes(x)
            input_data = Variable(LongTensor(input_data))
            y_train = y_train.item()
            target_data = Variable(LongTensor([y_train]))
            hidden = model.init_hidden()
            model.zero_grad()
            y_pred,_ = model(input_data,hidden)
            loss = loss_function(y_pred,target_data)
            avg_loss += loss.data[0]
            
            if idx%500 == 0 or idx == 1:
                print('epoch :%d iterations :%d loss :%g'%(i,idx,loss.data[0]))
            loss.backward()
            optimizer.step()
    save_params(model, i)
    print('the average loss after completion of %d epochs is %g'%((i+1),(avg_loss/len(X_sample))))    

starting training




epoch :0 iterations :1 loss :2.19914
epoch :0 iterations :500 loss :1.82093
epoch :0 iterations :1000 loss :2.09222
epoch :0 iterations :1500 loss :1.01968
epoch :0 iterations :2000 loss :1.8425
epoch :0 iterations :2500 loss :2.09945
epoch :0 iterations :3000 loss :2.22924
epoch :0 iterations :3500 loss :1.68119
epoch :0 iterations :4000 loss :1.85766
epoch :0 iterations :4500 loss :1.75222
epoch :0 iterations :5000 loss :1.7385
epoch :0 iterations :5500 loss :2.02257
epoch :0 iterations :6000 loss :2.5215
epoch :0 iterations :6500 loss :1.48666
epoch :0 iterations :7000 loss :0.507241
epoch :0 iterations :7500 loss :2.02556
epoch :0 iterations :8000 loss :0.96921
epoch :0 iterations :8500 loss :3.38435
epoch :0 iterations :9000 loss :0.642102
epoch :0 iterations :9500 loss :1.51792
the average loss after completion of 1 epochs is 1.90912
epoch :1 iterations :1 loss :1.71657
epoch :1 iterations :500 loss :2.87758
epoch :1 iterations :1000 loss :1.13411
epoch :1 iterations :1500 loss :

epoch :9 iterations :3500 loss :6.38962e-05
epoch :9 iterations :4000 loss :0.0509868
epoch :9 iterations :4500 loss :0.0184765
epoch :9 iterations :5000 loss :0.117779
epoch :9 iterations :5500 loss :0.0212955
epoch :9 iterations :6000 loss :0.000110626
epoch :9 iterations :6500 loss :0.000488281
epoch :9 iterations :7000 loss :0.0141387
epoch :9 iterations :7500 loss :0.0054059
epoch :9 iterations :8000 loss :0.00305653
epoch :9 iterations :8500 loss :0.153145
epoch :9 iterations :9000 loss :0.00282669
epoch :9 iterations :9500 loss :0.0382972
the average loss after completion of 10 epochs is 0.201455
epoch :10 iterations :1 loss :0.000548363
epoch :10 iterations :500 loss :0.011425
epoch :10 iterations :1000 loss :0.000112534
epoch :10 iterations :1500 loss :0.000841141
epoch :10 iterations :2000 loss :0.000193596
epoch :10 iterations :2500 loss :0.00616074
epoch :10 iterations :3000 loss :7.15256e-05
epoch :10 iterations :3500 loss :6.67572e-05
epoch :10 iterations :4000 loss :0.02

epoch :18 iterations :1500 loss :5.72205e-06
epoch :18 iterations :2000 loss :9.53674e-07
epoch :18 iterations :2500 loss :0.00257492
epoch :18 iterations :3000 loss :8.58307e-06
epoch :18 iterations :3500 loss :8.58307e-06
epoch :18 iterations :4000 loss :0.00216866
epoch :18 iterations :4500 loss :2.86102e-06
epoch :18 iterations :5000 loss :0.00100422
epoch :18 iterations :5500 loss :0.000691414
epoch :18 iterations :6000 loss :0
epoch :18 iterations :6500 loss :9.53674e-07
epoch :18 iterations :7000 loss :5.72205e-06
epoch :18 iterations :7500 loss :1.43051e-05
epoch :18 iterations :8000 loss :1.23978e-05
epoch :18 iterations :8500 loss :0.00051403
epoch :18 iterations :9000 loss :0.00275707
epoch :18 iterations :9500 loss :0.00235271
the average loss after completion of 19 epochs is 0.0338723
epoch :19 iterations :1 loss :3.8147e-05
epoch :19 iterations :500 loss :0.00142193
epoch :19 iterations :1000 loss :0
epoch :19 iterations :1500 loss :1.52588e-05
epoch :19 iterations :2000 

epoch :27 iterations :3000 loss :0
epoch :27 iterations :3500 loss :0
epoch :27 iterations :4000 loss :0.000255108
epoch :27 iterations :4500 loss :2.67029e-05
epoch :27 iterations :5000 loss :0
epoch :27 iterations :5500 loss :0.0481024
epoch :27 iterations :6000 loss :0
epoch :27 iterations :6500 loss :9.53674e-07
epoch :27 iterations :7000 loss :1.23978e-05
epoch :27 iterations :7500 loss :9.53674e-07
epoch :27 iterations :8000 loss :7.62939e-05
epoch :27 iterations :8500 loss :0
epoch :27 iterations :9000 loss :0.0013032
epoch :27 iterations :9500 loss :0.150668
the average loss after completion of 28 epochs is 0.0196444
epoch :28 iterations :1 loss :1.90735e-06
epoch :28 iterations :500 loss :0.00227356
epoch :28 iterations :1000 loss :0
epoch :28 iterations :1500 loss :9.53674e-07
epoch :28 iterations :2000 loss :5.53131e-05
epoch :28 iterations :2500 loss :0
epoch :28 iterations :3000 loss :0
epoch :28 iterations :3500 loss :0
epoch :28 iterations :4000 loss :0.00111198
epoch :2

In [13]:
save_params(model, -1)

In [14]:
# torch.save(model.state_dict(), './data/model_last.pth')
# torch.save(model, './data/model_last.pth')

In [15]:
# load_params(model, './data/params.dat')