In [28]:
import nltk
from models import InferSent
import torch
import torch.nn as nn
from torchtext.data import Field, TabularDataset
import torch.optim as optim
import copy
import numpy as np

In [11]:
id = Field(sequential = False, use_vocab = False)

id1 = Field(sequential = False, use_vocab = False)

id2 = Field(sequential = False, use_vocab = False)

sent1 = Field(sequential = False, # if false, no tokenization is applied.
              use_vocab = True, #whether to use a vocab object. if false, the data is already numerical.
              #tokenize = tokenizer,
              lower=False)

sent2 = Field(sequential = False, # if false, no tokenization is applied.
              use_vocab = True, #whether to use a vocab object. if false, the data is already numerical.
              #tokenize = tokenizer,
              lower=False)

labels = Field(sequential = False, use_vocab = False)

fields = [('i', id), ('i1', id1), ('i2', id2), ('sent1', sent1), ('sent2', sent2), ('l', labels)]

trn_data, dev_data, tst_data = TabularDataset.splits(path='/home/bozyurt20/Desktop/COMP542/Project/GLUE-baselines/glue_data/QQP/', #open the files, read them and create the Tabular
                                              train = 'train.tsv',      #Dataset objects according to the format of the files
                                              validation = 'dev.tsv',
                                              test = 'test.tsv', 
                                              fields = fields, #and the field specifications.
                                              format = 'tsv'
                                              )

#train_data, dev_data = train_var_data.split(split_ratio = 0.8)

In [15]:
trn_sentences1 = list(trn_data.sent1)[1:10001]
trn_sentences2 = list(trn_data.sent2)[1:10001]
trn_labels = list(trn_data.l)[1:10001]

dev_sentences1 = list(dev_data.sent1)[1:2001]
dev_sentences2 = list(dev_data.sent2)[1:2001]
dev_labels = list(dev_data.l)[1:2001]

'tst_sentences1 = list(tst_data.sent1)\ntst_sentences2 = list(tst_data.sent2)\ntst_labels = list(tst_data.l)'

In [16]:
trn_sentences = trn_sentences1 + trn_sentences2 # to build vocab out of it

In [17]:
V = 2
MODEL_PATH = 'encoder/infersent%s.pkl' % V
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))

W2V_PATH = 'fastText/crawl-300d-2M.vec'
infersent.set_w2v_path(W2V_PATH)

infersent.build_vocab(trn_sentences, tokenize=True)


Found 17368(/18644) words with w2v vectors
Vocab size : 17368


In [38]:
trn_embeddings1 = infersent.encode(trn_sentences1, tokenize=True)
trn_embeddings2 = infersent.encode(trn_sentences2, tokenize=True)

In [39]:
dev_embeddings1 = infersent.encode(dev_sentences1, tokenize=True)
dev_embeddings2 = infersent.encode(dev_sentences2, tokenize=True)

In [83]:
class Classifier(nn.Module):
    # define model elements
    def __init__(self):
        super(Classifier, self).__init__()
        self.layer = nn.Linear(params_model['enc_lstm_dim']*8, 1)
        self.activation = nn.Sigmoid()
 
    # forward propagate input
    def forward(self, X):
        X = self.layer(X)
        X = self.activation(X)
        return X
    
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = Classifier()
model.to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [21]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(y_pred)

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [22]:
no_trn_pairs = trn_embeddings1.shape[0]
no_dev_pairs = dev_embeddings1.shape[0]    

In [24]:
trn_labels = torch.tensor(list(map(int,trn_labels)), device=device)
dev_labels = torch.tensor(list(map(int,dev_labels)), device=device)

  trn_embeddings1 = torch.tensor(trn_embeddings1, device=device)
  trn_embeddings2 = torch.tensor(trn_embeddings2, device=device)


In [68]:
trn_u_v = []
for i in range(no_trn_pairs):
    u_v_abs1 = list(np.array(trn_embeddings1[i]) - np.array(trn_embeddings2[i]))
    u_v_mult1 = np.multiply(np.array(trn_embeddings1[i]), np.array(trn_embeddings2[i]))
    concat = np.concatenate((trn_embeddings1[i], trn_embeddings2[i], u_v_abs1, u_v_mult1))
    trn_u_v.append(concat)

In [76]:
dev_u_v = []
for i in range(no_dev_pairs):
    u_v_abs1 = list(np.array(dev_embeddings1[i]) - np.array(dev_embeddings2[i]))
    u_v_mult1 = np.multiply(np.array(dev_embeddings1[i]), np.array(dev_embeddings2[i]))
    concat = np.concatenate((dev_embeddings1[i], dev_embeddings2[i], u_v_abs1, u_v_mult1))
    dev_u_v.append(concat)

In [78]:
trn_u_v = torch.tensor(trn_u_v, device=device)
dev_u_v = torch.tensor(dev_u_v, device=device)

In [None]:
best_acc = 0.0
for e in range(1, 101):
    model.train()
    epoch_loss = 0.0
    epoch_acc = 0.0
    for i in range(no_trn_pairs):
        optimizer.zero_grad()
        sent = trn_u_v[i,:]
        with torch.set_grad_enabled(True):
            y_pred = model(sent)      
            label = trn_labels[i].type_as(y_pred)
            loss = criterion(y_pred, label.unsqueeze(0))
            acc = binary_acc(y_pred, label.unsqueeze(0))
            loss.backward()
            optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    model.eval()
    dev_epoch_loss = 0.0
    dev_epoch_acc = 0.0
    for i in range(no_dev_pairs):
        with torch.set_grad_enabled(False): 
            sent = dev_u_v[i,:]
            y_pred = model(sent)      
            label = dev_labels[i].type_as(y_pred)
            loss = criterion(y_pred, label.unsqueeze(0))
            acc = binary_acc(y_pred, label.unsqueeze(0))
        
        dev_epoch_loss += loss.item()
        dev_epoch_acc += acc.item()
    if dev_epoch_acc > best_acc:
        best_acc = dev_epoch_acc
        best_model_wts = copy.deepcopy(model.state_dict())
      

    print(f'Epoch {e+0:03}: | trn-Loss: {epoch_loss/no_trn_pairs:.5f} | trn-Acc: {epoch_acc/no_trn_pairs:.3f}')
    print(f'Epoch {e+0:03}: | dev-Loss: {dev_epoch_loss/no_dev_pairs:.5f} | dev-Acc: {dev_epoch_acc/no_dev_pairs:.3f}')