In [1]:
import random
import torch
import torch.nn as nn 
import torch.nn.functional as F
import torchtext
from torchtext.vocab import Vectors, GloVe
from namedtensor import ntorch, NamedTensor
from namedtensor.text import NamedField
from common import *

%reload_ext autoreload

In [2]:
# load data
TEXT = NamedField(names=('seqlen',)) # Our input $x$
LABEL = NamedField(sequential=False, names=()) # Our labels $y$
train, val, test = torchtext.datasets.SNLI.splits(TEXT, LABEL)
print('len(train)', len(train))
TEXT.build_vocab(train)
LABEL.build_vocab(train)
print('len(TEXT.vocab)', len(TEXT.vocab))
print('len(LABEL.vocab)', len(LABEL.vocab))
train_iter, val_iter, test_iter = torchtext.data.BucketIterator.splits(
    (train, val, test), batch_size=128, device=torch.device("cuda"), repeat=False)

len(train) 549367
len(TEXT.vocab) 62998
len(LABEL.vocab) 4


In [3]:
# build the vocabulary with word embeddings
# out-of-vocabulary words are hashed to one of 100 random embeddings each initialized to mean 0, stdev 1 (Sec 5.1)
unk_vectors = [torch.randn(300) for _ in range(100)]
TEXT.vocab.load_vectors(vectors='glove.6B.300d', unk_init=lambda x:random.choice(unk_vectors))
vectors = TEXT.vocab.vectors
vectors = vectors / vectors.norm(dim=1, keepdim=True) # normalized to have l_2 norm of 1
vectors = NamedTensor(vectors, ('word', 'embedding'))
TEXT.vocab.vectors = vectors
print("word embeddings shape:", TEXT.vocab.vectors.shape)
weights = TEXT.vocab.vectors.values.cuda()

word embeddings shape: OrderedDict([('word', 62998), ('embedding', 300)])


In [4]:
# dimensions
input_size = TEXT.vocab.vectors.shape['word']
embed_size = TEXT.vocab.vectors.shape['embedding']
hidden_size1 = 200
hidden_size2 = hidden_size1 * 2
output_size = len(LABEL.vocab)
print('DIMS - input: %d, embed: %d, hidden1: %d, hidden2: %d, output: %d'%(input_size, embed_size, hidden_size1, hidden_size2, output_size))

DIMS - input: 62998, embed: 300, hidden1: 200, hidden2: 400, output: 4


In [5]:
def training_loop(e, train_iter, network, criterion, optimizer):
    network.train()
    for ix,batch in enumerate(train_iter):
        optimizer.zero_grad()
        sent1 = prepend_null(batch.premise.values.transpose(0,1))
        sent2 = prepend_null(batch.hypothesis.values.transpose(0,1))
        target = batch.label.values
        output = network(sent1, sent2)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        
        if ix % 1000 == 0:
            acc = torch.sum(torch.argmax(output, dim=1) == target).item() / target.shape[0]
            print('Epoch: {0}, Batch: {1}, Train NLL: {2:0.4f}, Train Acc:{3:0.4f}'.format(e, ix, loss.cpu().detach(), acc))

In [6]:
def validation_loop(e, val_iter, network, criterion):
    network.train()
    total_loss = 0
    total_sent = 0
    total_correct = 0
    
    for ix,batch in enumerate(val_iter):
        sent1 = prepend_null(batch.premise.values.transpose(0,1))
        sent2 = prepend_null(batch.hypothesis.values.transpose(0,1))
        target = batch.label.values
        output = network(sent1, sent2)
        
        loss = criterion(output, target).item()
        sent = sent1.shape[0]
        correct = torch.sum(torch.argmax(output, dim=1) == target).item()
        
        total_loss += loss*sent
        total_sent += sent
        total_correct += correct
    
    print('Epoch: {0}, Val NLL: {1:0.4f}, Val Acc: {2:0.4f}'.format(e, total_loss/total_sent, total_correct/total_sent))
    return total_loss

In [7]:
FFA_net = Decomposable_Attn_Network(input_size,embed_size,hidden_size1,output_size,weights)
FFA_net.cuda()

Decomposable_Attn_Network(
  (Embedding_layer): EmbedProject(
    (embed): Embedding(62998, 300)
    (linear): Linear(in_features=300, out_features=200, bias=True)
  )
  (F): FeedForward_layer(
    (d): Dropout(p=0.2)
    (m): ReLU()
    (linear1): Linear(in_features=200, out_features=200, bias=True)
    (linear2): Linear(in_features=200, out_features=200, bias=True)
  )
  (G): FeedForward_layer(
    (d): Dropout(p=0.2)
    (m): ReLU()
    (linear1): Linear(in_features=400, out_features=200, bias=True)
    (linear2): Linear(in_features=200, out_features=200, bias=True)
  )
  (H): FeedForward_layer(
    (d): Dropout(p=0.2)
    (m): ReLU()
    (linear1): Linear(in_features=400, out_features=200, bias=True)
    (linear2): Linear(in_features=200, out_features=200, bias=True)
  )
  (linear): Linear(in_features=200, out_features=4, bias=True)
)

In [8]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adagrad(FFA_net.parameters(), lr=0.05, initial_accumulator_value=0.1)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", patience=4)

In [None]:
best_loss = 1e8

In [11]:
for e in range(50):
    training_loop(e, train_iter, FFA_net, criterion, optimizer)
    loss = validation_loop(e, val_iter,FFA_net, criterion)
    scheduler.step(loss)
    print('LR = {}'.format(optimizer.state_dict()['param_groups'][0]['lr']))
    if loss < best_loss:
        torch.save(FFA_net.state_dict(),'best_FFA_net.pt')
        best_loss = loss
        print('WROTE MODEL')

Epoch: 0, Batch: 0, Train NLL: 0.6408, Train Acc:0.7422
Epoch: 0, Batch: 1000, Train NLL: 0.6840, Train Acc:0.7500
Epoch: 0, Batch: 2000, Train NLL: 0.5563, Train Acc:0.8203
Epoch: 0, Batch: 3000, Train NLL: 0.6455, Train Acc:0.7109
Epoch: 0, Batch: 4000, Train NLL: 0.5970, Train Acc:0.7344
Epoch: 0, Val NLL: 0.6281, Val Acc: 0.7360
LR = 0.0005000000000000001
Epoch: 1, Batch: 0, Train NLL: 0.5822, Train Acc:0.7734
Epoch: 1, Batch: 1000, Train NLL: 0.5847, Train Acc:0.7344
Epoch: 1, Batch: 2000, Train NLL: 0.5953, Train Acc:0.7734
Epoch: 1, Batch: 3000, Train NLL: 0.6449, Train Acc:0.7422
Epoch: 1, Batch: 4000, Train NLL: 0.6546, Train Acc:0.7031
Epoch: 1, Val NLL: 0.6229, Val Acc: 0.7382
LR = 0.0005000000000000001
WROTE MODEL
Epoch: 2, Batch: 0, Train NLL: 0.6175, Train Acc:0.7109
Epoch: 2, Batch: 1000, Train NLL: 0.7402, Train Acc:0.6406
Epoch: 2, Batch: 2000, Train NLL: 0.7072, Train Acc:0.7266
Epoch: 2, Batch: 3000, Train NLL: 0.5294, Train Acc:0.8125
Epoch: 2, Batch: 4000, Train NL

Epoch: 22, Batch: 3000, Train NLL: 0.6336, Train Acc:0.7344
Epoch: 22, Batch: 4000, Train NLL: 0.4242, Train Acc:0.8281
Epoch: 22, Val NLL: 0.6302, Val Acc: 0.7371
LR = 5.000000000000002e-07
Epoch: 23, Batch: 0, Train NLL: 0.6479, Train Acc:0.6953
Epoch: 23, Batch: 1000, Train NLL: 0.5799, Train Acc:0.7578
Epoch: 23, Batch: 2000, Train NLL: 0.6442, Train Acc:0.7500
Epoch: 23, Batch: 3000, Train NLL: 0.7112, Train Acc:0.7031
Epoch: 23, Batch: 4000, Train NLL: 0.6425, Train Acc:0.7031
Epoch: 23, Val NLL: 0.6275, Val Acc: 0.7396
LR = 5.0000000000000024e-08
Epoch: 24, Batch: 0, Train NLL: 0.6747, Train Acc:0.6484
Epoch: 24, Batch: 1000, Train NLL: 0.6049, Train Acc:0.7578
Epoch: 24, Batch: 2000, Train NLL: 0.6387, Train Acc:0.7266
Epoch: 24, Batch: 3000, Train NLL: 0.6466, Train Acc:0.7578
Epoch: 24, Batch: 4000, Train NLL: 0.6205, Train Acc:0.7031
Epoch: 24, Val NLL: 0.6269, Val Acc: 0.7359
LR = 5.0000000000000024e-08
Epoch: 25, Batch: 0, Train NLL: 0.6278, Train Acc:0.7656
Epoch: 25, Bat

Epoch: 44, Val NLL: 0.6296, Val Acc: 0.7415
LR = 5.0000000000000026e-09
Epoch: 45, Batch: 0, Train NLL: 0.4848, Train Acc:0.7891
Epoch: 45, Batch: 1000, Train NLL: 0.5683, Train Acc:0.7578
Epoch: 45, Batch: 2000, Train NLL: 0.6035, Train Acc:0.7344
Epoch: 45, Batch: 3000, Train NLL: 0.5892, Train Acc:0.7891
Epoch: 45, Batch: 4000, Train NLL: 0.7232, Train Acc:0.6953
Epoch: 45, Val NLL: 0.6269, Val Acc: 0.7374
LR = 5.0000000000000026e-09
Epoch: 46, Batch: 0, Train NLL: 0.6350, Train Acc:0.7188
Epoch: 46, Batch: 1000, Train NLL: 0.5315, Train Acc:0.7812
Epoch: 46, Batch: 2000, Train NLL: 0.6376, Train Acc:0.7109
Epoch: 46, Batch: 3000, Train NLL: 0.6119, Train Acc:0.7500
Epoch: 46, Batch: 4000, Train NLL: 0.6238, Train Acc:0.7422
Epoch: 46, Val NLL: 0.6237, Val Acc: 0.7451
LR = 5.0000000000000026e-09
Epoch: 47, Batch: 0, Train NLL: 0.6050, Train Acc:0.7031
Epoch: 47, Batch: 1000, Train NLL: 0.6534, Train Acc:0.7109
Epoch: 47, Batch: 2000, Train NLL: 0.5962, Train Acc:0.7500
Epoch: 47, Ba