# CS 287 - HW 4 - Latent

In [1]:
import random
import torch
import torch.nn as nn 
import torch.nn.functional as F
import torchtext
from torchtext.vocab import Vectors, GloVe
from namedtensor import ntorch, NamedTensor
from namedtensor.text import NamedField

from common import *
%reload_ext autoreload

In [2]:
# load data
TEXT = NamedField(names=('seqlen',)) # Our input $x$
LABEL = NamedField(sequential=False, names=()) # Our labels $y$
train, val, test = torchtext.datasets.SNLI.splits(TEXT, LABEL)
print('len(train)', len(train))
TEXT.build_vocab(train)
LABEL.build_vocab(train)
print('len(TEXT.vocab)', len(TEXT.vocab))
print('len(LABEL.vocab)', len(LABEL.vocab))
train_iter, val_iter, test_iter = torchtext.data.BucketIterator.splits(
    (train, val, test), batch_size=32, device=torch.device("cuda"), repeat=False)

len(train) 549367
len(TEXT.vocab) 62998
len(LABEL.vocab) 4


In [3]:
# build the vocabulary with word embeddings
# out-of-vocabulary words are hashed to one of 100 random embeddings each initialized to mean 0, stdev 1 (Sec 5.1)
unk_vectors = [torch.randn(300) for _ in range(100)]
TEXT.vocab.load_vectors(vectors='glove.6B.300d', unk_init=lambda x:random.choice(unk_vectors))
vectors = TEXT.vocab.vectors
vectors = vectors / vectors.norm(dim=1, keepdim=True) # normalized to have l_2 norm of 1
vectors = NamedTensor(vectors, ('word', 'embedding'))
TEXT.vocab.vectors = vectors
print("word embeddings shape:", TEXT.vocab.vectors.shape)
weights = TEXT.vocab.vectors.values.cuda()

word embeddings shape: OrderedDict([('word', 62998), ('embedding', 300)])


In [4]:
# here's an example of a training example
batch = next(iter(train_iter))
print("Size of premise batch:", batch.premise.shape)
print("Size of hypothesis batch:", batch.hypothesis.shape)
print("Size of label batch:", batch.label.shape)

Size of premise batch: OrderedDict([('seqlen', 38), ('batch', 32)])
Size of hypothesis batch: OrderedDict([('seqlen', 19), ('batch', 32)])
Size of label batch: OrderedDict([('batch', 32)])


In [5]:
# dimensions
input_size = TEXT.vocab.vectors.shape['word']
embed_size = TEXT.vocab.vectors.shape['embedding']
hidden_size1 = 200
output_size = len(LABEL.vocab)
print('DIMS - input: %d, embed: %d, hidden1: %d, output: %d'%(input_size, embed_size, hidden_size1, output_size))

DIMS - input: 62998, embed: 300, hidden1: 200, output: 4


## Latent Variable Mixture Model

In [6]:
def training_loop(e, train_iter, networks, criterion, optimizer):
    K = len(networks)
    for network in networks:
        network.train()
    
    for ix,batch in enumerate(train_iter):
        optimizer.zero_grad()
        sent1 = prepend_null(batch.premise.values.transpose(0,1))
        sent2 = prepend_null(batch.hypothesis.values.transpose(0,1))
        target = batch.label.values
        
        output = torch.zeros((K, sent1.shape[0], output_size), device='cuda')
        for c in range(K):
            network = networks[c]
            output[c,:,:] = F.log_softmax(network(sent1, sent2), dim=1) # K x BATCH x OUTPUT_SIZE
        output = torch.logsumexp(output, dim=0) + torch.log(torch.tensor(1/K))
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        
        if ix % 1000 == 0:
            acc = torch.sum(torch.argmax(output, dim=1) == target).item() / target.shape[0]
            print('Epoch: {0}, Batch: {1}, Train NLL: {2:0.4f}, Train Acc:{3:0.4f}'.format(e, ix, loss.cpu().detach(), acc))

In [7]:
def validation_loop(e, val_iter, networks, criterion):
    K = len(networks)
    for network in networks:
        network.eval()
    total_loss = 0
    total_sent = 0
    total_correct = 0
    
    for ix,batch in enumerate(val_iter):
        sent1 = prepend_null(batch.premise.values.transpose(0,1))
        sent2 = prepend_null(batch.hypothesis.values.transpose(0,1))
        target = batch.label.values
        
        output = torch.zeros((K, sent1.shape[0], output_size), device='cuda')
        for c in range(K):
            network = networks[c]
            output[c,:,:] = F.log_softmax(network(sent1, sent2), dim=1) # K x BATCH x OUTPUT_SIZE
        output = torch.logsumexp(output, dim=0) + torch.log(torch.tensor(1/K))
        
        loss = criterion(output, target).item()
        sent = sent1.shape[0]
        correct = torch.sum(torch.argmax(output, dim=1) == target).item()
        
        total_loss += loss*sent
        total_sent += sent
        total_correct += correct
    
    print('Epoch: {0}, Val NLL: {1:0.4f}, Val Acc: {2:0.4f}'.format(e, total_loss/total_sent, total_correct/total_sent))
    return total_loss

In [8]:
FFA_net1 = Decomposable_Attn_Network(input_size, embed_size, hidden_size1, output_size, weights).cuda()
FFA_net2 = Decomposable_Attn_Network(input_size, embed_size, hidden_size1, output_size, weights).cuda()
FFA_net3 = Decomposable_Attn_Network(input_size, embed_size, hidden_size1, output_size, weights).cuda()
networks = [FFA_net1, FFA_net2, FFA_net3]

In [9]:
parameters = [param for param in FFA_net1.parameters()] 
parameters.extend([param for param in FFA_net2.parameters()]) 
parameters.extend([param for param in FFA_net3.parameters()]) 

In [10]:
criterion = nn.NLLLoss()
optimizer = torch.optim.Adagrad(parameters, lr=0.05, initial_accumulator_value=0.1)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", patience=4)

In [None]:
best_loss = 1e8
K = len(networks)

for e in range(100):
    training_loop(e, train_iter, networks, criterion, optimizer)
    loss = validation_loop(e, val_iter, networks, criterion)
    scheduler.step(loss)
    print('LR = {}'.format(optimizer.state_dict()['param_groups'][0]['lr']))
    if loss < best_loss:
        for c in range(K):
            torch.save(networks[c].state_dict(), ''.join(('best_FFA_net', str(c), '.pt')))
        best_loss = loss
        print('WROTE MODEL')

Epoch: 0, Batch: 0, Train NLL: 1.3822, Train Acc:0.2812
Epoch: 0, Batch: 1000, Train NLL: 1.1061, Train Acc:0.2812
Epoch: 0, Batch: 2000, Train NLL: 1.1057, Train Acc:0.3438
Epoch: 0, Batch: 3000, Train NLL: 1.0938, Train Acc:0.4688
Epoch: 0, Batch: 4000, Train NLL: 1.0358, Train Acc:0.3750
Epoch: 0, Batch: 5000, Train NLL: 1.0422, Train Acc:0.5000
Epoch: 0, Batch: 6000, Train NLL: 1.1214, Train Acc:0.3125
Epoch: 0, Batch: 7000, Train NLL: 1.0393, Train Acc:0.5000
Epoch: 0, Batch: 8000, Train NLL: 1.1032, Train Acc:0.3750
Epoch: 0, Batch: 9000, Train NLL: 1.0455, Train Acc:0.5625
Epoch: 0, Batch: 10000, Train NLL: 1.0915, Train Acc:0.5000
Epoch: 0, Batch: 11000, Train NLL: 1.0621, Train Acc:0.5000
Epoch: 0, Batch: 12000, Train NLL: 0.8859, Train Acc:0.6875
Epoch: 0, Batch: 13000, Train NLL: 0.9469, Train Acc:0.5625
Epoch: 0, Batch: 14000, Train NLL: 0.9406, Train Acc:0.5938
Epoch: 0, Batch: 15000, Train NLL: 1.0251, Train Acc:0.5000
Epoch: 0, Batch: 16000, Train NLL: 1.0496, Train Acc:

Epoch: 7, Batch: 5000, Train NLL: 0.6991, Train Acc:0.7500
Epoch: 7, Batch: 6000, Train NLL: 0.7824, Train Acc:0.6562
Epoch: 7, Batch: 7000, Train NLL: 0.6791, Train Acc:0.7500
Epoch: 7, Batch: 8000, Train NLL: 0.7781, Train Acc:0.5938
Epoch: 7, Batch: 9000, Train NLL: 0.8550, Train Acc:0.6250
Epoch: 7, Batch: 10000, Train NLL: 0.8553, Train Acc:0.6250
Epoch: 7, Batch: 11000, Train NLL: 0.8018, Train Acc:0.6562
Epoch: 7, Batch: 12000, Train NLL: 0.6868, Train Acc:0.7188
Epoch: 7, Batch: 13000, Train NLL: 0.6601, Train Acc:0.7500
Epoch: 7, Batch: 14000, Train NLL: 0.7282, Train Acc:0.7188
Epoch: 7, Batch: 15000, Train NLL: 0.6236, Train Acc:0.8125
Epoch: 7, Batch: 16000, Train NLL: 0.7829, Train Acc:0.6875
Epoch: 7, Batch: 17000, Train NLL: 0.8425, Train Acc:0.5938
Epoch: 7, Val NLL: 0.7781, Val Acc: 0.6607
LR = 0.05
WROTE MODEL
Epoch: 8, Batch: 0, Train NLL: 0.6710, Train Acc:0.7812
Epoch: 8, Batch: 1000, Train NLL: 0.7644, Train Acc:0.6562
Epoch: 8, Batch: 2000, Train NLL: 0.7023, Tra

Epoch: 14, Batch: 9000, Train NLL: 0.7441, Train Acc:0.6562
Epoch: 14, Batch: 10000, Train NLL: 0.7047, Train Acc:0.7188
Epoch: 14, Batch: 11000, Train NLL: 0.7579, Train Acc:0.6250
Epoch: 14, Batch: 12000, Train NLL: 0.9016, Train Acc:0.6562
Epoch: 14, Batch: 13000, Train NLL: 0.6455, Train Acc:0.6562
Epoch: 14, Batch: 14000, Train NLL: 0.5783, Train Acc:0.8125
Epoch: 14, Batch: 15000, Train NLL: 0.7429, Train Acc:0.6250
Epoch: 14, Batch: 16000, Train NLL: 0.7772, Train Acc:0.6250
Epoch: 14, Batch: 17000, Train NLL: 0.4943, Train Acc:0.8750
Epoch: 14, Val NLL: 0.6915, Val Acc: 0.7112
LR = 0.05
WROTE MODEL
Epoch: 15, Batch: 0, Train NLL: 0.6751, Train Acc:0.7188
Epoch: 15, Batch: 1000, Train NLL: 0.7078, Train Acc:0.7500
Epoch: 15, Batch: 2000, Train NLL: 0.8742, Train Acc:0.5938
Epoch: 15, Batch: 3000, Train NLL: 0.6082, Train Acc:0.8125
Epoch: 15, Batch: 4000, Train NLL: 0.7054, Train Acc:0.6562
