# CS 287 - HW 4

In [None]:
!pip install -q torch torchtext opt_einsum git+https://github.com/harvardnlp/namedtensor

In [None]:
import random
import torch
import torch.nn as nn 
import torch.nn.functional as F
import torchtext
from torchtext.vocab import Vectors, GloVe
from namedtensor import ntorch, NamedTensor
from namedtensor.text import NamedField

In [None]:
# load data
TEXT = NamedField(names=('seqlen',)) # Our input $x$
LABEL = NamedField(sequential=False, names=()) # Our labels $y$
train, val, test = torchtext.datasets.SNLI.splits(TEXT, LABEL)
print('len(train)', len(train))
TEXT.build_vocab(train)
LABEL.build_vocab(train)
print('len(TEXT.vocab)', len(TEXT.vocab))
print('len(LABEL.vocab)', len(LABEL.vocab))
train_iter, val_iter, test_iter = torchtext.data.BucketIterator.splits(
    (train, val, test), batch_size=16, device=torch.device("cuda"), repeat=False)

In [None]:
# build the vocabulary with word embeddings
# out-of-vocabulary words are hashed to one of 100 random embeddings each initialized to mean 0, stdev 1 (Sec 5.1)
unk_vectors = [torch.randn(300) for _ in range(100)]
TEXT.vocab.load_vectors(vectors='glove.6B.300d', unk_init=lambda x:random.choice(unk_vectors))
vectors = TEXT.vocab.vectors
vectors = vectors / vectors.norm(dim=1, keepdim=True) # normalized to have l_2 norm of 1
vectors = NamedTensor(vectors, ('word', 'embedding'))
TEXT.vocab.vectors = vectors
print("word embeddings shape:", TEXT.vocab.vectors.shape)

In [None]:
# here's an example of a training example
batch = next(iter(train_iter))
print("Size of premise batch:", batch.premise.shape)
print("Size of hypothesis batch:", batch.hypothesis.shape)
print("Size of label batch:", batch.label.shape)

## Vanilla Decomposable Attention Model

In [None]:
class EmbedProject(torch.nn.Module):
    def __init__(self, weights, embed_size, project_size):
        super(EmbedProject, self).__init__()
        self.embed = nn.Embedding.from_pretrained(weights, freeze=True) # weights: input_size x embed_size
        self.linear = nn.Linear(embed_size, project_size)
        torch.nn.init.normal_(self.linear.weight, mean=0, std=0.01)
    def forward(self, inputs):
        embedding = self.embed(inputs)
        output = self.linear(embedding)
        return output

In [None]:
class FeedForwardF(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout=0.2):
        super(FeedForwardF, self).__init__()
        self.d = nn.Dropout(dropout)
        self.m = nn.ReLU()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, output_size)
        for param in self.parameters():
            torch.nn.init.normal_(param, mean=0, std=0.01)
    def forward(self, inputs):
        hidden = self.m(self.linear1(self.d(inputs)))
        output = self.m(self.linear2(self.d(hidden)))
        return output

In [None]:
class FeedForwardG(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout=0.2):
        super(FeedForwardG, self).__init__()
        self.d = nn.Dropout(dropout)
        self.m = nn.ReLU()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, output_size)
        for param in self.parameters():
            torch.nn.init.normal_(param, mean=0, std=0.01)
    def forward(self, inputs):
        hidden = self.m(self.linear1(self.d(inputs)))
        output = self.m(self.linear2(self.d(hidden)))
        return output

In [None]:
class FeedForwardH(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout=0.2):
        super(FeedForwardH, self).__init__()
        self.d = nn.Dropout(dropout)
        self.m = nn.ReLU()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.linear3 = nn.Linear(hidden_size, output_size)
        for param in self.parameters():
            torch.nn.init.normal_(param, mean=0, std=0.01)
    def forward(self, inputs):
        hidden1 = self.m(self.linear1(self.d(inputs)))
        hidden2 = self.m(self.linear2(self.d(hidden1)))
        output = self.linear3(hidden2)
        return output

## Global Vars

In [None]:
# dimensions
input_size = TEXT.vocab.vectors.shape['word']
embed_size = TEXT.vocab.vectors.shape['embedding']
hidden_size1 = 200
hidden_size2 = hidden_size1 * 2
output_size = len(LABEL.vocab)
print('DIMS - input: %d, embed: %d, hidden1: %d, hidden2: %d, output: %d'%(input_size, embed_size, hidden_size1, hidden_size2, output_size))

In [None]:
# pre-trained embeddings
weights = TEXT.vocab.vectors.values.cuda()
print('DIMS - pre-trained:', weights.shape)

In [None]:
# tokens
pad_tkn = TEXT.vocab.stoi['<pad>']
null_tkn = torch.tensor(TEXT.vocab.stoi['null'], device='cuda')
print('<pad>:', pad_tkn, ', null:', null_tkn)

## Development

In [None]:
EP1 = EmbedProject(weights, embed_size, hidden_size1).cuda()
EP1

In [None]:
F1 = FeedForwardF(hidden_size1, hidden_size1, hidden_size1).cuda()
F1

In [None]:
G1 = FeedForwardG(hidden_size2, hidden_size1, hidden_size1).cuda()
G1

In [None]:
H1 = FeedForwardH(hidden_size2, hidden_size1, output_size).cuda()
H1

In [None]:
parameters = [param for param in EP1.parameters()] # embed, lnr, bias
print(len(parameters))
parameters.extend([param for param in F1.parameters()]) # lnr1, bias1, lnr2, bias2
print(len(parameters))
parameters.extend([param for param in G1.parameters()]) # lnr1, bias1, lnr2, bias2
print(len(parameters))
parameters.extend([param for param in H1.parameters()]) # lnr1, bias1, lnr2, bias2, lnr3, bias3
print(len(parameters))

In [None]:
raw_sent1 = batch.premise.values.transpose(0,1)#[0,:-1].unsqueeze(0)
raw_sent2 = batch.hypothesis.values.transpose(0,1)#[0,:-1].unsqueeze(0)
raw_sent1.shape, raw_sent2.shape

In [None]:
null_tkns = null_tkn.repeat(raw_sent1.shape[0], 1)
sent1 = torch.cat((null_tkns, raw_sent1), 1)
sent2 = torch.cat((null_tkns, raw_sent2), 1)
sent1.shape, sent2.shape

In [None]:
proj1 = EP1(sent1)
proj2 = EP1(sent2)
proj1.shape, proj2.shape

In [None]:
f1 = F1(proj1)
f2 = F1(proj2)
f1.shape, f2.shape

In [None]:
score1 = torch.bmm(f1, f2.transpose(1,2))
score2 = torch.bmm(f2, f1.transpose(1,2))
score1.shape, score2.shape

In [None]:
mask1 = (sent1 == pad_tkn)
mask2 = (sent2 == pad_tkn)
mask1.shape, mask2.shape

In [None]:
mask1a = mask1.unsqueeze(1).expand(-1, sent2.shape[1], -1).float()
mask2a = mask2.unsqueeze(1).expand(-1, sent1.shape[1], -1).float()
mask1a.shape, mask2a.shape

In [None]:
#score1 = score1.masked_fill_(mask2a, -1e8)
#score2 = score2.masked_fill_(mask1a, -1e8)
score1 = score1 * (1 - mask2a) + (mask2a * -1e8)
score2 = score2 * (1 - mask1a) + (mask1a * -1e8)
score1.shape, score2.shape

In [None]:
# MASK
#for i in range(sent1.shape[0]):
#    score1[i,:,(sent2[i,:] == pad_tkn).nonzero()] = -1e8
#    score2[i,:,(sent1[i,:] == pad_tkn).nonzero()] = -1e8

In [None]:
prob1 = F.softmax(score1, dim=2)
prob2 = F.softmax(score2, dim=2)
prob1.shape, prob2.shape

In [None]:
proj1_soft = torch.bmm(prob2, proj1)
proj2_soft = torch.bmm(prob1, proj2)
proj1_soft.shape, proj2_soft.shape

In [None]:
proj1_combined = torch.cat((proj1, proj2_soft), dim=2)
proj2_combined = torch.cat((proj2, proj1_soft), dim=2)
proj1_combined.shape, proj2_combined.shape

In [None]:
g1 = G1(proj1_combined)
g2 = G1(proj2_combined)
g1.shape, g2.shape

In [None]:
mask1b = mask1.unsqueeze(2).expand(-1, -1, hidden_size1).float()
mask2b = mask2.unsqueeze(2).expand(-1, -1, hidden_size1).float()
mask1b.shape, mask2b.shape

In [None]:
#g1 = g1.masked_fill_(mask1b, 0)
#g2 = g2.masked_fill_(mask2b, 0)
g1 = g1 * (1 - mask1b)
g2 = g2 * (1 - mask2b)
g1.shape, g2.shape

In [None]:
# MASK
#for i in range(sent1.shape[0]):
#    g1[i,(sent1[i,:] == pad_tkn).nonzero(),:] = 0
#    g2[i,(sent2[i,:] == pad_tkn).nonzero(),:] = 0

In [None]:
g1_sum = g1.sum(dim=1)
g2_sum = g2.sum(dim=1)
g1_sum.shape, g2_sum.shape

In [None]:
g_all = torch.cat((g1_sum, g2_sum), dim=1)
g_all.shape

In [None]:
h_all = H1(g_all)
h_all.shape

In [None]:
# TEST
#EP2 = EmbedProject(weights, 300, 4).cuda()

In [None]:
# TEST 
#out1 = EP2(sent1)
#h_all = torch.sum(out1, dim=1)
#h_all.shape

In [None]:
target = batch.label.values
#target = torch.tensor([1], device='cuda')
target.shape

In [None]:
criterion = nn.CrossEntropyLoss()
loss = criterion(h_all, target)
loss

In [None]:
loss.backward()
i = 0
for param in parameters:
    if i != 1:
        print(param.grad)
        #param.grad.data.zero_()        
    i += 1

In [None]:
# TEST
EP2.zero_grad()

In [None]:
EP1.zero_grad()
F1.zero_grad()
G1.zero_grad()
H1.zero_grad()

In [None]:
loss.backward()
i = 0
for param in parameters:
    if i != 1:
        print(param.grad)
    i += 1

In [None]:
optimizer = torch.optim.Adagrad(parameters, lr=0.05, initial_accumulator_value=0.1)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", patience=4)

In [None]:
#for param in EP1.parameters():
#    emb1 = param
#    break
i = 0
for param in H1.parameters():
    if i == 5:
        print(param)
    i += 1

In [None]:
EP1.train()
F1.train()
G1.train()
H1.train()
optimizer.zero_grad()

sent1 = batch.premise.values.transpose(0,1)
sent2 = batch.hypothesis.values.transpose(0,1)
target = batch.label.values

proj1 = EP1(sent1)
proj2 = EP1(sent2)
f1 = F1(proj1)
f2 = F1(proj2)
score1 = torch.bmm(f1, f2.transpose(1,2))
score2 = score1.transpose(1,2)
prob1 = F.softmax(score1, dim=2)
prob2 = F.softmax(score2, dim=2)
proj1_soft = torch.bmm(prob2, proj1)
proj2_soft = torch.bmm(prob1, proj2)
proj1_combined = torch.cat((proj1, proj2_soft), dim=2)
proj2_combined = torch.cat((proj2, proj1_soft), dim=2)
g1 = G1(proj1_combined)
g2 = G1(proj2_combined)
g1_sum = g1.sum(dim=1)
g2_sum = g2.sum(dim=1)
g_all = torch.cat((g1_sum, g2_sum), dim=1)
h_all = H1(g_all)

loss = criterion(h_all, target)
loss.backward()
optimizer.step()

In [None]:
for param in EP1.parameters():
    emb2 = param
    break
i = 0
for param in H1.parameters():
    if i == 5:
        print(param)
    i += 1

In [None]:
torch.sum(emb1 != emb2)

In [None]:
acc = torch.sum(torch.argmax(h_all, dim=1) == target).item() / target.shape[0]
acc

In [None]:
loss

## Train

In [None]:
def get_output(sent1, sent2, EP1, F1, G1, H1):
    proj1 = EP1(sent1)
    proj2 = EP1(sent2)
    f1 = F1(proj1)
    f2 = F1(proj2)
    
    score1 = torch.bmm(f1, f2.transpose(1,2))
    score2 = score1.transpose(1,2)
    mask1 = (sent1 == pad_tkn)
    mask2 = (sent2 == pad_tkn)
    mask1a = mask1.unsqueeze(1).expand(-1, sent2.shape[1], -1).float()
    mask2a = mask2.unsqueeze(1).expand(-1, sent1.shape[1], -1).float()
    score1 = score1 * (1 - mask2a) + (mask2a * -1e8)
    score2 = score2 * (1 - mask1a) + (mask1a * -1e8)
    
    prob1 = F.softmax(score1, dim=2)
    prob2 = F.softmax(score2, dim=2)
    proj1_soft = torch.bmm(prob2, proj1)
    proj2_soft = torch.bmm(prob1, proj2)
    proj1_combined = torch.cat((proj1, proj2_soft), dim=2)
    proj2_combined = torch.cat((proj2, proj1_soft), dim=2)
    
    g1 = G1(proj1_combined)
    g2 = G1(proj2_combined)
    mask1b = mask1.unsqueeze(2).expand(-1, -1, hidden_size1).float()
    mask2b = mask2.unsqueeze(2).expand(-1, -1, hidden_size1).float()
    g1 = g1 * (1 - mask1b)
    g2 = g2 * (1 - mask2b)

    g1_sum = g1.sum(dim=1)
    g2_sum = g2.sum(dim=1)
    g_all = torch.cat((g1_sum, g2_sum), dim=1)
    h_all = H1(g_all)
    return h_all

In [None]:
def prepend_null(sent):
    null_tkns = null_tkn.repeat(sent.shape[0], 1)
    return torch.cat((null_tkns, sent), 1)

In [None]:
def training_loop(e, train_iter, EP1, F1, G1, H1, criterion, optimizer):
    EP1.train()
    F1.train()
    G1.train()
    H1.train()
    
    for ix,batch in enumerate(train_iter):
        optimizer.zero_grad()
        sent1 = prepend_null(batch.premise.values.transpose(0,1))
        sent2 = prepend_null(batch.hypothesis.values.transpose(0,1))
        target = batch.label.values
        output = get_output(sent1, sent2, EP1, F1, G1, H1)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        
        if ix % 1000 == 0:
            acc = torch.sum(torch.argmax(output, dim=1) == target).item() / target.shape[0]
            print('Epoch: {0}, Batch: {1}, Train NLL: {2:0.4f}, Train Acc:{3:0.4f}'.format(e, ix, loss.cpu().detach(), acc))

In [None]:
def validation_loop(e, val_iter, EP1, F1, G1, H1, criterion):
    EP1.eval()
    F1.eval()
    G1.eval()
    H1.eval()
    
    total_loss = 0
    total_sent = 0
    total_correct = 0
    
    for ix,batch in enumerate(val_iter):
        sent1 = prepend_null(batch.premise.values.transpose(0,1))
        sent2 = prepend_null(batch.hypothesis.values.transpose(0,1))
        target = batch.label.values
        output = get_output(sent1, sent2, EP1, F1, G1, H1)
        
        loss = criterion(output, target).item()
        sent = sent1.shape[0]
        correct = torch.sum(torch.argmax(output, dim=1) == target).item()
        
        total_loss += loss*sent
        total_sent += sent
        total_correct += correct
    
    print('Epoch: {0}, Val NLL: {1:0.4f}, Val Acc: {2:0.4f}'.format(e, total_loss/total_sent, total_correct/total_sent))
    return total_loss

In [None]:
best_loss = 1e8
EP1 = EmbedProject(weights, embed_size, hidden_size1).cuda()
F1 = FeedForwardF(hidden_size1, hidden_size1, hidden_size1).cuda()
G1 = FeedForwardG(hidden_size2, hidden_size1, hidden_size1).cuda()
H1 = FeedForwardH(hidden_size2, hidden_size1, output_size).cuda()

parameters = [param for param in EP1.parameters()] # embed, lnr, bias
parameters.extend([param for param in F1.parameters()]) # lnr1, bias1, lnr2, bias2
parameters.extend([param for param in G1.parameters()]) # lnr1, bias1, lnr2, bias2
parameters.extend([param for param in H1.parameters()]) # lnr1, bias1, lnr2, bias2, lnr3, bias3

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adagrad(parameters, lr=0.05, initial_accumulator_value=0.1)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", patience=4)

for e in range(100):
    training_loop(e, train_iter, EP1, F1, G1, H1, criterion, optimizer)
    loss = validation_loop(e, val_iter, EP1, F1, G1, H1, criterion)
    scheduler.step(loss)
    print('LR = {}'.format(optimizer.state_dict()['param_groups'][0]['lr']))
    if loss < best_loss:
        torch.save(EP1.state_dict(),'best_EP1_v02.pt')
        torch.save(F1.state_dict(),'best_F1_v02.pt')
        torch.save(G1.state_dict(),'best_G1_v02.pt')
        torch.save(H1.state_dict(),'best_H1_v02.pt')
        best_loss = loss
        print('WROTE MODEL')

In [None]:
#EP1 = EmbedProject(weights, embed_size, hidden_size1).cuda()
#state_dict = torch.load('best_EP1.pt')
#EP1.load_state_dict(state_dict)