# CS 287 - HW 4 - Cont.

In [1]:
import random
import torch
import torch.nn as nn 
import torch.nn.functional as F
import torchtext
from torchtext.vocab import Vectors, GloVe
from namedtensor import ntorch, NamedTensor
from namedtensor.text import NamedField

In [2]:
# load data
TEXT = NamedField(names=('seqlen',)) # Our input $x$
LABEL = NamedField(sequential=False, names=()) # Our labels $y$
train, val, test = torchtext.datasets.SNLI.splits(TEXT, LABEL)
print('len(train)', len(train))
TEXT.build_vocab(train)
LABEL.build_vocab(train)
print('len(TEXT.vocab)', len(TEXT.vocab))
print('len(LABEL.vocab)', len(LABEL.vocab))
train_iter, val_iter, test_iter = torchtext.data.BucketIterator.splits(
    (train, val, test), batch_size=16, device=torch.device("cuda"), repeat=False)

len(train) 549367
len(TEXT.vocab) 62998
len(LABEL.vocab) 4


In [3]:
# build the vocabulary with word embeddings
# out-of-vocabulary words are hashed to one of 100 random embeddings each initialized to mean 0, stdev 1 (Sec 5.1)
unk_vectors = [torch.randn(300) for _ in range(100)]
TEXT.vocab.load_vectors(vectors='glove.6B.300d', unk_init=lambda x:random.choice(unk_vectors))
vectors = TEXT.vocab.vectors
vectors = vectors / vectors.norm(dim=1, keepdim=True) # normalized to have l_2 norm of 1
vectors = NamedTensor(vectors, ('word', 'embedding'))
TEXT.vocab.vectors = vectors
print("word embeddings shape:", TEXT.vocab.vectors.shape)

word embeddings shape: OrderedDict([('word', 62998), ('embedding', 300)])


## Decomposable Intra-Sentence Attention Model

In [13]:
[LABEL.vocab.itos[i] for i in [0,1,2,3]]

['<unk>', 'entailment', 'contradiction', 'neutral']

In [18]:
[TEXT.vocab.itos[i] for i in [0,1,2,3]]

['<unk>', '<pad>', 'a', 'A']

In [15]:
# checking to make sure no <unk> labels
total = 0
for batch in iter(train_iter):
    total += torch.sum(batch.label.values == 0)
total

tensor(0, device='cuda:0')

In [17]:
# no prepend with NULL!
for batch in iter(train_iter):
    print([TEXT.vocab.itos[i] for i in batch.premise.values[:,0]])
    print([TEXT.vocab.itos[i] for i in batch.hypothesis.values[:,0]])
    break

['A', 'large', 'building', 'provides', 'the', 'backdrop', 'for', 'a', 'man', 'wearing', 'shorts', 'who', 'is', 'looking', 'at', 'his', 'phone', 'as', 'well', 'as', 'a', 'woman', 'who', 'is', 'wearing', 'a', 'black', 'skirt', 'and', 'white', 'shirt', 'as', 'she', 'walks', 'toward', 'the', 'man.']
['The', 'woman', 'is', 'looking', 'at', 'the', 'man.', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']


In [24]:
TEXT.vocab.stoi['null']

56690

In [4]:
# dimensions
input_size = TEXT.vocab.vectors.shape['word']
embed_size = TEXT.vocab.vectors.shape['embedding']
hidden_size1 = 200
hidden_size2 = hidden_size1 * 2
output_size = len(LABEL.vocab)
print('DIMENSIONS -- input: %d, embed: %d, hidden1: %d, hidden2: %d, output: %d'%(input_size, embed_size, hidden_size1, hidden_size2, output_size))

DIMENSIONS -- input: 62998, embed: 300, hidden1: 200, hidden2: 400, output: 4


In [5]:
# pre-trained embeddings
weights = TEXT.vocab.vectors.values.cuda()
weights.shape

torch.Size([62998, 300])

In [6]:
class EmbedProject(torch.nn.Module):
    def __init__(self, weights, embed_size, project_size):
        super(EmbedProject, self).__init__()
        self.embed = nn.Embedding.from_pretrained(weights, freeze=True) # weights: input_size x embed_size
        self.linear = nn.Linear(embed_size, project_size)
        torch.nn.init.normal_(self.linear.weight, mean=0, std=0.01)
    def forward(self, inputs):
        embedding = self.embed(inputs)
        output = self.linear(embedding)
        return output

In [30]:
EP1 = EmbedProject(weights, embed_size, hidden_size1).cuda()
EP1

EmbedProject(
  (embed): Embedding(62998, 300)
  (linear): Linear(in_features=300, out_features=200, bias=True)
)

In [31]:
sent1 = batch.premise.values.transpose(0,1)
sent2 = batch.hypothesis.values.transpose(0,1)
sent1.shape, sent2.shape

(torch.Size([16, 37]), torch.Size([16, 13]))

In [32]:
proj1 = EP1(sent1)
proj2 = EP1(sent2)
proj1.shape, proj2.shape

(torch.Size([16, 37, 200]), torch.Size([16, 13, 200]))

In [7]:
class FeedForwardF(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout=0.2):
        super(FeedForwardF, self).__init__()
        self.d = nn.Dropout(dropout)
        self.m = nn.ReLU()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, output_size)
        for param in self.parameters():
            torch.nn.init.normal_(param, mean=0, std=0.01)
    def forward(self, inputs):
        hidden = self.m(self.linear1(self.d(inputs)))
        output = self.m(self.linear2(self.d(hidden)))
        return output

In [34]:
F1 = FeedForwardF(hidden_size1, hidden_size1, hidden_size1).cuda()
F1

FeedForwardF(
  (d): Dropout(p=0.2)
  (m): ReLU()
  (linear1): Linear(in_features=200, out_features=200, bias=True)
  (linear2): Linear(in_features=200, out_features=200, bias=True)
)

In [35]:
f1 = F1(proj1)
f2 = F1(proj2)
f1.shape, f2.shape

(torch.Size([16, 37, 200]), torch.Size([16, 13, 200]))

In [36]:
# intra-sentence attention!
score1 = torch.bmm(f1, f1.transpose(1,2))
score2 = torch.bmm(f2, f2.transpose(1,2))
score1.shape, score2.shape

(torch.Size([16, 37, 37]), torch.Size([16, 13, 13]))

In [37]:
prob1 = F.softmax(score1, dim=2)
prob2 = F.softmax(score2, dim=2)
prob1.shape, prob2.shape

(torch.Size([16, 37, 37]), torch.Size([16, 13, 13]))

In [38]:
# intra-sentence attention!
proj1_soft = torch.bmm(prob1, proj1)
proj2_soft = torch.bmm(prob2, proj2)
proj1_soft.shape, proj2_soft.shape

(torch.Size([16, 37, 200]), torch.Size([16, 13, 200]))

In [39]:
# intra-sentence attention!
proj1_combined = torch.cat((proj1, proj1_soft), dim=2)
proj2_combined = torch.cat((proj2, proj2_soft), dim=2)
proj1_combined.shape, proj2_combined.shape

(torch.Size([16, 37, 400]), torch.Size([16, 13, 400]))

In [8]:
class FeedForwardG(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout=0.2):
        super(FeedForwardG, self).__init__()
        self.d = nn.Dropout(dropout)
        self.m = nn.ReLU()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, output_size)
        for param in self.parameters():
            torch.nn.init.normal_(param, mean=0, std=0.01)
    def forward(self, inputs):
        hidden = self.m(self.linear1(self.d(inputs)))
        output = self.m(self.linear2(self.d(hidden)))
        return output

In [9]:
class FeedForwardH(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout=0.2):
        super(FeedForwardH, self).__init__()
        self.d = nn.Dropout(dropout)
        self.m = nn.ReLU()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.linear3 = nn.Linear(hidden_size, output_size)
        for param in self.parameters():
            torch.nn.init.normal_(param, mean=0, std=0.01)
    def forward(self, inputs):
        hidden1 = self.m(self.linear1(self.d(inputs)))
        hidden2 = self.m(self.linear2(self.d(hidden1)))
        output = self.linear3(hidden2)
        return output

In [10]:
class EmbedDist(torch.nn.Module):
    def __init__(self, num_embeddings, embedding_dim): # num = 11, dim = 1
        super(EmbedDist, self).__init__()
        self.embed = nn.Embedding(num_embeddings, embedding_dim)
        torch.nn.init.normal_(self.embed.weight, mean=0, std=0.01)
    def forward(self, inputs):
        output = self.embed(inputs)
        return output

In [None]:
dist = 10
seqlen = score1.shape[2]
steps = torch.arange(0, seqlen)
mat_steps = steps.repeat(seqlen, 1)
flip_steps = torch.flip(steps, [0]).view(-1, 1)
idx = torch.min(torch.abs(mat_steps - flip_steps), torch.tensor(dist))

ED1 = EmbedDist(dist+1, 1).cuda()
ED1(idx).squeeze().shape

## Train

In [12]:
def get_dist_bias(seqlen, dist, ED1):
    steps = torch.arange(0, seqlen)
    mat_steps = steps.repeat(seqlen, 1)
    flip_steps = torch.flip(steps, [0]).view(-1, 1)
    idx = torch.min(torch.abs(mat_steps - flip_steps), torch.tensor(dist))
    ED1 = EmbedDist(dist+1, 1)
    return ED1(idx).squeeze().cuda()

In [13]:
def get_output(sent1, sent2, EP1, F1, G1, H1, intra, dist, ED1):
    proj1 = EP1(sent1)
    proj2 = EP1(sent2)
    f1 = F1(proj1)
    f2 = F1(proj2)
    if intra:
        score1 = torch.bmm(f1, f1.transpose(1,2))
        score2 = torch.bmm(f2, f2.transpose(1,2))
        score1 += get_dist_bias(score1.shape[2], dist, ED1)
        score2 += get_dist_bias(score2.shape[2], dist, ED1)
        prob1 = F.softmax(score1, dim=2)
        prob2 = F.softmax(score2, dim=2)
        proj1_soft = torch.bmm(prob1, proj1)
        proj2_soft = torch.bmm(prob2, proj2)
        proj1_combined = torch.cat((proj1, proj1_soft), dim=2)
        proj2_combined = torch.cat((proj2, proj2_soft), dim=2)
    else:
        score1 = torch.bmm(f1, f2.transpose(1,2))
        score2 = score1.transpose(1,2)
        prob1 = F.softmax(score1, dim=2)
        prob2 = F.softmax(score2, dim=2)
        proj1_soft = torch.bmm(prob2, proj1)
        proj2_soft = torch.bmm(prob1, proj2)
        proj1_combined = torch.cat((proj1, proj2_soft), dim=2)
        proj2_combined = torch.cat((proj2, proj1_soft), dim=2)
    g1 = G1(proj1_combined)
    g2 = G1(proj2_combined)
    g1_sum = g1.sum(dim=1)
    g2_sum = g2.sum(dim=1)
    g_all = torch.cat((g1_sum, g2_sum), dim=1)
    h_all = H1(g_all)
    return h_all

In [14]:
def training_loop(e, train_iter, EP1, F1, G1, H1, criterion, optimizer, intra=False, dist=None, ED1=None):
    EP1.train()
    F1.train()
    G1.train()
    H1.train()
    
    for ix,batch in enumerate(train_iter):
        optimizer.zero_grad()
        sent1 = batch.premise.values.transpose(0,1)
        sent2 = batch.hypothesis.values.transpose(0,1)
        target = batch.label.values
        output = get_output(sent1, sent2, EP1, F1, G1, H1, intra, dist, ED1)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        
        if ix % 1000 == 0:
            acc = torch.sum(torch.argmax(output, dim=1) == target).item() / target.shape[0]
            print('Epoch: {0}, Batch: {1}, Train NLL: {2:0.4f}, Train Acc:{3:0.4f}'.format(e, ix, loss.cpu().detach(), acc))

In [15]:
def validation_loop(e, val_iter, EP1, F1, G1, H1, criterion, intra=False, dist=None, ED1=None):
    EP1.eval()
    F1.eval()
    G1.eval()
    H1.eval()
    
    total_loss = 0
    total_sent = 0
    total_correct = 0
    
    for ix,batch in enumerate(val_iter):
        sent1 = batch.premise.values.transpose(0,1)
        sent2 = batch.hypothesis.values.transpose(0,1)
        target = batch.label.values
        output = get_output(sent1, sent2, EP1, F1, G1, H1, intra, dist, ED1)
        
        loss = criterion(output, target).item()
        sent = sent1.shape[0]
        correct = torch.sum(torch.argmax(output, dim=1) == target).item()
        
        total_loss += loss*sent
        total_sent += sent
        total_correct += correct
    
    print('Epoch: {0}, Val NLL: {1:0.4f}, Val Acc: {2:0.4f}'.format(e, total_loss/total_sent, total_correct/total_sent))
    return total_loss

In [None]:
best_loss = 1e8
EP1 = EmbedProject(weights, embed_size, hidden_size1).cuda()
F1 = FeedForwardF(hidden_size1, hidden_size1, hidden_size1).cuda()
G1 = FeedForwardG(hidden_size2, hidden_size1, hidden_size1).cuda()
H1 = FeedForwardH(hidden_size2, hidden_size1, output_size).cuda()

intra = True
dist = 10
ED1 = EmbedDist(dist + 1, 1).cuda()

parameters = [param for param in EP1.parameters()] # embed, lnr, bias
parameters.extend([param for param in F1.parameters()]) # lnr1, bias1, lnr2, bias2
parameters.extend([param for param in G1.parameters()]) # lnr1, bias1, lnr2, bias2
parameters.extend([param for param in H1.parameters()]) # lnr1, bias1, lnr2, bias2, lnr3, bias3
if intra:
    parameters.extend([param for param in ED1.parameters()]) # embed

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adagrad(parameters, lr=0.025, initial_accumulator_value=0.1)

for e in range(100):
    training_loop(e, train_iter, EP1, F1, G1, H1, criterion, optimizer, intra=intra, dist=dist, ED1=ED1)
    loss = validation_loop(e, val_iter, EP1, F1, G1, H1, criterion, intra=intra, dist=dist, ED1=ED1)
    if loss < best_loss:
        torch.save(EP1.state_dict(),'best_EP1_intra.pt')
        torch.save(F1.state_dict(),'best_F1_intra.pt')
        torch.save(G1.state_dict(),'best_G1_intra.pt')
        torch.save(H1.state_dict(),'best_H1_intra.pt')
        best_loss = loss
        print('WROTE MODEL')

Epoch: 0, Batch: 0, Train NLL: 1.3938, Train Acc:0.1875
Epoch: 0, Batch: 1000, Train NLL: 1.1063, Train Acc:0.2500
Epoch: 0, Batch: 2000, Train NLL: 1.1036, Train Acc:0.2500
Epoch: 0, Batch: 3000, Train NLL: 1.0975, Train Acc:0.3125
Epoch: 0, Batch: 4000, Train NLL: 1.1153, Train Acc:0.1875
Epoch: 0, Batch: 5000, Train NLL: 1.0915, Train Acc:0.5000
Epoch: 0, Batch: 6000, Train NLL: 1.0928, Train Acc:0.5000
Epoch: 0, Batch: 7000, Train NLL: 1.1031, Train Acc:0.3750
Epoch: 0, Batch: 8000, Train NLL: 1.0816, Train Acc:0.4375
Epoch: 0, Batch: 9000, Train NLL: 1.0973, Train Acc:0.4375
Epoch: 0, Batch: 10000, Train NLL: 1.1004, Train Acc:0.2500
Epoch: 0, Batch: 11000, Train NLL: 1.1075, Train Acc:0.2500
Epoch: 0, Batch: 12000, Train NLL: 1.1043, Train Acc:0.2500
Epoch: 0, Batch: 13000, Train NLL: 1.0968, Train Acc:0.3750
Epoch: 0, Batch: 14000, Train NLL: 1.1022, Train Acc:0.3750
Epoch: 0, Batch: 15000, Train NLL: 1.0986, Train Acc:0.3125
Epoch: 0, Batch: 16000, Train NLL: 1.0986, Train Acc: