# CS 287 - HW 4

In [1]:
!pip install -q torch torchtext opt_einsum git+https://github.com/harvardnlp/namedtensor

In [1]:
import random
import torch
import torch.nn as nn 
import torch.nn.functional as F
import torchtext
from torchtext.vocab import Vectors, GloVe
from namedtensor import ntorch, NamedTensor
from namedtensor.text import NamedField

In [2]:
# load data
TEXT = NamedField(names=('seqlen',)) # Our input $x$
LABEL = NamedField(sequential=False, names=()) # Our labels $y$
train, val, test = torchtext.datasets.SNLI.splits(TEXT, LABEL)
print('len(train)', len(train))
TEXT.build_vocab(train)
LABEL.build_vocab(train)
print('len(TEXT.vocab)', len(TEXT.vocab))
print('len(LABEL.vocab)', len(LABEL.vocab))
train_iter, val_iter, test_iter = torchtext.data.BucketIterator.splits(
    (train, val, test), batch_size=16, device=torch.device("cuda"), repeat=False)

len(train) 549367
len(TEXT.vocab) 62998
len(LABEL.vocab) 4


In [3]:
# build the vocabulary with word embeddings
# out-of-vocabulary words are hashed to one of 100 random embeddings each initialized to mean 0, stdev 1 (Sec 5.1)
unk_vectors = [torch.randn(300) for _ in range(100)]
TEXT.vocab.load_vectors(vectors='glove.6B.300d', unk_init=lambda x:random.choice(unk_vectors))
vectors = TEXT.vocab.vectors
vectors = vectors / vectors.norm(dim=1, keepdim=True) # normalized to have l_2 norm of 1
vectors = NamedTensor(vectors, ('word', 'embedding'))
TEXT.vocab.vectors = vectors
print("word embeddings shape:", TEXT.vocab.vectors.shape)

word embeddings shape: OrderedDict([('word', 62998), ('embedding', 300)])


In [4]:
# here's an example of a training example
batch = next(iter(train_iter))
print("Size of premise batch:", batch.premise.shape)
print("Size of hypothesis batch:", batch.hypothesis.shape)
print("Size of label batch:", batch.label.shape)

Size of premise batch: OrderedDict([('seqlen', 26), ('batch', 16)])
Size of hypothesis batch: OrderedDict([('seqlen', 11), ('batch', 16)])
Size of label batch: OrderedDict([('batch', 16)])


## Vanilla Decomposable Attention Model

In [5]:
# dimensions
input_size = TEXT.vocab.vectors.shape['word']
embed_size = TEXT.vocab.vectors.shape['embedding']
hidden_size1 = 200
hidden_size2 = hidden_size1 * 2
output_size = len(LABEL.vocab)
print('DIMENSIONS -- input: %d, embed: %d, hidden1: %d, hidden2: %d, output: %d'%(input_size, embed_size, hidden_size1, hidden_size2, output_size))

DIMENSIONS -- input: 62998, embed: 300, hidden1: 200, hidden2: 400, output: 4


In [61]:
# pre-trained embeddings
weights = TEXT.vocab.vectors.values.cuda()
weights.shape

torch.Size([62998, 300])

In [26]:
pad_tkn = TEXT.vocab.stoi['<pad>']
#weights[pad_tkn,:] = 0

In [27]:
class EmbedProject(torch.nn.Module):
    def __init__(self, weights, embed_size, project_size):
        super(EmbedProject, self).__init__()
        self.embed = nn.Embedding.from_pretrained(weights, freeze=True) # weights: input_size x embed_size
        self.linear = nn.Linear(embed_size, project_size)
        torch.nn.init.normal_(self.linear.weight, mean=0, std=0.01)
    def forward(self, inputs):
        embedding = self.embed(inputs)
        output = self.linear(embedding)
        return output

In [28]:
EP1 = EmbedProject(weights, embed_size, hidden_size1).cuda()
EP1

EmbedProject(
  (embed): Embedding(62998, 300)
  (linear): Linear(in_features=300, out_features=200, bias=True)
)

In [29]:
raw_sent1 = batch.premise.values.transpose(0,1)
raw_sent2 = batch.hypothesis.values.transpose(0,1)
null_tkn = torch.tensor(TEXT.vocab.stoi['null'], device='cuda')
null_tkns = null_tkn.repeat(raw_sent1.shape[0],1)
sent1 = torch.cat((null_tkns, raw_sent1), 1)
sent2 = torch.cat((null_tkns, raw_sent2), 1)
raw_sent1.shape, raw_sent2.shape, sent1.shape, sent2.shape

(torch.Size([16, 26]),
 torch.Size([16, 11]),
 torch.Size([16, 27]),
 torch.Size([16, 12]))

In [32]:
proj1 = EP1(sent1)
proj2 = EP1(sent2)
proj1.shape, proj2.shape

(torch.Size([16, 27, 200]), torch.Size([16, 12, 200]))

In [None]:
'''temp_embed = nn.Embedding.from_pretrained(weights, freeze=True)
temp_linear = nn.Linear(300, 200).cuda()
temp_embedding = temp_embed(sent1)
#temp_embedding[0,-1,:]
temp_output = temp_linear(temp_embedding)
mask = sent1 != pad_tkn
mask.shape, temp_output.shape
temp_temp_output = mask * temp_output
temp_temp_output.shape'''

In [7]:
class FeedForwardF(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout=0.2):
        super(FeedForwardF, self).__init__()
        self.d = nn.Dropout(dropout)
        self.m = nn.ReLU()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, output_size)
        for param in self.parameters():
            torch.nn.init.normal_(param, mean=0, std=0.01)
    def forward(self, inputs):
        hidden = self.m(self.linear1(self.d(inputs)))
        output = self.m(self.linear2(self.d(hidden)))
        return output

In [97]:
F1 = FeedForwardF(hidden_size1, hidden_size1, hidden_size1).cuda()
F1

FeedForwardF(
  (d): Dropout(p=0.2)
  (m): ReLU()
  (linear1): Linear(in_features=200, out_features=200, bias=True)
  (linear2): Linear(in_features=200, out_features=200, bias=True)
)

In [98]:
f1 = F1(proj1)
f2 = F1(proj2)
f1.shape, f2.shape

(torch.Size([16, 26, 200]), torch.Size([16, 22, 200]))

In [99]:
score1 = torch.bmm(f1, f2.transpose(1,2))
score2 = score1.transpose(1,2)
score1.shape, score2.shape

(torch.Size([16, 26, 22]), torch.Size([16, 22, 26]))

In [100]:
prob1 = F.softmax(score1, dim=2)
prob2 = F.softmax(score2, dim=2)
prob1.shape, prob2.shape

(torch.Size([16, 26, 22]), torch.Size([16, 22, 26]))

In [101]:
proj1_soft = torch.bmm(prob2, proj1)
proj2_soft = torch.bmm(prob1, proj2)
proj1_soft.shape, proj2_soft.shape

(torch.Size([16, 22, 200]), torch.Size([16, 26, 200]))

In [102]:
proj1_combined = torch.cat((proj1, proj2_soft), dim=2)
proj2_combined = torch.cat((proj2, proj1_soft), dim=2)
proj1_combined.shape, proj2_combined.shape

(torch.Size([16, 26, 400]), torch.Size([16, 22, 400]))

In [8]:
class FeedForwardG(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout=0.2):
        super(FeedForwardG, self).__init__()
        self.d = nn.Dropout(dropout)
        self.m = nn.ReLU()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, output_size)
        for param in self.parameters():
            torch.nn.init.normal_(param, mean=0, std=0.01)
    def forward(self, inputs):
        hidden = self.m(self.linear1(self.d(inputs)))
        output = self.m(self.linear2(self.d(hidden)))
        return output

In [104]:
G1 = FeedForwardG(hidden_size2, hidden_size1, hidden_size1).cuda()
G1

FeedForwardG(
  (d): Dropout(p=0.2)
  (m): ReLU()
  (linear1): Linear(in_features=400, out_features=200, bias=True)
  (linear2): Linear(in_features=200, out_features=200, bias=True)
)

In [105]:
g1 = G1(proj1_combined)
g2 = G1(proj2_combined)
g1.shape, g2.shape

(torch.Size([16, 26, 200]), torch.Size([16, 22, 200]))

In [106]:
g1_sum = g1.sum(dim=1)
g2_sum = g2.sum(dim=1)
g1_sum.shape, g2_sum.shape

(torch.Size([16, 200]), torch.Size([16, 200]))

In [107]:
g_all = torch.cat((g1_sum, g2_sum), dim=1)
g_all.shape

torch.Size([16, 400])

In [9]:
class FeedForwardH(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout=0.2):
        super(FeedForwardH, self).__init__()
        self.d = nn.Dropout(dropout)
        self.m = nn.ReLU()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.linear3 = nn.Linear(hidden_size, output_size)
        for param in self.parameters():
            torch.nn.init.normal_(param, mean=0, std=0.01)
    def forward(self, inputs):
        hidden1 = self.m(self.linear1(self.d(inputs)))
        hidden2 = self.m(self.linear2(self.d(hidden1)))
        output = self.linear3(hidden2)
        return output

In [109]:
H1 = FeedForwardH(hidden_size2, hidden_size1, output_size).cuda()
H1

FeedForwardH(
  (d): Dropout(p=0.2)
  (m): ReLU()
  (linear1): Linear(in_features=400, out_features=200, bias=True)
  (linear2): Linear(in_features=200, out_features=200, bias=True)
  (linear3): Linear(in_features=200, out_features=4, bias=True)
)

In [110]:
h_all = H1(g_all)
h_all.shape

torch.Size([16, 4])

In [112]:
target = batch.label.values
target.shape

torch.Size([16])

In [113]:
criterion = nn.CrossEntropyLoss()
loss = criterion(out_all, target)
loss

tensor(1.3871, device='cuda:0', grad_fn=<NllLossBackward>)

In [121]:
parameters = [param for param in EP1.parameters()] # embed, lnr, bias
print(len(parameters))
parameters.extend([param for param in F1.parameters()]) # lnr1, bias1, lnr2, bias2
print(len(parameters))
parameters.extend([param for param in G1.parameters()]) # lnr1, bias1, lnr2, bias2
print(len(parameters))
parameters.extend([param for param in H1.parameters()]) # lnr1, bias1, lnr2, bias2, lnr3, bias3
print(len(parameters))

3
7
11
17


In [122]:
optimizer = torch.optim.Adagrad(parameters, lr=0.05, initial_accumulator_value=0.1)

In [136]:
for param in EP1.parameters():
    emb1 = param
    break
i = 0
for param in H1.parameters():
    if i == 5:
        print(param)
    i += 1

Parameter containing:
tensor([-0.1111, -0.0486, -0.0384,  0.1044], device='cuda:0',
       requires_grad=True)


In [137]:
EP1.train()
F1.train()
G1.train()
H1.train()
optimizer.zero_grad()

sent1 = batch.premise.values.transpose(0,1)
sent2 = batch.hypothesis.values.transpose(0,1)
target = batch.label.values

proj1 = EP1(sent1)
proj2 = EP1(sent2)
f1 = F1(proj1)
f2 = F1(proj2)
score1 = torch.bmm(f1, f2.transpose(1,2))
score2 = score1.transpose(1,2)
prob1 = F.softmax(score1, dim=2)
prob2 = F.softmax(score2, dim=2)
proj1_soft = torch.bmm(prob2, proj1)
proj2_soft = torch.bmm(prob1, proj2)
proj1_combined = torch.cat((proj1, proj2_soft), dim=2)
proj2_combined = torch.cat((proj2, proj1_soft), dim=2)
g1 = G1(proj1_combined)
g2 = G1(proj2_combined)
g1_sum = g1.sum(dim=1)
g2_sum = g2.sum(dim=1)
g_all = torch.cat((g1_sum, g2_sum), dim=1)
h_all = H1(g_all)

loss = criterion(h_all, target)
loss.backward()
optimizer.step()

In [138]:
for param in EP1.parameters():
    emb2 = param
    break
i = 0
for param in H1.parameters():
    if i == 5:
        print(param)
    i += 1

Parameter containing:
tensor([-0.1295, -0.0567, -0.0467,  0.1243], device='cuda:0',
       requires_grad=True)


In [142]:
torch.sum(emb1 != emb2)

tensor(0, device='cuda:0')

In [156]:
acc = torch.sum(torch.argmax(h_all, dim=1) == target).item() / target.shape[0]
acc

0.625

In [155]:
loss

tensor(1.3147, device='cuda:0', grad_fn=<NllLossBackward>)

## Train

In [10]:
def get_output(sent1, sent2, EP1, F1, G1, H1):
    proj1 = EP1(sent1)
    proj2 = EP1(sent2)
    f1 = F1(proj1)
    f2 = F1(proj2)
    score1 = torch.bmm(f1, f2.transpose(1,2))
    score2 = score1.transpose(1,2)
    prob1 = F.softmax(score1, dim=2)
    prob2 = F.softmax(score2, dim=2)
    proj1_soft = torch.bmm(prob2, proj1)
    proj2_soft = torch.bmm(prob1, proj2)
    proj1_combined = torch.cat((proj1, proj2_soft), dim=2)
    proj2_combined = torch.cat((proj2, proj1_soft), dim=2)
    g1 = G1(proj1_combined)
    g2 = G1(proj2_combined)
    g1_sum = g1.sum(dim=1)
    g2_sum = g2.sum(dim=1)
    g_all = torch.cat((g1_sum, g2_sum), dim=1)
    h_all = H1(g_all)
    return h_all

In [11]:
def training_loop(e, train_iter, EP1, F1, G1, H1, criterion, optimizer):
    EP1.train()
    F1.train()
    G1.train()
    H1.train()
    
    for ix,batch in enumerate(train_iter):
        optimizer.zero_grad()
        sent1 = batch.premise.values.transpose(0,1)
        sent2 = batch.hypothesis.values.transpose(0,1)
        target = batch.label.values
        output = get_output(sent1, sent2, EP1, F1, G1, H1)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        
        if ix % 1000 == 0:
            acc = torch.sum(torch.argmax(output, dim=1) == target).item() / target.shape[0]
            print('Epoch: {0}, Batch: {1}, Train NLL: {2:0.4f}, Train Acc:{3:0.4f}'.format(e, ix, loss.cpu().detach(), acc))

In [12]:
def validation_loop(e, val_iter, EP1, F1, G1, H1, criterion):
    EP1.eval()
    F1.eval()
    G1.eval()
    H1.eval()
    
    total_loss = 0
    total_sent = 0
    total_correct = 0
    
    for ix,batch in enumerate(val_iter):
        sent1 = batch.premise.values.transpose(0,1)
        sent2 = batch.hypothesis.values.transpose(0,1)
        target = batch.label.values
        output = get_output(sent1, sent2, EP1, F1, G1, H1)
        
        loss = criterion(output, target).item()
        sent = sent1.shape[0]
        correct = torch.sum(torch.argmax(output, dim=1) == target).item()
        
        total_loss += loss*sent
        total_sent += sent
        total_correct += correct
    
    print('Epoch: {0}, Val NLL: {1:0.4f}, Val Acc: {2:0.4f}'.format(e, total_loss/total_sent, total_correct/total_sent))
    return total_loss

In [13]:
best_loss = 1e8
EP1 = EmbedProject(weights, embed_size, hidden_size1).cuda()
F1 = FeedForwardF(hidden_size1, hidden_size1, hidden_size1).cuda()
G1 = FeedForwardG(hidden_size2, hidden_size1, hidden_size1).cuda()
H1 = FeedForwardH(hidden_size2, hidden_size1, output_size).cuda()

parameters = [param for param in EP1.parameters()] # embed, lnr, bias
parameters.extend([param for param in F1.parameters()]) # lnr1, bias1, lnr2, bias2
parameters.extend([param for param in G1.parameters()]) # lnr1, bias1, lnr2, bias2
parameters.extend([param for param in H1.parameters()]) # lnr1, bias1, lnr2, bias2, lnr3, bias3

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adagrad(parameters, lr=0.05, initial_accumulator_value=0.1)

for e in range(100):
    training_loop(e, train_iter, EP1, F1, G1, H1, criterion, optimizer)
    loss = validation_loop(e, val_iter, EP1, F1, G1, H1, criterion)
    if loss < best_loss:
        #torch.save(EP1.state_dict(),'best_EP1.pt')
        #torch.save(F1.state_dict(),'best_F1.pt')
        #torch.save(G1.state_dict(),'best_G1.pt')
        #torch.save(H1.state_dict(),'best_H1.pt')
        best_loss = loss
        print('WROTE MODEL')

Epoch: 0, Batch: 0, Train NLL: 1.3805, Train Acc:0.4375
Epoch: 0, Batch: 1000, Train NLL: 1.0982, Train Acc:0.4375
Epoch: 0, Batch: 2000, Train NLL: 1.1028, Train Acc:0.3125
Epoch: 0, Batch: 3000, Train NLL: 1.0868, Train Acc:0.3750
Epoch: 0, Batch: 4000, Train NLL: 1.1005, Train Acc:0.2500
Epoch: 0, Batch: 5000, Train NLL: 1.1006, Train Acc:0.3125
Epoch: 0, Batch: 6000, Train NLL: 1.1002, Train Acc:0.2500
Epoch: 0, Batch: 7000, Train NLL: 1.1000, Train Acc:0.3125
Epoch: 0, Batch: 8000, Train NLL: 1.1074, Train Acc:0.2500
Epoch: 0, Batch: 9000, Train NLL: 1.0958, Train Acc:0.5000
Epoch: 0, Batch: 10000, Train NLL: 1.0928, Train Acc:0.3125
Epoch: 0, Batch: 11000, Train NLL: 1.0979, Train Acc:0.3125
Epoch: 0, Batch: 12000, Train NLL: 1.1029, Train Acc:0.1250
Epoch: 0, Batch: 13000, Train NLL: 1.1009, Train Acc:0.1875
Epoch: 0, Batch: 14000, Train NLL: 1.0981, Train Acc:0.3125
Epoch: 0, Batch: 15000, Train NLL: 1.1042, Train Acc:0.3125
Epoch: 0, Batch: 16000, Train NLL: 1.0985, Train Acc:

KeyboardInterrupt: 

In [None]:
#EP1 = EmbedProject(weights, embed_size, hidden_size1).cuda()
#state_dict = torch.load('best_EP1.pt')
#EP1.load_state_dict(state_dict)