# CS 287 - HW 4

In [1]:
!pip install -q torch torchtext opt_einsum git+https://github.com/harvardnlp/namedtensor

In [8]:
import random
import torch
import torch.nn as nn 
import torch.nn.functional as F
import torchtext
from torchtext.vocab import Vectors, GloVe
from namedtensor import ntorch, NamedTensor
from namedtensor.text import NamedField

In [3]:
# load data
TEXT = NamedField(names=('seqlen',)) # Our input $x$
LABEL = NamedField(sequential=False, names=()) # Our labels $y$
train, val, test = torchtext.datasets.SNLI.splits(TEXT, LABEL)
print('len(train)', len(train))
TEXT.build_vocab(train)
LABEL.build_vocab(train)
print('len(TEXT.vocab)', len(TEXT.vocab))
print('len(LABEL.vocab)', len(LABEL.vocab))
train_iter, val_iter, test_iter = torchtext.data.BucketIterator.splits(
    (train, val, test), batch_size=16, device=torch.device("cuda"), repeat=False)

len(train) 549367
len(TEXT.vocab) 62998
len(LABEL.vocab) 4


In [4]:
# build the vocabulary with word embeddings
# out-of-vocabulary words are hashed to one of 100 random embeddings each initialized to mean 0, stdev 1 (Sec 5.1)
unk_vectors = [torch.randn(300) for _ in range(100)]
TEXT.vocab.load_vectors(vectors='glove.6B.300d', unk_init=lambda x:random.choice(unk_vectors))
vectors = TEXT.vocab.vectors
vectors = vectors / vectors.norm(dim=1, keepdim=True) # normalized to have l_2 norm of 1
vectors = NamedTensor(vectors, ('word', 'embedding'))
TEXT.vocab.vectors = vectors
print("word embeddings shape:", TEXT.vocab.vectors.shape)

word embeddings shape: OrderedDict([('word', 62998), ('embedding', 300)])


In [7]:
# here's an example of a training example
batch = next(iter(train_iter))
print("Size of premise batch:", batch.premise.shape)
print("Size of hypothesis batch:", batch.hypothesis.shape)
print("Size of label batch:", batch.label.shape)

Size of premise batch: OrderedDict([('seqlen', 26), ('batch', 16)])
Size of hypothesis batch: OrderedDict([('seqlen', 22), ('batch', 16)])
Size of label batch: OrderedDict([('batch', 16)])


## Vanilla Decomposable Attention Model

In [90]:
# dimensions
input_size = TEXT.vocab.vectors.shape['word']
embed_size = TEXT.vocab.vectors.shape['embedding']
hidden_size1 = 200
hidden_size2 = hidden_size1 * 2
output_size = len(LABEL.vocab)
print('DIMENSIONS -- input: %d, embed: %d, hidden1: %d, hidden2: %d, output: %d'%(input_size, embed_size, hidden_size1, hidden_size2, output_size))

DIMENSIONS -- input: 62998, embed: 300, hidden1: 200, hidden2: 400, output: 4


In [91]:
# pre-trained embeddings
weights = TEXT.vocab.vectors.values.cuda()
weights.shape

torch.Size([62998, 300])

In [92]:
class EmbedProject(torch.nn.Module):
    def __init__(self, weights, embed_size, project_size):
        super(EmbedProject, self).__init__()
        self.embed = nn.Embedding.from_pretrained(weights, freeze=True) # weights: input_size x embed_size
        self.linear = nn.Linear(embed_size, project_size)
        torch.nn.init.normal_(self.linear.weight, mean=0, std=0.01)
    def forward(self, inputs):
        embedding = self.embed(inputs)
        output = self.linear(embedding)
        return output

In [93]:
EP1 = EmbedProject(weights, embed_size, hidden_size1).cuda()
EP1

EmbedProject(
  (embed): Embedding(62998, 300)
  (linear): Linear(in_features=300, out_features=200, bias=True)
)

In [94]:
sent1 = batch.premise.values.transpose(0,1)
sent2 = batch.hypothesis.values.transpose(0,1)
sent1.shape, sent2.shape

(torch.Size([16, 26]), torch.Size([16, 22]))

In [95]:
proj1 = EP1(sent1)
proj2 = EP1(sent2)
proj1.shape, proj2.shape

(torch.Size([16, 26, 200]), torch.Size([16, 22, 200]))

In [96]:
class FeedForwardF(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout=0.2):
        super(FeedForwardF, self).__init__()
        self.d = nn.Dropout(dropout)
        self.m = nn.ReLU()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, output_size)
        for param in self.parameters():
            torch.nn.init.normal_(param, mean=0, std=0.01)
    def forward(self, inputs):
        hidden = self.m(self.linear1(self.d(inputs)))
        output = self.m(self.linear2(self.d(hidden)))
        return output

In [97]:
F1 = FeedForwardF(hidden_size1, hidden_size1, hidden_size1).cuda()
F1

FeedForwardF(
  (d): Dropout(p=0.2)
  (m): ReLU()
  (linear1): Linear(in_features=200, out_features=200, bias=True)
  (linear2): Linear(in_features=200, out_features=200, bias=True)
)

In [98]:
f1 = F1(proj1)
f2 = F1(proj2)
f1.shape, f2.shape

(torch.Size([16, 26, 200]), torch.Size([16, 22, 200]))

In [99]:
score1 = torch.bmm(f1, f2.transpose(1,2))
score2 = score1.transpose(1,2)
score1.shape, score2.shape

(torch.Size([16, 26, 22]), torch.Size([16, 22, 26]))

In [100]:
prob1 = F.softmax(score1, dim=2)
prob2 = F.softmax(score2, dim=2)
prob1.shape, prob2.shape

(torch.Size([16, 26, 22]), torch.Size([16, 22, 26]))

In [101]:
proj1_soft = torch.bmm(prob2, proj1)
proj2_soft = torch.bmm(prob1, proj2)
proj1_soft.shape, proj2_soft.shape

(torch.Size([16, 22, 200]), torch.Size([16, 26, 200]))

In [102]:
proj1_combined = torch.cat((proj1, proj2_soft), dim=2)
proj2_combined = torch.cat((proj2, proj1_soft), dim=2)
proj1_combined.shape, proj2_combined.shape

(torch.Size([16, 26, 400]), torch.Size([16, 22, 400]))

In [103]:
class FeedForwardG(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout=0.2):
        super(FeedForwardG, self).__init__()
        self.d = nn.Dropout(dropout)
        self.m = nn.ReLU()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, output_size)
        for param in self.parameters():
            torch.nn.init.normal_(param, mean=0, std=0.01)
    def forward(self, inputs):
        hidden = self.m(self.linear1(self.d(inputs)))
        output = self.m(self.linear2(self.d(hidden)))
        return output

In [104]:
G1 = FeedForwardG(hidden_size2, hidden_size1, hidden_size1).cuda()
G1

FeedForwardG(
  (d): Dropout(p=0.2)
  (m): ReLU()
  (linear1): Linear(in_features=400, out_features=200, bias=True)
  (linear2): Linear(in_features=200, out_features=200, bias=True)
)

In [105]:
g1 = G1(proj1_combined)
g2 = G1(proj2_combined)
g1.shape, g2.shape

(torch.Size([16, 26, 200]), torch.Size([16, 22, 200]))

In [106]:
g1_sum = g1.sum(dim=1)
g2_sum = g2.sum(dim=1)
g1_sum.shape, g2_sum.shape

(torch.Size([16, 200]), torch.Size([16, 200]))

In [107]:
g_all = torch.cat((g1_sum, g2_sum), dim=1)
g_all.shape

torch.Size([16, 400])

In [108]:
class FeedForwardH(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout=0.2):
        super(FeedForwardH, self).__init__()
        self.d = nn.Dropout(dropout)
        self.m = nn.ReLU()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.linear3 = nn.Linear(hidden_size, output_size)
        for param in self.parameters():
            torch.nn.init.normal_(param, mean=0, std=0.01)
    def forward(self, inputs):
        hidden1 = self.m(self.linear1(self.d(inputs)))
        hidden2 = self.m(self.linear2(self.d(hidden1)))
        output = self.linear3(hidden2)
        return output

In [109]:
H1 = FeedForwardH(hidden_size2, hidden_size1, output_size).cuda()
H1

FeedForwardH(
  (d): Dropout(p=0.2)
  (m): ReLU()
  (linear1): Linear(in_features=400, out_features=200, bias=True)
  (linear2): Linear(in_features=200, out_features=200, bias=True)
  (linear3): Linear(in_features=200, out_features=4, bias=True)
)

In [110]:
out_all = H1(g_all)
out_all.shape

torch.Size([16, 4])