# CS 287 - HW 4 - Cont.

In [1]:
import random
import torch
import torch.nn as nn 
import torch.nn.functional as F
import torchtext
from torchtext.vocab import Vectors, GloVe
from namedtensor import ntorch, NamedTensor
from namedtensor.text import NamedField

In [2]:
# load data
TEXT = NamedField(names=('seqlen',)) # Our input $x$
LABEL = NamedField(sequential=False, names=()) # Our labels $y$
train, val, test = torchtext.datasets.SNLI.splits(TEXT, LABEL)
print('len(train)', len(train))
TEXT.build_vocab(train)
LABEL.build_vocab(train)
print('len(TEXT.vocab)', len(TEXT.vocab))
print('len(LABEL.vocab)', len(LABEL.vocab))
train_iter, val_iter, test_iter = torchtext.data.BucketIterator.splits(
    (train, val, test), batch_size=16, device=torch.device("cuda"), repeat=False)

len(train) 549367
len(TEXT.vocab) 62998
len(LABEL.vocab) 4


In [3]:
# build the vocabulary with word embeddings
# out-of-vocabulary words are hashed to one of 100 random embeddings each initialized to mean 0, stdev 1 (Sec 5.1)
unk_vectors = [torch.randn(300) for _ in range(100)]
TEXT.vocab.load_vectors(vectors='glove.6B.300d', unk_init=lambda x:random.choice(unk_vectors))
vectors = TEXT.vocab.vectors
vectors = vectors / vectors.norm(dim=1, keepdim=True) # normalized to have l_2 norm of 1
vectors = NamedTensor(vectors, ('word', 'embedding'))
TEXT.vocab.vectors = vectors
print("word embeddings shape:", TEXT.vocab.vectors.shape)

word embeddings shape: OrderedDict([('word', 62998), ('embedding', 300)])


## Decomposable Intra-Sentence Attention Model

In [4]:
class EmbedProject(torch.nn.Module):
    def __init__(self, weights, embed_size, project_size):
        super(EmbedProject, self).__init__()
        self.embed = nn.Embedding.from_pretrained(weights, freeze=True) # weights: input_size x embed_size
        self.linear = nn.Linear(embed_size, project_size)
        torch.nn.init.normal_(self.linear.weight, mean=0, std=0.01)
    def forward(self, inputs):
        embedding = self.embed(inputs)
        output = self.linear(embedding)
        return output

In [5]:
class FeedForwardFIntra(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout=0.2):
        super(FeedForwardFIntra, self).__init__()
        self.d = nn.Dropout(dropout)
        self.m = nn.ReLU()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, output_size)
        for param in self.parameters():
            torch.nn.init.normal_(param, mean=0, std=0.01)
    def forward(self, inputs):
        hidden = self.m(self.linear1(self.d(inputs)))
        output = self.m(self.linear2(self.d(hidden)))
        return output

In [6]:
class EmbedDist(torch.nn.Module):
    def __init__(self, num_embeddings, embedding_dim): # num = 11, dim = 1
        super(EmbedDist, self).__init__()
        self.embed = nn.Embedding(num_embeddings, embedding_dim)
        torch.nn.init.normal_(self.embed.weight, mean=0, std=0.01)
    def forward(self, inputs):
        output = self.embed(inputs)
        return output

In [7]:
class FeedForwardF(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout=0.2):
        super(FeedForwardF, self).__init__()
        self.d = nn.Dropout(dropout)
        self.m = nn.ReLU()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, output_size)
        for param in self.parameters():
            torch.nn.init.normal_(param, mean=0, std=0.01)
    def forward(self, inputs):
        hidden = self.m(self.linear1(self.d(inputs)))
        output = self.m(self.linear2(self.d(hidden)))
        return output

In [8]:
class FeedForwardG(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout=0.2):
        super(FeedForwardG, self).__init__()
        self.d = nn.Dropout(dropout)
        self.m = nn.ReLU()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, output_size)
        for param in self.parameters():
            torch.nn.init.normal_(param, mean=0, std=0.01)
    def forward(self, inputs):
        hidden = self.m(self.linear1(self.d(inputs)))
        output = self.m(self.linear2(self.d(hidden)))
        return output

In [9]:
class FeedForwardH(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout=0.2):
        super(FeedForwardH, self).__init__()
        self.d = nn.Dropout(dropout)
        self.m = nn.ReLU()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.linear3 = nn.Linear(hidden_size, output_size)
        for param in self.parameters():
            torch.nn.init.normal_(param, mean=0, std=0.01)
    def forward(self, inputs):
        hidden1 = self.m(self.linear1(self.d(inputs)))
        hidden2 = self.m(self.linear2(self.d(hidden1)))
        output = self.linear3(hidden2)
        return output

## Global Vars

In [10]:
# dimensions
input_size = TEXT.vocab.vectors.shape['word']
embed_size = TEXT.vocab.vectors.shape['embedding']
hidden_size1 = 200
hidden_size2 = hidden_size1 * 2
output_size = len(LABEL.vocab)
print('DIMENSIONS -- input: %d, embed: %d, hidden1: %d, hidden2: %d, output: %d'%(input_size, embed_size, hidden_size1, hidden_size2, output_size))

DIMENSIONS -- input: 62998, embed: 300, hidden1: 200, hidden2: 400, output: 4


In [11]:
# pre-trained embeddings
weights = TEXT.vocab.vectors.values.cuda()
weights.shape

torch.Size([62998, 300])

In [12]:
pad_tkn = TEXT.vocab.stoi['<pad>']
null_tkn = torch.tensor(TEXT.vocab.stoi['null'], device='cuda')
print('<pad>:', pad_tkn, ', null:', null_tkn)

<pad>: 1 , null: tensor(56690, device='cuda:0')


## Development

In [None]:
[LABEL.vocab.itos[i] for i in [0,1,2,3]]

In [None]:
[TEXT.vocab.itos[i] for i in [0,1,2,3]]

In [None]:
# checking to make sure no <unk> labels
total = 0
for batch in iter(train_iter):
    total += torch.sum(batch.label.values == 0)
total

In [None]:
# no prepend with NULL!
for batch in iter(train_iter):
    print([TEXT.vocab.itos[i] for i in batch.premise.values[:,0]])
    print([TEXT.vocab.itos[i] for i in batch.hypothesis.values[:,0]])
    break

In [None]:
TEXT.vocab.stoi['null']

In [25]:
EP1 = EmbedProject(weights, embed_size, hidden_size1).cuda()
EP1

EmbedProject(
  (embed): Embedding(62998, 300)
  (linear): Linear(in_features=300, out_features=200, bias=True)
)

In [26]:
FI1 = FeedForwardFIntra(hidden_size1, hidden_size1, hidden_size1).cuda()
FI1

FeedForwardFIntra(
  (d): Dropout(p=0.2)
  (m): ReLU()
  (linear1): Linear(in_features=200, out_features=200, bias=True)
  (linear2): Linear(in_features=200, out_features=200, bias=True)
)

In [27]:
ED1 = EmbedDist(dist + 1, 1).cuda()
ED1

EmbedDist(
  (embed): Embedding(11, 1)
)

In [30]:
F1 = FeedForwardF(hidden_size2, hidden_size1, hidden_size1).cuda()
F1

FeedForwardF(
  (d): Dropout(p=0.2)
  (m): ReLU()
  (linear1): Linear(in_features=400, out_features=200, bias=True)
  (linear2): Linear(in_features=200, out_features=200, bias=True)
)

In [51]:
G1 = FeedForwardG(hidden_size2 * 2, hidden_size1, hidden_size1).cuda()
G1

FeedForwardG(
  (d): Dropout(p=0.2)
  (m): ReLU()
  (linear1): Linear(in_features=800, out_features=200, bias=True)
  (linear2): Linear(in_features=200, out_features=200, bias=True)
)

In [29]:
H1 = FeedForwardH(hidden_size2, hidden_size1, output_size).cuda()
H1

FeedForwardH(
  (d): Dropout(p=0.2)
  (m): ReLU()
  (linear1): Linear(in_features=400, out_features=200, bias=True)
  (linear2): Linear(in_features=200, out_features=200, bias=True)
  (linear3): Linear(in_features=200, out_features=4, bias=True)
)

In [None]:
'''proj1 = EP1(sent1)
proj2 = EP1(sent2)
proj1.shape, proj2.shape

fi1 = FI1(proj1)
fi2 = FI1(proj2)
fi1.shape, fi2.shape

# intra-sentence attention!
score1 = torch.bmm(fi1, fi1.transpose(1,2))
score2 = torch.bmm(fi2, fi2.transpose(1,2))
score1.shape, score2.shape

prob1 = F.softmax(score1, dim=2)
prob2 = F.softmax(score2, dim=2)
prob1.shape, prob2.shape

# intra-sentence attention!
proj1_soft = torch.bmm(prob1, proj1)
proj2_soft = torch.bmm(prob2, proj2)
proj1_soft.shape, proj2_soft.shape

# intra-sentence attention!
proj1_intra = torch.cat((proj1, proj1_soft), dim=2)
proj2_intra = torch.cat((proj2, proj2_soft), dim=2)
proj1_intra.shape, proj2_intra.shape

dist = 10
seqlen = score1.shape[2]
steps = torch.arange(0, seqlen)
mat_steps = steps.repeat(seqlen, 1)
flip_steps = torch.flip(steps, [0]).view(-1, 1)
idx = torch.min(torch.abs(mat_steps - flip_steps), torch.tensor(dist))
ED1(idx).squeeze().shape'''

In [31]:
batch = next(iter(train_iter))

In [32]:
raw_sent1 = batch.premise.values.transpose(0,1)
raw_sent2 = batch.hypothesis.values.transpose(0,1)
raw_sent1.shape, raw_sent2.shape

(torch.Size([16, 19]), torch.Size([16, 14]))

In [33]:
null_tkns = null_tkn.repeat(raw_sent1.shape[0], 1)
sent1 = torch.cat((null_tkns, raw_sent1), 1)
sent2 = torch.cat((null_tkns, raw_sent2), 1)
sent1.shape, sent2.shape

(torch.Size([16, 20]), torch.Size([16, 15]))

In [34]:
proj1 = EP1(sent1)
proj2 = EP1(sent2)
proj1.shape, proj2.shape

(torch.Size([16, 20, 200]), torch.Size([16, 15, 200]))

In [35]:
mask1 = (sent1 == pad_tkn)
mask2 = (sent2 == pad_tkn)
mask1.shape, mask2.shape

(torch.Size([16, 20]), torch.Size([16, 15]))

In [36]:
fi1 = FI1(proj1)
fi2 = FI1(proj2)
fi1.shape, fi2.shape

(torch.Size([16, 20, 200]), torch.Size([16, 15, 200]))

In [37]:
score1 = torch.bmm(fi1, fi1.transpose(1,2))
score2 = torch.bmm(fi2, fi2.transpose(1,2))
score1.shape, score2.shape

(torch.Size([16, 20, 20]), torch.Size([16, 15, 15]))

In [38]:
score1 += get_dist_bias(score1.shape[2], dist, ED1)
score2 += get_dist_bias(score2.shape[2], dist, ED1)
score1.shape, score2.shape

(torch.Size([16, 20, 20]), torch.Size([16, 15, 15]))

In [39]:
mask1c = mask1.unsqueeze(1).expand(-1, sent1.shape[1], -1).float()
mask2c = mask2.unsqueeze(1).expand(-1, sent2.shape[1], -1).float()
mask1c.shape, mask2c.shape

(torch.Size([16, 20, 20]), torch.Size([16, 15, 15]))

In [40]:
score1 = score1 * (1 - mask1c) + (mask1c * -1e8)
score2 = score2 * (1 - mask2c) + (mask2c * -1e8)
score1.shape, score2.shape

(torch.Size([16, 20, 20]), torch.Size([16, 15, 15]))

In [41]:
prob1 = F.softmax(score1, dim=2)
prob2 = F.softmax(score2, dim=2)
prob1.shape, prob2.shape

(torch.Size([16, 20, 20]), torch.Size([16, 15, 15]))

In [42]:
proj1_soft = torch.bmm(prob1, proj1)
proj2_soft = torch.bmm(prob2, proj2)
proj1_soft.shape, proj2_soft.shape

(torch.Size([16, 20, 200]), torch.Size([16, 15, 200]))

In [43]:
proj1 = torch.cat((proj1, proj1_soft), dim=2)
proj2 = torch.cat((proj2, proj2_soft), dim=2) 
proj1.shape, proj2.shape

(torch.Size([16, 20, 400]), torch.Size([16, 15, 400]))

In [44]:
f1 = F1(proj1)
f2 = F1(proj2)
f1.shape, f2.shape

(torch.Size([16, 20, 200]), torch.Size([16, 15, 200]))

In [45]:
score1 = torch.bmm(f1, f2.transpose(1,2))
score2 = torch.bmm(f2, f1.transpose(1,2))
score1.shape, score2.shape

(torch.Size([16, 20, 15]), torch.Size([16, 15, 20]))

In [46]:
mask1a = mask1.unsqueeze(1).expand(-1, sent2.shape[1], -1).float()
mask2a = mask2.unsqueeze(1).expand(-1, sent1.shape[1], -1).float()
mask1a.shape, mask2a.shape

(torch.Size([16, 15, 20]), torch.Size([16, 20, 15]))

In [47]:
score1 = score1 * (1 - mask2a) + (mask2a * -1e8)
score2 = score2 * (1 - mask1a) + (mask1a * -1e8)
score1.shape, score2.shape

(torch.Size([16, 20, 15]), torch.Size([16, 15, 20]))

In [48]:
prob1 = F.softmax(score1, dim=2)
prob2 = F.softmax(score2, dim=2)
prob1.shape, prob2.shape

(torch.Size([16, 20, 15]), torch.Size([16, 15, 20]))

In [49]:
proj1_soft = torch.bmm(prob2, proj1)
proj2_soft = torch.bmm(prob1, proj2)
proj1_soft.shape, proj2_soft.shape

(torch.Size([16, 15, 400]), torch.Size([16, 20, 400]))

In [50]:
proj1_combined = torch.cat((proj1, proj2_soft), dim=2)
proj2_combined = torch.cat((proj2, proj1_soft), dim=2)
proj1_combined.shape, proj2_combined.shape

(torch.Size([16, 20, 800]), torch.Size([16, 15, 800]))

In [52]:
g1 = G1(proj1_combined)
g2 = G1(proj2_combined)
g1.shape, g2.shape

(torch.Size([16, 20, 200]), torch.Size([16, 15, 200]))

In [53]:
mask1b = mask1.unsqueeze(2).expand(-1, -1, hidden_size1).float()
mask2b = mask2.unsqueeze(2).expand(-1, -1, hidden_size1).float()
mask1b.shape, mask2b.shape

(torch.Size([16, 20, 200]), torch.Size([16, 15, 200]))

In [54]:
g1 = g1 * (1 - mask1b)
g2 = g2 * (1 - mask2b)
g1.shape, g2.shape

(torch.Size([16, 20, 200]), torch.Size([16, 15, 200]))

In [55]:
g1_sum = g1.sum(dim=1)
g2_sum = g2.sum(dim=1)
g1_sum.shape, g2_sum.shape

(torch.Size([16, 200]), torch.Size([16, 200]))

In [56]:
g_all = torch.cat((g1_sum, g2_sum), dim=1)
g_all.shape

torch.Size([16, 400])

In [57]:
h_all = H1(g_all)
h_all.shape

torch.Size([16, 4])

## Train

In [13]:
def get_dist_bias(seqlen, dist, ED1):
    steps = torch.arange(0, seqlen)
    mat_steps = steps.repeat(seqlen, 1)
    flip_steps = torch.flip(steps, [0]).view(-1, 1)
    idx = torch.min(torch.abs(mat_steps - flip_steps), torch.tensor(dist)).cuda()
    return ED1(idx).squeeze()

In [14]:
def get_output(sent1, sent2, EP1, F1, G1, H1, intra, dist, FI1, ED1):
    proj1 = EP1(sent1)
    proj2 = EP1(sent2)
    
    mask1 = (sent1 == pad_tkn)
    mask2 = (sent2 == pad_tkn)
        
    if intra:
        fi1 = FI1(proj1)
        fi2 = FI1(proj2)
        score1 = torch.bmm(fi1, fi1.transpose(1,2))
        score2 = torch.bmm(fi2, fi2.transpose(1,2))
        score1 += get_dist_bias(score1.shape[2], dist, ED1)
        score2 += get_dist_bias(score2.shape[2], dist, ED1)
        mask1c = mask1.unsqueeze(1).expand(-1, sent1.shape[1], -1).float()
        mask2c = mask2.unsqueeze(1).expand(-1, sent2.shape[1], -1).float()
        score1 = score1 * (1 - mask1c) + (mask1c * -1e8)
        score2 = score2 * (1 - mask2c) + (mask2c * -1e8)
        prob1 = F.softmax(score1, dim=2)
        prob2 = F.softmax(score2, dim=2)
        proj1_soft = torch.bmm(prob1, proj1)
        proj2_soft = torch.bmm(prob2, proj2)
        proj1 = torch.cat((proj1, proj1_soft), dim=2)
        proj2 = torch.cat((proj2, proj2_soft), dim=2) 
        
    f1 = F1(proj1)
    f2 = F1(proj2)
    
    score1 = torch.bmm(f1, f2.transpose(1,2))
    score2 = torch.bmm(f2, f1.transpose(1,2))
    mask1a = mask1.unsqueeze(1).expand(-1, sent2.shape[1], -1).float()
    mask2a = mask2.unsqueeze(1).expand(-1, sent1.shape[1], -1).float()
    score1 = score1 * (1 - mask2a) + (mask2a * -1e8)
    score2 = score2 * (1 - mask1a) + (mask1a * -1e8)
    
    prob1 = F.softmax(score1, dim=2)
    prob2 = F.softmax(score2, dim=2)
    proj1_soft = torch.bmm(prob2, proj1)
    proj2_soft = torch.bmm(prob1, proj2)
    proj1_combined = torch.cat((proj1, proj2_soft), dim=2)
    proj2_combined = torch.cat((proj2, proj1_soft), dim=2)
    
    g1 = G1(proj1_combined)
    g2 = G1(proj2_combined)
    mask1b = mask1.unsqueeze(2).expand(-1, -1, hidden_size1).float()
    mask2b = mask2.unsqueeze(2).expand(-1, -1, hidden_size1).float()
    g1 = g1 * (1 - mask1b)
    g2 = g2 * (1 - mask2b)
    
    g1_sum = g1.sum(dim=1)
    g2_sum = g2.sum(dim=1)
    g_all = torch.cat((g1_sum, g2_sum), dim=1)
    h_all = H1(g_all)
    return h_all

In [15]:
def prepend_null(sent):
    null_tkns = null_tkn.repeat(sent.shape[0], 1)
    return torch.cat((null_tkns, sent), 1)

In [16]:
def training_loop(e, train_iter, EP1, F1, G1, H1, criterion, optimizer, intra=False, dist=None, FI1=None, ED1=None):
    EP1.train()
    F1.train()
    G1.train()
    H1.train()
    if intra:
        FI1.train()
        ED1.train()
    
    for ix,batch in enumerate(train_iter):
        optimizer.zero_grad()
        sent1 = prepend_null(batch.premise.values.transpose(0,1))
        sent2 = prepend_null(batch.hypothesis.values.transpose(0,1))
        target = batch.label.values
        output = get_output(sent1, sent2, EP1, F1, G1, H1, intra, dist, FI1, ED1)
        
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        
        if ix % 1000 == 0:
            acc = torch.sum(torch.argmax(output, dim=1) == target).item() / target.shape[0]
            print('Epoch: {0}, Batch: {1}, Train NLL: {2:0.4f}, Train Acc:{3:0.4f}'.format(e, ix, loss.cpu().detach(), acc))

In [17]:
def validation_loop(e, val_iter, EP1, F1, G1, H1, criterion, intra=False, dist=None, FI1=None, ED1=None):
    EP1.eval()
    F1.eval()
    G1.eval()
    H1.eval()
    if intra:
        FI1.eval()
        ED1.eval()
    
    total_loss = 0
    total_sent = 0
    total_correct = 0
    
    for ix,batch in enumerate(val_iter):
        sent1 = prepend_null(batch.premise.values.transpose(0,1))
        sent2 = prepend_null(batch.hypothesis.values.transpose(0,1))
        target = batch.label.values
        output = get_output(sent1, sent2, EP1, F1, G1, H1, intra, dist, FI1, ED1)
        
        loss = criterion(output, target).item()
        sent = sent1.shape[0]
        correct = torch.sum(torch.argmax(output, dim=1) == target).item()
        
        total_loss += loss*sent
        total_sent += sent
        total_correct += correct
    
    print('Epoch: {0}, Val NLL: {1:0.4f}, Val Acc: {2:0.4f}'.format(e, total_loss/total_sent, total_correct/total_sent))
    return total_loss

In [18]:
best_loss = 1e8
intra = True

if intra:
    dist = 10
    num_embeddings = dist + 1
    embedding_dim = 1
    FI1 = FeedForwardFIntra(hidden_size1, hidden_size1, hidden_size1).cuda()
    ED1 = EmbedDist(num_embeddings, embedding_dim).cuda()
    F1 = FeedForwardF(hidden_size1 * 2, hidden_size1, hidden_size1).cuda()
    G1 = FeedForwardG(hidden_size2 * 2, hidden_size1, hidden_size1).cuda()
else:
    dist = None    
    FI1 = None
    ED1 = None
    F1 = FeedForwardF(hidden_size1, hidden_size1, hidden_size1).cuda()
    G1 = FeedForwardG(hidden_size2, hidden_size1, hidden_size1).cuda()

EP1 = EmbedProject(weights, embed_size, hidden_size1).cuda()
H1 = FeedForwardH(hidden_size2, hidden_size1, output_size).cuda()

parameters = [param for param in EP1.parameters()] # embed, lnr, bias
parameters.extend([param for param in F1.parameters()]) # lnr1, bias1, lnr2, bias2
parameters.extend([param for param in G1.parameters()]) # lnr1, bias1, lnr2, bias2
parameters.extend([param for param in H1.parameters()]) # lnr1, bias1, lnr2, bias2, lnr3, bias3
if intra:
    parameters.extend([param for param in FI1.parameters()]) # lnr1, bias1, lnr2, bias2
    parameters.extend([param for param in ED1.parameters()]) # embed

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adagrad(parameters, lr=0.025, initial_accumulator_value=0.1)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", patience=4)

for e in range(100):
    training_loop(e, train_iter, EP1, F1, G1, H1, criterion, optimizer, intra=intra, dist=dist, FI1=FI1, ED1=ED1)
    loss = validation_loop(e, val_iter, EP1, F1, G1, H1, criterion, intra=intra, dist=dist, FI1=FI1, ED1=ED1)
    scheduler.step(loss)
    print('LR = {}'.format(optimizer.state_dict()['param_groups'][0]['lr']))
    if loss < best_loss:
        torch.save(EP1.state_dict(),'best_EP1_intra.pt')
        torch.save(F1.state_dict(),'best_F1_intra.pt')
        torch.save(G1.state_dict(),'best_G1_intra.pt')
        torch.save(H1.state_dict(),'best_H1_intra.pt')
        best_loss = loss
        print('WROTE MODEL')

Epoch: 0, Batch: 0, Train NLL: 1.3825, Train Acc:0.2500
Epoch: 0, Batch: 1000, Train NLL: 1.0682, Train Acc:0.5000
Epoch: 0, Batch: 2000, Train NLL: 1.0618, Train Acc:0.5000
Epoch: 0, Batch: 3000, Train NLL: 1.0612, Train Acc:0.3750
Epoch: 0, Batch: 4000, Train NLL: 1.1269, Train Acc:0.1875
Epoch: 0, Batch: 5000, Train NLL: 1.1323, Train Acc:0.6250
Epoch: 0, Batch: 6000, Train NLL: 1.1065, Train Acc:0.2500
Epoch: 0, Batch: 7000, Train NLL: 1.0678, Train Acc:0.4375
Epoch: 0, Batch: 8000, Train NLL: 0.9472, Train Acc:0.6875
Epoch: 0, Batch: 9000, Train NLL: 0.9522, Train Acc:0.5000
Epoch: 0, Batch: 10000, Train NLL: 0.9979, Train Acc:0.5000
Epoch: 0, Batch: 11000, Train NLL: 1.1684, Train Acc:0.1875
Epoch: 0, Batch: 12000, Train NLL: 0.9229, Train Acc:0.6250
Epoch: 0, Batch: 13000, Train NLL: 1.0132, Train Acc:0.4375
Epoch: 0, Batch: 14000, Train NLL: 1.0711, Train Acc:0.3125
Epoch: 0, Batch: 15000, Train NLL: 1.0289, Train Acc:0.5000
Epoch: 0, Batch: 16000, Train NLL: 0.9860, Train Acc:

Epoch: 4, Batch: 6000, Train NLL: 0.7437, Train Acc:0.7500
Epoch: 4, Batch: 7000, Train NLL: 0.8765, Train Acc:0.7500
Epoch: 4, Batch: 8000, Train NLL: 1.1986, Train Acc:0.3125
Epoch: 4, Batch: 9000, Train NLL: 0.5942, Train Acc:0.7500
Epoch: 4, Batch: 10000, Train NLL: 1.0224, Train Acc:0.5625
Epoch: 4, Batch: 11000, Train NLL: 0.8691, Train Acc:0.6875
Epoch: 4, Batch: 12000, Train NLL: 0.7700, Train Acc:0.6875
Epoch: 4, Batch: 13000, Train NLL: 0.4911, Train Acc:0.8125
Epoch: 4, Batch: 14000, Train NLL: 0.4845, Train Acc:0.8750
Epoch: 4, Batch: 15000, Train NLL: 0.8889, Train Acc:0.6250
Epoch: 4, Batch: 16000, Train NLL: 0.6050, Train Acc:0.7500
Epoch: 4, Batch: 17000, Train NLL: 0.9947, Train Acc:0.5000
Epoch: 4, Batch: 18000, Train NLL: 0.9545, Train Acc:0.6875
Epoch: 4, Batch: 19000, Train NLL: 0.6217, Train Acc:0.7500
Epoch: 4, Batch: 20000, Train NLL: 0.7094, Train Acc:0.6875
Epoch: 4, Batch: 21000, Train NLL: 0.6161, Train Acc:0.7500
Epoch: 4, Batch: 22000, Train NLL: 0.5051, T

Epoch: 7, Val NLL: 0.6942, Val Acc: 0.6990
LR = 0.025
WROTE MODEL
Epoch: 8, Batch: 0, Train NLL: 0.5743, Train Acc:0.7500
Epoch: 8, Batch: 1000, Train NLL: 0.5217, Train Acc:0.8750
Epoch: 8, Batch: 2000, Train NLL: 1.1333, Train Acc:0.4375
Epoch: 8, Batch: 3000, Train NLL: 0.6934, Train Acc:0.6875
Epoch: 8, Batch: 4000, Train NLL: 0.7012, Train Acc:0.6250
Epoch: 8, Batch: 5000, Train NLL: 0.5562, Train Acc:0.8125
Epoch: 8, Batch: 6000, Train NLL: 0.6984, Train Acc:0.7500
Epoch: 8, Batch: 7000, Train NLL: 0.6551, Train Acc:0.6875
Epoch: 8, Batch: 8000, Train NLL: 0.6037, Train Acc:0.6875
Epoch: 8, Batch: 9000, Train NLL: 0.7181, Train Acc:0.6250
Epoch: 8, Batch: 10000, Train NLL: 0.4593, Train Acc:0.8750
Epoch: 8, Batch: 11000, Train NLL: 0.6422, Train Acc:0.7500
Epoch: 8, Batch: 12000, Train NLL: 0.8122, Train Acc:0.6250
Epoch: 8, Batch: 13000, Train NLL: 1.0246, Train Acc:0.5000
Epoch: 8, Batch: 14000, Train NLL: 0.5933, Train Acc:0.6875
Epoch: 8, Batch: 15000, Train NLL: 0.5169, Trai

Epoch: 11, Batch: 27000, Train NLL: 0.7417, Train Acc:0.7500
Epoch: 11, Batch: 28000, Train NLL: 0.8635, Train Acc:0.6875
Epoch: 11, Batch: 29000, Train NLL: 0.4487, Train Acc:0.8125
Epoch: 11, Batch: 30000, Train NLL: 0.6280, Train Acc:0.8125
Epoch: 11, Batch: 31000, Train NLL: 0.5261, Train Acc:0.7500
Epoch: 11, Batch: 32000, Train NLL: 0.8044, Train Acc:0.6250
Epoch: 11, Batch: 33000, Train NLL: 0.7562, Train Acc:0.7500
Epoch: 11, Batch: 34000, Train NLL: 0.3704, Train Acc:0.9375
Epoch: 11, Val NLL: 0.6575, Val Acc: 0.7223
LR = 0.025
WROTE MODEL
Epoch: 12, Batch: 0, Train NLL: 0.6526, Train Acc:0.6875
Epoch: 12, Batch: 1000, Train NLL: 0.9073, Train Acc:0.3750
Epoch: 12, Batch: 2000, Train NLL: 0.3156, Train Acc:0.8750
Epoch: 12, Batch: 3000, Train NLL: 0.5001, Train Acc:0.8125
Epoch: 12, Batch: 4000, Train NLL: 0.6214, Train Acc:0.8125
Epoch: 12, Batch: 5000, Train NLL: 0.5829, Train Acc:0.7500
Epoch: 12, Batch: 6000, Train NLL: 0.4710, Train Acc:0.8125
Epoch: 12, Batch: 7000, Trai

Epoch: 15, Batch: 31000, Train NLL: 0.6076, Train Acc:0.6875
Epoch: 15, Batch: 32000, Train NLL: 0.4493, Train Acc:0.8750
Epoch: 15, Batch: 33000, Train NLL: 0.5812, Train Acc:0.8125
Epoch: 15, Batch: 34000, Train NLL: 0.5834, Train Acc:0.7500
Epoch: 15, Val NLL: 0.6428, Val Acc: 0.7298
LR = 0.025
WROTE MODEL
Epoch: 16, Batch: 0, Train NLL: 0.7981, Train Acc:0.6250
Epoch: 16, Batch: 1000, Train NLL: 0.9806, Train Acc:0.7500
Epoch: 16, Batch: 2000, Train NLL: 0.4446, Train Acc:0.8750
Epoch: 16, Batch: 3000, Train NLL: 0.8493, Train Acc:0.6875
Epoch: 16, Batch: 4000, Train NLL: 0.4982, Train Acc:0.6875
Epoch: 16, Batch: 5000, Train NLL: 0.4137, Train Acc:0.8750
Epoch: 16, Batch: 6000, Train NLL: 0.6240, Train Acc:0.8125
Epoch: 16, Batch: 7000, Train NLL: 0.5927, Train Acc:0.7500
Epoch: 16, Batch: 8000, Train NLL: 0.5110, Train Acc:0.7500
Epoch: 16, Batch: 9000, Train NLL: 0.8850, Train Acc:0.5625
Epoch: 16, Batch: 10000, Train NLL: 0.4934, Train Acc:0.8750
Epoch: 16, Batch: 11000, Train 

Epoch: 19, Batch: 23000, Train NLL: 0.6887, Train Acc:0.7500
Epoch: 19, Batch: 24000, Train NLL: 0.7144, Train Acc:0.5625
Epoch: 19, Batch: 25000, Train NLL: 0.8779, Train Acc:0.5625
Epoch: 19, Batch: 26000, Train NLL: 0.4751, Train Acc:0.7500
Epoch: 19, Batch: 27000, Train NLL: 0.4776, Train Acc:0.8750
Epoch: 19, Batch: 28000, Train NLL: 0.5124, Train Acc:0.7500
Epoch: 19, Batch: 29000, Train NLL: 0.3377, Train Acc:0.9375
Epoch: 19, Batch: 30000, Train NLL: 0.6344, Train Acc:0.6875
Epoch: 19, Batch: 31000, Train NLL: 0.5938, Train Acc:0.8750
Epoch: 19, Batch: 32000, Train NLL: 0.5541, Train Acc:0.8125
Epoch: 19, Batch: 33000, Train NLL: 0.4515, Train Acc:0.8125
Epoch: 19, Batch: 34000, Train NLL: 0.4023, Train Acc:0.9375
Epoch: 19, Val NLL: 0.6278, Val Acc: 0.7374
LR = 0.025
WROTE MODEL
Epoch: 20, Batch: 0, Train NLL: 0.7327, Train Acc:0.6250
Epoch: 20, Batch: 1000, Train NLL: 0.6520, Train Acc:0.7500
Epoch: 20, Batch: 2000, Train NLL: 0.5941, Train Acc:0.7500
Epoch: 20, Batch: 3000, 

Epoch: 23, Batch: 14000, Train NLL: 0.8116, Train Acc:0.6250
Epoch: 23, Batch: 15000, Train NLL: 0.4118, Train Acc:0.8750
Epoch: 23, Batch: 16000, Train NLL: 0.7189, Train Acc:0.7500
Epoch: 23, Batch: 17000, Train NLL: 0.6969, Train Acc:0.6875
Epoch: 23, Batch: 18000, Train NLL: 0.9099, Train Acc:0.5625
Epoch: 23, Batch: 19000, Train NLL: 0.3078, Train Acc:0.8750
Epoch: 23, Batch: 20000, Train NLL: 0.6288, Train Acc:0.6875
Epoch: 23, Batch: 21000, Train NLL: 0.7661, Train Acc:0.6875
Epoch: 23, Batch: 22000, Train NLL: 0.8289, Train Acc:0.5625
Epoch: 23, Batch: 23000, Train NLL: 0.7883, Train Acc:0.6875
Epoch: 23, Batch: 24000, Train NLL: 0.5754, Train Acc:0.8125
Epoch: 23, Batch: 25000, Train NLL: 0.4289, Train Acc:0.8750
Epoch: 23, Batch: 26000, Train NLL: 0.5018, Train Acc:0.7500
Epoch: 23, Batch: 27000, Train NLL: 0.4411, Train Acc:0.9375
Epoch: 23, Batch: 28000, Train NLL: 0.7542, Train Acc:0.6250
Epoch: 23, Batch: 29000, Train NLL: 0.4606, Train Acc:0.8750
Epoch: 23, Batch: 30000,

Epoch: 27, Batch: 5000, Train NLL: 0.8396, Train Acc:0.7500
Epoch: 27, Batch: 6000, Train NLL: 0.7883, Train Acc:0.6875
Epoch: 27, Batch: 7000, Train NLL: 0.4496, Train Acc:0.8125
Epoch: 27, Batch: 8000, Train NLL: 0.4594, Train Acc:0.8125
Epoch: 27, Batch: 9000, Train NLL: 0.4099, Train Acc:0.8750
Epoch: 27, Batch: 10000, Train NLL: 0.4368, Train Acc:0.8750
Epoch: 27, Batch: 11000, Train NLL: 0.6231, Train Acc:0.6250
Epoch: 27, Batch: 12000, Train NLL: 0.4981, Train Acc:0.8750
Epoch: 27, Batch: 13000, Train NLL: 0.5261, Train Acc:0.8125
Epoch: 27, Batch: 14000, Train NLL: 0.7683, Train Acc:0.6875
Epoch: 27, Batch: 15000, Train NLL: 0.6209, Train Acc:0.8125
Epoch: 27, Batch: 16000, Train NLL: 0.7906, Train Acc:0.7500
Epoch: 27, Batch: 17000, Train NLL: 0.5276, Train Acc:0.6875
Epoch: 27, Batch: 18000, Train NLL: 0.6693, Train Acc:0.7500
Epoch: 27, Batch: 19000, Train NLL: 0.7052, Train Acc:0.7500
Epoch: 27, Batch: 20000, Train NLL: 0.6671, Train Acc:0.6875
Epoch: 27, Batch: 21000, Trai

Epoch: 30, Batch: 33000, Train NLL: 0.4444, Train Acc:0.8125
Epoch: 30, Batch: 34000, Train NLL: 0.7760, Train Acc:0.6250
Epoch: 30, Val NLL: 0.6119, Val Acc: 0.7463
LR = 0.025
WROTE MODEL
Epoch: 31, Batch: 0, Train NLL: 0.2546, Train Acc:0.9375
Epoch: 31, Batch: 1000, Train NLL: 0.4059, Train Acc:0.8125
Epoch: 31, Batch: 2000, Train NLL: 0.3599, Train Acc:0.8750
Epoch: 31, Batch: 3000, Train NLL: 0.5894, Train Acc:0.6250
Epoch: 31, Batch: 4000, Train NLL: 0.5441, Train Acc:0.6875
Epoch: 31, Batch: 5000, Train NLL: 1.0278, Train Acc:0.5625
Epoch: 31, Batch: 6000, Train NLL: 0.3512, Train Acc:0.9375
Epoch: 31, Batch: 7000, Train NLL: 0.4621, Train Acc:0.8125
Epoch: 31, Batch: 8000, Train NLL: 0.5124, Train Acc:0.8125
Epoch: 31, Batch: 9000, Train NLL: 0.5855, Train Acc:0.7500
Epoch: 31, Batch: 10000, Train NLL: 0.5418, Train Acc:0.7500
Epoch: 31, Batch: 11000, Train NLL: 0.7057, Train Acc:0.6250
Epoch: 31, Batch: 12000, Train NLL: 0.6028, Train Acc:0.7500
Epoch: 31, Batch: 13000, Train 

Epoch: 34, Batch: 24000, Train NLL: 0.4179, Train Acc:0.8750
Epoch: 34, Batch: 25000, Train NLL: 0.5570, Train Acc:0.6875
Epoch: 34, Batch: 26000, Train NLL: 0.4032, Train Acc:0.8750
Epoch: 34, Batch: 27000, Train NLL: 0.4390, Train Acc:0.8750
Epoch: 34, Batch: 28000, Train NLL: 1.1542, Train Acc:0.3750
Epoch: 34, Batch: 29000, Train NLL: 0.4051, Train Acc:0.8125
Epoch: 34, Batch: 30000, Train NLL: 0.5627, Train Acc:0.6875
Epoch: 34, Batch: 31000, Train NLL: 0.7093, Train Acc:0.7500
Epoch: 34, Batch: 32000, Train NLL: 0.3965, Train Acc:0.8750
Epoch: 34, Batch: 33000, Train NLL: 0.7599, Train Acc:0.6875
Epoch: 34, Batch: 34000, Train NLL: 0.8244, Train Acc:0.5625
Epoch: 34, Val NLL: 0.6036, Val Acc: 0.7540
LR = 0.025
WROTE MODEL
Epoch: 35, Batch: 0, Train NLL: 0.5061, Train Acc:0.8125
Epoch: 35, Batch: 1000, Train NLL: 0.4188, Train Acc:0.8125
Epoch: 35, Batch: 2000, Train NLL: 0.6337, Train Acc:0.6875
Epoch: 35, Batch: 3000, Train NLL: 0.2854, Train Acc:0.9375
Epoch: 35, Batch: 4000, T

Epoch: 38, Batch: 15000, Train NLL: 0.3446, Train Acc:0.8750
Epoch: 38, Batch: 16000, Train NLL: 0.4012, Train Acc:0.8125
Epoch: 38, Batch: 17000, Train NLL: 0.6096, Train Acc:0.7500
Epoch: 38, Batch: 18000, Train NLL: 0.4776, Train Acc:0.8750
Epoch: 38, Batch: 19000, Train NLL: 1.0412, Train Acc:0.4375
Epoch: 38, Batch: 20000, Train NLL: 0.5343, Train Acc:0.7500
Epoch: 38, Batch: 21000, Train NLL: 0.4460, Train Acc:0.8125
Epoch: 38, Batch: 22000, Train NLL: 0.8368, Train Acc:0.6250
Epoch: 38, Batch: 23000, Train NLL: 0.5507, Train Acc:0.6875
Epoch: 38, Batch: 24000, Train NLL: 0.6544, Train Acc:0.6875
Epoch: 38, Batch: 25000, Train NLL: 0.4941, Train Acc:0.8125
Epoch: 38, Batch: 26000, Train NLL: 0.7067, Train Acc:0.6875
Epoch: 38, Batch: 27000, Train NLL: 0.4167, Train Acc:0.8125
Epoch: 38, Batch: 28000, Train NLL: 0.3521, Train Acc:0.9375
Epoch: 38, Batch: 29000, Train NLL: 0.6462, Train Acc:0.7500
Epoch: 38, Batch: 30000, Train NLL: 0.7063, Train Acc:0.6875
Epoch: 38, Batch: 31000,

KeyboardInterrupt: 

In [19]:
torch.save(ED1.state_dict(),'best_ED1_intra.pt')
torch.save(FI1.state_dict(),'best_FI1_intra.pt')