# CS 287 - HW 4 - Cont.

In [1]:
import random
import torch
import torch.nn as nn 
import torch.nn.functional as F
import torchtext
from torchtext.vocab import Vectors, GloVe
from namedtensor import ntorch, NamedTensor
from namedtensor.text import NamedField

In [2]:
# load data
TEXT = NamedField(names=('seqlen',)) # Our input $x$
LABEL = NamedField(sequential=False, names=()) # Our labels $y$
train, val, test = torchtext.datasets.SNLI.splits(TEXT, LABEL)
print('len(train)', len(train))
TEXT.build_vocab(train)
LABEL.build_vocab(train)
print('len(TEXT.vocab)', len(TEXT.vocab))
print('len(LABEL.vocab)', len(LABEL.vocab))
train_iter, val_iter, test_iter = torchtext.data.BucketIterator.splits(
    (train, val, test), batch_size=16, device=torch.device("cuda"), repeat=False)

len(train) 549367
len(TEXT.vocab) 62998
len(LABEL.vocab) 4


In [3]:
# build the vocabulary with word embeddings
# out-of-vocabulary words are hashed to one of 100 random embeddings each initialized to mean 0, stdev 1 (Sec 5.1)
unk_vectors = [torch.randn(300) for _ in range(100)]
TEXT.vocab.load_vectors(vectors='glove.6B.300d', unk_init=lambda x:random.choice(unk_vectors))
vectors = TEXT.vocab.vectors
vectors = vectors / vectors.norm(dim=1, keepdim=True) # normalized to have l_2 norm of 1
vectors = NamedTensor(vectors, ('word', 'embedding'))
TEXT.vocab.vectors = vectors
print("word embeddings shape:", TEXT.vocab.vectors.shape)

word embeddings shape: OrderedDict([('word', 62998), ('embedding', 300)])


## Decomposable Intra-Sentence Attention Model

In [13]:
[LABEL.vocab.itos[i] for i in [0,1,2,3]]

['<unk>', 'entailment', 'contradiction', 'neutral']

In [18]:
[TEXT.vocab.itos[i] for i in [0,1,2,3]]

['<unk>', '<pad>', 'a', 'A']

In [15]:
# checking to make sure no <unk> labels
total = 0
for batch in iter(train_iter):
    total += torch.sum(batch.label.values == 0)
total

tensor(0, device='cuda:0')

In [17]:
# no prepend with NULL!
for batch in iter(train_iter):
    print([TEXT.vocab.itos[i] for i in batch.premise.values[:,0]])
    print([TEXT.vocab.itos[i] for i in batch.hypothesis.values[:,0]])
    break

['A', 'large', 'building', 'provides', 'the', 'backdrop', 'for', 'a', 'man', 'wearing', 'shorts', 'who', 'is', 'looking', 'at', 'his', 'phone', 'as', 'well', 'as', 'a', 'woman', 'who', 'is', 'wearing', 'a', 'black', 'skirt', 'and', 'white', 'shirt', 'as', 'she', 'walks', 'toward', 'the', 'man.']
['The', 'woman', 'is', 'looking', 'at', 'the', 'man.', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']


In [24]:
TEXT.vocab.stoi['null']

56690

In [4]:
# dimensions
input_size = TEXT.vocab.vectors.shape['word']
embed_size = TEXT.vocab.vectors.shape['embedding']
hidden_size1 = 200
hidden_size2 = hidden_size1 * 2
output_size = len(LABEL.vocab)
print('DIMENSIONS -- input: %d, embed: %d, hidden1: %d, hidden2: %d, output: %d'%(input_size, embed_size, hidden_size1, hidden_size2, output_size))

DIMENSIONS -- input: 62998, embed: 300, hidden1: 200, hidden2: 400, output: 4


In [5]:
# pre-trained embeddings
weights = TEXT.vocab.vectors.values.cuda()
weights.shape

torch.Size([62998, 300])

In [6]:
class EmbedProject(torch.nn.Module):
    def __init__(self, weights, embed_size, project_size):
        super(EmbedProject, self).__init__()
        self.embed = nn.Embedding.from_pretrained(weights, freeze=True) # weights: input_size x embed_size
        self.linear = nn.Linear(embed_size, project_size)
        torch.nn.init.normal_(self.linear.weight, mean=0, std=0.01)
    def forward(self, inputs):
        embedding = self.embed(inputs)
        output = self.linear(embedding)
        return output

In [7]:
EP1 = EmbedProject(weights, embed_size, hidden_size1).cuda()
EP1

EmbedProject(
  (embed): Embedding(62998, 300)
  (linear): Linear(in_features=300, out_features=200, bias=True)
)

In [11]:
batch = next(iter(train_iter))
raw_sent1 = batch.premise.values.transpose(0,1)
raw_sent2 = batch.hypothesis.values.transpose(0,1)
null_tkn = torch.tensor(TEXT.vocab.stoi['null'], device='cuda')
null_tkns = null_tkn.repeat(raw_sent1.shape[0],1)
sent1 = torch.cat((null_tkns, raw_sent1), 1)
sent2 = torch.cat((null_tkns, raw_sent2), 1)
raw_sent1.shape, raw_sent2.shape, sent1.shape, sent2.shape

(torch.Size([16, 36]),
 torch.Size([16, 18]),
 torch.Size([16, 37]),
 torch.Size([16, 19]))

In [12]:
proj1 = EP1(sent1)
proj2 = EP1(sent2)
proj1.shape, proj2.shape

(torch.Size([16, 37, 200]), torch.Size([16, 19, 200]))

In [13]:
class FeedForwardFIntra(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout=0.2):
        super(FeedForwardFIntra, self).__init__()
        self.d = nn.Dropout(dropout)
        self.m = nn.ReLU()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, output_size)
        for param in self.parameters():
            torch.nn.init.normal_(param, mean=0, std=0.01)
    def forward(self, inputs):
        hidden = self.m(self.linear1(self.d(inputs)))
        output = self.m(self.linear2(self.d(hidden)))
        return output

In [14]:
FI1 = FeedForwardFIntra(hidden_size1, hidden_size1, hidden_size1).cuda()
FI1

FeedForwardFIntra(
  (d): Dropout(p=0.2)
  (m): ReLU()
  (linear1): Linear(in_features=200, out_features=200, bias=True)
  (linear2): Linear(in_features=200, out_features=200, bias=True)
)

In [15]:
fi1 = FI1(proj1)
fi2 = FI1(proj2)
fi1.shape, fi2.shape

(torch.Size([16, 37, 200]), torch.Size([16, 19, 200]))

In [16]:
# intra-sentence attention!
score1 = torch.bmm(fi1, fi1.transpose(1,2))
score2 = torch.bmm(fi2, fi2.transpose(1,2))
score1.shape, score2.shape

(torch.Size([16, 37, 37]), torch.Size([16, 19, 19]))

In [17]:
prob1 = F.softmax(score1, dim=2)
prob2 = F.softmax(score2, dim=2)
prob1.shape, prob2.shape

(torch.Size([16, 37, 37]), torch.Size([16, 19, 19]))

In [18]:
# intra-sentence attention!
proj1_soft = torch.bmm(prob1, proj1)
proj2_soft = torch.bmm(prob2, proj2)
proj1_soft.shape, proj2_soft.shape

(torch.Size([16, 37, 200]), torch.Size([16, 19, 200]))

In [21]:
# intra-sentence attention!
proj1_intra = torch.cat((proj1, proj1_soft), dim=2)
proj2_intra = torch.cat((proj2, proj2_soft), dim=2)
proj1_intra.shape, proj2_intra.shape

(torch.Size([16, 37, 400]), torch.Size([16, 19, 400]))

In [22]:
class FeedForwardF(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout=0.2):
        super(FeedForwardF, self).__init__()
        self.d = nn.Dropout(dropout)
        self.m = nn.ReLU()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, output_size)
        for param in self.parameters():
            torch.nn.init.normal_(param, mean=0, std=0.01)
    def forward(self, inputs):
        hidden = self.m(self.linear1(self.d(inputs)))
        output = self.m(self.linear2(self.d(hidden)))
        return output

In [23]:
F1 = FeedForwardF(hidden_size2, hidden_size1, hidden_size1).cuda()
F1

FeedForwardF(
  (d): Dropout(p=0.2)
  (m): ReLU()
  (linear1): Linear(in_features=400, out_features=200, bias=True)
  (linear2): Linear(in_features=200, out_features=200, bias=True)
)

In [8]:
class FeedForwardG(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout=0.2):
        super(FeedForwardG, self).__init__()
        self.d = nn.Dropout(dropout)
        self.m = nn.ReLU()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, output_size)
        for param in self.parameters():
            torch.nn.init.normal_(param, mean=0, std=0.01)
    def forward(self, inputs):
        hidden = self.m(self.linear1(self.d(inputs)))
        output = self.m(self.linear2(self.d(hidden)))
        return output

In [9]:
class FeedForwardH(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout=0.2):
        super(FeedForwardH, self).__init__()
        self.d = nn.Dropout(dropout)
        self.m = nn.ReLU()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.linear3 = nn.Linear(hidden_size, output_size)
        for param in self.parameters():
            torch.nn.init.normal_(param, mean=0, std=0.01)
    def forward(self, inputs):
        hidden1 = self.m(self.linear1(self.d(inputs)))
        hidden2 = self.m(self.linear2(self.d(hidden1)))
        output = self.linear3(hidden2)
        return output

In [10]:
class EmbedDist(torch.nn.Module):
    def __init__(self, num_embeddings, embedding_dim): # num = 11, dim = 1
        super(EmbedDist, self).__init__()
        self.embed = nn.Embedding(num_embeddings, embedding_dim)
        torch.nn.init.normal_(self.embed.weight, mean=0, std=0.01)
    def forward(self, inputs):
        output = self.embed(inputs)
        return output

In [None]:
dist = 10
seqlen = score1.shape[2]
steps = torch.arange(0, seqlen)
mat_steps = steps.repeat(seqlen, 1)
flip_steps = torch.flip(steps, [0]).view(-1, 1)
idx = torch.min(torch.abs(mat_steps - flip_steps), torch.tensor(dist))

ED1 = EmbedDist(dist+1, 1).cuda()
ED1(idx).squeeze().shape

## Train

In [12]:
def get_dist_bias(seqlen, dist, ED1):
    steps = torch.arange(0, seqlen)
    mat_steps = steps.repeat(seqlen, 1)
    flip_steps = torch.flip(steps, [0]).view(-1, 1)
    idx = torch.min(torch.abs(mat_steps - flip_steps), torch.tensor(dist))
    return ED1(idx).squeeze().cuda()

In [13]:
def get_output(sent1, sent2, EP1, F1, G1, H1, intra, dist, FI1, ED1):
    proj1 = EP1(sent1)
    proj2 = EP1(sent2)
    if intra:
        fi1 = FI1(proj1)
        fi2 = FI1(proj2)
        score1 = torch.bmm(fi1, fi1.transpose(1,2))
        score2 = torch.bmm(fi2, fi2.transpose(1,2))
        score1 += get_dist_bias(score1.shape[2], dist, ED1)
        score2 += get_dist_bias(score2.shape[2], dist, ED1)
        prob1 = F.softmax(score1, dim=2)
        prob2 = F.softmax(score2, dim=2)
        proj1_soft = torch.bmm(prob1, proj1)
        proj2_soft = torch.bmm(prob2, proj2)
        proj1 = torch.cat((proj1, proj1_soft), dim=2)
        proj2 = torch.cat((proj2, proj2_soft), dim=2) 
    f1 = F1(proj1)
    f2 = F1(proj2)
    score1 = torch.bmm(f1, f2.transpose(1,2))
    score2 = score1.transpose(1,2)
    prob1 = F.softmax(score1, dim=2)
    prob2 = F.softmax(score2, dim=2)
    proj1_soft = torch.bmm(prob2, proj1)
    proj2_soft = torch.bmm(prob1, proj2)
    proj1_combined = torch.cat((proj1, proj2_soft), dim=2)
    proj2_combined = torch.cat((proj2, proj1_soft), dim=2)
    g1 = G1(proj1_combined)
    g2 = G1(proj2_combined)
    g1_sum = g1.sum(dim=1)
    g2_sum = g2.sum(dim=1)
    g_all = torch.cat((g1_sum, g2_sum), dim=1)
    h_all = H1(g_all)
    return h_all

In [14]:
def training_loop(e, train_iter, EP1, F1, G1, H1, criterion, optimizer, intra=False, dist=None, FI1=None, ED1=None):
    EP1.train()
    F1.train()
    G1.train()
    H1.train()
    if intra:
        FI1.train()
        ED1.train()
    
    for ix,batch in enumerate(train_iter):
        optimizer.zero_grad()
        sent1 = batch.premise.values.transpose(0,1)
        sent2 = batch.hypothesis.values.transpose(0,1)
        target = batch.label.values
        output = get_output(sent1, sent2, EP1, F1, G1, H1, intra, dist, FI1, ED1)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        
        if ix % 1000 == 0:
            acc = torch.sum(torch.argmax(output, dim=1) == target).item() / target.shape[0]
            print('Epoch: {0}, Batch: {1}, Train NLL: {2:0.4f}, Train Acc:{3:0.4f}'.format(e, ix, loss.cpu().detach(), acc))

In [15]:
def validation_loop(e, val_iter, EP1, F1, G1, H1, criterion, intra=False, dist=None, FI1=None, ED1=None):
    EP1.eval()
    F1.eval()
    G1.eval()
    H1.eval()
    if intra:
        FI1.eval()
        ED1.eval()
    
    total_loss = 0
    total_sent = 0
    total_correct = 0
    
    for ix,batch in enumerate(val_iter):
        sent1 = batch.premise.values.transpose(0,1)
        sent2 = batch.hypothesis.values.transpose(0,1)
        target = batch.label.values
        output = get_output(sent1, sent2, EP1, F1, G1, H1, intra, dist, FI1, ED1)
        
        loss = criterion(output, target).item()
        sent = sent1.shape[0]
        correct = torch.sum(torch.argmax(output, dim=1) == target).item()
        
        total_loss += loss*sent
        total_sent += sent
        total_correct += correct
    
    print('Epoch: {0}, Val NLL: {1:0.4f}, Val Acc: {2:0.4f}'.format(e, total_loss/total_sent, total_correct/total_sent))
    return total_loss

In [16]:
best_loss = 1e8
intra = True

EP1 = EmbedProject(weights, embed_size, hidden_size1).cuda()
if intra:
    F1 = FeedForwardF(hidden_size2, hidden_size1, hidden_size1).cuda()
    dist = 10
    num_embeddings = dist + 1
    embedding_dim = 1
    ED1 = EmbedDist(num_embeddings, embedding_dim).cuda()
else:
    F1 = FeedForwardF(hidden_size1, hidden_size1, hidden_size1).cuda()
    dist = None
    num_embeddings = None
    embedding_dim = None
    ED1 = None
G1 = FeedForwardG(hidden_size2, hidden_size1, hidden_size1).cuda()
H1 = FeedForwardH(hidden_size2, hidden_size1, output_size).cuda()

parameters = [param for param in EP1.parameters()] # embed, lnr, bias
parameters.extend([param for param in F1.parameters()]) # lnr1, bias1, lnr2, bias2
parameters.extend([param for param in G1.parameters()]) # lnr1, bias1, lnr2, bias2
parameters.extend([param for param in H1.parameters()]) # lnr1, bias1, lnr2, bias2, lnr3, bias3
if intra:
    parameters.extend([param for param in ED1.parameters()]) # embed

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adagrad(parameters, lr=0.025, initial_accumulator_value=0.1)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", patience=4)

for e in range(100):
    training_loop(e, train_iter, EP1, F1, G1, H1, criterion, optimizer, intra=intra, dist=dist, FI1=FI1, ED1=ED1)
    loss = validation_loop(e, val_iter, EP1, F1, G1, H1, criterion, intra=intra, dist=dist, FI1=FI1, ED1=ED1)
    scheduler.step(loss)
    if loss < best_loss:
        torch.save(EP1.state_dict(),'best_EP1_intra.pt')
        torch.save(F1.state_dict(),'best_F1_intra.pt')
        torch.save(G1.state_dict(),'best_G1_intra.pt')
        torch.save(H1.state_dict(),'best_H1_intra.pt')
        best_loss = loss
        print('WROTE MODEL')

Epoch: 0, Batch: 0, Train NLL: 1.3938, Train Acc:0.1875
Epoch: 0, Batch: 1000, Train NLL: 1.1063, Train Acc:0.2500
Epoch: 0, Batch: 2000, Train NLL: 1.1036, Train Acc:0.2500
Epoch: 0, Batch: 3000, Train NLL: 1.0975, Train Acc:0.3125
Epoch: 0, Batch: 4000, Train NLL: 1.1153, Train Acc:0.1875
Epoch: 0, Batch: 5000, Train NLL: 1.0915, Train Acc:0.5000
Epoch: 0, Batch: 6000, Train NLL: 1.0928, Train Acc:0.5000
Epoch: 0, Batch: 7000, Train NLL: 1.1031, Train Acc:0.3750
Epoch: 0, Batch: 8000, Train NLL: 1.0816, Train Acc:0.4375
Epoch: 0, Batch: 9000, Train NLL: 1.0973, Train Acc:0.4375
Epoch: 0, Batch: 10000, Train NLL: 1.1004, Train Acc:0.2500
Epoch: 0, Batch: 11000, Train NLL: 1.1075, Train Acc:0.2500
Epoch: 0, Batch: 12000, Train NLL: 1.1043, Train Acc:0.2500
Epoch: 0, Batch: 13000, Train NLL: 1.0968, Train Acc:0.3750
Epoch: 0, Batch: 14000, Train NLL: 1.1022, Train Acc:0.3750
Epoch: 0, Batch: 15000, Train NLL: 1.0986, Train Acc:0.3125
Epoch: 0, Batch: 16000, Train NLL: 1.0986, Train Acc:

Epoch: 3, Batch: 30000, Train NLL: 0.9155, Train Acc:0.5000
Epoch: 3, Batch: 31000, Train NLL: 0.8083, Train Acc:0.6875
Epoch: 3, Batch: 32000, Train NLL: 0.9403, Train Acc:0.5625
Epoch: 3, Batch: 33000, Train NLL: 1.1548, Train Acc:0.2500
Epoch: 3, Batch: 34000, Train NLL: 0.8532, Train Acc:0.6875
Epoch: 3, Val NLL: 0.8340, Val Acc: 0.6212
WROTE MODEL
Epoch: 4, Batch: 0, Train NLL: 0.8023, Train Acc:0.6250
Epoch: 4, Batch: 1000, Train NLL: 0.7721, Train Acc:0.6250
Epoch: 4, Batch: 2000, Train NLL: 0.6612, Train Acc:0.8750
Epoch: 4, Batch: 3000, Train NLL: 0.9726, Train Acc:0.5625
Epoch: 4, Batch: 4000, Train NLL: 1.0314, Train Acc:0.4375
Epoch: 4, Batch: 5000, Train NLL: 0.9285, Train Acc:0.4375
Epoch: 4, Batch: 6000, Train NLL: 0.8408, Train Acc:0.6250
Epoch: 4, Batch: 7000, Train NLL: 1.0021, Train Acc:0.5000
Epoch: 4, Batch: 8000, Train NLL: 0.7951, Train Acc:0.7500
Epoch: 4, Batch: 9000, Train NLL: 0.7730, Train Acc:0.6250
Epoch: 4, Batch: 10000, Train NLL: 0.8468, Train Acc:0.562

Epoch: 7, Batch: 24000, Train NLL: 0.8508, Train Acc:0.5625
Epoch: 7, Batch: 25000, Train NLL: 0.9408, Train Acc:0.5625
Epoch: 7, Batch: 26000, Train NLL: 0.7137, Train Acc:0.6250
Epoch: 7, Batch: 27000, Train NLL: 0.5748, Train Acc:0.8125
Epoch: 7, Batch: 28000, Train NLL: 0.9278, Train Acc:0.6250
Epoch: 7, Batch: 29000, Train NLL: 0.8465, Train Acc:0.6250
Epoch: 7, Batch: 30000, Train NLL: 1.0609, Train Acc:0.3750
Epoch: 7, Batch: 31000, Train NLL: 0.8002, Train Acc:0.7500
Epoch: 7, Batch: 32000, Train NLL: 0.6893, Train Acc:0.6250
Epoch: 7, Batch: 33000, Train NLL: 0.8114, Train Acc:0.6875
Epoch: 7, Batch: 34000, Train NLL: 0.7402, Train Acc:0.7500
Epoch: 7, Val NLL: 0.7529, Val Acc: 0.6706
WROTE MODEL
Epoch: 8, Batch: 0, Train NLL: 0.7039, Train Acc:0.7500
Epoch: 8, Batch: 1000, Train NLL: 1.0572, Train Acc:0.4375
Epoch: 8, Batch: 2000, Train NLL: 0.6645, Train Acc:0.8750
Epoch: 8, Batch: 3000, Train NLL: 0.6413, Train Acc:0.7500
Epoch: 8, Batch: 4000, Train NLL: 1.0493, Train Acc:

Epoch: 11, Batch: 29000, Train NLL: 1.2459, Train Acc:0.4375
Epoch: 11, Batch: 30000, Train NLL: 0.9010, Train Acc:0.5000
Epoch: 11, Batch: 31000, Train NLL: 0.5912, Train Acc:0.7500
Epoch: 11, Batch: 32000, Train NLL: 0.7385, Train Acc:0.7500
Epoch: 11, Batch: 33000, Train NLL: 1.1863, Train Acc:0.3750
Epoch: 11, Batch: 34000, Train NLL: 0.8138, Train Acc:0.6250
Epoch: 11, Val NLL: 0.7176, Val Acc: 0.6955
WROTE MODEL
Epoch: 12, Batch: 0, Train NLL: 0.7235, Train Acc:0.6875
Epoch: 12, Batch: 1000, Train NLL: 0.6787, Train Acc:0.6875
Epoch: 12, Batch: 2000, Train NLL: 0.7030, Train Acc:0.6875
Epoch: 12, Batch: 3000, Train NLL: 0.4986, Train Acc:0.8125
Epoch: 12, Batch: 4000, Train NLL: 1.0448, Train Acc:0.5625
Epoch: 12, Batch: 5000, Train NLL: 1.1040, Train Acc:0.5625
Epoch: 12, Batch: 6000, Train NLL: 0.8374, Train Acc:0.8125
Epoch: 12, Batch: 7000, Train NLL: 0.5821, Train Acc:0.8125
Epoch: 12, Batch: 8000, Train NLL: 0.7206, Train Acc:0.6875
Epoch: 12, Batch: 9000, Train NLL: 0.5925

Epoch: 15, Batch: 21000, Train NLL: 0.7236, Train Acc:0.6875
Epoch: 15, Batch: 22000, Train NLL: 0.7709, Train Acc:0.6250
Epoch: 15, Batch: 23000, Train NLL: 0.5795, Train Acc:0.8125
Epoch: 15, Batch: 24000, Train NLL: 0.6956, Train Acc:0.6875
Epoch: 15, Batch: 25000, Train NLL: 0.6602, Train Acc:0.8125
Epoch: 15, Batch: 26000, Train NLL: 0.5480, Train Acc:0.8125
Epoch: 15, Batch: 27000, Train NLL: 0.8168, Train Acc:0.6875
Epoch: 15, Batch: 28000, Train NLL: 0.6679, Train Acc:0.7500
Epoch: 15, Batch: 29000, Train NLL: 1.0730, Train Acc:0.4375
Epoch: 15, Batch: 30000, Train NLL: 0.6673, Train Acc:0.7500
Epoch: 15, Batch: 31000, Train NLL: 0.7367, Train Acc:0.6250
Epoch: 15, Batch: 32000, Train NLL: 0.6219, Train Acc:0.6875
Epoch: 15, Batch: 33000, Train NLL: 0.9014, Train Acc:0.6250
Epoch: 15, Batch: 34000, Train NLL: 0.9345, Train Acc:0.5625
Epoch: 15, Val NLL: 0.6982, Val Acc: 0.6991
WROTE MODEL
Epoch: 16, Batch: 0, Train NLL: 0.8165, Train Acc:0.7500
Epoch: 16, Batch: 1000, Train NLL

Epoch: 19, Batch: 13000, Train NLL: 0.7534, Train Acc:0.5625
Epoch: 19, Batch: 14000, Train NLL: 0.8133, Train Acc:0.5625
Epoch: 19, Batch: 15000, Train NLL: 0.4737, Train Acc:0.8750
Epoch: 19, Batch: 16000, Train NLL: 1.0264, Train Acc:0.4375
Epoch: 19, Batch: 17000, Train NLL: 0.8296, Train Acc:0.6875
Epoch: 19, Batch: 18000, Train NLL: 0.4902, Train Acc:0.6250
Epoch: 19, Batch: 19000, Train NLL: 0.6126, Train Acc:0.7500
Epoch: 19, Batch: 20000, Train NLL: 0.7928, Train Acc:0.6875
Epoch: 19, Batch: 21000, Train NLL: 0.7903, Train Acc:0.6250
Epoch: 19, Batch: 22000, Train NLL: 0.6054, Train Acc:0.6875
Epoch: 19, Batch: 23000, Train NLL: 0.8742, Train Acc:0.5625
Epoch: 19, Batch: 24000, Train NLL: 0.8283, Train Acc:0.5625
Epoch: 19, Batch: 25000, Train NLL: 0.5780, Train Acc:0.7500
Epoch: 19, Batch: 26000, Train NLL: 0.7600, Train Acc:0.6875
Epoch: 19, Batch: 27000, Train NLL: 0.6682, Train Acc:0.7500
Epoch: 19, Batch: 28000, Train NLL: 0.7129, Train Acc:0.6250
Epoch: 19, Batch: 29000,

Epoch: 23, Batch: 5000, Train NLL: 0.5129, Train Acc:0.8750
Epoch: 23, Batch: 6000, Train NLL: 0.8447, Train Acc:0.6875
Epoch: 23, Batch: 7000, Train NLL: 0.7937, Train Acc:0.6875
Epoch: 23, Batch: 8000, Train NLL: 0.4960, Train Acc:0.9375
Epoch: 23, Batch: 9000, Train NLL: 0.5928, Train Acc:0.7500
Epoch: 23, Batch: 10000, Train NLL: 0.7353, Train Acc:0.7500
Epoch: 23, Batch: 11000, Train NLL: 0.9277, Train Acc:0.5000
Epoch: 23, Batch: 12000, Train NLL: 0.7721, Train Acc:0.6250
Epoch: 23, Batch: 13000, Train NLL: 0.7031, Train Acc:0.7500
Epoch: 23, Batch: 14000, Train NLL: 0.8401, Train Acc:0.6250
Epoch: 23, Batch: 15000, Train NLL: 0.5564, Train Acc:0.8125
Epoch: 23, Batch: 16000, Train NLL: 0.8533, Train Acc:0.5625
Epoch: 23, Batch: 17000, Train NLL: 0.6983, Train Acc:0.6875
Epoch: 23, Batch: 18000, Train NLL: 1.0974, Train Acc:0.5000
Epoch: 23, Batch: 19000, Train NLL: 0.8324, Train Acc:0.5000
Epoch: 23, Batch: 20000, Train NLL: 1.0323, Train Acc:0.4375
Epoch: 23, Batch: 21000, Trai

Epoch: 26, Batch: 33000, Train NLL: 0.8354, Train Acc:0.6250
Epoch: 26, Batch: 34000, Train NLL: 0.7986, Train Acc:0.6875
Epoch: 26, Val NLL: 0.6718, Val Acc: 0.7106
WROTE MODEL
Epoch: 27, Batch: 0, Train NLL: 0.4493, Train Acc:0.8750
Epoch: 27, Batch: 1000, Train NLL: 0.4506, Train Acc:0.9375
Epoch: 27, Batch: 2000, Train NLL: 0.6436, Train Acc:0.6875
Epoch: 27, Batch: 3000, Train NLL: 0.6827, Train Acc:0.6875
Epoch: 27, Batch: 4000, Train NLL: 0.5959, Train Acc:0.8750
Epoch: 27, Batch: 5000, Train NLL: 0.7211, Train Acc:0.6875
Epoch: 27, Batch: 6000, Train NLL: 0.7955, Train Acc:0.6250
Epoch: 27, Batch: 7000, Train NLL: 0.7880, Train Acc:0.7500
Epoch: 27, Batch: 8000, Train NLL: 1.1189, Train Acc:0.6250
Epoch: 27, Batch: 9000, Train NLL: 0.5948, Train Acc:0.8125
Epoch: 27, Batch: 10000, Train NLL: 0.6651, Train Acc:0.7500
Epoch: 27, Batch: 11000, Train NLL: 0.9400, Train Acc:0.4375
Epoch: 27, Batch: 12000, Train NLL: 0.7599, Train Acc:0.5625
Epoch: 27, Batch: 13000, Train NLL: 0.6114

Epoch: 30, Batch: 25000, Train NLL: 0.7387, Train Acc:0.7500
Epoch: 30, Batch: 26000, Train NLL: 0.8721, Train Acc:0.6250
Epoch: 30, Batch: 27000, Train NLL: 0.7912, Train Acc:0.6875
Epoch: 30, Batch: 28000, Train NLL: 0.8985, Train Acc:0.5000
Epoch: 30, Batch: 29000, Train NLL: 0.7879, Train Acc:0.6875
Epoch: 30, Batch: 30000, Train NLL: 0.6118, Train Acc:0.6875
Epoch: 30, Batch: 31000, Train NLL: 0.5204, Train Acc:0.8750
Epoch: 30, Batch: 32000, Train NLL: 0.5276, Train Acc:0.6875
Epoch: 30, Batch: 33000, Train NLL: 0.4310, Train Acc:0.8750
Epoch: 30, Batch: 34000, Train NLL: 0.6352, Train Acc:0.7500
Epoch: 30, Val NLL: 0.6665, Val Acc: 0.7186
WROTE MODEL
Epoch: 31, Batch: 0, Train NLL: 0.4385, Train Acc:0.9375
Epoch: 31, Batch: 1000, Train NLL: 0.4097, Train Acc:0.8750
Epoch: 31, Batch: 2000, Train NLL: 0.6510, Train Acc:0.8125
Epoch: 31, Batch: 3000, Train NLL: 0.6972, Train Acc:0.6250
Epoch: 31, Batch: 4000, Train NLL: 0.4972, Train Acc:0.8750
Epoch: 31, Batch: 5000, Train NLL: 0.

Epoch: 34, Batch: 17000, Train NLL: 0.4486, Train Acc:0.8750
Epoch: 34, Batch: 18000, Train NLL: 0.4619, Train Acc:0.8750
Epoch: 34, Batch: 19000, Train NLL: 0.7016, Train Acc:0.6875
Epoch: 34, Batch: 20000, Train NLL: 0.8127, Train Acc:0.5625
Epoch: 34, Batch: 21000, Train NLL: 0.5673, Train Acc:0.7500
Epoch: 34, Batch: 22000, Train NLL: 0.7315, Train Acc:0.5625
Epoch: 34, Batch: 23000, Train NLL: 0.4442, Train Acc:0.8125
Epoch: 34, Batch: 24000, Train NLL: 0.4431, Train Acc:0.8750
Epoch: 34, Batch: 25000, Train NLL: 0.7750, Train Acc:0.7500
Epoch: 34, Batch: 26000, Train NLL: 0.7348, Train Acc:0.7500
Epoch: 34, Batch: 27000, Train NLL: 0.6815, Train Acc:0.6875
Epoch: 34, Batch: 28000, Train NLL: 0.5131, Train Acc:0.8750
Epoch: 34, Batch: 29000, Train NLL: 0.8411, Train Acc:0.5625
Epoch: 34, Batch: 30000, Train NLL: 0.6929, Train Acc:0.6875
Epoch: 34, Batch: 31000, Train NLL: 0.4369, Train Acc:0.8750
Epoch: 34, Batch: 32000, Train NLL: 0.5455, Train Acc:0.8125
Epoch: 34, Batch: 33000,

Epoch: 38, Batch: 9000, Train NLL: 0.8471, Train Acc:0.5625
Epoch: 38, Batch: 10000, Train NLL: 0.9796, Train Acc:0.5000
Epoch: 38, Batch: 11000, Train NLL: 0.5050, Train Acc:0.8750
Epoch: 38, Batch: 12000, Train NLL: 0.7822, Train Acc:0.6250
Epoch: 38, Batch: 13000, Train NLL: 0.6530, Train Acc:0.8125
Epoch: 38, Batch: 14000, Train NLL: 0.8776, Train Acc:0.5625
Epoch: 38, Batch: 15000, Train NLL: 0.7019, Train Acc:0.6875
Epoch: 38, Batch: 16000, Train NLL: 0.9871, Train Acc:0.5000
Epoch: 38, Batch: 17000, Train NLL: 0.6994, Train Acc:0.6875
Epoch: 38, Batch: 18000, Train NLL: 0.7576, Train Acc:0.7500
Epoch: 38, Batch: 19000, Train NLL: 0.4704, Train Acc:0.8750
Epoch: 38, Batch: 20000, Train NLL: 0.8374, Train Acc:0.5625
Epoch: 38, Batch: 21000, Train NLL: 1.0552, Train Acc:0.5625
Epoch: 38, Batch: 22000, Train NLL: 0.7151, Train Acc:0.5625
Epoch: 38, Batch: 23000, Train NLL: 0.7134, Train Acc:0.6250
Epoch: 38, Batch: 24000, Train NLL: 0.7040, Train Acc:0.6875
Epoch: 38, Batch: 25000, 

Epoch: 42, Batch: 1000, Train NLL: 0.9204, Train Acc:0.5000
Epoch: 42, Batch: 2000, Train NLL: 0.4491, Train Acc:0.9375
Epoch: 42, Batch: 3000, Train NLL: 0.6636, Train Acc:0.8125
Epoch: 42, Batch: 4000, Train NLL: 0.5384, Train Acc:0.7500
Epoch: 42, Batch: 5000, Train NLL: 0.7871, Train Acc:0.6875
Epoch: 42, Batch: 6000, Train NLL: 0.8280, Train Acc:0.6875
Epoch: 42, Batch: 7000, Train NLL: 0.4802, Train Acc:0.8125
Epoch: 42, Batch: 8000, Train NLL: 0.5379, Train Acc:0.7500
Epoch: 42, Batch: 9000, Train NLL: 0.6511, Train Acc:0.8125
Epoch: 42, Batch: 10000, Train NLL: 0.7645, Train Acc:0.6250
Epoch: 42, Batch: 11000, Train NLL: 0.5861, Train Acc:0.7500
Epoch: 42, Batch: 12000, Train NLL: 0.5169, Train Acc:0.8125
Epoch: 42, Batch: 13000, Train NLL: 0.6295, Train Acc:0.7500
Epoch: 42, Batch: 14000, Train NLL: 0.6256, Train Acc:0.6875
Epoch: 42, Batch: 15000, Train NLL: 0.6760, Train Acc:0.7500
Epoch: 42, Batch: 16000, Train NLL: 0.5085, Train Acc:0.8125
Epoch: 42, Batch: 17000, Train NL

Epoch: 45, Batch: 29000, Train NLL: 0.3983, Train Acc:0.8750
Epoch: 45, Batch: 30000, Train NLL: 0.7285, Train Acc:0.5625
Epoch: 45, Batch: 31000, Train NLL: 0.5682, Train Acc:0.7500
Epoch: 45, Batch: 32000, Train NLL: 0.8909, Train Acc:0.6875
Epoch: 45, Batch: 33000, Train NLL: 0.8867, Train Acc:0.6250
Epoch: 45, Batch: 34000, Train NLL: 0.9760, Train Acc:0.7500
Epoch: 45, Val NLL: 0.6529, Val Acc: 0.7233
Epoch: 46, Batch: 0, Train NLL: 0.8399, Train Acc:0.7500
Epoch: 46, Batch: 1000, Train NLL: 0.5656, Train Acc:0.8125
Epoch: 46, Batch: 2000, Train NLL: 0.7757, Train Acc:0.6875
Epoch: 46, Batch: 3000, Train NLL: 0.7324, Train Acc:0.6875
Epoch: 46, Batch: 4000, Train NLL: 0.7769, Train Acc:0.6250
Epoch: 46, Batch: 5000, Train NLL: 0.6206, Train Acc:0.7500
Epoch: 46, Batch: 6000, Train NLL: 0.6490, Train Acc:0.6875
Epoch: 46, Batch: 7000, Train NLL: 0.9514, Train Acc:0.5000
Epoch: 46, Batch: 8000, Train NLL: 0.5629, Train Acc:0.8125
Epoch: 46, Batch: 9000, Train NLL: 0.7037, Train Acc:

Epoch: 49, Batch: 21000, Train NLL: 0.8082, Train Acc:0.5625
Epoch: 49, Batch: 22000, Train NLL: 0.7326, Train Acc:0.6250
Epoch: 49, Batch: 23000, Train NLL: 0.7861, Train Acc:0.6875
Epoch: 49, Batch: 24000, Train NLL: 0.6216, Train Acc:0.7500
Epoch: 49, Batch: 25000, Train NLL: 1.0585, Train Acc:0.4375
Epoch: 49, Batch: 26000, Train NLL: 0.3937, Train Acc:0.8125
Epoch: 49, Batch: 27000, Train NLL: 0.6038, Train Acc:0.6875
Epoch: 49, Batch: 28000, Train NLL: 0.4392, Train Acc:0.8125
Epoch: 49, Batch: 29000, Train NLL: 0.5353, Train Acc:0.8125
Epoch: 49, Batch: 30000, Train NLL: 0.8297, Train Acc:0.6875
Epoch: 49, Batch: 31000, Train NLL: 1.0648, Train Acc:0.5000
Epoch: 49, Batch: 32000, Train NLL: 0.4479, Train Acc:0.9375
Epoch: 49, Batch: 33000, Train NLL: 0.4952, Train Acc:0.8125
Epoch: 49, Batch: 34000, Train NLL: 1.0512, Train Acc:0.5000
Epoch: 49, Val NLL: 0.6476, Val Acc: 0.7253
WROTE MODEL
Epoch: 50, Batch: 0, Train NLL: 0.7476, Train Acc:0.6250
Epoch: 50, Batch: 1000, Train NLL

Epoch: 53, Batch: 13000, Train NLL: 0.8162, Train Acc:0.4375
Epoch: 53, Batch: 14000, Train NLL: 0.6525, Train Acc:0.6250
Epoch: 53, Batch: 15000, Train NLL: 0.9778, Train Acc:0.5625
Epoch: 53, Batch: 16000, Train NLL: 0.7642, Train Acc:0.5625
Epoch: 53, Batch: 17000, Train NLL: 0.5453, Train Acc:0.8125
Epoch: 53, Batch: 18000, Train NLL: 0.7002, Train Acc:0.6875
Epoch: 53, Batch: 19000, Train NLL: 0.6730, Train Acc:0.6875
Epoch: 53, Batch: 20000, Train NLL: 0.4187, Train Acc:0.8750
Epoch: 53, Batch: 21000, Train NLL: 0.5451, Train Acc:0.6875
Epoch: 53, Batch: 22000, Train NLL: 0.6163, Train Acc:0.6250
Epoch: 53, Batch: 23000, Train NLL: 0.8494, Train Acc:0.5625
Epoch: 53, Batch: 24000, Train NLL: 0.5599, Train Acc:0.7500
Epoch: 53, Batch: 25000, Train NLL: 0.8519, Train Acc:0.6250
Epoch: 53, Batch: 26000, Train NLL: 0.8707, Train Acc:0.6250
Epoch: 53, Batch: 27000, Train NLL: 0.6608, Train Acc:0.7500
Epoch: 53, Batch: 28000, Train NLL: 0.4908, Train Acc:0.8125
Epoch: 53, Batch: 29000,

Epoch: 57, Batch: 5000, Train NLL: 0.7241, Train Acc:0.6875
Epoch: 57, Batch: 6000, Train NLL: 0.8385, Train Acc:0.6875
Epoch: 57, Batch: 7000, Train NLL: 0.6600, Train Acc:0.7500
Epoch: 57, Batch: 8000, Train NLL: 0.7345, Train Acc:0.7500
Epoch: 57, Batch: 9000, Train NLL: 0.7908, Train Acc:0.6250
Epoch: 57, Batch: 10000, Train NLL: 1.0356, Train Acc:0.6875
Epoch: 57, Batch: 11000, Train NLL: 0.5651, Train Acc:0.8125
Epoch: 57, Batch: 12000, Train NLL: 0.6295, Train Acc:0.7500
Epoch: 57, Batch: 13000, Train NLL: 0.9165, Train Acc:0.6250
Epoch: 57, Batch: 14000, Train NLL: 0.5466, Train Acc:0.8125
Epoch: 57, Batch: 15000, Train NLL: 0.6240, Train Acc:0.6875
Epoch: 57, Batch: 16000, Train NLL: 0.4265, Train Acc:0.8750
Epoch: 57, Batch: 17000, Train NLL: 0.4855, Train Acc:0.8125
Epoch: 57, Batch: 18000, Train NLL: 0.5456, Train Acc:0.8125
Epoch: 57, Batch: 19000, Train NLL: 0.7081, Train Acc:0.8125
Epoch: 57, Batch: 20000, Train NLL: 0.5212, Train Acc:0.7500
Epoch: 57, Batch: 21000, Trai

Epoch: 60, Batch: 33000, Train NLL: 0.4476, Train Acc:0.7500
Epoch: 60, Batch: 34000, Train NLL: 0.4471, Train Acc:0.7500
Epoch: 60, Val NLL: 0.6451, Val Acc: 0.7250
Epoch: 61, Batch: 0, Train NLL: 1.0016, Train Acc:0.4375
Epoch: 61, Batch: 1000, Train NLL: 0.9030, Train Acc:0.5625
Epoch: 61, Batch: 2000, Train NLL: 0.6360, Train Acc:0.7500
Epoch: 61, Batch: 3000, Train NLL: 0.5143, Train Acc:0.8125
Epoch: 61, Batch: 4000, Train NLL: 0.7470, Train Acc:0.6250
Epoch: 61, Batch: 5000, Train NLL: 0.9193, Train Acc:0.5000
Epoch: 61, Batch: 6000, Train NLL: 0.4039, Train Acc:0.8750
Epoch: 61, Batch: 7000, Train NLL: 0.3558, Train Acc:0.8125
Epoch: 61, Batch: 8000, Train NLL: 1.1681, Train Acc:0.4375
Epoch: 61, Batch: 9000, Train NLL: 0.3494, Train Acc:0.8750
Epoch: 61, Batch: 10000, Train NLL: 0.6752, Train Acc:0.7500
Epoch: 61, Batch: 11000, Train NLL: 0.7436, Train Acc:0.6250
Epoch: 61, Batch: 12000, Train NLL: 0.9594, Train Acc:0.5625
Epoch: 61, Batch: 13000, Train NLL: 0.5007, Train Acc:

Epoch: 64, Batch: 26000, Train NLL: 0.2755, Train Acc:1.0000
Epoch: 64, Batch: 27000, Train NLL: 0.6503, Train Acc:0.8750
Epoch: 64, Batch: 28000, Train NLL: 0.3523, Train Acc:0.8750
Epoch: 64, Batch: 29000, Train NLL: 0.4617, Train Acc:0.8750
Epoch: 64, Batch: 30000, Train NLL: 0.5397, Train Acc:0.7500
Epoch: 64, Batch: 31000, Train NLL: 1.0304, Train Acc:0.5000
Epoch: 64, Batch: 32000, Train NLL: 0.6811, Train Acc:0.6875
Epoch: 64, Batch: 33000, Train NLL: 0.5225, Train Acc:0.8125
Epoch: 64, Batch: 34000, Train NLL: 0.9447, Train Acc:0.5625
Epoch: 64, Val NLL: 0.6424, Val Acc: 0.7280
Epoch: 65, Batch: 0, Train NLL: 0.8625, Train Acc:0.6875
Epoch: 65, Batch: 1000, Train NLL: 0.9725, Train Acc:0.5000
Epoch: 65, Batch: 2000, Train NLL: 0.5001, Train Acc:0.8750
Epoch: 65, Batch: 3000, Train NLL: 0.3024, Train Acc:0.9375
Epoch: 65, Batch: 4000, Train NLL: 0.4600, Train Acc:0.8750
Epoch: 65, Batch: 5000, Train NLL: 0.6079, Train Acc:0.7500
Epoch: 65, Batch: 6000, Train NLL: 0.6072, Train A

KeyboardInterrupt: 