In [1]:
import numpy as np
import pandas as pd
from collections import Counter
import pickle
import random
import io
import matplotlib.pyplot as plt
%matplotlib inline
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [152]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

np.random.seed(1)

cuda


Using the pretrained word embeddings: wiki-news-300d-1M.vec

In [40]:
def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', 
                  newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = [*map(float, tokens[1:])]
    return data

word_embeds = load_vectors("wiki-news-300d-1M.vec")

In [41]:
snli_train = pd.read_table("snli_mnli_data/snli_train.tsv")
snli_val = pd.read_table("snli_mnli_data/snli_val.tsv")

In [42]:
snli_train.head(5)

Unnamed: 0,sentence1,sentence2,label
0,A young girl in a pink shirt sitting on a dock...,A young girl watching the sunset over the water .,neutral
1,A woman is smiling while the man next to her i...,Two people are next to each other .,entailment
2,"Across the river , you can see a large building .",The large building is full of apartments and t...,neutral
3,a man in white shorts and a black shirt is par...,A man is riding a jetski on the ocean .,contradiction
4,Four black dogs run together on bright green g...,Four dogs are preparing to be launched into sp...,contradiction


In [43]:
snli_val.head(5)

Unnamed: 0,sentence1,sentence2,label
0,"Three women on a stage , one wearing red shoes...",There are two women standing on the stage,contradiction
1,"Four people sit on a subway two read books , o...","Multiple people are on a subway together , wit...",entailment
2,bicycles stationed while a group of people soc...,People get together near a stand of bicycles .,entailment
3,Man in overalls with two horses .,a man in overalls with two horses,entailment
4,Man observes a wavelength given off by an elec...,The man is examining what wavelength is given ...,entailment


In [44]:
snli_train["sentence1"] = snli_train["sentence1"].apply(lambda x: x.split(" "))
snli_train["sentence2"] = snli_train["sentence2"].apply(lambda x: x.split(" "))

snli_val["sentence1"] = snli_val["sentence1"].apply(lambda x: x.split(" "))
snli_val["sentence2"] = snli_val["sentence2"].apply(lambda x: x.split(" "))

label_dict = {"entailment":0,"neutral":1,"contradiction":2}

snli_train["label_encoding"] = snli_train["label"].apply(lambda x: label_dict[x])
snli_val["label_encoding"] = snli_val["label"].apply(lambda x: label_dict[x])

snli_train_labels = np.array(snli_train["label_encoding"])
snli_val_labels = np.array(snli_val["label_encoding"])

In [45]:
snli_train.head(3)

Unnamed: 0,sentence1,sentence2,label,label_encoding
0,"[A, young, girl, in, a, pink, shirt, sitting, ...","[A, young, girl, watching, the, sunset, over, ...",neutral,1
1,"[A, woman, is, smiling, while, the, man, next,...","[Two, people, are, next, to, each, other, .]",entailment,0
2,"[Across, the, river, ,, you, can, see, a, larg...","[The, large, building, is, full, of, apartment...",neutral,1


In [46]:
snli_val.head(3)

Unnamed: 0,sentence1,sentence2,label,label_encoding
0,"[Three, women, on, a, stage, ,, one, wearing, ...","[There, are, two, women, standing, on, the, st...",contradiction,2
1,"[Four, people, sit, on, a, subway, two, read, ...","[Multiple, people, are, on, a, subway, togethe...",entailment,0
2,"[bicycles, stationed, while, a, group, of, peo...","[People, get, together, near, a, stand, of, bi...",entailment,0


In [55]:
def build_vocabulary(train_data, vocab_size, vocab_embeddings):
    '''
    Returns:
    id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    token2id: dictionary where keys represent tokens and corresponding values represent indices
    '''

    all_tokens = []
    for row in (train_data['sentence1'] + train_data['sentence2']).iteritems():
        all_tokens += row[1]
        
    vocabulary, count = zip(*Counter(all_tokens).most_common(vocab_size))
    vectors = word_embeds
    vocabulary = [word for word in vocabulary if word in vectors]

    id2token = list(vocabulary)
    token2id = dict(zip(vocabulary, range(2, 2+len(vocabulary))))
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX
    token2id['<unk>'] = UNK_IDX
    
    return token2id, id2token, vectors

In [56]:
def preprocess_data(data, dataset, vocab_size=50000):
#     data = prepare_data(data)
    if dataset == 'train':
        token2id, id2token, vectors = build_vocabulary(data, 
                                                       vocab_size,
                                                       word_embeds)
        return data, token2id, id2token, vectors
    return data

In [87]:
token2id, id2token, vectors = build_vocabulary(snli_val, 
                                               vocab_size,
                                               word_embeds)

In [88]:
val_data = preprocess_data(snli_val, 
                          'val', 
                          VOCAB_SIZE)

In [59]:
train_data.shape

(100000, 4)

In [63]:
token2id["man"]

8

In [64]:
id2token[8]

'man'

In [92]:
embedding_size = len(vectors["man"])
embedding_size

300

In [74]:
max_sent1 = train_data['sentence1'].str.len().quantile(0.99)
max_sent2 = train_data['sentence2'].str.len().quantile(0.99)
max_sent_len = max(max_sent1,max_sent2)
max_sent_len = int(max_sent_len)
max_sent_len

34

In [75]:
PAD_IDX = 0
UNK_IDX = 1

In [223]:
class SNLIDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, data, token2id, max_sent_len):

        self.sent1 = data['sentence1'].values
        self.sent2 = data['sentence2'].values
#         self.y = data['label'].values
        self.y = data['label_encoding'].values
        self.max_sent_len = max_sent_len
        self.token2id = token2id

    def __len__(self):
        return len(self.y)

    def __getitem__(self, sent_no):

        sent_label = self.y[sent_no]
        sent1_word_idx =  []
        sent2_word_idx = []
        sent1_mask = []
        sent2_mask = []
        
        """
        Adds id to the list
        Adds UNK's id to the list if the word is out of vocab
        """
        for token in self.sent1[sent_no][:self.max_sent_len]:
            # if token in vocab condition
            if token in self.token2id.keys():
                sent1_word_idx.append(self.token2id[token])
                sent1_mask.append(0)
            else:
                # Adding the UNK
                sent1_word_idx.append(UNK_IDX)
                sent1_mask.append(1)
                
        for token in self.sent2[sent_no][:self.max_sent_len]:
            # if token in vocab condition
            if token in self.token2id.keys():
                sent2_word_idx.append(self.token2id[token])
                sent2_mask.append(0)
            else:
                # adding the UNK
                sent2_word_idx.append(UNK_IDX)
                sent2_mask.append(1)
        
        sent1_list = [sent1_word_idx, sent1_mask, len(sent1_word_idx)]
        sent2_list = [sent2_word_idx, sent2_mask, len(sent2_word_idx)]
        
        return sent1_list + sent2_list + [sent_label]
    
    

In [224]:
def snli_collate_func(batch, max_sent_len):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    sent1_data = []
    sent2_data = []
    sent1_mask = [] 
    sent2_mask = []
    sent1_lengths = []
    sent2_lengths = []
    labels = []

    for datum in batch:
        
        sent1_lengths.append(datum[2])
        sent2_lengths.append(datum[5])
        labels.append(datum[6])
        
        # Padding to achieve the same length for every input
        # max_sent_len x embedding_size
        sent1_data_padded = np.pad(np.array(datum[0]), 
                                   pad_width=((0, 
                                               max_sent_len-datum[2])),
                                mode="constant", 
                                   constant_values=0)
        sent1_data.append(sent1_data_padded)
        
        # doing the same thing for the mask of out of vocab words
        sent1_mask_padded = np.pad(np.array(datum[1]), 
                                pad_width=((0, 
                                            max_sent_len-datum[2])),
                                mode="constant", 
                                constant_values=0)
        sent1_mask.append(sent1_mask_padded)
        
        # Padding to achieve the same length for every input
        # max_sent_len x embedding_size
        sent2_data_padded = np.pad(np.array(datum[3]), 
                                pad_width=((0, 
                                            max_sent_len-datum[5])),
                                mode="constant", 
                                constant_values=0)
        sent2_data.append(sent2_data_padded)
        
        # doing the same thing for the mask of out of vocab words
        sent2_mask_padded = np.pad(np.array(datum[4]), 
                                pad_width=((0, 
                                            max_sent_len-datum[5])),
                               mode="constant", 
                                constant_values=0)
        sent2_mask.append(sent2_mask_padded)
        
    # sorting in reverse order    
    idx_desc = np.argsort(sent1_lengths)[::-1]
    
    # reverse sorting the data
    sent1_data = np.array(sent1_data)[idx_desc]
    sent2_data = np.array(sent2_data)[idx_desc]
    
    # reverse sorting the mask
    sent1_mask = np.array(sent1_mask)[idx_desc].reshape(len(batch), -1, 1)
    sent2_mask = np.array(sent2_mask)[idx_desc].reshape(len(batch), -1, 1)
    
    # reverse sorting the lengths
    sent1_lengths = np.array(sent1_lengths)[idx_desc]
    sent2_lengths = np.array(sent2_lengths)[idx_desc]
    
    # reverse sorting the labels
    labels = np.array(labels)[idx_desc]
    #print()
    
    # hypothesis output
    sent1_list = [torch.from_numpy(sent1_data), 
                  torch.from_numpy(sent1_mask).float(), 
                  sent1_lengths]
    # second sentence output
    sent2_list = [torch.from_numpy(sent2_data), 
                  torch.from_numpy(sent2_mask).float(), 
                  sent2_lengths]
    
    return_list = []
    return_list.extend(sent1_list)
    return_list.extend(sent2_list)
    return_list.extend([torch.from_numpy(labels)])
        
    return  return_list

In [225]:
train_dataset = SNLIDataset(train_data, 
                            token2id, 
                            max_sent_len)
train_loader = DataLoader(dataset = train_dataset,
                          batch_size = BATCH_SIZE,
                          collate_fn = lambda batch, max_sent_length = max_sent_len: snli_collate_func(batch, max_sent_length),
                          shuffle = True)

val_dataset = SNLIDataset(val_data, 
                          token2id, 
                          max_sent_len)
val_loader = DataLoader(dataset = val_dataset,
                        batch_size = BATCH_SIZE,
                        collate_fn = lambda batch, max_sent_length = max_sent_len: snli_collate_func(batch, max_sent_length),
                        shuffle = False)

In [237]:
class BiGRUNetwork(nn.Module):
    def __init__(self, 
                 hidden_size, 
                 num_layers, 
                 embedding_w, 
                 vocab_size, 
                 bidirectional=False):

        super(BiGRUNetwork, self).__init__()
        
        self.num_layers = num_layers 
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, 
                                      embedding_size, 
                                      padding_idx=PAD_IDX)
        self.embedding.weight.data.copy_(torch.from_numpy(embedding_w))
        
        self.gru = nn.GRU(embedding_size, 
                          hidden_size, 
                          num_layers, 
                          batch_first = True,
                          bidirectional = bidirectional)
        if bidirectional:
            self.num_directions = 2 
        else:
            self.num_directions = 1
            
        

    def init_hidden(self, batch_size):
        hidden = torch.randn(self.num_directions * self.num_layers, 
                             batch_size, 
                             self.hidden_size).to(device)
        return hidden
        
        
    def forward(self, x, mask, lengths):
        
        real2sort = sorted(range(len(lengths)), 
                             key = lambda x: -lengths[x])
        sort2real = sorted(range(len(lengths)), 
                             key = lambda x: real2sort[x])
#         print("real2sort[0:5] = " + str(real2sort[0:5]))
#         print("sort2real[0:5] = " + str(sort2real[0:5]))
        
        x = x[real2sort]
        batch_size, seq_len = x.size()
        freeze_mask = mask[sort2real]
        lengths = lengths[real2sort]
        
        self.hidden = self.init_hidden(batch_size)
#         print("hidden = " + str(hidden.size()))
        
        embeddings = self.embedding(x)
        embeddings = freeze_mask * embeddings + \
                        (1 - freeze_mask) * embeddings.clone().detach()
        embeddings = torch.nn.utils.rnn.pack_padded_sequence(embeddings, 
                                                        lengths, 
                                                        batch_first = True)
        
        gru_out, self.hidden = self.gru(embeddings, 
                                        self.hidden)
        gru_out, _ = torch.nn.utils.rnn.pad_packed_sequence(gru_out, 
                                                            batch_first = True)
        gru_out = gru_out.view(batch_size, 
                               -1, 
                               self.num_directions, 
                               self.hidden_size)
        gru_out = torch.sum(gru_out, 
                            dim=1)
        gru_out = torch.cat([gru_out[:,i,:] for i in range(self.num_directions)], 
                            dim=1)
        gru_out = gru_out[sort2real]
        
        return gru_out
    


In [238]:
id2token[0:5]

['<pad>', '<unk>', '.', 'a', 'A']

In [239]:
len(id2token)

22059

In [240]:
len(vectors)

999994

In [241]:
"""
weights_init = init_embedding_weights(vectors, 
                                 token2id, 
                                 id2token,
                                 embedding_size = embedding_size)
"""

def init_embedding_weights(vectors, 
                           token2id, 
                           id2token, 
                           embedding_size):
    
    weights = np.zeros((len(token2id), 
                        embedding_size))
    
    #print("len(vectors) = " + str(len(vectors)))
    for idx in range(2, len(id2token)):
        token = id2token[idx]
        weights[idx] = np.array(vectors[token])
        
    weights[1] = np.random.randn(embedding_size)
    
    return weights


In [335]:
class MulticlassClassificationNetwork(nn.Module):
    def __init__(self, 
                 input_size, 
                 hidden_size, 
                 num_classes=3, 
                 num_directions=1, 
                 aggregation='concatenate', 
                 dropout_p=0.5):
        
        super(MulticlassClassificationNetwork, self).__init__()
        
        self.aggregation = aggregation
        
        """
        First FC Layer
        """
        if self.aggregation == 'concatenate':
            self.fc1 = nn.Linear(2 * input_size * num_directions, 
                                 hidden_size)
        else:
            self.fc1 = nn.Linear(input_size * num_directions, 
                                 hidden_size)
            
        """
        Second FC Layer
        """
        self.fc2 = nn.Linear(hidden_size, 
                             num_classes)
        self.relu = nn.ReLU(inplace = True)
        self.dropout = nn.Dropout(p = dropout_p)

        self.init_weights()

    def forward(self, gru_output1, gru_output2):
        
        """
        Aggregation of outputs
        """
        if self.aggregation == 'concatenate':
            output = torch.cat([gru_output1, gru_output2], dim=1)
        elif self.aggregation == 'el-multiply':
            output = gru_output1 * gru_output2
        elif self.aggregation == 'subtract':
            output = gru_output1 - gru_output2
        else:
            output = gru_output1 + gru_output2
            
#         print("MC classifier output shape = " + str(output.shape))
            
            
        """
        The FC Network
        """
        output = output.view(output.size(0), -1) 
        output = self.fc1(output)
        output = self.relu(output)
        output = self.dropout(output)
        output = self.fc2(output)
        
        return output

    def init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight)
                nn.init.uniform_(m.bias)

In [336]:
def train(GRUNetwork, 
          Multiclassifier, 
          DataLoader, 
          criterion, 
          optimizer, 
          epoch):
    
    GRUNetwork.train()
    Multiclassifier.train()
    total_loss = 0.0
    
    for batch_idx, \
        (sent1, sent1_mask, sent1_lengths, \
         sent2, sent2_mask, sent2_lengths, 
         y) in enumerate(DataLoader):
            
        sent1, sent1_mask = sent1.to(device), sent1_mask.to(device),  
        sent2, sent2_mask = sent2.to(device), sent2_mask.to(device),
        y = y.to(device)
        
        GRUNetwork.train()
        Multiclassifier.train()
        optimizer.zero_grad()
        
        # Forward
        output1 = GRUNetwork(sent1, 
                              sent1_mask, 
                              sent1_lengths)
        # Reverse
        output2 = GRUNetwork(sent2, 
                              sent2_mask, 
                              sent2_lengths)
        c_output = Multiclassifier(output1, output2)
        
        loss = criterion(c_output, y)
        loss.backward()
        optimizer.step()

        # Accurately compute loss, because of different batch size
        total_loss += loss.item() * len(sent1) / len(DataLoader.dataset)
        
        if (batch_idx+1) % (len(DataLoader.dataset)//(10*y.shape[0])) == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, (batch_idx+1) * y.shape[0], len(DataLoader.dataset),
                100. * (batch_idx+1) / len(DataLoader), loss.item()))

    optimizer.zero_grad()
    return total_loss



In [337]:
def test(GRUNetwork, 
         Multiclassifier, 
         DataLoader, 
         criterion):
    
    # switching to eval mode
    GRUNetwork.eval()
    Multiclassifier.eval()
    
    test_loss = 0.0
    y_list = []
    output_list = []
    
    with torch.no_grad():
        for batch_idx, \
            (sent1, sent1_mask, sent1_lengths, \
             sent2, sent2_mask, sent2_lengths, 
             y) in enumerate(DataLoader):

            sent1, sent1_mask = sent1.to(device), sent1_mask.to(device),  
            sent2, sent2_mask = sent2.to(device), sent2_mask.to(device),
            y = y.to(device)
            
            # Forward
            output1 = GRUNetwork(sent1, 
                                  sent1_mask, 
                                  sent1_lengths)
            # Reverse
            output2 = GRUNetwork(sent2, 
                                  sent2_mask, 
                                  sent2_lengths)
            c_output = Multiclassifier(output1, output2)
        
            loss = criterion(c_output, y)

            test_loss += loss.item() / len(DataLoader.dataset)

            output_list.append(c_output)
            y_list.append(y)
            
    return test_loss, torch.cat(output_list, dim=0), torch.cat(y_list, dim=0)




In [338]:
def accuracy(GRUNetwork, 
             Multiclassifier, 
             DataLoader, 
             criterion):
    
    _, y_preds, y_true = test(GRUNetwork = GRUNetwork,
                                    Multiclassifier = Multiclassifier,
                                    DataLoader = DataLoader,
                                    criterion = criterion)

    y_preds = y_preds.max(1)[1]
    return 100 * y_preds.eq(y_true.data.view_as(y_preds)).float().mean().item()



In [339]:
vocab_size = 50000
num_classes = 3
num_layers = 1
bidirectional = True
if bidirectional:
    num_directions = 2
else:
    num_directions = 1
gru_hidden_size = 256
classifier_hidden_size = 512

BATCH_SIZE = 32
lr = 3e-4
n_epochs = 10

Hyperparameters to bu searched:

* The size of the hidden dimension of the CNN,
* The kernel size of the CNN,
* Experiment with different ways of interacting the two encoded sentences (concatenation, element-wise multiplication, outer multiplication etc)
* Regularization (e.g. weight decay, dropout).

In [343]:
import itertools

# BiGRUNetwork(hidden_size = gru_hidden_size, 
#                    num_layers = num_layers, 
#                    embedding_w = weights_init, 
#                    vocab_size = len(token2id), 
#                    bidirectional = bidirectional).to(device)
# MulticlassClassificationNetwork(gru_hidden_size, 
#                                   classifier_hidden_size, 
#                                   num_classes, 
#                                   num_directions).to(device)

"""
        if self.aggregation == 'concatenate':
            output = torch.cat([gru_output1, gru_output2], dim=1)
        elif self.aggregation == 'el_multiply':
            output = gru_output1 * gru_output2
        elif self.aggregation == 'subtract':
            output = gru_output1 - gru_output2
            
            
           
"""


params = [[1e-3],                    # Learning Rates
         [50,100,250],               # gru_hidden_size
         ["concatenate", "el-multiply", "subtract"], # Different Aggregations
         [150, 250] ,                # classifier_hidden_size
        [0.1,0.5]              # Dropout
         ]

num_epochs = 7

params = [*itertools.product(*params)]
len(params)

36

In [344]:
import pickle as pkl

In [345]:
param_losses = {}

for param_set in params[:]:
    print ("Parameter Set: "+str(param_set))
    ## INITIALIZE VALIDATION ACCURACY LIST
    param_losses[param_set] = []
    
    weights_init = init_embedding_weights(vectors, 
                                     token2id, 
                                     id2token,
                                     embedding_size = embedding_size)

    biGRU = BiGRUNetwork(hidden_size = param_set[1], 
                       num_layers = num_layers, 
                       embedding_w = weights_init, 
                       vocab_size = len(token2id), 
                       bidirectional = bidirectional).to(device)
    """
    input_size, 
     hidden_size, 
     num_classes=3, 
     num_directions=1, 
     aggregation='concatenate', 
     dropout_p=0.5
    """
    mClassifier = MulticlassClassificationNetwork(input_size = param_set[1], 
                                                   hidden_size = param_set[3],
                                                  aggregation = param_set[2],
                                                   num_classes = num_classes, 
                                                   num_directions = num_directions,
                                                  dropout_p = param_set[4]).to(device)

    
    loss_train_list = []
    acc_train_list = []
    loss_val_list = []
    acc_val_list = []
    stop_epoch = n_epochs

#     weights_init = init_embedding_weights(vectors, 
#                                      token2id, 
#                                      id2token,
#                                      embedding_size = embedding_size)

#     biGRU = BiGRUNetwork(hidden_size = gru_hidden_size, 
#                        num_layers = NUM_LAYERS, 
#                        embedding_w = weights_init, 
#                        vocab_size = len(token2id), 
#                        bidirectional = bidirectional).to(device)
#     mClassifier = MulticlassClassificationNetwork(gru_hidden_size, 
#                                                    classifier_hidden_size, 
#                                                    num_classes, 
#                                                    num_directions).to(device)

    for epoch in range(1, num_epochs+1):

        loss_train = train(GRUNetwork = biGRU, 
                          Multiclassifier = mClassifier, 
                          DataLoader = train_loader, 
                          criterion = nn.CrossEntropyLoss(), 
                          optimizer = torch.optim.Adam(list(biGRU.parameters()) + \
                                                       list(mClassifier.parameters()), 
                                                       lr=lr), 
                          epoch = epoch
        )

        loss_val, val_preds, val_true = test(
            GRUNetwork = biGRU,
            Multiclassifier = mClassifier,
            DataLoader = val_loader,
            criterion = nn.CrossEntropyLoss(reduction='sum')
        )

        acc_train = accuracy(biGRU, 
                              mClassifier, 
                              train_loader, 
                              nn.CrossEntropyLoss(reduction='sum'))
        acc_val = accuracy(biGRU, 
                            mClassifier, 
                            val_loader, 
                            nn.CrossEntropyLoss(reduction='sum'))

        loss_train_list.append(loss_train)
        acc_train_list.append(acc_train)
        loss_val_list.append(loss_val)
        acc_val_list.append(acc_val)
        
        print("loss_train = " + str(loss_train))
        print("acc_train = " + str(acc_train))
        print("loss_val = " + str(loss_val))
        print("acc_val = " + str(acc_val))
        
        
#         # Plotting every epoch
#         plt.figure(figsize=(16,8))
#         plt.plot(acc_train_list, color="blue", label = "train_acc")
#         plt.plot(acc_val_list, color="red", label = "val_acc")
#         plt.plot(loss_train_list, color="green", label = "loss train - (CE)")
#         plt.plot(loss_val_list, color="orange", label = "loss validation - (CE)")
#         plt.title("Training and Validation Accuracy")
#         plt.xlabel("Iterations")
#         plt.ylabel("Accuracy")
#         plt.legend()
#         plt.show()

    # saving to pickle
    param_losses[param_set] = [loss_train_list, acc_val_list, acc_train_list]
    pkl.dump(param_losses, open("param_losses_rnn.p", "wb"))



Parameter Set: (0.001, 50, 'concatenate', 150, 0.1)
loss_train = 1.054586710357665
acc_train = 55.15899658203125
loss_val = 0.9494785118103025
acc_val = 54.90000247955322
loss_train = 0.921186209735866
acc_train = 58.48999619483948
loss_val = 0.9048771533966066
acc_val = 57.40000009536743
loss_train = 0.8726335548591616
acc_train = 61.71000003814697
loss_val = 0.8644836411476134
acc_val = 60.00000238418579
loss_train = 0.8297434891319272
acc_train = 64.82499837875366
loss_val = 0.8299408059120177
acc_val = 62.800002098083496
loss_train = 0.7928592366790777
acc_train = 67.25099682807922
loss_val = 0.7920640697479248
acc_val = 65.00000357627869
loss_train = 0.7613294848537425
acc_train = 68.47899556159973
loss_val = 0.779049578666687
acc_val = 64.70000147819519
loss_train = 0.7376081550502784
acc_train = 69.7219967842102
loss_val = 0.7449611682891846
acc_val = 67.70000457763672
Parameter Set: (0.001, 50, 'concatenate', 150, 0.5)
loss_train = 1.103470438995361
acc_train = 48.3619987964630

KeyboardInterrupt: 