In [1]:
import numpy as np
import pandas as pd
import spacy
import string
import pickle as pkl
from sklearn import preprocessing
import os.path
from operator import add

In [2]:
#load in the data
train_df = pd.read_csv('snli_train.tsv', sep="\t")
val_df = pd.read_csv('snli_val.tsv',sep="\t")

#get data & convert sentences to lists
train_sentence1=train_df['sentence1'].values.tolist()
train_sentence2=train_df['sentence2'].values.tolist()
val_sentence1=val_df['sentence1'].values.tolist()
val_sentence2=val_df['sentence2'].values.tolist()

#convert the text labels to numeric
le = preprocessing.LabelEncoder()
le.fit(train_df['label'])
train_targets=le.transform(train_df['label']).tolist()
val_targets=le.transform(val_df['label']).tolist()

In [3]:
# Load English tokenizer, tagger, parser, NER and word vectors
tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation

# lowercase and remove punctuation
def tokenize(sent):
    tokens = tokenizer(sent)
    return [token.text.lower() for token in tokens if (token.text not in punctuations)]

In [4]:
# This is the code cell that tokenizes train/val/test datasets
def tokenize_dataset(dataset):
    token_dataset = []
    # we are keeping track of all tokens in dataset 
    # in order to create vocabulary later
    all_tokens = []
    
    for sample in dataset:
        tokens = tokenize(sample)
        token_dataset.append(tokens)
        all_tokens += tokens

    return token_dataset, all_tokens

In [5]:
#train set tokens
if not os.path.exists('train_all_combined_sentence_tokens.p'):
    print("Tokenizing train data")
    train_sentence1_tokens, train_all_sentence1_tokens = tokenize_dataset(train_sentence1)
    train_sentence2_tokens, train_all_sentence2_tokens = tokenize_dataset(train_sentence2)
    train_all_combined_sentence_tokens = train_all_sentence1_tokens + train_all_sentence2_tokens
    pkl.dump(train_all_combined_sentence_tokens, open("train_all_combined_sentence_tokens.p", "wb"))

In [6]:
#if pickle files do not exist, collect them.  Otherwise, this if block will not run
if not os.path.exists('train_sentence1_tokens.p'):
    print ("Tokenizing train data")
    train_sentence1_tokens, train_all_sentence1_tokens = tokenize_dataset(train_sentence1)
    train_sentence2_tokens, train_all_sentence2_tokens = tokenize_dataset(train_sentence2)
    pkl.dump(train_sentence1_tokens, open("train_sentence1_tokens.p", "wb"))
    pkl.dump(train_sentence2_tokens, open("train_sentence2_tokens.p", "wb"))
    #pkl.dump(train_all_sentence1_tokens, open("train_all_sentence1_tokens.p", "wb"))
    #pkl.dump(train_all_sentence2_tokens, open("train_all_sentence2_tokens.p", "wb"))1

    #combine tokens from both sentences to create a shared dictionary
    train_all_combined_sentence_tokens = train_all_sentence1_tokens + train_all_sentence2_tokens
    pkl.dump(train_all_combined_sentence_tokens, open("train_all_combined_sentence_tokens.p", "wb"))

    #val set tokens
    print ("Tokenizing val data")
    val_sentence1_tokens, _ = tokenize_dataset(val_sentence1)
    val_sentence2_tokens, _ = tokenize_dataset(val_sentence2)
    pkl.dump(val_sentence1_tokens, open("val_sentence1_tokens.p", "wb"))
    pkl.dump(val_sentence2_tokens, open("val_sentence2_tokens.p", "wb"))

In [7]:
#If you have previously run the previous cell, run this cell instead to load preprocessed datasets
train_sentence1_tokens = pkl.load(open("train_sentence1_tokens.p", "rb"))
train_sentence2_tokens = pkl.load(open("train_sentence2_tokens.p", "rb"))
train_all_combined_sentence_tokens = pkl.load(open("train_all_combined_sentence_tokens.p", "rb"))
train_all_sentence1_tokens = pkl.load(open("train_all_sentence1_tokens.p", "rb"))
train_all_sentence2_tokens = pkl.load(open("train_all_sentence2_tokens.p", "rb"))
val_sentence1_tokens = pkl.load(open("val_sentence1_tokens.p", "rb"))
val_sentence2_tokens = pkl.load(open("val_sentence2_tokens.p", "rb"))

In [8]:
#print information about the token datasets
# double checking
print ("Train sentence1 dataset size is {}".format(len(train_sentence1_tokens)))
print ("Train sentence2 dataset size is {}".format(len(train_sentence2_tokens)))
print ("Val sentence1 dataset size is {}".format(len(val_sentence1_tokens)))
print ("Val sentence2 dataset size is {}".format(len(val_sentence2_tokens)))

print ("\nTotal number of tokens in sentence1 train dataset is {}".format(len(train_all_sentence1_tokens)))
print ("Total number of tokens in sentence2 train dataset is {}".format(len(train_all_sentence2_tokens)))
print ("Total number of tokens in combined sent1 & sent2 train dataset is {}".format(len(train_all_combined_sentence_tokens)))

print ("\nTotal number of *unique* tokens in sentence1 train dataset is {}".format(len(set(train_all_sentence1_tokens))))
print ("Total number of *unique* tokens in sentence2 train dataset is {}".format(len(set(train_all_sentence2_tokens))))
print ("Total number of *unique* tokens in sent1 & sent2 train dataset is {}".format(len(set(train_all_combined_sentence_tokens))))


Train sentence1 dataset size is 100000
Train sentence2 dataset size is 100000
Val sentence1 dataset size is 1000
Val sentence2 dataset size is 1000

Total number of tokens in sentence1 train dataset is 1294135
Total number of tokens in sentence2 train dataset is 743372
Total number of tokens in combined sent1 & sent2 train dataset is 2037507

Total number of *unique* tokens in sentence1 train dataset is 14131
Total number of *unique* tokens in sentence2 train dataset is 15225
Total number of *unique* tokens in sent1 & sent2 train dataset is 19642


In [191]:
#build vocabularies for sentence1 and sentence2
from collections import Counter

#max_vocab_size = 10000
# save index 0 for unk and 1 for pad
PAD_IDX = 0
UNK_IDX = 1

def build_vocab(all_tokens, max_vocab_size):
    # Returns:
    # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    # token2id: dictionary where keys represent tokens and corresponding values represent indices
    token_counter = Counter(all_tokens)
    vocab, count = zip(*token_counter.most_common(max_vocab_size))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

#try different vocab sizes
vocab1=10000
vocab2=15000
vocab3=19000
token2id_combined_sent, id2token_combined_sent = build_vocab(train_all_combined_sentence_tokens,vocab1)
token2id_combined_sent_voc2, id2token_combined_sentvoc2 = build_vocab(train_all_combined_sentence_tokens,vocab2)
token2id_combined_sent_voc3, id2token_combined_sentvoc3 = build_vocab(train_all_combined_sentence_tokens,vocab3)

In [192]:
# Lets check the dictionary by loading random token from it
import random 

random_token_id = random.randint(0, len(id2token_combined_sent)-1)
random_token = id2token_combined_sent[random_token_id]

print ("Token id {} ; token {}".format(random_token_id, id2token_combined_sent[random_token_id]))
print ("Token {}; token id {}".format(random_token, token2id_combined_sent[random_token]))

Token id 4062 ; token rainstorm
Token rainstorm; token id 4062


In [193]:
# convert token to id in the dataset.  After running this cell we will have converted the word tokens to indices
def token2index_dataset(tokens_data,token2id):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

#create train & val for vocab1 size
train_sentence1_data_indices = token2index_dataset(train_sentence1_tokens,token2id_combined_sent)
train_sentence2_data_indices = token2index_dataset(train_sentence2_tokens,token2id_combined_sent)
val_sentence1_data_indices = token2index_dataset(val_sentence1_tokens,token2id_combined_sent)
val_sentence2_data_indices = token2index_dataset(val_sentence2_tokens,token2id_combined_sent)

#create train & val for vocab2 size
train_sentence1_data_indices_voc2 = token2index_dataset(train_sentence1_tokens,token2id_combined_sent_voc2)
train_sentence2_data_indices_voc2 = token2index_dataset(train_sentence2_tokens,token2id_combined_sent_voc2)
val_sentence1_data_indices_voc2 = token2index_dataset(val_sentence1_tokens,token2id_combined_sent_voc2)
val_sentence2_data_indices_voc2 = token2index_dataset(val_sentence2_tokens,token2id_combined_sent_voc2)

#create train & val for vocab3 size
train_sentence1_data_indices_voc3 = token2index_dataset(train_sentence1_tokens,token2id_combined_sent_voc3)
train_sentence2_data_indices_voc3 = token2index_dataset(train_sentence2_tokens,token2id_combined_sent_voc3)
val_sentence1_data_indices_voc3 = token2index_dataset(val_sentence1_tokens,token2id_combined_sent_voc3)
val_sentence2_data_indices_voc3 = token2index_dataset(val_sentence2_tokens,token2id_combined_sent_voc3)

# double checking
print ("Train sentence1 dataset size is {}".format(len(train_sentence1_data_indices)))
print ("Train sentence2 dataset size is {}".format(len(train_sentence2_data_indices)))
print ("Val sentence1 dataset size is {}".format(len(val_sentence1_data_indices)))
print ("Val sentence2 dataset size is {}".format(len(val_sentence2_data_indices)))

Train sentence1 dataset size is 100000
Train sentence2 dataset size is 100000
Val sentence1 dataset size is 1000
Val sentence2 dataset size is 1000


In [194]:
#visualize a random sentence1 and sentence2 paired training example
rand_training_example = random.randint(0, len(train_sentence1) - 1)
print (train_sentence1_tokens[rand_training_example])
print(train_sentence1_data_indices[rand_training_example])

print (train_sentence2_tokens[rand_training_example])
print(train_sentence2_data_indices[rand_training_example])

['a', 'soccer', 'player', 'running', 'on', 'the', 'field']
[2, 129, 111, 64, 7, 3, 89]
['the', 'soccer', 'player', 'is', 'running']
[3, 129, 111, 5, 64]


In [195]:
#Check average, max, min sentence lengths to determine word padding
total_sent1_len=0
total_sent2_len=0
sent1_lens=[]
sent2_lens=[]
for i in range(0,len(train_sentence1_tokens)):
    total_sent1_len+=len(train_sentence1_tokens[i])
    total_sent2_len+=len(train_sentence2_tokens[i])
    sent1_lens.append(len(train_sentence1_tokens[i]))
    sent2_lens.append(len(train_sentence2_tokens[i]))

avg1=total_sent1_len/len(train_sentence1)
avg2=total_sent2_len/len(train_sentence2)
print("sentence1 average is: "+str(avg1)+", std dev is: "+str(np.std(sent1_lens))+", max is: "+str(max(sent1_lens))+", min is: "+str(min(sent1_lens)))
print("sentence2 average is: "+str(avg2)+", std dev is: "+str(np.std(sent2_lens))+", max is: "+str(max(sent2_lens))+", min is: "+str(min(sent2_lens)))


sentence1 average is: 12.94135, std dev is: 5.755700667816214, max is: 78, min is: 2
sentence2 average is: 7.43372, std dev is: 3.0907033118046123, max is: 38, min is: 1


In [196]:
import numpy as np
import torch
from torch.utils.data import Dataset

class NewsGroupDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list1, data_list2, target_list, MAX_SENTENCE_LENGTH):
        """
        @param data_list: list of newsgroup tokens 
        @param target_list: list of newsgroup targets 

        """
        self.data_list1 = data_list1
        self.data_list2 = data_list2
        self.MAX_SENTENCE_LENGTH=MAX_SENTENCE_LENGTH
        self.target_list = target_list
        assert (len(self.data_list1) == len(self.target_list) == len(self.data_list2))

    def __len__(self):
        return len(self.data_list1)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        token_idx1 = self.data_list1[key][:self.MAX_SENTENCE_LENGTH]
        token_idx2 = self.data_list2[key][:self.MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        
        return [token_idx1, token_idx2, len(token_idx1), len(token_idx2), label]
    

In [197]:
train_dataset = NewsGroupDataset(train_sentence1_data_indices,train_sentence2_data_indices, train_targets, 35)
val_dataset = NewsGroupDataset(val_sentence1_data_indices, val_sentence2_data_indices, val_targets, 35)

train_dataset_voc2 = NewsGroupDataset(train_sentence1_data_indices_voc2,train_sentence2_data_indices_voc2, train_targets, 35)
val_dataset_voc2 = NewsGroupDataset(val_sentence1_data_indices_voc2, val_sentence2_data_indices_voc2, val_targets, 35)

train_dataset_voc3 = NewsGroupDataset(train_sentence1_data_indices_voc3,train_sentence2_data_indices_voc3, train_targets, 35)
val_dataset_voc3 = NewsGroupDataset(val_sentence1_data_indices_voc3, val_sentence2_data_indices_voc3, val_targets, 35)


In [198]:
MAX_SENTENCE_LENGTH=35

def newsgroup_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list1 = []
    data_list2 = []
    label_list = []
    length_list1 = []
    length_list2 = []
    #print("collate batch: ", batch[0][0])
    #batch[0][0] = batch[0][0][:MAX_SENTENCE_LENGTH]
    for datum in batch:
        label_list.append(datum[4])
        length_list1.append(datum[2])
        length_list2.append(datum[3])
    # padding
    for datum in batch:
        padded_vec1 = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[2])), 
                                mode="constant", constant_values=0)
        data_list1.append(padded_vec1)
        
    for datum in batch:
        
        padded_vec2 = np.pad(np.array(datum[1]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[3])), 
                                mode="constant", constant_values=0)
        data_list2.append(padded_vec2)
        
    return [torch.from_numpy(np.array(data_list1)), torch.from_numpy(np.array(data_list2)), torch.LongTensor(length_list1), torch.LongTensor(length_list2), torch.LongTensor(label_list)]



In [199]:
BATCH_SIZE = 32

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

train_loader_voc2 = torch.utils.data.DataLoader(dataset=train_dataset_voc2, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

val_loader_voc2 = torch.utils.data.DataLoader(dataset=val_dataset_voc2, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

train_loader_voc3 = torch.utils.data.DataLoader(dataset=train_dataset_voc3, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

val_loader_voc3 = torch.utils.data.DataLoader(dataset=val_dataset_voc3, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

dataloader_list=[[train_loader,val_loader,vocab1+2],[train_loader_voc2,val_loader_voc2,vocab2+2],[train_loader_voc3,val_loader_voc3,vocab3+2]]



In [200]:
#sent1 example
for i,(data1, data2, lengths1, lengths2, labels) in enumerate(train_loader):
    print(data1)
    print(data1.shape)
    print(data2)
    print(data2.shape)
    print(lengths1)
    print(lengths1.shape)
    print(labels)
    print(labels.shape)
    break

tensor([[   2,    6,   17,  ...,    0,    0,    0],
        [   2,  163,  272,  ...,    0,    0,    0],
        [   2,   12,    4,  ...,    0,    0,    0],
        ...,
        [  18,  125,   12,  ...,    0,    0,    0],
        [   6,  852,  104,  ...,    0,    0,    0],
        [  13, 1604,  184,  ...,    0,    0,    0]])
torch.Size([32, 35])
tensor([[   2,    6,    5,  ...,    0,    0,    0],
        [   2,    6, 1582,  ...,    0,    0,    0],
        [  49,    5,   12,  ...,    0,    0,    0],
        ...,
        [   2,   12,    4,  ...,    0,    0,    0],
        [   2,   12,  283,  ...,    0,    0,    0],
        [  13,  118,   43,  ...,    0,    0,    0]])
torch.Size([32, 35])
tensor([16,  9,  8, 10, 17,  9, 11, 12,  8,  9, 12, 14, 11, 24, 17, 17, 18, 12,
        18, 11,  7, 18, 11, 17, 15, 20,  6, 12,  7, 20,  5, 12])
torch.Size([32])
tensor([2, 2, 1, 1, 1, 1, 2, 1, 2, 1, 2, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1, 0,
        1, 0, 1, 2, 0, 0, 0, 0])
torch.Size([32])


### Bag-of-Words model in PyTorch

Next, we will implement a Bag of Words in PyTorch -- as an `nn.Module`.

A `nn.Module` can really be any function, but it is often used to implement layers, functions and models. Note that you can also nest modules.

Importantly, modules need to have their `forward()` method overridden, and very often you will want to override the `__init__` method as well. 

The `__init__` method sets up the module. This is also often where the internal modules and parameters are initialized.

The `forward` method defines what happens when you *apply* the module.

In the background, PyTorch makes use of your code in the forward method and determines how to implement back-propagation with it - but all you need to do is to define the forward pass!

In [201]:
test1 = [[1,2,3,4],[5,6,7,8],[9,10,11,12]]
test2 = [[1,2,3,],[5,6,7],[9,10,11]]

def concat_matrices(M1,M2):
    new_matrix = []
    for i in range(len(M1)):
        new_matrix.append(M1[i] + M2[i])
        
    return new_matrix

concat_matrices(test1, test2)

test3 = [1,2,3,4,5]
test4 = [5,5,5,5,5]
list( map(add, test3, test4) )

[6, 7, 8, 9, 10]

In [202]:
# First import torch related libraries
import torch
import torch.nn as nn
import torch.nn.functional as F

class BagOfWords(nn.Module):
    """
    BagOfWords classification model
    """
    def __init__(self, vocab_size, emb_dim, comb_method, model_type):
        """
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding
        """
        super(BagOfWords, self).__init__()
        # pay attention to padding_idx 
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.model_type=model_type
        #Want 2 hidden layers.  Dims can change out of linear1 must = input of linear2
        if self.model_type=='NN':
            if comb_method=='concat':
                self.linear1 = nn.Linear(emb_dim*2,50)
                self.linear2 = nn.Linear(50,20)
                self.linear3 = nn.Linear(20,3)
            elif comb_method=='sum':
                self.linear1 = nn.Linear(emb_dim,50)
                self.linear2 = nn.Linear(50,20)
                self.linear3 = nn.Linear(20,3)
            elif comb_method=='product':
                self.linear1 = nn.Linear(emb_dim,50)
                self.linear2 = nn.Linear(50,20)
                self.linear3 = nn.Linear(20,3)
            else:
                raise Exception('Vect comb methods incl concat, sum, or mult. Comb used was: {}'.format(self.comb_method))
        elif self.model_type=='LG':
            self.linear1 = nn.Linear(emb_dim,20)
        else:
            raise Exception('Model types incl NN (neural network) or LG (logistic regression).  Model used was: {}'.format(self.model_type))
        
        self.comb_method=comb_method
    
    def forward(self, data1, data2, lengths1, lengths2):
        """
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        
        data1_vecrep=self.embed(data1)
        data2_vecrep=self.embed(data2)
        
        out1=torch.sum(data1_vecrep, dim=1)
        out2=torch.sum(data2_vecrep, dim=1)
        
        out1 /= lengths1.view(lengths1.size()[0],1).expand_as(out1).float()
        out2 /= lengths2.view(lengths2.size()[0],1).expand_as(out2).float()
        
        if self.comb_method=='concat':
            out=torch.cat((out1,out2), dim=1, out=None)
        elif self.comb_method=='sum':
            out=out1+out2
        elif self.comb_method=='product':
            out=out1*out2

        if self.model_type=='NN':
            out = self.linear1(out.float())
            out = F.relu(out)
            out = self.linear2(out.float())
            out = F.relu(out)
            out = self.linear3(out.float())
        elif self.model_type=='LG':
            out = self.linear1(out.float())

        return out

### Loss Function and Optimizer

Note that in our Bag of Words model we haven't applied softmax to the output of linear layer. Why?
We use `nn.CrossEntropyLoss()` to train. From pytorch documentation for `nn.CrossEntropyLoss()` ( https://pytorch.org/docs/stable/nn.html ) - this criterion combines `nn.LogSoftmax()` and `nn.NLLLoss()` in one single class. So, this is actually exactly the same as minimizing the log likelihood after applying softmax.

In [203]:
#create models with various embedding dimensions for hyperparameter tuning
model = BagOfWords(len(id2token_combined_sent), 100,'concat','NN')

In [204]:
# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()

learning_rate = 0.01
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [205]:
for x in model.parameters():
    print(x.shape)

torch.Size([10002, 100])
torch.Size([50, 200])
torch.Size([50])
torch.Size([20, 50])
torch.Size([20])
torch.Size([3, 20])
torch.Size([3])


### Training the Model

In [207]:
num_epochs = 1 # number epoch to train

# Function for testing the model
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data1, data2, lengths1, lengths2, labels in loader:
        data_batch1, data_batch2, length_batch1, length_batch2, label_batch = data1, data2, lengths1, lengths2, labels
        outputs = F.softmax(model(data_batch1, data_batch2, length_batch1, length_batch2), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

In [208]:
#create pandas DF to store results
results_df=pd.DataFrame(columns=['model_type','epochs','sent_comb_method','vocab_size','embed_dim','train_acc','val_acc','val_loss'])
results_df

Unnamed: 0,model_type,epochs,sent_comb_method,vocab_size,embed_dim,train_acc,val_acc,val_loss


In [None]:
#dim_testsizes=[100,200,300]
#comb_methods=['sum','concat','product']
#create short list versions for faster testing
dim_testsizes=[100]
comb_methods=['product']
model_types=['NN','LG']

#if not os.path.exists('train_all_combined_sentence_tokens.p'):

learning_rate = 0.01
criterion = torch.nn.CrossEntropyLoss()

#search for best results
for dimension in dim_testsizes:
    for combination_method in comb_methods:
        for model_type in model_types:
            for loader in dataloader_list:
                model=BagOfWords(loader[2], dimension, combination_method, model_type)
                optimizer=torch.optim.Adam(model.parameters(), lr=learning_rate)
                save_str=(model_type+'_'+str(num_epochs)+'_'+combination_method+'_'+str(loader[2])+'_'+str(dimension))
                #check to see if we have already run this model/epoch/comb method/vocab size/embed combination
                #if we haven't run it yet, run it and then save
                if not os.path.exists(save_str):
                    for epoch in range(num_epochs):
                        for i, (data1, data2, lengths1, lengths2, labels) in enumerate(loader[0]):
                            model.train()
                            data_batch1, data_batch2, length_batch1, length_batch2, label_batch = data1, data2, lengths1, lengths2, labels
                            model.zero_grad()
                            outputs = model(data_batch1, data_batch2, length_batch1, length_batch2)
                            loss = criterion(outputs, label_batch)
                            loss.backward()
                            optimizer.step()
                            # validate every x iterations
                            if i > 0 and i % 3000 == 0:
                                # validate
                                val_acc = test_model(loader[1], model)
                                print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format( 
                                    epoch+1, num_epochs, i+1, len(loader[0]), val_acc))
                    #now save the model so we don't have to rerun
                    torch.save(model.state_dict(),save_str)
                #if we have already run this model/epoch/comb method/vocab size/embed combination, load instead
                else:
                    model.load_state_dict(torch.load(save_str))

                #now we want to save results, but only if we don't yet have these results in the table
                if not ((results_df['model_type'] == model_type) & (results_df['epochs'] == num_epochs) & \
                    (results_df['sent_comb_method']==combination_method) & \
                    (results_df['vocab_size']==model.embed.num_embeddings-2) & \
                    (results_df['embed_dim']==model.embed.embedding_dim)).any():
                    results_df=results_df.append(pd.Series([model_type,num_epochs,combination_method,model.embed.num_embeddings-2,model.embed.embedding_dim,test_model(loader[0], model),test_model(loader[1], model),loss],index=results_df.columns),ignore_index=True)



Epoch: [1/1], Step: [3001/3125], Validation Acc: 61.3


In [160]:
results_df.sort_values(by=['val_acc'],axis=0,ascending=False)

Unnamed: 0,model_type,epochs,sent_comb_method,vocab_size,embed_dim,train_acc,val_acc,val_loss
1,NN,3,product,5002,100,85.482,64.4,"tensor(0.4344, grad_fn=<NllLossBackward>)"
0,NN,3,product,10002,100,54.822,48.0,"tensor(0.7524, grad_fn=<NllLossBackward>)"
2,NN,3,product,1002,100,51.618,47.9,"tensor(1.0346, grad_fn=<NllLossBackward>)"
