<h1 id="tocheading">Table of Contents</h1>
<div id="toc"></div>

In [1]:
%%javascript

$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

## import modules

In [2]:
import itertools
import os
import random
import pickle as pkl
from collections import Counter
import numpy as np
import torch
from torch.utils.data import Dataset
import spacy
import string
import torch
import torch.nn as nn
import torch.nn.functional as F

## Data Loading

The dataset was downloaded from: http://ai.stanford.edu/~amaas/data/sentiment/

In [3]:
data_loc = "data/imdb_reviews/"

In [4]:
def read_txt_files(folder_path):
    """Reads all .txt files in a folder to a list"""
    
    file_list = os.listdir(folder_path)
    # for debugging, printing out the folder path and some files in it
    print(folder_path)
    print(file_list[:10])
    
    all_reviews = []
    for file_path in file_list:
        f = open(folder_path + file_path,"r")
        all_reviews.append(f.readline())
        
    return all_reviews

In [5]:
train_pos = read_txt_files(folder_path=data_loc+"train/pos/")
print(len(train_pos))
train_neg = read_txt_files(folder_path=data_loc+"train/neg/")
print(len(train_neg))
test_pos = read_txt_files(folder_path=data_loc+"test/pos/")
print(len(test_pos))
test_neg = read_txt_files(folder_path=data_loc+"test/neg/")
print(len(test_neg))

data/imdb_reviews/train/pos/
['4715_9.txt', '12390_8.txt', '8329_7.txt', '9063_8.txt', '3092_10.txt', '9865_8.txt', '6639_10.txt', '10460_10.txt', '10331_10.txt', '11606_10.txt']
12500
data/imdb_reviews/train/neg/
['1821_4.txt', '10402_1.txt', '1062_4.txt', '9056_1.txt', '5392_3.txt', '2682_3.txt', '3351_4.txt', '399_2.txt', '10447_1.txt', '10096_1.txt']
12500
data/imdb_reviews/test/pos/
['4715_9.txt', '1930_9.txt', '3205_9.txt', '10186_10.txt', '147_10.txt', '7511_7.txt', '616_10.txt', '10460_10.txt', '3240_9.txt', '1975_9.txt']
12500
data/imdb_reviews/test/neg/
['1821_4.txt', '9487_1.txt', '4604_4.txt', '2828_2.txt', '10890_1.txt', '3351_4.txt', '8070_2.txt', '1027_4.txt', '8248_3.txt', '4290_4.txt']
12500


In [6]:
print("Train Positive examples = " + str(len(train_pos)))
print("Train Negative examples = " + str(len(train_neg)))
print("Test Positive examples = " + str(len(test_pos)))
print("Test Negative examples = " + str(len(test_neg)))

Train Positive examples = 12500
Train Negative examples = 12500
Test Positive examples = 12500
Test Negative examples = 12500


## Label Generation

In [7]:
train_pos_labels = np.ones((len(train_pos),), dtype=int)
train_pos_labels

train_neg_labels = np.zeros((len(train_neg),), dtype=int)
train_neg_labels

train_data_labels = np.concatenate((train_pos_labels,train_neg_labels))
print(len(train_data_labels))
print(train_data_labels)

test_pos_labels = np.ones((len(test_pos),), dtype=int)
test_neg_labels = np.zeros((len(test_neg),), dtype=int)
test_data_labels = np.concatenate((test_pos_labels,test_neg_labels))
print(len(test_data_labels))
print(test_data_labels)

25000
[1 1 1 ... 0 0 0]
25000
[1 1 1 ... 0 0 0]


In [8]:
import re

def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

In [9]:
train_pos_clean = [cleanhtml(x) for x in train_pos]
train_neg_clean = [cleanhtml(x) for x in train_neg]

test_pos_clean = [cleanhtml(x) for x in test_pos]
test_neg_clean = [cleanhtml(x) for x in test_neg]

In [10]:
train_all_clean = train_pos_clean + train_neg_clean
len(train_all_clean)

test_all_clean = test_pos_clean + test_neg_clean
len(test_all_clean)

25000

In [11]:
training_size = 20000

assert training_size < 25000

shuffled_index = np.random.permutation(len(train_all_clean))
print(len(shuffled_index))
print(shuffled_index)

training_all_clean = [train_all_clean[i] for i in shuffled_index[:training_size]]
training_labels = [train_data_labels[i] for i in shuffled_index[:training_size]]
print(len(training_all_clean))
print(len(training_labels))

validation_all_clean = [train_all_clean[i] for i in shuffled_index[training_size:]]
validation_labels = [train_data_labels[i] for i in shuffled_index[training_size:]]
print(len(validation_all_clean))
print(len(validation_labels))


25000
[16668  5151 19604 ...  5002 16380  2885]
20000
20000
5000
5000


## Tokenization

In [12]:
import spacy
import string

# Load English tokenizer, tagger, parser, NER and word vectors
tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation

# This is word tokenizer
# # lowercase and remove punctuation
# def tokenize(sent):
#     tokens = tokenizer(sent)
#     return [token.text.lower() for token in tokens if (token.text not in punctuations)]
#     #return [token.text.lower() for token in tokens]
    
# Modified for n-grams
def tokenize(sent, n_gram = 0, lemmatize = False):
    
    tokens = tokenizer(sent)
    
    # unigrams
    if lemmatize == False:
        unigrams = [token.text.lower() for token in tokens if (token.text not in punctuations)]
    else:
        #LEMMATIZED
        unigrams = [token.lemma_.lower() for token in tokens if (token.text not in punctuations)]
    
    
    output = []
    output.extend(unigrams)
    
    n = 2
    while n <= n_gram:
        ngram_tokens = [" ".join(unigrams[x:x+n]) for x in range(len(unigrams)-n+1)]
        output.extend(ngram_tokens)
        n = n + 1
        
    return output

In [13]:
def lower_case_remove_punc(parsed):
    return [token.text.lower() for token in parsed if (token.text not in punctuations)]

def tokenize_dataset(dataset, n_gram, lemmatize = True):
    token_dataset = []
    # we are keeping track of all tokens in dataset
    # in order to create vocabulary later
    all_tokens = []

#     for sample in tqdm_notebook(tokenizer.pipe(dataset, 
#                                                disable=['parser', 'tagger', 'ner'], 
#                                                batch_size=512, 
#                                                n_threads=4)):

    itr = 0
    for sample in dataset:
        
        if itr % 50 == 0:
            print(str(itr) + " / " + str(len(dataset)))
        # unigram version
        #tokens = lower_case_remove_punc(sample)
        
        # n-gram version
        tokens = tokenize(sample,n_gram, lemmatize = lemmatize)
        
        token_dataset.append(tokens)
        all_tokens += tokens
        
        itr = itr + 1

    return token_dataset, all_tokens

In [14]:
# convert token to id in the dataset
def token2index_dataset(tokens_data, token2id):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

In [15]:
# save index 0 for unk and 1 for pad
PAD_IDX = 0
UNK_IDX = 1

def build_vocab(all_tokens, max_vocab_size = 10000):
    # Returns:
    # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    # token2id: dictionary where keys represent tokens and corresponding values represent indices
    token_counter = Counter(all_tokens)
    vocab, count = zip(*token_counter.most_common(max_vocab_size))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

In [16]:
grams = [1,2,3]
lemmatize_list = [True,False]

In [17]:
for lemmatize_arg in lemmatize_list:
    for gram_no in grams:
        print(str(gram_no))

        train_data_tokens, all_train_tokens = tokenize_dataset(training_all_clean,
                                                               n_gram=gram_no, 
                                                               lemmatize = lemmatize_arg)

        # Tokenize Validation
        val_data_tokens, _ = tokenize_dataset(validation_all_clean,
                                              n_gram=gram_no, 
                                              lemmatize = lemmatize_arg)

        if lemmatize_arg == True:
            gram_no = str(gram_no) + "_lemma"
        else:
            gram_no = str(gram_no)
        print(gram_no)

        # val set tokens
        print ("Tokenizing val data")
        pkl.dump(val_data_tokens, open("val_data_tokens_"+str(gram_no)+".p", "wb"))

        # train set tokens
        print ("Tokenizing train data")
        pkl.dump(train_data_tokens, open("train_data_tokens_"+str(gram_no)+".p", "wb"))
        pkl.dump(all_train_tokens, open("all_train_tokens_"+str(gram_no)+".p", "wb"))


1
0 / 20000
50 / 20000
100 / 20000
150 / 20000
200 / 20000
250 / 20000
300 / 20000
350 / 20000
400 / 20000
450 / 20000
500 / 20000
550 / 20000
600 / 20000
650 / 20000
700 / 20000
750 / 20000
800 / 20000
850 / 20000
900 / 20000
950 / 20000
1000 / 20000
1050 / 20000
1100 / 20000
1150 / 20000
1200 / 20000
1250 / 20000
1300 / 20000
1350 / 20000
1400 / 20000
1450 / 20000
1500 / 20000
1550 / 20000
1600 / 20000
1650 / 20000
1700 / 20000
1750 / 20000
1800 / 20000
1850 / 20000
1900 / 20000
1950 / 20000
2000 / 20000
2050 / 20000
2100 / 20000
2150 / 20000
2200 / 20000
2250 / 20000
2300 / 20000
2350 / 20000
2400 / 20000
2450 / 20000
2500 / 20000
2550 / 20000
2600 / 20000
2650 / 20000
2700 / 20000
2750 / 20000
2800 / 20000
2850 / 20000
2900 / 20000
2950 / 20000
3000 / 20000
3050 / 20000
3100 / 20000
3150 / 20000
3200 / 20000
3250 / 20000
3300 / 20000
3350 / 20000
3400 / 20000
3450 / 20000
3500 / 20000
3550 / 20000
3600 / 20000
3650 / 20000
3700 / 20000
3750 / 20000
3800 / 20000
3850 / 20000
3900 / 

6200 / 20000
6250 / 20000
6300 / 20000
6350 / 20000
6400 / 20000
6450 / 20000
6500 / 20000
6550 / 20000
6600 / 20000
6650 / 20000
6700 / 20000
6750 / 20000
6800 / 20000
6850 / 20000
6900 / 20000
6950 / 20000
7000 / 20000
7050 / 20000
7100 / 20000
7150 / 20000
7200 / 20000
7250 / 20000
7300 / 20000
7350 / 20000
7400 / 20000
7450 / 20000
7500 / 20000
7550 / 20000
7600 / 20000
7650 / 20000
7700 / 20000
7750 / 20000
7800 / 20000
7850 / 20000
7900 / 20000
7950 / 20000
8000 / 20000
8050 / 20000
8100 / 20000
8150 / 20000
8200 / 20000
8250 / 20000
8300 / 20000
8350 / 20000
8400 / 20000
8450 / 20000
8500 / 20000
8550 / 20000
8600 / 20000
8650 / 20000
8700 / 20000
8750 / 20000
8800 / 20000
8850 / 20000
8900 / 20000
8950 / 20000
9000 / 20000
9050 / 20000
9100 / 20000
9150 / 20000
9200 / 20000
9250 / 20000
9300 / 20000
9350 / 20000
9400 / 20000
9450 / 20000
9500 / 20000
9550 / 20000
9600 / 20000
9650 / 20000
9700 / 20000
9750 / 20000
9800 / 20000
9850 / 20000
9900 / 20000
9950 / 20000
10000 / 2000

12150 / 20000
12200 / 20000
12250 / 20000
12300 / 20000
12350 / 20000
12400 / 20000
12450 / 20000
12500 / 20000
12550 / 20000
12600 / 20000
12650 / 20000
12700 / 20000
12750 / 20000
12800 / 20000
12850 / 20000
12900 / 20000
12950 / 20000
13000 / 20000
13050 / 20000
13100 / 20000
13150 / 20000
13200 / 20000
13250 / 20000
13300 / 20000
13350 / 20000
13400 / 20000
13450 / 20000
13500 / 20000
13550 / 20000
13600 / 20000
13650 / 20000
13700 / 20000
13750 / 20000
13800 / 20000
13850 / 20000
13900 / 20000
13950 / 20000
14000 / 20000
14050 / 20000
14100 / 20000
14150 / 20000
14200 / 20000
14250 / 20000
14300 / 20000
14350 / 20000
14400 / 20000
14450 / 20000
14500 / 20000
14550 / 20000
14600 / 20000
14650 / 20000
14700 / 20000
14750 / 20000
14800 / 20000
14850 / 20000
14900 / 20000
14950 / 20000
15000 / 20000
15050 / 20000
15100 / 20000
15150 / 20000
15200 / 20000
15250 / 20000
15300 / 20000
15350 / 20000
15400 / 20000
15450 / 20000
15500 / 20000
15550 / 20000
15600 / 20000
15650 / 20000
15700 

17850 / 20000
17900 / 20000
17950 / 20000
18000 / 20000
18050 / 20000
18100 / 20000
18150 / 20000
18200 / 20000
18250 / 20000
18300 / 20000
18350 / 20000
18400 / 20000
18450 / 20000
18500 / 20000
18550 / 20000
18600 / 20000
18650 / 20000
18700 / 20000
18750 / 20000
18800 / 20000
18850 / 20000
18900 / 20000
18950 / 20000
19000 / 20000
19050 / 20000
19100 / 20000
19150 / 20000
19200 / 20000
19250 / 20000
19300 / 20000
19350 / 20000
19400 / 20000
19450 / 20000
19500 / 20000
19550 / 20000
19600 / 20000
19650 / 20000
19700 / 20000
19750 / 20000
19800 / 20000
19850 / 20000
19900 / 20000
19950 / 20000
0 / 5000
50 / 5000
100 / 5000
150 / 5000
200 / 5000
250 / 5000
300 / 5000
350 / 5000
400 / 5000
450 / 5000
500 / 5000
550 / 5000
600 / 5000
650 / 5000
700 / 5000
750 / 5000
800 / 5000
850 / 5000
900 / 5000
950 / 5000
1000 / 5000
1050 / 5000
1100 / 5000
1150 / 5000
1200 / 5000
1250 / 5000
1300 / 5000
1350 / 5000
1400 / 5000
1450 / 5000
1500 / 5000
1550 / 5000
1600 / 5000
1650 / 5000
1700 / 5000
1

4250 / 5000
4300 / 5000
4350 / 5000
4400 / 5000
4450 / 5000
4500 / 5000
4550 / 5000
4600 / 5000
4650 / 5000
4700 / 5000
4750 / 5000
4800 / 5000
4850 / 5000
4900 / 5000
4950 / 5000
2
Tokenizing val data
Tokenizing train data
3
0 / 20000
50 / 20000
100 / 20000
150 / 20000
200 / 20000
250 / 20000
300 / 20000
350 / 20000
400 / 20000
450 / 20000
500 / 20000
550 / 20000
600 / 20000
650 / 20000
700 / 20000
750 / 20000
800 / 20000
850 / 20000
900 / 20000
950 / 20000
1000 / 20000
1050 / 20000
1100 / 20000
1150 / 20000
1200 / 20000
1250 / 20000
1300 / 20000
1350 / 20000
1400 / 20000
1450 / 20000
1500 / 20000
1550 / 20000
1600 / 20000
1650 / 20000
1700 / 20000
1750 / 20000
1800 / 20000
1850 / 20000
1900 / 20000
1950 / 20000
2000 / 20000
2050 / 20000
2100 / 20000
2150 / 20000
2200 / 20000
2250 / 20000
2300 / 20000
2350 / 20000
2400 / 20000
2450 / 20000
2500 / 20000
2550 / 20000
2600 / 20000
2650 / 20000
2700 / 20000
2750 / 20000
2800 / 20000
2850 / 20000
2900 / 20000
2950 / 20000
3000 / 20000
3050

In [18]:
MAX_SENTENCE_LENGTH = 200

In [19]:
class IMDBDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's 
    readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list, target_list):
        """
        @param data_list: list of newsgroup tokens 
        @param target_list: list of newsgroup targets 

        """
        self.data_list = data_list
        self.target_list = target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        token_idx = self.data_list[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [token_idx, len(token_idx), label]

In [20]:
def imdb_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list = []
    label_list = []
    length_list = []
    #print("collate batch: ", batch[0][0])
    #batch[0][0] = batch[0][0][:MAX_SENTENCE_LENGTH]
    for datum in batch:
        label_list.append(datum[2])
        length_list.append(datum[1])
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        data_list.append(padded_vec)
    return [torch.from_numpy(np.array(data_list)), 
            torch.LongTensor(length_list), 
            torch.LongTensor(label_list)]

In [21]:
class BagOfNgrams(nn.Module):
    """
    BagOfNgrams classification model
    """
    def __init__(self, vocab_size, emb_dim):
        """
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding
        """
        super(BagOfNgrams, self).__init__()
        # pay attention to padding_idx 
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.linear = nn.Linear(emb_dim,20)
    
    def forward(self, data, length):
        """
        
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        out = self.embed(data)
        out = torch.sum(out, dim=1)
        out /= length.view(length.size()[0],1).expand_as(out).float()
     
        # return logits
        out = self.linear(out.float())
        return out

In [22]:
# Function for testing the model
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data, lengths, labels in loader:
        data_batch, length_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, length_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

In [23]:
params = [[1e-2,1e-1,1,2], ## learning rates
          list(range(1,4)), ## ngrams
          [1e5,1e6], ## vocab size
          [100,200], ## embedding size
#          [100,200], ## max sentence length
          [64,128] ## batch size
         ]

# params = [[1e-1,1,2,5], ## learning rates
#           list(range(1,2)), ## ngrams
#           [1e5], ## vocab size
#           [100], ## embedding size
#           [100], ## max sentence length
#           [64] ## batch size
#          ]

print(len([*itertools.product(*params)]))
[*itertools.product(*params)]

96


[(0.01, 1, 100000.0, 100, 64),
 (0.01, 1, 100000.0, 100, 128),
 (0.01, 1, 100000.0, 200, 64),
 (0.01, 1, 100000.0, 200, 128),
 (0.01, 1, 1000000.0, 100, 64),
 (0.01, 1, 1000000.0, 100, 128),
 (0.01, 1, 1000000.0, 200, 64),
 (0.01, 1, 1000000.0, 200, 128),
 (0.01, 2, 100000.0, 100, 64),
 (0.01, 2, 100000.0, 100, 128),
 (0.01, 2, 100000.0, 200, 64),
 (0.01, 2, 100000.0, 200, 128),
 (0.01, 2, 1000000.0, 100, 64),
 (0.01, 2, 1000000.0, 100, 128),
 (0.01, 2, 1000000.0, 200, 64),
 (0.01, 2, 1000000.0, 200, 128),
 (0.01, 3, 100000.0, 100, 64),
 (0.01, 3, 100000.0, 100, 128),
 (0.01, 3, 100000.0, 200, 64),
 (0.01, 3, 100000.0, 200, 128),
 (0.01, 3, 1000000.0, 100, 64),
 (0.01, 3, 1000000.0, 100, 128),
 (0.01, 3, 1000000.0, 200, 64),
 (0.01, 3, 1000000.0, 200, 128),
 (0.1, 1, 100000.0, 100, 64),
 (0.1, 1, 100000.0, 100, 128),
 (0.1, 1, 100000.0, 200, 64),
 (0.1, 1, 100000.0, 200, 128),
 (0.1, 1, 1000000.0, 100, 64),
 (0.1, 1, 1000000.0, 100, 128),
 (0.1, 1, 1000000.0, 200, 64),
 (0.1, 1, 100000

In [None]:
def hyperparameter_search(hyperparameter_space=params,
                          epochs=5,
                          optimizer_name = "Adam",
                          lemmatize = False):

    # returns all the permutations of the parameter search space
    param_space = [*itertools.product(*params)]
    
    # validation loss dictionary
    val_losses = {}
    
    # counter for progress
    count = 0
    
    for param_comb in param_space:
        print("-----------------------------------------------------------")
        print("Parameter Combination = " + str(count+1) + " / " + str(len(param_space)))
        count = count + 1      
        
        NUM_EPOCHS = epochs
        lr_rate = param_comb[0]             # learning rate
        grams = param_comb[1]               # n-grams
        max_vocab_size = int(param_comb[2]) # vocabulary size
        embed_dimension = param_comb[3]     # embedding vector size
        #max_sentence_length = int(param_comb[4]) # max sentence length of data loader
        BATCH_SIZE = param_comb[4]
        
        print("Learning Rate = " + str(lr_rate))
        print("Ngram = " + str(grams))
        print("Vocab Size = " + str(max_vocab_size))
        print("Embedding Dimension = " + str(embed_dimension))
        #print("Max Sentence Length = " + str(max_sentence_length))
        print("Batch Size = " + str(BATCH_SIZE))

        # Tokenization
        # All tokens are created before the hyperparameter search loop
        # Load the tokens here
        if lemmatize == True:
            grams = "lemma_" + str(grams)
        
        train_data_tokens = pkl.load(open("train_data_tokens_"+str(grams)+".p", "rb"))
        all_train_tokens = pkl.load(open("all_train_tokens_"+str(grams)+".p", "rb"))

        val_data_tokens = pkl.load(open("val_data_tokens_"+str(grams)+".p", "rb"))
        
        print("Train dataset size is {}".format(len(train_data_tokens)))
        print("Val dataset size is {}".format(len(val_data_tokens)))
        print("Total number of tokens in train dataset is {}".format(len(all_train_tokens)))
        
        # Building Vocabulary
        # implicitly gets the max_vocab_size parameter
        token2id, id2token = build_vocab(all_train_tokens,
                                         max_vocab_size=max_vocab_size)
        
        # Lets check the dictionary by loading random token from it
        random_token_id = random.randint(0, len(id2token)-1)
        random_token = id2token[random_token_id]
        print ("Token id {} -> token {}".format(random_token_id, id2token[random_token_id]))
        print ("Token {} -> token id {}".format(random_token, token2id[random_token]))
        
        train_data_indices = token2index_dataset(train_data_tokens, 
                                                 token2id = token2id)
        val_data_indices = token2index_dataset(val_data_tokens, 
                                               token2id = token2id)
        # double checking
        print ("Train dataset size is {}".format(len(train_data_indices)))
        print ("Val dataset size is {}".format(len(val_data_indices)))
        
        

        # Load training and validation data
        train_dataset = IMDBDataset(train_data_indices, 
                                    training_labels)
        train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                                   batch_size=BATCH_SIZE,
                                                   collate_fn=imdb_func,
                                                   shuffle=True)

        val_dataset = IMDBDataset(val_data_indices, 
                                  validation_labels)
        val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                                   batch_size=BATCH_SIZE,
                                                   collate_fn=imdb_func,
                                                   shuffle=True)  

        # Initialize the N-gram Model
        model = BagOfNgrams(len(id2token), embed_dimension)
        
        # Both Adam and SGD will be tried
        if optimizer_name == "Adam":
            optimizer = torch.optim.Adam(model.parameters(), lr=lr_rate)
        elif optimizer_name == "SGD":
            optimizer = torch.optim.SGD(model.parameters(), lr=lr_rate)
        else:
            print("this optimizer is not implemented yet")
        
        # Cross Entropy Loss will be used
        criterion = torch.nn.CrossEntropyLoss()  
        
        # Validation Losses will be stored in a list
        # Caution: Two different optimizers
        val_losses[param_comb] = []
        
    #for optimizer in optimizers:
        print("Optimization Start")
        print(optimizer)

        for epoch in range(NUM_EPOCHS):
            for i, (data, lengths, labels) in enumerate(train_loader):
                model.train()
                data_batch, length_batch, label_batch = data, lengths, labels
                optimizer.zero_grad()
                outputs = model(data_batch, length_batch)
                loss = criterion(outputs, label_batch)
                loss.backward()
                optimizer.step()
                # Validate every 100 iterations
                # Adjust it to accustom changing batch sizes
                if i > 0 and i % (50 * (64 / BATCH_SIZE)) == 0:

                    # Accuracy Calculations
                    train_acc = test_model(train_loader, model)
                    val_acc = test_model(val_loader, model)
                    val_losses[param_comb].append(val_acc)

                    # Logging
                    print('Epoch:[{}/{}],Step:[{}/{}],Training Acc:{},Validation Acc:{}'.format( 
                               epoch+1, NUM_EPOCHS, 
                                i+1, len(train_loader), 
                                train_acc, val_acc))
                      
    return val_losses


In [None]:
param_val_losses_adam_nolemma = hyperparameter_search(hyperparameter_space = params,
                                         epochs = 5,
                                         optimizer_name = "Adam",
                                          lemmatize = False)
pkl.dump(param_val_losses_adam_nolemma, 
         open("param_val_losses_adam_nolemma.p", "wb"))

-----------------------------------------------------------
Parameter Combination = 1 / 96
Learning Rate = 0.01
Ngram = 1
Vocab Size = 100000
Embedding Dimension = 100
Batch Size = 64
Train dataset size is 20000
Val dataset size is 5000
Total number of tokens in train dataset is 4760038
Token id 15430 -> token sandwich
Token sandwich -> token id 15430
Train dataset size is 20000
Val dataset size is 5000
Optimization Start
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.01
    weight_decay: 0
)
Epoch:[1/5],Step:[51/313],Training Acc:74.225,Validation Acc:72.4
Epoch:[1/5],Step:[101/313],Training Acc:83.86,Validation Acc:81.1
Epoch:[1/5],Step:[151/313],Training Acc:84.935,Validation Acc:82.1
Epoch:[1/5],Step:[201/313],Training Acc:88.36,Validation Acc:84.12
Epoch:[1/5],Step:[251/313],Training Acc:92.58,Validation Acc:86.28
Epoch:[1/5],Step:[301/313],Training Acc:93.81,Validation Acc:86.84
Epoch:[2/5],Step:[51/313],Training Acc:94.72,Validation 

Epoch:[1/5],Step:[26/157],Training Acc:67.76,Validation Acc:66.32
Epoch:[1/5],Step:[51/157],Training Acc:78.605,Validation Acc:76.16
Epoch:[1/5],Step:[76/157],Training Acc:85.435,Validation Acc:82.56
Epoch:[1/5],Step:[101/157],Training Acc:88.655,Validation Acc:83.98
Epoch:[1/5],Step:[126/157],Training Acc:89.305,Validation Acc:83.74
Epoch:[1/5],Step:[151/157],Training Acc:92.62,Validation Acc:86.1
Epoch:[2/5],Step:[26/157],Training Acc:93.885,Validation Acc:86.44
Epoch:[2/5],Step:[51/157],Training Acc:94.565,Validation Acc:86.58
Epoch:[2/5],Step:[76/157],Training Acc:95.095,Validation Acc:86.66
Epoch:[2/5],Step:[101/157],Training Acc:95.98,Validation Acc:86.86
Epoch:[2/5],Step:[126/157],Training Acc:96.73,Validation Acc:87.32
Epoch:[2/5],Step:[151/157],Training Acc:97.73,Validation Acc:87.5
Epoch:[3/5],Step:[26/157],Training Acc:98.05,Validation Acc:87.54
Epoch:[3/5],Step:[51/157],Training Acc:98.125,Validation Acc:87.38
Epoch:[3/5],Step:[76/157],Training Acc:98.5,Validation Acc:87.42

Epoch:[2/5],Step:[151/313],Training Acc:96.665,Validation Acc:87.12
Epoch:[2/5],Step:[201/313],Training Acc:96.29,Validation Acc:86.3
Epoch:[2/5],Step:[251/313],Training Acc:98.06,Validation Acc:87.78
Epoch:[2/5],Step:[301/313],Training Acc:98.675,Validation Acc:87.32
Epoch:[3/5],Step:[51/313],Training Acc:98.755,Validation Acc:87.2
Epoch:[3/5],Step:[101/313],Training Acc:99.0,Validation Acc:87.04
Epoch:[3/5],Step:[151/313],Training Acc:98.7,Validation Acc:85.54
Epoch:[3/5],Step:[201/313],Training Acc:99.25,Validation Acc:86.7
Epoch:[3/5],Step:[251/313],Training Acc:99.34,Validation Acc:86.58
Epoch:[3/5],Step:[301/313],Training Acc:99.63,Validation Acc:86.58
Epoch:[4/5],Step:[51/313],Training Acc:99.695,Validation Acc:86.54
Epoch:[4/5],Step:[101/313],Training Acc:99.79,Validation Acc:86.48
Epoch:[4/5],Step:[151/313],Training Acc:99.84,Validation Acc:86.38
Epoch:[4/5],Step:[201/313],Training Acc:99.855,Validation Acc:86.22
Epoch:[4/5],Step:[251/313],Training Acc:99.865,Validation Acc:86

Epoch:[3/5],Step:[126/157],Training Acc:98.89,Validation Acc:87.82
Epoch:[3/5],Step:[151/157],Training Acc:99.265,Validation Acc:88.18
Epoch:[4/5],Step:[26/157],Training Acc:99.46,Validation Acc:87.88
Epoch:[4/5],Step:[51/157],Training Acc:99.535,Validation Acc:87.78
Epoch:[4/5],Step:[76/157],Training Acc:99.55,Validation Acc:87.74
Epoch:[4/5],Step:[101/157],Training Acc:99.67,Validation Acc:87.66
Epoch:[4/5],Step:[126/157],Training Acc:99.695,Validation Acc:87.52
Epoch:[4/5],Step:[151/157],Training Acc:99.83,Validation Acc:87.52
Epoch:[5/5],Step:[26/157],Training Acc:99.84,Validation Acc:87.26
Epoch:[5/5],Step:[51/157],Training Acc:99.905,Validation Acc:87.36
Epoch:[5/5],Step:[76/157],Training Acc:99.925,Validation Acc:87.44
Epoch:[5/5],Step:[101/157],Training Acc:99.95,Validation Acc:87.42
Epoch:[5/5],Step:[126/157],Training Acc:99.97,Validation Acc:87.52
Epoch:[5/5],Step:[151/157],Training Acc:99.975,Validation Acc:87.6
-----------------------------------------------------------
Par

Epoch:[5/5],Step:[51/313],Training Acc:99.995,Validation Acc:87.52
Epoch:[5/5],Step:[101/313],Training Acc:99.995,Validation Acc:87.54
Epoch:[5/5],Step:[151/313],Training Acc:99.995,Validation Acc:87.54
Epoch:[5/5],Step:[201/313],Training Acc:99.995,Validation Acc:87.54
Epoch:[5/5],Step:[251/313],Training Acc:99.995,Validation Acc:87.32
Epoch:[5/5],Step:[301/313],Training Acc:100.0,Validation Acc:87.4
-----------------------------------------------------------
Parameter Combination = 14 / 96
Learning Rate = 0.01
Ngram = 2
Vocab Size = 1000000
Embedding Dimension = 100
Batch Size = 128
Train dataset size is 20000
Val dataset size is 5000
Total number of tokens in train dataset is 9500076
Token id 245036 -> token a monetary
Token a monetary -> token id 245036
Train dataset size is 20000
Val dataset size is 5000
Optimization Start
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.01
    weight_decay: 0
)
Epoch:[1/5],Step:[26/157],Training Acc:55.

Train dataset size is 20000
Val dataset size is 5000
Total number of tokens in train dataset is 14220114
Token id 14824 -> token ... this is
Token ... this is -> token id 14824
Train dataset size is 20000
Val dataset size is 5000
Optimization Start
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.01
    weight_decay: 0
)
Epoch:[1/5],Step:[51/313],Training Acc:67.185,Validation Acc:65.88
Epoch:[1/5],Step:[101/313],Training Acc:81.335,Validation Acc:78.62
Epoch:[1/5],Step:[151/313],Training Acc:86.24,Validation Acc:82.26
Epoch:[1/5],Step:[201/313],Training Acc:90.18,Validation Acc:85.38
Epoch:[1/5],Step:[251/313],Training Acc:92.45,Validation Acc:86.42
Epoch:[1/5],Step:[301/313],Training Acc:94.17,Validation Acc:87.0
Epoch:[2/5],Step:[51/313],Training Acc:94.78,Validation Acc:87.02
Epoch:[2/5],Step:[101/313],Training Acc:95.5,Validation Acc:86.6
Epoch:[2/5],Step:[151/313],Training Acc:96.11,Validation Acc:87.04
Epoch:[2/5],Step:[201/313],Traini

Epoch:[1/5],Step:[101/157],Training Acc:88.805,Validation Acc:83.94
Epoch:[1/5],Step:[126/157],Training Acc:91.515,Validation Acc:85.8
Epoch:[1/5],Step:[151/157],Training Acc:93.255,Validation Acc:86.44
Epoch:[2/5],Step:[26/157],Training Acc:94.115,Validation Acc:86.8
Epoch:[2/5],Step:[51/157],Training Acc:95.15,Validation Acc:86.92
Epoch:[2/5],Step:[76/157],Training Acc:95.72,Validation Acc:87.04
Epoch:[2/5],Step:[101/157],Training Acc:96.635,Validation Acc:87.2
Epoch:[2/5],Step:[126/157],Training Acc:97.44,Validation Acc:87.32
Epoch:[2/5],Step:[151/157],Training Acc:97.04,Validation Acc:85.92
Epoch:[3/5],Step:[26/157],Training Acc:98.45,Validation Acc:87.18
Epoch:[3/5],Step:[51/157],Training Acc:98.545,Validation Acc:87.04
Epoch:[3/5],Step:[76/157],Training Acc:98.77,Validation Acc:87.2
Epoch:[3/5],Step:[101/157],Training Acc:98.985,Validation Acc:86.92
Epoch:[3/5],Step:[126/157],Training Acc:99.23,Validation Acc:87.18
Epoch:[3/5],Step:[151/157],Training Acc:99.5,Validation Acc:86.88

Epoch:[2/5],Step:[301/313],Training Acc:99.365,Validation Acc:86.6
Epoch:[3/5],Step:[51/313],Training Acc:99.655,Validation Acc:86.82
Epoch:[3/5],Step:[101/313],Training Acc:99.66,Validation Acc:87.1
Epoch:[3/5],Step:[151/313],Training Acc:99.69,Validation Acc:87.3
Epoch:[3/5],Step:[201/313],Training Acc:99.72,Validation Acc:86.74
Epoch:[3/5],Step:[251/313],Training Acc:99.83,Validation Acc:86.9
Epoch:[3/5],Step:[301/313],Training Acc:99.94,Validation Acc:86.96
Epoch:[4/5],Step:[51/313],Training Acc:99.955,Validation Acc:86.34
Epoch:[4/5],Step:[101/313],Training Acc:99.985,Validation Acc:86.74
Epoch:[4/5],Step:[151/313],Training Acc:99.985,Validation Acc:86.54
Epoch:[4/5],Step:[201/313],Training Acc:99.995,Validation Acc:86.96
Epoch:[4/5],Step:[251/313],Training Acc:100.0,Validation Acc:86.9
Epoch:[4/5],Step:[301/313],Training Acc:100.0,Validation Acc:86.82
Epoch:[5/5],Step:[51/313],Training Acc:99.995,Validation Acc:87.02
Epoch:[5/5],Step:[101/313],Training Acc:100.0,Validation Acc:86

Epoch:[4/5],Step:[51/157],Training Acc:99.475,Validation Acc:84.7
Epoch:[4/5],Step:[76/157],Training Acc:99.51,Validation Acc:84.44
Epoch:[4/5],Step:[101/157],Training Acc:99.245,Validation Acc:84.34
Epoch:[4/5],Step:[126/157],Training Acc:99.04,Validation Acc:83.7
Epoch:[4/5],Step:[151/157],Training Acc:99.45,Validation Acc:84.4
Epoch:[5/5],Step:[26/157],Training Acc:99.435,Validation Acc:84.06
Epoch:[5/5],Step:[51/157],Training Acc:99.46,Validation Acc:84.0
Epoch:[5/5],Step:[76/157],Training Acc:99.46,Validation Acc:83.92
Epoch:[5/5],Step:[101/157],Training Acc:99.59,Validation Acc:84.14
Epoch:[5/5],Step:[126/157],Training Acc:99.71,Validation Acc:84.74
Epoch:[5/5],Step:[151/157],Training Acc:99.495,Validation Acc:83.96
-----------------------------------------------------------
Parameter Combination = 27 / 96
Learning Rate = 0.1
Ngram = 1
Vocab Size = 100000
Embedding Dimension = 200
Batch Size = 64
Train dataset size is 20000
Val dataset size is 5000
Total number of tokens in train

Epoch:[5/5],Step:[251/313],Training Acc:98.395,Validation Acc:82.96
Epoch:[5/5],Step:[301/313],Training Acc:98.885,Validation Acc:83.78
-----------------------------------------------------------
Parameter Combination = 30 / 96
Learning Rate = 0.1
Ngram = 1
Vocab Size = 1000000
Embedding Dimension = 100
Batch Size = 128
Train dataset size is 20000
Val dataset size is 5000
Total number of tokens in train dataset is 4760038
Token id 42577 -> token reclaimed
Token reclaimed -> token id 42577
Train dataset size is 20000
Val dataset size is 5000
Optimization Start
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.1
    weight_decay: 0
)
Epoch:[1/5],Step:[26/157],Training Acc:85.715,Validation Acc:83.68
Epoch:[1/5],Step:[51/157],Training Acc:88.36,Validation Acc:83.6
Epoch:[1/5],Step:[76/157],Training Acc:90.59,Validation Acc:84.26
Epoch:[1/5],Step:[101/157],Training Acc:93.755,Validation Acc:86.42
Epoch:[1/5],Step:[126/157],Training Acc:95.275,Vali

Epoch:[1/5],Step:[51/313],Training Acc:85.595,Validation Acc:82.66
Epoch:[1/5],Step:[101/313],Training Acc:88.05,Validation Acc:83.28
Epoch:[1/5],Step:[151/313],Training Acc:89.87,Validation Acc:83.2
Epoch:[1/5],Step:[201/313],Training Acc:93.905,Validation Acc:86.48
Epoch:[1/5],Step:[251/313],Training Acc:95.565,Validation Acc:86.68
Epoch:[1/5],Step:[301/313],Training Acc:97.135,Validation Acc:86.72
Epoch:[2/5],Step:[51/313],Training Acc:97.71,Validation Acc:85.84
Epoch:[2/5],Step:[101/313],Training Acc:97.8,Validation Acc:86.02
Epoch:[2/5],Step:[151/313],Training Acc:98.05,Validation Acc:85.96
Epoch:[2/5],Step:[201/313],Training Acc:98.22,Validation Acc:86.16
Epoch:[2/5],Step:[251/313],Training Acc:97.915,Validation Acc:84.12
Epoch:[2/5],Step:[301/313],Training Acc:98.065,Validation Acc:84.7
Epoch:[3/5],Step:[51/313],Training Acc:99.245,Validation Acc:85.62
Epoch:[3/5],Step:[101/313],Training Acc:99.275,Validation Acc:85.42
Epoch:[3/5],Step:[151/313],Training Acc:98.645,Validation Ac

Epoch:[2/5],Step:[101/157],Training Acc:97.675,Validation Acc:84.6
Epoch:[2/5],Step:[126/157],Training Acc:98.85,Validation Acc:85.6
Epoch:[2/5],Step:[151/157],Training Acc:99.285,Validation Acc:85.42
Epoch:[3/5],Step:[26/157],Training Acc:99.41,Validation Acc:85.22
Epoch:[3/5],Step:[51/157],Training Acc:99.365,Validation Acc:85.5
Epoch:[3/5],Step:[76/157],Training Acc:99.385,Validation Acc:84.92
Epoch:[3/5],Step:[101/157],Training Acc:98.875,Validation Acc:84.74
Epoch:[3/5],Step:[126/157],Training Acc:99.39,Validation Acc:85.22
Epoch:[3/5],Step:[151/157],Training Acc:99.18,Validation Acc:84.52
Epoch:[4/5],Step:[26/157],Training Acc:99.49,Validation Acc:85.56
Epoch:[4/5],Step:[51/157],Training Acc:99.625,Validation Acc:85.26
Epoch:[4/5],Step:[76/157],Training Acc:99.56,Validation Acc:85.0
Epoch:[4/5],Step:[101/157],Training Acc:99.56,Validation Acc:84.76
Epoch:[4/5],Step:[126/157],Training Acc:99.525,Validation Acc:84.28
Epoch:[4/5],Step:[151/157],Training Acc:98.995,Validation Acc:84.

Epoch:[3/5],Step:[301/313],Training Acc:99.195,Validation Acc:84.24
Epoch:[4/5],Step:[51/313],Training Acc:99.095,Validation Acc:83.42
Epoch:[4/5],Step:[101/313],Training Acc:97.745,Validation Acc:81.66
Epoch:[4/5],Step:[151/313],Training Acc:98.525,Validation Acc:83.48
Epoch:[4/5],Step:[201/313],Training Acc:98.185,Validation Acc:82.96
Epoch:[4/5],Step:[251/313],Training Acc:98.545,Validation Acc:83.66
Epoch:[4/5],Step:[301/313],Training Acc:99.19,Validation Acc:84.62
Epoch:[5/5],Step:[51/313],Training Acc:99.38,Validation Acc:84.28
Epoch:[5/5],Step:[101/313],Training Acc:99.155,Validation Acc:83.1
Epoch:[5/5],Step:[151/313],Training Acc:99.325,Validation Acc:84.18
Epoch:[5/5],Step:[201/313],Training Acc:99.375,Validation Acc:84.14
Epoch:[5/5],Step:[251/313],Training Acc:98.57,Validation Acc:83.14
Epoch:[5/5],Step:[301/313],Training Acc:98.91,Validation Acc:83.92
-----------------------------------------------------------
Parameter Combination = 40 / 96
Learning Rate = 0.1
Ngram = 2
V

In [None]:
param_val_losses_adam_lemma = hyperparameter_search(hyperparameter_space = params,
                                         epochs = 5,
                                         optimizer_name = "Adam",
                                          lemmatize = True)
pkl.dump(param_val_losses_adam_lemma, 
         open("param_val_losses_adam_lemma.p", "wb"))

In [None]:
param_val_losses_sgd_nolemma = hyperparameter_search(hyperparameter_space = params,
                                         epochs = 5,
                                         optimizer_name = "SGD",
                                          lemmatize = False)
pkl.dump(param_val_losses_sgd_nolemma,
         open("param_val_losses_sgd_nolemma.p", "wb"))

In [None]:
param_val_losses_sgd_lemma = hyperparameter_search(hyperparameter_space = params,
                                         epochs = 5,
                                         optimizer_name = "SGD",
                                          lemmatize = True)
pkl.dump(param_val_losses_sgd_lemma,
         open("param_val_losses_sgd_lemma.p", "wb"))