In [113]:
import pandas as pd
import numpy as np
import os
import spacy
import string
import pickle as pkl
from nltk import ngrams
from collections import Counter
import torch
from torch.utils.data import Dataset
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR
import matplotlib.pyplot as plt
%matplotlib inline

## Data Loading

In [2]:
def load_data(folder_path, label): 
    scores = []
    data_list = []
    for file in os.listdir(folder_path):
        if file.endswith(".txt"):
            scores.append(int(file[file.find("_")+1:file.find(".")]))
            with open(folder_path+file) as f:
                data_list.append(f.read())
    
    labels = label*np.ones(len(scores))
    return data_list, labels, scores

In [3]:
def merge_data(data1, data2, label1, label2, score1, score2, split, shuffle, train_size=20000):
    data = data1+data2
    labels = np.concatenate([label1, label2]).tolist()
    scores = score1+score2
    
    if shuffle:
        np.random.seed(0)
        index = np.random.permutation(len(data))
        data = np.array(data)[index].tolist()
        labels = np.array(labels)[index].tolist()
        scores = np.array(scores)[index].tolist()
    
    if split:
        train_data = data[:train_size]
        val_data = data[train_size:]
        train_labels = labels[:train_size]
        val_labels = labels[train_size:]
        train_scores = scores[:train_size]
        val_scores = scores[train_size]
        return train_data, train_labels, train_scores, val_data, val_labels, val_scores
    
    return data, labels, scores

In [4]:
train_pos_path = os.getcwd()+'/aclImdb/train/pos/'
train_neg_path = os.getcwd()+'/aclImdb/train/neg/'
test_pos_path = os.getcwd()+'/aclImdb/test/pos/'
test_neg_path = os.getcwd()+'/aclImdb/test/neg/'

In [5]:
train_pos_data, train_pos_label, train_pos_scores = load_data(train_pos_path, 1)
train_neg_data, train_neg_label, train_neg_scores = load_data(train_neg_path, 0)
test_pos_data, test_pos_label, test_pos_scores = load_data(test_pos_path, 1)
test_neg_data, test_neg_label, test_neg_scores = load_data(test_neg_path, 0)

In [6]:
# train_data, train_labels, train_scores, \
# val_data, val_labels, val_scores = merge_data(train_pos_data, train_neg_data, train_pos_label, train_neg_label,
#                                               train_pos_scores, train_neg_scores, True, True)

In [7]:
test_data, test_labels, test_scores = \
merge_data(test_pos_data, test_neg_data, test_pos_label, test_neg_label,
                                              test_pos_scores, test_neg_scores, False, False)

In [8]:
# pkl.dump(train_data, open("train_data.p", "wb"))
# pkl.dump(train_labels, open("train_labels.p", "wb"))
# pkl.dump(train_scores, open("train_scores.p", "wb"))
# pkl.dump(val_data, open("val_data.p", "wb"))
# pkl.dump(val_labels, open("val_labels.p", "wb"))
# pkl.dump(val_scores, open("val_scores.p", "wb"))
train_data = pkl.load(open("train_data.p", "rb"))
train_labels = pkl.load(open("train_labels.p", "rb"))
train_scores = pkl.load(open("train_scores.p", "rb"))
val_data = pkl.load(open("val_data.p", "rb"))
val_labels = pkl.load(open("val_labels.p", "rb"))
val_scores = pkl.load(open("val_scores.p", "rb"))

## Data preprocessing (Tokenization)

In [9]:
tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation

def tokenize(sent, tokenization):
    tokens = tokenizer(sent)
    if tokenization:
        return [token.text.lower() for token in tokens if (token.text not in punctuations)]
    else:
        return [token.text for token in tokens]

### 1-gram (word)

In [10]:
def tokenize_dataset(dataset, tokenization):
    token_dataset = []
    all_tokens = []
    
    for sample in dataset:
        tokens = tokenize(sample, tokenization)
        token_dataset.append(tokens)
        all_tokens += tokens

    return token_dataset, all_tokens

In [11]:
# val set tokens
# print ("Tokenizing val data")
# val_data_tokens, _ = tokenize_dataset(val_data, True)
# pkl.dump(val_data_tokens, open("val_data_tokens.p", "wb"))

# test set tokens
# print ("Tokenizing test data")
# test_data_tokens, _ = tokenize_dataset(test_data, True)
# pkl.dump(test_data_tokens, open("test_data_tokens.p", "wb"))

# train set tokens
# print ("Tokenizing train data")
# train_data_tokens, all_train_tokens = tokenize_dataset(train_data, True)
# pkl.dump(train_data_tokens, open("train_data_tokens.p", "wb"))
# pkl.dump(all_train_tokens, open("all_train_tokens.p", "wb"))

In [12]:
train_data_tokens = pkl.load(open("train_data_tokens.p", "rb"))
all_train_tokens = pkl.load(open("all_train_tokens.p", "rb"))
val_data_tokens = pkl.load(open("val_data_tokens.p", "rb"))
test_data_tokens = pkl.load(open("test_data_tokens.p", "rb"))

print ("Train dataset size is {}".format(len(train_data_tokens)))
print ("Val dataset size is {}".format(len(val_data_tokens)))
print ("Test dataset size is {}".format(len(test_data_tokens)))
print ("Total number of tokens in train dataset is {}".format(len(all_train_tokens)))

Train dataset size is 20000
Val dataset size is 5000
Test dataset size is 25000
Total number of tokens in train dataset is 4809135


### 1-gram (word) no tokenization (lowercase & remove punctuations)

In [13]:
# val set tokens
# print ("Tokenizing val data")
# val_data_tokens_ntk, _ = tokenize_dataset(val_data, False)
# pkl.dump(val_data_tokens_ntk, open("val_data_tokens_ntk.p", "wb"))

# test set tokens
# print ("Tokenizing test data")
# test_data_tokens_ntk, _ = tokenize_dataset(test_data, False)
# pkl.dump(test_data_tokens_ntk, open("test_data_tokens_ntk.p", "wb"))

# train set tokens
# print ("Tokenizing train data")
# train_data_tokens_ntk, all_train_tokens_ntk = tokenize_dataset(train_data, False)
# pkl.dump(train_data_tokens_ntk, open("train_data_tokens_ntk.p", "wb"))
# pkl.dump(all_train_tokens_ntk, open("all_train_tokens_ntk.p", "wb"))

In [14]:
train_data_tokens_ntk = pkl.load(open("train_data_tokens_ntk.p", "rb"))
all_train_tokens_ntk = pkl.load(open("all_train_tokens_ntk.p", "rb"))
val_data_tokens_ntk = pkl.load(open("val_data_tokens_ntk.p", "rb"))
test_data_tokens_ntk = pkl.load(open("test_data_tokens_ntk.p", "rb"))

print ("Train dataset size is {}".format(len(train_data_tokens_ntk)))
print ("Val dataset size is {}".format(len(val_data_tokens_ntk)))
print ("Test dataset size is {}".format(len(test_data_tokens_ntk)))
print ("Total number of tokens in train dataset is {}".format(len(all_train_tokens_ntk)))

Train dataset size is 20000
Val dataset size is 5000
Test dataset size is 25000
Total number of tokens in train dataset is 5439707


### 2-grams

In [15]:
def tokenize_dataset_ngram(dataset, n, tokenization):
    token_dataset = []
    all_tokens = []
    
    for sample in dataset:
        tokens = tokenize(sample, tokenization)
        n_grams = list(ngrams(tokens, n))
        token_dataset.append(n_grams)
        all_tokens += n_grams

    return token_dataset, all_tokens

In [16]:
# val set tokens
# print ("Tokenizing val data")
# val_data_tokens_n2, _ = tokenize_dataset_ngram(val_data, 2, True)
# pkl.dump(val_data_tokens_n2, open("val_data_tokens_n2.p", "wb"))

# test set tokens
# print ("Tokenizing test data")
# test_data_tokens_n2, _ = tokenize_dataset_ngram(test_data, 2, True)
# pkl.dump(test_data_tokens_n2, open("test_data_tokens_n2.p", "wb"))

# train set tokens
# print ("Tokenizing train data")
# train_data_tokens_n2, all_train_tokens_n2 = tokenize_dataset_ngram(train_data, 2, True)
# pkl.dump(train_data_tokens_n2, open("train_data_tokens_n2.p", "wb"))
# pkl.dump(all_train_tokens_n2, open("all_train_tokens_n2.p", "wb"))

In [17]:
train_data_tokens_n2 = pkl.load(open("train_data_tokens_n2.p", "rb"))
all_train_tokens_n2 = pkl.load(open("all_train_tokens_n2.p", "rb"))
val_data_tokens_n2 = pkl.load(open("val_data_tokens_n2.p", "rb"))
test_data_tokens_n2 = pkl.load(open("test_data_tokens_n2.p", "rb"))

print ("Train dataset size is {}".format(len(train_data_tokens_n2)))
print ("Val dataset size is {}".format(len(val_data_tokens_n2)))
print ("Test dataset size is {}".format(len(test_data_tokens_n2)))
print ("Total number of tokens in train dataset is {}".format(len(all_train_tokens_n2)))

Train dataset size is 20000
Val dataset size is 5000
Test dataset size is 25000
Total number of tokens in train dataset is 4789135


### 2-grams no tokenization (lowercase & remove punctuation)

In [18]:
# val set tokens
# print ("Tokenizing val data")
# val_data_tokens_n2_ntk, _ = tokenize_dataset_ngram(val_data, 2, False)
# pkl.dump(val_data_tokens_n2_ntk, open("val_data_tokens_n2_ntk.p", "wb"))

# test set tokens
# print ("Tokenizing test data")
# test_data_tokens_n2_ntk, _ = tokenize_dataset_ngram(test_data, 2, False)
# pkl.dump(test_data_tokens_n2_ntk, open("test_data_tokens_n2_ntk.p", "wb"))

# train set tokens
# print ("Tokenizing train data")
# train_data_tokens_n2_ntk, all_train_tokens_n2_ntk = tokenize_dataset_ngram(train_data, 2, False)
# pkl.dump(train_data_tokens_n2_ntk, open("train_data_tokens_n2_ntk.p", "wb"))
# pkl.dump(all_train_tokens_n2_ntk, open("all_train_tokens_n2_ntk.p", "wb"))

In [19]:
train_data_tokens_n2_ntk = pkl.load(open("train_data_tokens_n2_ntk.p", "rb"))
all_train_tokens_n2_ntk = pkl.load(open("all_train_tokens_n2_ntk.p", "rb"))
val_data_tokens_n2_ntk = pkl.load(open("val_data_tokens_n2_ntk.p", "rb"))
test_data_tokens_n2_ntk = pkl.load(open("test_data_tokens_n2_ntk.p", "rb"))

print ("Train dataset size is {}".format(len(train_data_tokens_n2_ntk)))
print ("Val dataset size is {}".format(len(val_data_tokens_n2_ntk)))
print ("Test dataset size is {}".format(len(test_data_tokens_n2_ntk)))
print ("Total number of tokens in train dataset is {}".format(len(all_train_tokens_n2_ntk)))

Train dataset size is 20000
Val dataset size is 5000
Test dataset size is 25000
Total number of tokens in train dataset is 5419707


### 3-grams

In [20]:
# val set tokens
# print ("Tokenizing val data")
# val_data_tokens_n3, _ = tokenize_dataset_ngram(val_data, 3, True)
# pkl.dump(val_data_tokens_n3, open("val_data_tokens_n3.p", "wb"))

# test set tokens
# print ("Tokenizing test data")
# test_data_tokens_n3, _ = tokenize_dataset_ngram(test_data, 3, True)
# pkl.dump(test_data_tokens_n3, open("test_data_tokens_n3.p", "wb"))

# train set tokens
# print ("Tokenizing train data")
# train_data_tokens_n3, all_train_tokens_n3 = tokenize_dataset_ngram(train_data, 3, True)
# pkl.dump(train_data_tokens_n3, open("train_data_tokens_n3.p", "wb"))
# pkl.dump(all_train_tokens_n3, open("all_train_tokens_n3.p", "wb"))

In [21]:
train_data_tokens_n3 = pkl.load(open("train_data_tokens_n3.p", "rb"))
all_train_tokens_n3 = pkl.load(open("all_train_tokens_n3.p", "rb"))
val_data_tokens_n3 = pkl.load(open("val_data_tokens_n3.p", "rb"))
test_data_tokens_n3 = pkl.load(open("test_data_tokens_n3.p", "rb"))

print ("Train dataset size is {}".format(len(train_data_tokens_n3)))
print ("Val dataset size is {}".format(len(val_data_tokens_n3)))
print ("Test dataset size is {}".format(len(test_data_tokens_n3)))
print ("Total number of tokens in train dataset is {}".format(len(all_train_tokens_n3)))

Train dataset size is 20000
Val dataset size is 5000
Test dataset size is 25000
Total number of tokens in train dataset is 4769135


### 3-grams no tokenization (lowercase & remove punctuation)

In [22]:
# val set tokens
# print ("Tokenizing val data")
# val_data_tokens_n3_ntk, _ = tokenize_dataset_ngram(val_data, 3, False)
# pkl.dump(val_data_tokens_n3_ntk, open("val_data_tokens_n3_ntk.p", "wb"))

# test set tokens
# print ("Tokenizing test data")
# test_data_tokens_n3_ntk, _ = tokenize_dataset_ngram(test_data, 3, False)
# pkl.dump(test_data_tokens_n3_ntk, open("test_data_tokens_n3_ntk.p", "wb"))

# train set tokens
# print ("Tokenizing train data")
# train_data_tokens_n3_ntk, all_train_tokens_n3_ntk = tokenize_dataset_ngram(train_data, 3, False)
# pkl.dump(train_data_tokens_n3_ntk, open("train_data_tokens_n3_ntk.p", "wb"))
# pkl.dump(all_train_tokens_n3_ntk, open("all_train_tokens_n3_ntk.p", "wb"))

In [23]:
train_data_tokens_n3_ntk = pkl.load(open("train_data_tokens_n3_ntk.p", "rb"))
all_train_tokens_n3_ntk = pkl.load(open("all_train_tokens_n3_ntk.p", "rb"))
val_data_tokens_n3_ntk = pkl.load(open("val_data_tokens_n3_ntk.p", "rb"))
test_data_tokens_n3_ntk = pkl.load(open("test_data_tokens_n3_ntk.p", "rb"))

print ("Train dataset size is {}".format(len(train_data_tokens_n3_ntk)))
print ("Val dataset size is {}".format(len(val_data_tokens_n3_ntk)))
print ("Test dataset size is {}".format(len(test_data_tokens_n3_ntk)))
print ("Total number of tokens in train dataset is {}".format(len(all_train_tokens_n3_ntk)))

Train dataset size is 20000
Val dataset size is 5000
Test dataset size is 25000
Total number of tokens in train dataset is 5399707


### 4-grams

In [24]:
# val set tokens
# print ("Tokenizing val data")
# val_data_tokens_n4, _ = tokenize_dataset_ngram(val_data, 4, True)
# pkl.dump(val_data_tokens_n4, open("val_data_tokens_n4.p", "wb"))

# test set tokens
# print ("Tokenizing test data")
# test_data_tokens_n4, _ = tokenize_dataset_ngram(test_data, 4, True)
# pkl.dump(test_data_tokens_n4, open("test_data_tokens_n4.p", "wb"))

# train set tokens
# print ("Tokenizing train data")
# train_data_tokens_n4, all_train_tokens_n4 = tokenize_dataset_ngram(train_data, 4, True)
# pkl.dump(train_data_tokens_n4, open("train_data_tokens_n4.p", "wb"))
# pkl.dump(all_train_tokens_n4, open("all_train_tokens_n4.p", "wb"))

In [25]:
train_data_tokens_n4 = pkl.load(open("train_data_tokens_n4.p", "rb"))
all_train_tokens_n4 = pkl.load(open("all_train_tokens_n4.p", "rb"))
val_data_tokens_n4 = pkl.load(open("val_data_tokens_n4.p", "rb"))
test_data_tokens_n4 = pkl.load(open("test_data_tokens_n4.p", "rb"))

print ("Train dataset size is {}".format(len(train_data_tokens_n4)))
print ("Val dataset size is {}".format(len(val_data_tokens_n4)))
print ("Test dataset size is {}".format(len(test_data_tokens_n4)))
print ("Total number of tokens in train dataset is {}".format(len(all_train_tokens_n4)))

Train dataset size is 20000
Val dataset size is 5000
Test dataset size is 25000
Total number of tokens in train dataset is 4749135


### 4-grams no tokenization (lowercase & remove punctuation)

In [26]:
# val set tokens
# print ("Tokenizing val data")
# val_data_tokens_n4_ntk, _ = tokenize_dataset_ngram(val_data, 4, False)
# pkl.dump(val_data_tokens_n4_ntk, open("val_data_tokens_n4_ntk.p", "wb"))

# test set tokens
# print ("Tokenizing test data")
# test_data_tokens_n4_ntk, _ = tokenize_dataset_ngram(test_data, 4, False)
# pkl.dump(test_data_tokens_n4_ntk, open("test_data_tokens_n4_ntk.p", "wb"))

# train set tokens
# print ("Tokenizing train data")
# train_data_tokens_n4_ntk, all_train_tokens_n4_ntk = tokenize_dataset_ngram(train_data, 4, False)
# pkl.dump(train_data_tokens_n4_ntk, open("train_data_tokens_n4_ntk.p", "wb"))
# pkl.dump(all_train_tokens_n4_ntk, open("all_train_tokens_n4_ntk.p", "wb"))

In [27]:
train_data_tokens_n4_ntk = pkl.load(open("train_data_tokens_n4_ntk.p", "rb"))
all_train_tokens_n4_ntk = pkl.load(open("all_train_tokens_n4_ntk.p", "rb"))
val_data_tokens_n4_ntk = pkl.load(open("val_data_tokens_n4_ntk.p", "rb"))
test_data_tokens_n4_ntk = pkl.load(open("test_data_tokens_n4_ntk.p", "rb"))

print ("Train dataset size is {}".format(len(train_data_tokens_n4_ntk)))
print ("Val dataset size is {}".format(len(val_data_tokens_n4_ntk)))
print ("Test dataset size is {}".format(len(test_data_tokens_n4_ntk)))
print ("Total number of tokens in train dataset is {}".format(len(all_train_tokens_n4_ntk)))

Train dataset size is 20000
Val dataset size is 5000
Test dataset size is 25000
Total number of tokens in train dataset is 5379707


## Build vocab lists and transform data into indices lists

In [28]:
PAD_IDX = 0
UNK_IDX = 1

In [29]:
def build_vocab(all_tokens, max_vocab_size):
    token_counter = Counter(all_tokens)
    vocab, count = zip(*token_counter.most_common(max_vocab_size))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

In [30]:
def token2index_dataset(tokens_data, token2id):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

### 1-gram (word)

In [31]:
max_vocab_size = 10000
token2id_n1, id2token_n1 = build_vocab(all_train_tokens, max_vocab_size)

In [32]:
train_data_indices_n1 = token2index_dataset(train_data_tokens, token2id_n1)
val_data_indices_n1 = token2index_dataset(val_data_tokens, token2id_n1)
test_data_indices_n1 = token2index_dataset(test_data_tokens, token2id_n1)

# double checking
print ("Train dataset size is {}".format(len(train_data_indices_n1)))
print ("Val dataset size is {}".format(len(val_data_indices_n1)))
print ("Test dataset size is {}".format(len(test_data_indices_n1)))

Train dataset size is 20000
Val dataset size is 5000
Test dataset size is 25000


### 1-gram (word) no tokenization

In [33]:
max_vocab_size = 10000
token2id_n1_ntk, id2token_n1_ntk = build_vocab(all_train_tokens_ntk, max_vocab_size = 10000)

In [34]:
train_data_indices_n1_ntk = token2index_dataset(train_data_tokens_ntk, token2id_n1_ntk)
val_data_indices_n1_ntk = token2index_dataset(val_data_tokens_ntk, token2id_n1_ntk)
test_data_indices_n1_ntk = token2index_dataset(test_data_tokens_ntk, token2id_n1_ntk)

# double checking
print ("Train dataset size is {}".format(len(train_data_indices_n1_ntk)))
print ("Val dataset size is {}".format(len(val_data_indices_n1_ntk)))
print ("Test dataset size is {}".format(len(test_data_indices_n1_ntk)))

Train dataset size is 20000
Val dataset size is 5000
Test dataset size is 25000


### 2-grams

In [35]:
max_vocab_size = 10000
token2id_n2, id2token_n2 = build_vocab(all_train_tokens_n2, max_vocab_size)

In [36]:
train_data_indices_n2 = token2index_dataset(train_data_tokens_n2, token2id_n2)
val_data_indices_n2 = token2index_dataset(val_data_tokens_n2, token2id_n2)
test_data_indices_n2 = token2index_dataset(test_data_tokens_n2, token2id_n2)

# double checking
print ("Train dataset size is {}".format(len(train_data_indices_n2)))
print ("Val dataset size is {}".format(len(val_data_indices_n2)))
print ("Test dataset size is {}".format(len(test_data_indices_n2)))

Train dataset size is 20000
Val dataset size is 5000
Test dataset size is 25000


### 2-grams no tokenization

In [37]:
max_vocab_size = 10000
token2id_n2_ntk, id2token_n2_ntk = build_vocab(all_train_tokens_n2_ntk, max_vocab_size)

In [38]:
train_data_indices_n2_ntk = token2index_dataset(train_data_tokens_n2_ntk, token2id_n2_ntk)
val_data_indices_n2_ntk = token2index_dataset(val_data_tokens_n2_ntk, token2id_n2_ntk)
test_data_indices_n2_ntk = token2index_dataset(test_data_tokens_n2_ntk, token2id_n2_ntk)

# double checking
print ("Train dataset size is {}".format(len(train_data_indices_n2_ntk)))
print ("Val dataset size is {}".format(len(val_data_indices_n2_ntk)))
print ("Test dataset size is {}".format(len(test_data_indices_n2_ntk)))

Train dataset size is 20000
Val dataset size is 5000
Test dataset size is 25000


### 3-grams

In [39]:
max_vocab_size = 10000
token2id_n3, id2token_n3 = build_vocab(all_train_tokens_n3, max_vocab_size)

In [40]:
train_data_indices_n3 = token2index_dataset(train_data_tokens_n3, token2id_n3)
val_data_indices_n3 = token2index_dataset(val_data_tokens_n3, token2id_n3)
test_data_indices_n3 = token2index_dataset(test_data_tokens_n3, token2id_n3)

# double checking
print ("Train dataset size is {}".format(len(train_data_indices_n3)))
print ("Val dataset size is {}".format(len(val_data_indices_n3)))
print ("Test dataset size is {}".format(len(test_data_indices_n3)))

Train dataset size is 20000
Val dataset size is 5000
Test dataset size is 25000


### 3-grams no tokenization

In [41]:
max_vocab_size = 10000
token2id_n3_ntk, id2token_n3_ntk = build_vocab(all_train_tokens_n3_ntk, max_vocab_size)

In [42]:
train_data_indices_n3_ntk = token2index_dataset(train_data_tokens_n3_ntk, token2id_n3_ntk)
val_data_indices_n3_ntk = token2index_dataset(val_data_tokens_n3_ntk, token2id_n3_ntk)
test_data_indices_n3_ntk = token2index_dataset(test_data_tokens_n3_ntk, token2id_n3_ntk)

# double checking
print ("Train dataset size is {}".format(len(train_data_indices_n3_ntk)))
print ("Val dataset size is {}".format(len(val_data_indices_n3_ntk)))
print ("Test dataset size is {}".format(len(test_data_indices_n3_ntk)))

Train dataset size is 20000
Val dataset size is 5000
Test dataset size is 25000


### 4-grams

In [43]:
max_vocab_size = 10000
token2id_n4, id2token_n4 = build_vocab(all_train_tokens_n4, max_vocab_size)

In [44]:
train_data_indices_n4 = token2index_dataset(train_data_tokens_n4, token2id_n4)
val_data_indices_n4 = token2index_dataset(val_data_tokens_n4, token2id_n4)
test_data_indices_n4 = token2index_dataset(test_data_tokens_n4, token2id_n4)

# double checking
print ("Train dataset size is {}".format(len(train_data_indices_n4)))
print ("Val dataset size is {}".format(len(val_data_indices_n4)))
print ("Test dataset size is {}".format(len(test_data_indices_n4)))

Train dataset size is 20000
Val dataset size is 5000
Test dataset size is 25000


### 4-grams no tokenization

In [45]:
max_vocab_size = 10000
token2id_n4_ntk, id2token_n4_ntk = build_vocab(all_train_tokens_n4_ntk, max_vocab_size)

In [46]:
train_data_indices_n4_ntk = token2index_dataset(train_data_tokens_n4_ntk, token2id_n4_ntk)
val_data_indices_n4_ntk = token2index_dataset(val_data_tokens_n4_ntk, token2id_n4_ntk)
test_data_indices_n4_ntk = token2index_dataset(test_data_tokens_n4_ntk, token2id_n4_ntk)

# double checking
print ("Train dataset size is {}".format(len(train_data_indices_n4_ntk)))
print ("Val dataset size is {}".format(len(val_data_indices_n4_ntk)))
print ("Test dataset size is {}".format(len(test_data_indices_n4_ntk)))

Train dataset size is 20000
Val dataset size is 5000
Test dataset size is 25000


## DataLoader

In [47]:
MAX_SENTENCE_LENGTH = 200

In [48]:
class NewsGroupDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list, target_list):
        """
        @param data_list: list of newsgroup tokens 
        @param target_list: list of newsgroup targets 

        """
        self.data_list = data_list
        self.target_list = target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        """
        Triggered when yo-u call dataset[i]
        """
        
        token_idx = self.data_list[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [token_idx, len(token_idx), label]

def newsgroup_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list = []
    label_list = []
    length_list = []
    #print("collate batch: ", batch[0][0])
    #batch[0][0] = batch[0][0][:MAX_SENTENCE_LENGTH]
    for datum in batch:
        label_list.append(datum[2])
        length_list.append(datum[1])
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        data_list.append(padded_vec)
    return [torch.from_numpy(np.array(data_list)), torch.LongTensor(length_list), torch.LongTensor(label_list)]

### 1-gram (word)

In [49]:
BATCH_SIZE = 32
train_dataset_n1 = NewsGroupDataset(train_data_indices_n1, train_labels)
train_loader_n1 = torch.utils.data.DataLoader(dataset=train_dataset_n1, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

val_dataset_n1 = NewsGroupDataset(val_data_indices_n1, val_labels)
val_loader_n1 = torch.utils.data.DataLoader(dataset=val_dataset_n1, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

test_dataset_n1 = NewsGroupDataset(test_data_indices_n1, test_labels)
test_loader_n1 = torch.utils.data.DataLoader(dataset=test_dataset_n1, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=False)

### 1-gram (word) no tokenization

In [50]:
BATCH_SIZE = 32
train_dataset_n1_ntk = NewsGroupDataset(train_data_indices_n1_ntk, train_labels)
train_loader_n1_ntk = torch.utils.data.DataLoader(dataset=train_dataset_n1_ntk, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

val_dataset_n1_ntk = NewsGroupDataset(val_data_indices_n1_ntk, val_labels)
val_loader_n1_ntk = torch.utils.data.DataLoader(dataset=val_dataset_n1_ntk, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

test_dataset_n1_ntk = NewsGroupDataset(test_data_indices_n1_ntk, test_labels)
test_loader_n1_ntk = torch.utils.data.DataLoader(dataset=test_dataset_n1_ntk, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=False)

### 2-grams

In [51]:
BATCH_SIZE = 32
train_dataset_n2 = NewsGroupDataset(train_data_indices_n2, train_labels)
train_loader_n2 = torch.utils.data.DataLoader(dataset=train_dataset_n2, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

val_dataset_n2 = NewsGroupDataset(val_data_indices_n2, val_labels)
val_loader_n2 = torch.utils.data.DataLoader(dataset=val_dataset_n2, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

test_dataset_n2 = NewsGroupDataset(test_data_indices_n2, test_labels)
test_loader_n2 = torch.utils.data.DataLoader(dataset=test_dataset_n2, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=False)

### 2-grams no tokenization

In [52]:
BATCH_SIZE = 32
train_dataset_n2_ntk = NewsGroupDataset(train_data_indices_n2_ntk, train_labels)
train_loader_n2_ntk = torch.utils.data.DataLoader(dataset=train_dataset_n2_ntk, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

val_dataset_n2_ntk = NewsGroupDataset(val_data_indices_n2_ntk, val_labels)
val_loader_n2_ntk = torch.utils.data.DataLoader(dataset=val_dataset_n2_ntk, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

test_dataset_n2_ntk = NewsGroupDataset(test_data_indices_n2_ntk, test_labels)
test_loader_n2_ntk = torch.utils.data.DataLoader(dataset=test_dataset_n2_ntk, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=False)

### 3-grams

In [53]:
BATCH_SIZE = 32
train_dataset_n3 = NewsGroupDataset(train_data_indices_n3, train_labels)
train_loader_n3 = torch.utils.data.DataLoader(dataset=train_dataset_n3, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

val_dataset_n3 = NewsGroupDataset(val_data_indices_n3, val_labels)
val_loader_n3 = torch.utils.data.DataLoader(dataset=val_dataset_n3, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

test_dataset_n3 = NewsGroupDataset(test_data_indices_n3, test_labels)
test_loader_n3 = torch.utils.data.DataLoader(dataset=test_dataset_n3, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=False)

### 3-grams no tokenization

In [54]:
BATCH_SIZE = 32
train_dataset_n3_ntk = NewsGroupDataset(train_data_indices_n3_ntk, train_labels)
train_loader_n3_ntk = torch.utils.data.DataLoader(dataset=train_dataset_n3_ntk, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

val_dataset_n3_ntk = NewsGroupDataset(val_data_indices_n3_ntk, val_labels)
val_loader_n3_ntk = torch.utils.data.DataLoader(dataset=val_dataset_n3_ntk, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

test_dataset_n3_ntk = NewsGroupDataset(test_data_indices_n3_ntk, test_labels)
test_loader_n3_ntk = torch.utils.data.DataLoader(dataset=test_dataset_n3_ntk, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=False)

### 4-grams

In [55]:
BATCH_SIZE = 32
train_dataset_n4 = NewsGroupDataset(train_data_indices_n4, train_labels)
train_loader_n4 = torch.utils.data.DataLoader(dataset=train_dataset_n4, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

val_dataset_n4 = NewsGroupDataset(val_data_indices_n4, val_labels)
val_loader_n4 = torch.utils.data.DataLoader(dataset=val_dataset_n4, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

test_dataset_n4 = NewsGroupDataset(test_data_indices_n4, test_labels)
test_loader_n4 = torch.utils.data.DataLoader(dataset=test_dataset_n4, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=False)

### 4-grams no tokenization

In [56]:
BATCH_SIZE = 32
train_dataset_n4_ntk = NewsGroupDataset(train_data_indices_n4_ntk, train_labels)
train_loader_n4_ntk = torch.utils.data.DataLoader(dataset=train_dataset_n4_ntk, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

val_dataset_n4_ntk = NewsGroupDataset(val_data_indices_n4_ntk, val_labels)
val_loader_n4_ntk = torch.utils.data.DataLoader(dataset=val_dataset_n4_ntk, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

test_dataset_n4_ntk = NewsGroupDataset(test_data_indices_n4_ntk, test_labels)
test_loader_n4_ntk = torch.utils.data.DataLoader(dataset=test_dataset_n4_ntk, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=False)

## Bag of N-gram Models & Training

In [154]:
class BagOfNgram(nn.Module):
    """
    BagOfNgram classification model
    """
    def __init__(self, vocab_size, emb_dim):
        """
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding
        """
        super(BagOfNgram, self).__init__()
        # pay attention to padding_idx 
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.linear = nn.Linear(emb_dim, 2)
    
    def forward(self, data, length):
        """
        
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        out = self.embed(data)
        out = torch.sum(out, dim=1)
        out /= length.view(length.size()[0],1).expand_as(out).float()
     
        # return logits
        out = self.linear(out.float())
        return out

In [225]:
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data, lengths, labels in loader:
        data_batch, length_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, length_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    
    return (100 * correct / total)

In [227]:
def train_proc(model, train_loader, val_loader, lr, adj, ep, optim, lr_decay=0, plt=False):
    criterion = torch.nn.CrossEntropyLoss()
    if optim == 'Adam':
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    else:
        optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    
    if adj:
        scheduler = StepLR(optimizer, step_size=1, gamma=lr_decay)
    
    train_ls = []
    for epoch in range(ep):
        if adj:
            scheduler.step()
        for i, (data, lengths, labels) in enumerate(train_loader):
            model.train()
            data_batch, length_batch, label_batch = data, lengths, labels
            optimizer.zero_grad()
            outputs = model(data_batch, length_batch)
            loss = criterion(outputs, label_batch)
            loss.backward()
            optimizer.step()
            train_ls.append(loss)
            
#             if i > 0 and i % 300 == 0:
#                 val_acc, val_loss = test_model(val_loader, model)
#                 val_ls += val_loss
#                 print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format(
#                     epoch+1, ep, i+1, len(train_loader), val_acc))
    
    val_acc = test_model(val_loader, model)
    print('Val Accuracy: {}'.format(val_acc))
    
    if plt:
        plt.plot(train_ls)
        plt.xlabel("n")
        plt.ylabel("Train Loss")
    
    return val_acc

### 1-gram (word)

In [228]:
emb_size = 100

In [229]:
model_n1 = BagOfNgram(len(id2token_n1), emb_size)
val_acc = train_proc(model_n1, train_loader_n1, val_loader_n1, 0.01, False, 3, 'Adam')

Val Accuracy: 85.8


In [230]:
model_n1 = BagOfNgram(len(id2token_n1), emb_size)
val_acc = train_proc(model_n1, train_loader_n1, val_loader_n1, 0.001, False, 3, 'Adam')

Val Accuracy: 86.26


In [231]:
model_n1 = BagOfNgram(len(id2token_n1), emb_size)
val_acc = train_proc(model_n1, train_loader_n1, val_loader_n1, 0.01, False, 5, 'Adam')

Val Accuracy: 83.84


In [232]:
model_n1 = BagOfNgram(len(id2token_n1), emb_size)
val_acc = train_proc(model_n1, train_loader_n1, val_loader_n1, 0.001, False, 5, 'Adam')

Val Accuracy: 87.3


In [233]:
model_n1 = BagOfNgram(len(id2token_n1), emb_size)
val_acc = train_proc(model_n1, train_loader_n1, val_loader_n1,  0.01, True, 3, 'Adam', 0.5)

Val Accuracy: 86.82


In [234]:
model_n1 = BagOfNgram(len(id2token_n1), emb_size)
val_acc = train_proc(model_n1, train_loader_n1, val_loader_n1, 0.01, True, 3, 'Adam', 0.1)

Val Accuracy: 87.72


In [235]:
model_n1 = BagOfNgram(len(id2token_n1), emb_size)
val_acc = train_proc(model_n1, train_loader_n1, val_loader_n1,  0.01, False, 3, 'SGD')

Val Accuracy: 66.1


In [236]:
model_n1 = BagOfNgram(len(id2token_n1), emb_size)
val_acc = train_proc(model_n1, train_loader_n1, val_loader_n1, 0.001, False, 3, 'SGD')

Val Accuracy: 61.14


In [237]:
model_n1 = BagOfNgram(len(id2token_n1), emb_size)
val_acc = train_proc(model_n1, train_loader_n1, val_loader_n1,  0.01, False, 5, 'SGD')

Val Accuracy: 67.28


In [238]:
model_n1 = BagOfNgram(len(id2token_n1), emb_size)
val_acc = train_proc(model_n1, train_loader_n1, val_loader_n1,  0.001, False, 5, 'SGD')

Val Accuracy: 61.08


In [239]:
model_n1 = BagOfNgram(len(id2token_n1), emb_size)
val_acc = train_proc(model_n1, train_loader_n1, val_loader_n1,  0.01, True, 3, 'SGD', 0.5)

Val Accuracy: 64.4


In [240]:
model_n1 = BagOfNgram(len(id2token_n1), emb_size)
val_acc = train_proc(model_n1, train_loader_n1, val_loader_n1,  0.01, True, 3, 'SGD', 0.1)

Val Accuracy: 66.32


In [241]:
emb_size = 200

In [242]:
model_n1 = BagOfNgram(len(id2token_n1), emb_size)
val_acc = train_proc(model_n1, train_loader_n1, val_loader_n1, 0.01, False, 3, 'Adam')

Val Accuracy: 85.62


In [243]:
model_n1 = BagOfNgram(len(id2token_n1), emb_size)
val_acc = train_proc(model_n1, train_loader_n1, val_loader_n1,  0.001, False, 3, 'Adam')

Val Accuracy: 87.12


In [244]:
model_n1 = BagOfNgram(len(id2token_n1), emb_size)
val_acc = train_proc(model_n1, train_loader_n1, val_loader_n1, 0.01, False, 5, 'Adam')

Val Accuracy: 84.02


In [245]:
model_n1 = BagOfNgram(len(id2token_n1), emb_size)
val_acc = train_proc(model_n1, train_loader_n1, val_loader_n1, 0.001, False, 5, 'Adam')

Val Accuracy: 88.04


In [246]:
model_n1 = BagOfNgram(len(id2token_n1), emb_size)
val_acc = train_proc(model_n1, train_loader_n1, val_loader_n1, 0.01, True, 3, 'Adam', 0.5)

Val Accuracy: 86.62


In [247]:
model_n1 = BagOfNgram(len(id2token_n1), emb_size)
val_acc = train_proc(model_n1, train_loader_n1, val_loader_n1,  0.01, True, 3, 'Adam', 0.1)

Val Accuracy: 87.8


In [248]:
model_n1 = BagOfNgram(len(id2token_n1), emb_size)
val_acc = train_proc(model_n1, train_loader_n1, val_loader_n1, 0.01, False, 3, 'SGD')

Val Accuracy: 69.72


In [249]:
model_n1 = BagOfNgram(len(id2token_n1), emb_size)
val_acc = train_proc(model_n1, train_loader_n1, val_loader_n1,  0.001, False, 3, 'SGD')

Val Accuracy: 63.36


In [250]:
model_n1 = BagOfNgram(len(id2token_n1), emb_size)
val_acc = train_proc(model_n1, train_loader_n1, val_loader_n1,  0.01, False, 5, 'SGD')

Val Accuracy: 72.9


In [251]:
model_n1 = BagOfNgram(len(id2token_n1), emb_size)
val_acc = train_proc(model_n1, train_loader_n1, val_loader_n1,  0.001, False, 5, 'SGD')

Val Accuracy: 65.44


In [252]:
model_n1 = BagOfNgram(len(id2token_n1), emb_size)
val_acc = train_proc(model_n1, train_loader_n1, val_loader_n1,  0.01, True, 3, 'SGD', 0.5)

Val Accuracy: 69.16


In [253]:
model_n1 = BagOfNgram(len(id2token_n1), emb_size)
val_acc = train_proc(model_n1, train_loader_n1, val_loader_n1,  0.01, True, 3, 'SGD', 0.1)

Val Accuracy: 66.9


### 1-gram (word) no tokenizaton

In [254]:
emb_size = 100

In [255]:
model_n1_ntk = BagOfNgram(len(id2token_n1_ntk), emb_size)
val_acc = train_proc(model_n1_ntk, train_loader_n1_ntk, val_loader_n1_ntk, 0.01, False, 3, 'Adam')

Val Accuracy: 85.48


In [256]:
model_n1_ntk = BagOfNgram(len(id2token_n1_ntk), emb_size)
val_acc = train_proc(model_n1_ntk, train_loader_n1_ntk, val_loader_n1_ntk, 0.001, False, 3, 'Adam')

Val Accuracy: 85.66


In [257]:
model_n1_ntk = BagOfNgram(len(id2token_n1_ntk), emb_size)
val_acc = train_proc(model_n1_ntk, train_loader_n1_ntk, val_loader_n1_ntk, 0.01, False, 5, 'Adam')

Val Accuracy: 84.52


In [258]:
model_n1_ntk = BagOfNgram(len(id2token_n1_ntk), emb_size)
val_acc = train_proc(model_n1_ntk, train_loader_n1_ntk, val_loader_n1_ntk,  0.001, False, 5, 'Adam')

Val Accuracy: 86.86


In [259]:
model_n1_ntk = BagOfNgram(len(id2token_n1_ntk), emb_size)
val_acc = train_proc(model_n1_ntk, train_loader_n1_ntk, val_loader_n1_ntk, 0.01, True, 3, 'Adam', 0.5)

Val Accuracy: 86.86


In [260]:
model_n1_ntk = BagOfNgram(len(id2token_n1_ntk), emb_size)
val_acc = train_proc(model_n1_ntk, train_loader_n1_ntk, val_loader_n1_ntk, 0.01, True, 3, 'Adam', 0.1)

Val Accuracy: 87.44


In [261]:
model_n1_ntk = BagOfNgram(len(id2token_n1_ntk), emb_size)
val_acc = train_proc(model_n1_ntk, train_loader_n1_ntk, val_loader_n1_ntk,  0.01, False, 3, 'SGD')

Val Accuracy: 66.76


In [262]:
model_n1_ntk = BagOfNgram(len(id2token_n1_ntk), emb_size)
val_acc = train_proc(model_n1_ntk, train_loader_n1_ntk, val_loader_n1_ntk,  0.001, False, 3, 'SGD')

Val Accuracy: 60.68


In [263]:
model_n1_ntk = BagOfNgram(len(id2token_n1_ntk), emb_size)
val_acc = train_proc(model_n1_ntk, train_loader_n1_ntk, val_loader_n1_ntk, 0.01, False, 5, 'SGD')

Val Accuracy: 68.2


In [264]:
model_n1_ntk = BagOfNgram(len(id2token_n1_ntk), emb_size)
val_acc = train_proc(model_n1_ntk, train_loader_n1_ntk, val_loader_n1_ntk, 0.001, False, 5, 'SGD')

Val Accuracy: 59.56


In [265]:
model_n1_ntk = BagOfNgram(len(id2token_n1_ntk), emb_size)
val_acc = train_proc(model_n1_ntk, train_loader_n1_ntk, val_loader_n1_ntk, 0.01, True, 3, 'SGD', 0.5)

Val Accuracy: 66.04


In [266]:
model_n1_ntk = BagOfNgram(len(id2token_n1_ntk), emb_size)
val_acc = train_proc(model_n1_ntk, train_loader_n1_ntk, val_loader_n1_ntk, 0.01, True, 3, 'SGD', 0.1)

Val Accuracy: 65.0


In [267]:
emb_size = 200

In [268]:
model_n1_ntk = BagOfNgram(len(id2token_n1_ntk), emb_size)
val_acc = train_proc(model_n1_ntk, train_loader_n1_ntk, val_loader_n1_ntk,  0.01, False, 3, 'Adam')

Val Accuracy: 85.36


In [269]:
model_n1_ntk = BagOfNgram(len(id2token_n1_ntk), emb_size)
val_acc = train_proc(model_n1_ntk, train_loader_n1_ntk, val_loader_n1_ntk, 0.001, False, 3, 'Adam')

Val Accuracy: 86.28


In [270]:
model_n1_ntk = BagOfNgram(len(id2token_n1_ntk), emb_size)
val_acc = train_proc(model_n1_ntk, train_loader_n1_ntk, val_loader_n1_ntk, 0.01, False, 5, 'Adam')

Val Accuracy: 84.72


In [271]:
model_n1_ntk = BagOfNgram(len(id2token_n1_ntk), emb_size)
val_acc = train_proc(model_n1_ntk, train_loader_n1_ntk, val_loader_n1_ntk, 0.001, False, 5, 'Adam')

Val Accuracy: 87.4


In [272]:
model_n1_ntk = BagOfNgram(len(id2token_n1_ntk), emb_size)
val_acc = train_proc(model_n1_ntk, train_loader_n1_ntk, val_loader_n1_ntk, 0.01, True, 3, 'Adam', 0.5)

Val Accuracy: 86.58


In [273]:
model_n1_ntk = BagOfNgram(len(id2token_n1_ntk), emb_size)
val_acc = train_proc(model_n1_ntk, train_loader_n1_ntk, val_loader_n1_ntk, 0.01, True, 3, 'Adam', 0.1)

Val Accuracy: 87.26


In [274]:
model_n1_ntk = BagOfNgram(len(id2token_n1_ntk), emb_size)
val_acc = train_proc(model_n1_ntk, train_loader_n1_ntk, val_loader_n1_ntk,  0.01, False, 3, 'SGD')

Val Accuracy: 69.56


In [275]:
model_n1_ntk = BagOfNgram(len(id2token_n1_ntk), emb_size)
val_acc = train_proc(model_n1_ntk, train_loader_n1_ntk, val_loader_n1_ntk,  0.001, False, 3, 'SGD')

Val Accuracy: 61.74


In [276]:
model_n1_ntk = BagOfNgram(len(id2token_n1_ntk), emb_size)
val_acc = train_proc(model_n1_ntk, train_loader_n1_ntk, val_loader_n1_ntk, 0.01, False, 5, 'SGD')

Val Accuracy: 71.04


In [277]:
model_n1_ntk = BagOfNgram(len(id2token_n1_ntk), emb_size)
val_acc = train_proc(model_n1_ntk, train_loader_n1_ntk, val_loader_n1_ntk,  0.001, False, 5, 'SGD')

Val Accuracy: 64.42


In [278]:
model_n1_ntk = BagOfNgram(len(id2token_n1_ntk), emb_size)
val_acc = train_proc(model_n1_ntk, train_loader_n1_ntk, val_loader_n1_ntk,  0.01, True, 3, 'SGD', 0.5)

Val Accuracy: 67.94


In [279]:
model_n1_ntk = BagOfNgram(len(id2token_n1_ntk), emb_size)
val_acc = train_proc(model_n1_ntk, train_loader_n1_ntk, val_loader_n1_ntk, 0.01, True, 3, 'SGD', 0.1)

Val Accuracy: 67.54


### 2-grams

In [280]:
emb_size = 100

In [281]:
model_n2 = BagOfNgram(len(id2token_n2), emb_size)
val_acc = train_proc(model_n2, train_loader_n2, val_loader_n2, 0.01, False, 3, 'Adam')

Val Accuracy: 82.0


In [282]:
model_n2 = BagOfNgram(len(id2token_n2), emb_size)
val_acc = train_proc(model_n2, train_loader_n2, val_loader_n2, 0.001, False, 3, 'Adam')

Val Accuracy: 82.04


In [283]:
model_n2 = BagOfNgram(len(id2token_n2), emb_size)
val_acc = train_proc(model_n2, train_loader_n2, val_loader_n2, 0.01, False, 5, 'Adam')

Val Accuracy: 80.48


In [284]:
model_n2 = BagOfNgram(len(id2token_n2), emb_size)
val_acc = train_proc(model_n2, train_loader_n2, val_loader_n2, 0.001, False, 5, 'Adam')

Val Accuracy: 83.84


In [285]:
model_n2 = BagOfNgram(len(id2token_n2), emb_size)
val_acc = train_proc(model_n2, train_loader_n2, val_loader_n2, 0.01, True, 3, 'Adam', 0.5)

Val Accuracy: 83.38


In [286]:
model_n2 = BagOfNgram(len(id2token_n2), emb_size)
val_acc = train_proc(model_n2, train_loader_n2, val_loader_n2, 0.01, True, 3, 'Adam', 0.1)

Val Accuracy: 83.78


In [287]:
model_n2 = BagOfNgram(len(id2token_n2), emb_size)
val_acc = train_proc(model_n2, train_loader_n2, val_loader_n2, 0.01, False, 3, 'SGD')

Val Accuracy: 56.0


In [288]:
model_n2 = BagOfNgram(len(id2token_n2), emb_size)
val_acc = train_proc(model_n2, train_loader_n2, val_loader_n2, 0.001, False, 3, 'SGD')

Val Accuracy: 53.64


In [289]:
model_n2 = BagOfNgram(len(id2token_n2), emb_size)
val_acc = train_proc(model_n2, train_loader_n2, val_loader_n2, 0.01, False, 5, 'SGD')

Val Accuracy: 56.16


In [290]:
model_n2 = BagOfNgram(len(id2token_n2), emb_size)
val_acc = train_proc(model_n2, train_loader_n2, val_loader_n2, 0.001, False, 5, 'SGD')

Val Accuracy: 54.7


In [291]:
model_n2 = BagOfNgram(len(id2token_n2), emb_size)
val_acc = train_proc(model_n2, train_loader_n2, val_loader_n2, 0.01, True, 3, 'SGD', 0.5)

Val Accuracy: 58.82


In [292]:
model_n2 = BagOfNgram(len(id2token_n2), emb_size)
val_acc = train_proc(model_n2, train_loader_n2, val_loader_n2, 0.01, True, 3, 'SGD', 0.1)

Val Accuracy: 58.44


In [293]:
emb_size = 200

In [294]:
model_n2 = BagOfNgram(len(id2token_n2), emb_size)
val_acc = train_proc(model_n2, train_loader_n2, val_loader_n2, 0.01, False, 3, 'Adam')

Val Accuracy: 80.18


In [295]:
model_n2 = BagOfNgram(len(id2token_n2), emb_size)
val_acc = train_proc(model_n2, train_loader_n2, val_loader_n2, 0.001, False, 3, 'Adam')

Val Accuracy: 82.72


In [296]:
model_n2 = BagOfNgram(len(id2token_n2), emb_size)
val_acc = train_proc(model_n2, train_loader_n2, val_loader_n2, 0.01, False, 5, 'Adam')

Val Accuracy: 79.9


In [297]:
model_n2 = BagOfNgram(len(id2token_n2), emb_size)
val_acc = train_proc(model_n2, train_loader_n2, val_loader_n2, 0.001, False, 5, 'Adam')

Val Accuracy: 83.68


In [298]:
model_n2 = BagOfNgram(len(id2token_n2), emb_size)
val_acc = train_proc(model_n2, train_loader_n2, val_loader_n2, 0.01, True, 3, 'Adam', 0.5)

Val Accuracy: 83.04


In [299]:
model_n2 = BagOfNgram(len(id2token_n2), emb_size)
val_acc = train_proc(model_n2, train_loader_n2, val_loader_n2, 0.01, True, 3, 'Adam', 0.1)

Val Accuracy: 83.76


In [300]:
model_n2 = BagOfNgram(len(id2token_n2), emb_size)
val_acc = train_proc(model_n2, train_loader_n2, val_loader_n2, 0.01, False, 3, 'SGD')

Val Accuracy: 53.22


In [301]:
model_n2 = BagOfNgram(len(id2token_n2), emb_size)
val_acc = train_proc(model_n2, train_loader_n2, val_loader_n2, 0.001, False, 3, 'SGD')

Val Accuracy: 53.58


In [302]:
model_n2 = BagOfNgram(len(id2token_n2), emb_size)
val_acc = train_proc(model_n2, train_loader_n2, val_loader_n2, 0.01, False, 5, 'SGD')

Val Accuracy: 61.96


In [303]:
model_n2 = BagOfNgram(len(id2token_n2), emb_size)
val_acc = train_proc(model_n2, train_loader_n2, val_loader_n2, 0.001, False, 5, 'SGD')

Val Accuracy: 58.62


In [304]:
model_n2 = BagOfNgram(len(id2token_n2), emb_size)
val_acc = train_proc(model_n2, train_loader_n2, val_loader_n2, 0.01, True, 3, 'SGD', 0.5)

Val Accuracy: 55.06


In [305]:
model_n2 = BagOfNgram(len(id2token_n2), emb_size)
val_acc = train_proc(model_n2, train_loader_n2, val_loader_n2, 0.01, True, 3, 'SGD', 0.1)

Val Accuracy: 61.92


### 2-grams no tokenization

In [306]:
emb_size = 100

In [307]:
model_n2_ntk = BagOfNgram(len(id2token_n2_ntk), emb_size)
val_acc = train_proc(model_n2_ntk, train_loader_n2_ntk, val_loader_n2_ntk, 0.01, False, 3, 'Adam')

Val Accuracy: 81.04


In [308]:
model_n2_ntk = BagOfNgram(len(id2token_n2_ntk), emb_size)
val_acc = train_proc(model_n2_ntk, train_loader_n2_ntk, val_loader_n2_ntk, 0.001, False, 3, 'Adam')

Val Accuracy: 81.86


In [309]:
model_n2_ntk = BagOfNgram(len(id2token_n2_ntk), emb_size)
val_acc = train_proc(model_n2_ntk, train_loader_n2_ntk, val_loader_n2_ntk, 0.01, False, 5, 'Adam')

Val Accuracy: 81.04


In [310]:
model_n2_ntk = BagOfNgram(len(id2token_n2_ntk), emb_size)
val_acc = train_proc(model_n2_ntk, train_loader_n2_ntk, val_loader_n2_ntk, 0.001, False, 5, 'Adam')

Val Accuracy: 83.92


In [311]:
model_n2_ntk = BagOfNgram(len(id2token_n2_ntk), emb_size)
val_acc = train_proc(model_n2_ntk, train_loader_n2_ntk, val_loader_n2_ntk, 0.01, True, 3, 'Adam', 0.5)

Val Accuracy: 83.16


In [312]:
model_n2_ntk = BagOfNgram(len(id2token_n2_ntk), emb_size)
val_acc = train_proc(model_n2_ntk, train_loader_n2_ntk, val_loader_n2_ntk, 0.01, True, 3, 'Adam', 0.1)

Val Accuracy: 83.2


In [313]:
model_n2_ntk = BagOfNgram(len(id2token_n2_ntk), emb_size)
val_acc = train_proc(model_n2_ntk, train_loader_n2_ntk, val_loader_n2_ntk, 0.01, False, 3, 'SGD')

Val Accuracy: 54.38


In [314]:
model_n2_ntk = BagOfNgram(len(id2token_n2_ntk), emb_size)
val_acc = train_proc(model_n2_ntk, train_loader_n2_ntk, val_loader_n2_ntk, 0.001, False, 3, 'SGD')

Val Accuracy: 53.66


In [315]:
model_n2_ntk = BagOfNgram(len(id2token_n2_ntk), emb_size)
val_acc = train_proc(model_n2_ntk, train_loader_n2_ntk, val_loader_n2_ntk, 0.01, False, 5, 'SGD')

Val Accuracy: 50.14


In [316]:
model_n2_ntk = BagOfNgram(len(id2token_n2_ntk), emb_size)
val_acc = train_proc(model_n2_ntk, train_loader_n2_ntk, val_loader_n2_ntk, 0.001, False, 5, 'SGD')

Val Accuracy: 50.18


In [317]:
model_n2_ntk = BagOfNgram(len(id2token_n2_ntk), emb_size)
val_acc = train_proc(model_n2_ntk, train_loader_n2_ntk, val_loader_n2_ntk, 0.01, True, 3, 'SGD', 0.5)

Val Accuracy: 51.22


In [318]:
model_n2_ntk = BagOfNgram(len(id2token_n2_ntk), emb_size)
val_acc = train_proc(model_n2_ntk, train_loader_n2_ntk, val_loader_n2_ntk, 0.01, True, 5, 'SGD', 0.1)

Val Accuracy: 57.34


In [319]:
emb_size = 200

In [320]:
model_n2_ntk = BagOfNgram(len(id2token_n2_ntk), emb_size)
val_acc = train_proc(model_n2_ntk, train_loader_n2_ntk, val_loader_n2_ntk, 0.01, False, 3, 'Adam')

Val Accuracy: 81.2


In [321]:
model_n2_ntk = BagOfNgram(len(id2token_n2_ntk), emb_size)
val_acc = train_proc(model_n2_ntk, train_loader_n2_ntk, val_loader_n2_ntk, 0.001, False, 3, 'Adam')

Val Accuracy: 82.78


In [322]:
model_n2_ntk = BagOfNgram(len(id2token_n2_ntk), emb_size)
val_acc = train_proc(model_n2_ntk, train_loader_n2_ntk, val_loader_n2_ntk, 0.01, False, 5, 'Adam')

Val Accuracy: 80.24


In [323]:
model_n2_ntk = BagOfNgram(len(id2token_n2_ntk), emb_size)
val_acc = train_proc(model_n2_ntk, train_loader_n2_ntk, val_loader_n2_ntk, 0.001, False, 5, 'Adam')

Val Accuracy: 83.72


In [324]:
model_n2_ntk = BagOfNgram(len(id2token_n2_ntk), emb_size)
val_acc = train_proc(model_n2_ntk, train_loader_n2_ntk, val_loader_n2_ntk, 0.01, True, 3, 'Adam', 0.5)

Val Accuracy: 82.48


In [325]:
model_n2_ntk = BagOfNgram(len(id2token_n2_ntk), emb_size)
val_acc = train_proc(model_n2_ntk, train_loader_n2_ntk, val_loader_n2_ntk, 0.01, True, 3, 'Adam', 0.1)

Val Accuracy: 83.54


In [326]:
model_n2_ntk = BagOfNgram(len(id2token_n2_ntk), emb_size)
val_acc = train_proc(model_n2_ntk, train_loader_n2_ntk, val_loader_n2_ntk, 0.01, False, 3, 'SGD')

Val Accuracy: 50.72


In [327]:
model_n2_ntk = BagOfNgram(len(id2token_n2_ntk), emb_size)
val_acc = train_proc(model_n2_ntk, train_loader_n2_ntk, val_loader_n2_ntk, 0.001, False, 3, 'SGD')

Val Accuracy: 57.7


In [328]:
model_n2_ntk = BagOfNgram(len(id2token_n2_ntk), emb_size)
val_acc = train_proc(model_n2_ntk, train_loader_n2_ntk, val_loader_n2_ntk, 0.01, False, 5, 'SGD')

Val Accuracy: 57.64


In [329]:
model_n2_ntk = BagOfNgram(len(id2token_n2_ntk), emb_size)
val_acc = train_proc(model_n2_ntk, train_loader_n2_ntk, val_loader_n2_ntk, 0.001, False, 5, 'SGD')

Val Accuracy: 53.88


In [330]:
model_n2_ntk = BagOfNgram(len(id2token_n2_ntk), emb_size)
val_acc = train_proc(model_n2_ntk, train_loader_n2_ntk, val_loader_n2_ntk, 0.01, True, 3, 'SGD', 0.5)

Val Accuracy: 54.4


In [331]:
model_n2_ntk = BagOfNgram(len(id2token_n2_ntk), emb_size)
val_acc = train_proc(model_n2_ntk, train_loader_n2_ntk, val_loader_n2_ntk, 0.01, True, 5, 'SGD', 0.1)

Val Accuracy: 58.46


### 3-grams

In [332]:
emb_size = 100

In [333]:
model_n3 = BagOfNgram(len(id2token_n3), emb_size)
val_acc = train_proc(model_n3, train_loader_n3, val_loader_n3, 0.01, False, 3, 'Adam')

Val Accuracy: 76.94


In [334]:
model_n3 = BagOfNgram(len(id2token_n3), emb_size)
val_acc = train_proc(model_n3, train_loader_n3, val_loader_n3, 0.001, False, 3, 'Adam')

Val Accuracy: 74.66


In [335]:
model_n3 = BagOfNgram(len(id2token_n3), emb_size)
val_acc = train_proc(model_n3, train_loader_n3, val_loader_n3, 0.01, False, 5, 'Adam')

Val Accuracy: 76.22


In [336]:
model_n3 = BagOfNgram(len(id2token_n3), emb_size)
val_acc = train_proc(model_n3, train_loader_n3, val_loader_n3, 0.001, False, 5, 'Adam')

Val Accuracy: 76.98


In [337]:
model_n3 = BagOfNgram(len(id2token_n3), emb_size)
val_acc = train_proc(model_n3, train_loader_n3, val_loader_n3, 0.01, True, 3, 'Adam', 0.5)

Val Accuracy: 77.74


In [338]:
model_n3 = BagOfNgram(len(id2token_n3), emb_size)
val_acc = train_proc(model_n3, train_loader_n3, val_loader_n3, 0.01, True, 3, 'Adam', 0.1)

Val Accuracy: 78.52


In [339]:
model_n3 = BagOfNgram(len(id2token_n3), emb_size)
val_acc = train_proc(model_n3, train_loader_n3, val_loader_n3, 0.01, False, 3, 'SGD')

Val Accuracy: 52.5


In [340]:
model_n3 = BagOfNgram(len(id2token_n3), emb_size)
val_acc = train_proc(model_n3, train_loader_n3, val_loader_n3, 0.001, False, 3, 'SGD')

Val Accuracy: 49.86


In [341]:
model_n3 = BagOfNgram(len(id2token_n3), emb_size)
val_acc = train_proc(model_n3, train_loader_n3, val_loader_n3, 0.01, False, 5, 'SGD')

Val Accuracy: 50.26


In [342]:
model_n3 = BagOfNgram(len(id2token_n3), emb_size)
val_acc = train_proc(model_n3, train_loader_n3, val_loader_n3, 0.001, False, 5, 'SGD')

Val Accuracy: 50.46


In [343]:
model_n3 = BagOfNgram(len(id2token_n3), emb_size)
val_acc = train_proc(model_n3, train_loader_n3, val_loader_n3, 0.01, True, 3, 'SGD', 0.5)

Val Accuracy: 49.86


In [344]:
model_n3 = BagOfNgram(len(id2token_n3), emb_size)
val_acc = train_proc(model_n3, train_loader_n3, val_loader_n3, 0.01, True, 3, 'SGD', 0.1)

Val Accuracy: 51.22


In [345]:
emb_size = 200

In [346]:
model_n3 = BagOfNgram(len(id2token_n3), emb_size)
val_acc = train_proc(model_n3, train_loader_n3, val_loader_n3, 0.01, False, 3, 'Adam')

Val Accuracy: 76.72


In [347]:
model_n3 = BagOfNgram(len(id2token_n3), emb_size)
val_acc = train_proc(model_n3, train_loader_n3, val_loader_n3, 0.001, False, 3, 'Adam')

Val Accuracy: 74.58


In [348]:
model_n3 = BagOfNgram(len(id2token_n3), emb_size)
val_acc = train_proc(model_n3, train_loader_n3, val_loader_n3, 0.01, False, 5, 'Adam')

Val Accuracy: 75.66


In [349]:
model_n3 = BagOfNgram(len(id2token_n3), emb_size)
val_acc = train_proc(model_n3, train_loader_n3, val_loader_n3, 0.001, False, 5, 'Adam')

Val Accuracy: 77.82


In [350]:
model_n3 = BagOfNgram(len(id2token_n3), emb_size)
val_acc = train_proc(model_n3, train_loader_n3, val_loader_n3, 0.01, True, 3, 'Adam', 0.5)

Val Accuracy: 77.9


In [351]:
model_n3 = BagOfNgram(len(id2token_n3), emb_size)
val_acc = train_proc(model_n3, train_loader_n3, val_loader_n3, 0.01, True, 3, 'Adam', 0.1)

Val Accuracy: 78.24


In [352]:
model_n3 = BagOfNgram(len(id2token_n3), emb_size)
val_acc = train_proc(model_n3, train_loader_n3, val_loader_n3, 0.01, False, 3, 'SGD')

Val Accuracy: 58.22


In [353]:
model_n3 = BagOfNgram(len(id2token_n3), emb_size)
val_acc = train_proc(model_n3, train_loader_n3, val_loader_n3, 0.001, False, 3, 'SGD')

Val Accuracy: 52.4


In [354]:
model_n3 = BagOfNgram(len(id2token_n3), emb_size)
val_acc = train_proc(model_n3, train_loader_n3, val_loader_n3, 0.01, False, 5, 'SGD')

Val Accuracy: 56.98


In [355]:
model_n3 = BagOfNgram(len(id2token_n3), emb_size)
val_acc = train_proc(model_n3, train_loader_n3, val_loader_n3, 0.001, False, 5, 'SGD')

Val Accuracy: 50.44


In [356]:
model_n3 = BagOfNgram(len(id2token_n3), emb_size)
val_acc = train_proc(model_n3, train_loader_n3, val_loader_n3, 0.01, True, 3, 'SGD', 0.5)

Val Accuracy: 50.16


In [357]:
model_n3 = BagOfNgram(len(id2token_n3), emb_size)
val_acc = train_proc(model_n3, train_loader_n3, val_loader_n3, 0.01, True, 3, 'SGD', 0.1)

Val Accuracy: 57.56


### 3-grams no tokenization

In [358]:
emb_size = 100

In [359]:
model_n3_ntk = BagOfNgram(len(id2token_n3_ntk), emb_size)
val_acc = train_proc(model_n3_ntk, train_loader_n3_ntk, val_loader_n3_ntk, 0.01, False, 3, 'Adam')

Val Accuracy: 75.86


In [360]:
model_n3_ntk = BagOfNgram(len(id2token_n3_ntk), emb_size)
val_acc = train_proc(model_n3_ntk, train_loader_n3_ntk, val_loader_n3_ntk, 0.001, False, 3, 'Adam')

Val Accuracy: 69.24


In [361]:
model_n3_ntk = BagOfNgram(len(id2token_n3_ntk), emb_size)
val_acc = train_proc(model_n3_ntk, train_loader_n3_ntk, val_loader_n3_ntk, 0.01, False, 5, 'Adam')

Val Accuracy: 74.0


In [362]:
model_n3_ntk = BagOfNgram(len(id2token_n3_ntk), emb_size)
val_acc = train_proc(model_n3_ntk, train_loader_n3_ntk, val_loader_n3_ntk, 0.001, False, 5, 'Adam')

Val Accuracy: 76.48


In [363]:
model_n3_ntk = BagOfNgram(len(id2token_n3_ntk), emb_size)
val_acc = train_proc(model_n3_ntk, train_loader_n3_ntk, val_loader_n3_ntk, 0.01, True, 3, 'Adam', 0.5)

Val Accuracy: 77.22


In [364]:
model_n3_ntk = BagOfNgram(len(id2token_n3_ntk), emb_size)
val_acc = train_proc(model_n3_ntk, train_loader_n3_ntk, val_loader_n3_ntk, 0.01, True, 3, 'Adam', 0.1)

Val Accuracy: 77.92


In [365]:
model_n3_ntk = BagOfNgram(len(id2token_n3_ntk), emb_size)
val_acc = train_proc(model_n3_ntk, train_loader_n3_ntk, val_loader_n3_ntk, 0.01, False, 3, 'SGD')

Val Accuracy: 50.16


In [366]:
model_n3_ntk = BagOfNgram(len(id2token_n3_ntk), emb_size)
val_acc = train_proc(model_n3_ntk, train_loader_n3_ntk, val_loader_n3_ntk, 0.001, False, 3, 'SGD')

Val Accuracy: 50.04


In [367]:
model_n3_ntk = BagOfNgram(len(id2token_n3_ntk), emb_size)
val_acc = train_proc(model_n3_ntk, train_loader_n3_ntk, val_loader_n3_ntk, 0.01, False, 5, 'SGD')

Val Accuracy: 50.66


In [368]:
model_n3_ntk = BagOfNgram(len(id2token_n3_ntk), emb_size)
val_acc = train_proc(model_n3_ntk, train_loader_n3_ntk, val_loader_n3_ntk, 0.001, False, 5, 'SGD')

Val Accuracy: 53.46


In [369]:
model_n3_ntk = BagOfNgram(len(id2token_n3_ntk), emb_size)
val_acc = train_proc(model_n3_ntk, train_loader_n3_ntk, val_loader_n3_ntk, 0.01, True, 3, 'SGD', 0.5)

Val Accuracy: 54.5


In [370]:
model_n3_ntk = BagOfNgram(len(id2token_n3_ntk), emb_size)
val_acc = train_proc(model_n3_ntk, train_loader_n3_ntk, val_loader_n3_ntk, 0.01, True, 3, 'SGD', 0.1)

Val Accuracy: 54.04


In [371]:
emb_size = 200

In [372]:
model_n3_ntk = BagOfNgram(len(id2token_n3_ntk), emb_size)
val_acc = train_proc(model_n3_ntk, train_loader_n3_ntk, val_loader_n3_ntk, 0.01, False, 3, 'Adam')

Val Accuracy: 75.46


In [373]:
model_n3_ntk = BagOfNgram(len(id2token_n3_ntk), emb_size)
val_acc = train_proc(model_n3_ntk, train_loader_n3_ntk, val_loader_n3_ntk, 0.001, False, 3, 'Adam')

Val Accuracy: 74.96


In [374]:
model_n3_ntk = BagOfNgram(len(id2token_n3_ntk), emb_size)
val_acc = train_proc(model_n3_ntk, train_loader_n3_ntk, val_loader_n3_ntk, 0.01, False, 5, 'Adam')

Val Accuracy: 74.68


In [375]:
model_n3_ntk = BagOfNgram(len(id2token_n3_ntk), emb_size)
val_acc = train_proc(model_n3_ntk, train_loader_n3_ntk, val_loader_n3_ntk, 0.001, False, 5, 'Adam')

Val Accuracy: 77.16


In [376]:
model_n3_ntk = BagOfNgram(len(id2token_n3_ntk), emb_size)
val_acc = train_proc(model_n3_ntk, train_loader_n3_ntk, val_loader_n3_ntk, 0.01, True, 3, 'Adam', 0.5)

Val Accuracy: 76.44


In [377]:
model_n3_ntk = BagOfNgram(len(id2token_n3_ntk), emb_size)
val_acc = train_proc(model_n3_ntk, train_loader_n3_ntk, val_loader_n3_ntk, 0.01, True, 3, 'Adam', 0.1)

Val Accuracy: 77.66


In [378]:
model_n3_ntk = BagOfNgram(len(id2token_n3_ntk), emb_size)
val_acc = train_proc(model_n3_ntk, train_loader_n3_ntk, val_loader_n3_ntk, 0.01, False, 3, 'SGD')

Val Accuracy: 50.18


In [379]:
model_n3_ntk = BagOfNgram(len(id2token_n3_ntk), emb_size)
val_acc = train_proc(model_n3_ntk, train_loader_n3_ntk, val_loader_n3_ntk, 0.001, False, 3, 'SGD')

Val Accuracy: 49.82


In [380]:
model_n3_ntk = BagOfNgram(len(id2token_n3_ntk), emb_size)
val_acc = train_proc(model_n3_ntk, train_loader_n3_ntk, val_loader_n3_ntk, 0.01, False, 5, 'SGD')

Val Accuracy: 50.2


In [381]:
model_n3_ntk = BagOfNgram(len(id2token_n3_ntk), emb_size)
val_acc = train_proc(model_n3_ntk, train_loader_n3_ntk, val_loader_n3_ntk, 0.001, False, 5, 'SGD')

Val Accuracy: 50.16


In [382]:
model_n3_ntk = BagOfNgram(len(id2token_n3_ntk), emb_size)
val_acc = train_proc(model_n3_ntk, train_loader_n3_ntk, val_loader_n3_ntk, 0.01, True, 3, 'SGD', 0.5)

Val Accuracy: 56.7


In [383]:
model_n3_ntk = BagOfNgram(len(id2token_n3_ntk), emb_size)
val_acc = train_proc(model_n3_ntk, train_loader_n3_ntk, val_loader_n3_ntk, 0.01, True, 3, 'SGD', 0.1)

Val Accuracy: 53.08


### 4-grams

In [384]:
emb_size = 100

In [385]:
model_n4 = BagOfNgram(len(id2token_n4), emb_size)
val_acc = train_proc(model_n4, train_loader_n4, val_loader_n4, 0.01, False, 3, 'Adam')

Val Accuracy: 71.64


In [386]:
model_n4 = BagOfNgram(len(id2token_n4), emb_size)
val_acc = train_proc(model_n4, train_loader_n4, val_loader_n4, 0.001, False, 3, 'Adam')

Val Accuracy: 67.22


In [387]:
model_n4 = BagOfNgram(len(id2token_n4), emb_size)
val_acc = train_proc(model_n4, train_loader_n4, val_loader_n4, 0.01, False, 5, 'Adam')

Val Accuracy: 69.62


In [388]:
model_n4 = BagOfNgram(len(id2token_n4), emb_size)
val_acc = train_proc(model_n4, train_loader_n4, val_loader_n4, 0.001, False, 5, 'Adam')

Val Accuracy: 66.42


In [389]:
model_n4 = BagOfNgram(len(id2token_n4), emb_size)
val_acc = train_proc(model_n4, train_loader_n4, val_loader_n4, 0.01, True, 3, 'Adam', 0.5)

Val Accuracy: 70.48


In [390]:
model_n4 = BagOfNgram(len(id2token_n4), emb_size)
val_acc = train_proc(model_n4, train_loader_n4, val_loader_n4, 0.01, True, 3, 'Adam', 0.1)

Val Accuracy: 72.04


In [391]:
model_n4 = BagOfNgram(len(id2token_n4), emb_size)
val_acc = train_proc(model_n4, train_loader_n4, val_loader_n4, 0.01, False, 3, 'SGD')

Val Accuracy: 50.16


In [392]:
model_n4 = BagOfNgram(len(id2token_n4), emb_size)
val_acc = train_proc(model_n4, train_loader_n4, val_loader_n4, 0.001, False, 3, 'SGD')

Val Accuracy: 50.18


In [393]:
model_n4 = BagOfNgram(len(id2token_n4), emb_size)
val_acc = train_proc(model_n4, train_loader_n4, val_loader_n4, 0.01, False, 5, 'SGD')

Val Accuracy: 49.88


In [394]:
model_n4 = BagOfNgram(len(id2token_n4), emb_size)
val_acc = train_proc(model_n4, train_loader_n4, val_loader_n4, 0.001, False, 5, 'SGD')

Val Accuracy: 49.84


In [395]:
model_n4 = BagOfNgram(len(id2token_n4), emb_size)
val_acc = train_proc(model_n4, train_loader_n4, val_loader_n4, 0.01, True, 3, 'SGD', 0.5)

Val Accuracy: 53.56


In [396]:
model_n4 = BagOfNgram(len(id2token_n4), emb_size)
val_acc = train_proc(model_n4, train_loader_n4, val_loader_n4, 0.01, True, 3, 'SGD', 0.1)

Val Accuracy: 50.18


In [397]:
emb_size = 200

In [398]:
model_n4 = BagOfNgram(len(id2token_n4), emb_size)
val_acc = train_proc(model_n4, train_loader_n4, val_loader_n4, 0.01, False, 3, 'Adam')

Val Accuracy: 71.38


In [399]:
model_n4 = BagOfNgram(len(id2token_n4), emb_size)
val_acc = train_proc(model_n4, train_loader_n4, val_loader_n4, 0.001, False, 3, 'Adam')

Val Accuracy: 68.54


In [400]:
model_n4 = BagOfNgram(len(id2token_n4), emb_size)
val_acc = train_proc(model_n4, train_loader_n4, val_loader_n4, 0.01, False, 5, 'Adam')

Val Accuracy: 70.92


In [401]:
model_n4 = BagOfNgram(len(id2token_n4), emb_size)
val_acc = train_proc(model_n4, train_loader_n4, val_loader_n4, 0.001, False, 5, 'Adam')

Val Accuracy: 71.32


In [402]:
model_n4 = BagOfNgram(len(id2token_n4), emb_size)
val_acc = train_proc(model_n4, train_loader_n4, val_loader_n4, 0.01, True, 3, 'Adam', 0.5)

Val Accuracy: 72.24


In [403]:
model_n4 = BagOfNgram(len(id2token_n4), emb_size)
val_acc = train_proc(model_n4, train_loader_n4, val_loader_n4, 0.01, True, 3, 'Adam', 0.1)

Val Accuracy: 71.74


In [404]:
model_n4 = BagOfNgram(len(id2token_n4), emb_size)
val_acc = train_proc(model_n4, train_loader_n4, val_loader_n4, 0.01, False, 3, 'SGD')

Val Accuracy: 50.42


In [405]:
model_n4 = BagOfNgram(len(id2token_n4), emb_size)
val_acc = train_proc(model_n4, train_loader_n4, val_loader_n4, 0.001, False, 3, 'SGD')

Val Accuracy: 49.88


In [406]:
model_n4 = BagOfNgram(len(id2token_n4), emb_size)
val_acc = train_proc(model_n4, train_loader_n4, val_loader_n4, 0.01, False, 5, 'SGD')

Val Accuracy: 50.18


In [407]:
model_n4 = BagOfNgram(len(id2token_n4), emb_size)
val_acc = train_proc(model_n4, train_loader_n4, val_loader_n4, 0.001, False, 5, 'SGD')

Val Accuracy: 50.16


In [408]:
model_n4 = BagOfNgram(len(id2token_n4), emb_size)
val_acc = train_proc(model_n4, train_loader_n4, val_loader_n4, 0.01, True, 3, 'SGD', 0.5)

Val Accuracy: 50.16


In [409]:
model_n4 = BagOfNgram(len(id2token_n4), emb_size)
val_acc = train_proc(model_n4, train_loader_n4, val_loader_n4, 0.01, True, 3, 'SGD', 0.1)

Val Accuracy: 51.66


### 4-grams no tokenization

In [410]:
emb_size = 100

In [411]:
model_n4_ntk = BagOfNgram(len(id2token_n4_ntk), emb_size)
val_acc = train_proc(model_n4_ntk, train_loader_n4_ntk, val_loader_n4_ntk, 0.01, False, 3, 'Adam')

Val Accuracy: 70.5


In [412]:
model_n4_ntk = BagOfNgram(len(id2token_n4_ntk), emb_size)
val_acc = train_proc(model_n4_ntk, train_loader_n4_ntk, val_loader_n4_ntk, 0.001, False, 3, 'Adam')

Val Accuracy: 58.5


In [413]:
model_n4_ntk = BagOfNgram(len(id2token_n4_ntk), emb_size)
val_acc = train_proc(model_n4_ntk, train_loader_n4_ntk, val_loader_n4_ntk, 0.01, False, 5, 'Adam')

Val Accuracy: 69.88


In [414]:
model_n4_ntk = BagOfNgram(len(id2token_n4_ntk), emb_size)
val_acc = train_proc(model_n4_ntk, train_loader_n4_ntk, val_loader_n4_ntk, 0.001, False, 5, 'Adam')

Val Accuracy: 65.74


In [415]:
model_n4_ntk = BagOfNgram(len(id2token_n4_ntk), emb_size)
val_acc = train_proc(model_n4_ntk, train_loader_n4_ntk, val_loader_n4_ntk, 0.01, True, 3, 'Adam', 0.5)

Val Accuracy: 70.46


In [416]:
model_n4_ntk = BagOfNgram(len(id2token_n4_ntk), emb_size)
val_acc = train_proc(model_n4_ntk, train_loader_n4_ntk, val_loader_n4_ntk, 0.01, True, 3, 'Adam', 0.1)

Val Accuracy: 70.64


In [417]:
model_n4_ntk = BagOfNgram(len(id2token_n4_ntk), emb_size)
val_acc = train_proc(model_n4_ntk, train_loader_n4_ntk, val_loader_n4_ntk, 0.01, False, 3, 'SGD')

Val Accuracy: 50.16


In [418]:
model_n4_ntk = BagOfNgram(len(id2token_n4_ntk), emb_size)
val_acc = train_proc(model_n4_ntk, train_loader_n4_ntk, val_loader_n4_ntk, 0.001, False, 3, 'SGD')

Val Accuracy: 49.86


In [419]:
model_n4_ntk = BagOfNgram(len(id2token_n4_ntk), emb_size)
val_acc = train_proc(model_n4_ntk, train_loader_n4_ntk, val_loader_n4_ntk, 0.01, False, 5, 'SGD')

Val Accuracy: 49.88


In [420]:
model_n4_ntk = BagOfNgram(len(id2token_n4_ntk), emb_size)
val_acc = train_proc(model_n4_ntk, train_loader_n4_ntk, val_loader_n4_ntk, 0.001, False, 5, 'SGD')

Val Accuracy: 50.22


In [421]:
model_n4_ntk = BagOfNgram(len(id2token_n4_ntk), emb_size)
val_acc = train_proc(model_n4_ntk, train_loader_n4_ntk, val_loader_n4_ntk, 0.01, True, 3, 'SGD', 0.5)

Val Accuracy: 49.84


In [422]:
model_n4_ntk = BagOfNgram(len(id2token_n4_ntk), emb_size)
val_acc = train_proc(model_n4_ntk, train_loader_n4_ntk, val_loader_n4_ntk, 0.01, True, 3, 'SGD', 0.1)

Val Accuracy: 49.92


In [423]:
emb_size = 200

In [424]:
model_n4_ntk = BagOfNgram(len(id2token_n4_ntk), emb_size)
val_acc = train_proc(model_n4_ntk, train_loader_n4_ntk, val_loader_n4_ntk, 0.01, False, 3, 'Adam')

Val Accuracy: 70.4


In [425]:
model_n4_ntk = BagOfNgram(len(id2token_n4_ntk), emb_size)
val_acc = train_proc(model_n4_ntk, train_loader_n4_ntk, val_loader_n4_ntk, 0.001, False, 3, 'Adam')

Val Accuracy: 56.0


In [426]:
model_n4_ntk = BagOfNgram(len(id2token_n4_ntk), emb_size)
val_acc = train_proc(model_n4_ntk, train_loader_n4_ntk, val_loader_n4_ntk, 0.01, False, 5, 'Adam')

Val Accuracy: 69.48


In [427]:
model_n4_ntk = BagOfNgram(len(id2token_n4_ntk), emb_size)
val_acc = train_proc(model_n4_ntk, train_loader_n4_ntk, val_loader_n4_ntk, 0.001, False, 5, 'Adam')

Val Accuracy: 65.86


In [428]:
model_n4_ntk = BagOfNgram(len(id2token_n4_ntk), emb_size)
val_acc = train_proc(model_n4_ntk, train_loader_n4_ntk, val_loader_n4_ntk, 0.01, True, 3, 'Adam', 0.5)

Val Accuracy: 70.64


In [429]:
model_n4_ntk = BagOfNgram(len(id2token_n4_ntk), emb_size)
val_acc = train_proc(model_n4_ntk, train_loader_n4_ntk, val_loader_n4_ntk, 0.01, True, 3, 'Adam', 0.1)

Val Accuracy: 71.04


In [430]:
model_n4_ntk = BagOfNgram(len(id2token_n4_ntk), emb_size)
val_acc = train_proc(model_n4_ntk, train_loader_n4_ntk, val_loader_n4_ntk, 0.01, False, 3, 'SGD')

Val Accuracy: 50.18


In [431]:
model_n4_ntk = BagOfNgram(len(id2token_n4_ntk), emb_size)
val_acc = train_proc(model_n4_ntk, train_loader_n4_ntk, val_loader_n4_ntk, 0.001, False, 3, 'SGD')

Val Accuracy: 49.82


In [432]:
model_n4_ntk = BagOfNgram(len(id2token_n4_ntk), emb_size)
val_acc = train_proc(model_n4_ntk, train_loader_n4_ntk, val_loader_n4_ntk, 0.01, False, 5, 'SGD')

Val Accuracy: 49.84


In [433]:
model_n4_ntk = BagOfNgram(len(id2token_n4_ntk), emb_size)
val_acc = train_proc(model_n4_ntk, train_loader_n4_ntk, val_loader_n4_ntk, 0.001, False, 5, 'SGD')

Val Accuracy: 49.86


In [434]:
model_n4_ntk = BagOfNgram(len(id2token_n4_ntk), emb_size)
val_acc = train_proc(model_n4_ntk, train_loader_n4_ntk, val_loader_n4_ntk, 0.01, True, 3, 'SGD', 0.5)

Val Accuracy: 49.86


In [435]:
model_n4_ntk = BagOfNgram(len(id2token_n4_ntk), emb_size)
val_acc = train_proc(model_n4_ntk, train_loader_n4_ntk, val_loader_n4_ntk, 0.01, True, 3, 'SGD', 0.1)

Val Accuracy: 52.16
