In [1]:
import itertools
import os
import random
import pickle as pkl
from collections import Counter
import numpy as np
import torch
from torch.utils.data import Dataset
import spacy
import string
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
shuffled_index = pkl.load(open("shuffled_index.p", "rb"))
training_all_clean = pkl.load(open("training_all_clean.p", "rb"))
training_labels = pkl.load(open("training_labels.p", "rb"))
validation_all_clean = pkl.load(open("validation_all_clean.p", "rb"))
validation_labels = pkl.load(open("validation_labels.p", "rb"))

In [3]:
# convert token to id in the dataset
def token2index_dataset(tokens_data, token2id):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

In [4]:
# save index 0 for unk and 1 for pad
PAD_IDX = 0
UNK_IDX = 1

def build_vocab(all_tokens, max_vocab_size = 10000):
    # Returns:
    # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    # token2id: dictionary where keys represent tokens and corresponding values represent indices
    token_counter = Counter(all_tokens)
    vocab, count = zip(*token_counter.most_common(max_vocab_size))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

In [5]:
grams = [1,2,3]
lemmatize_list = [True,False]

In [6]:
MAX_SENTENCE_LENGTH = 200

In [7]:
class IMDBDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's 
    readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list, target_list):
        """
        @param data_list: list of newsgroup tokens 
        @param target_list: list of newsgroup targets 

        """
        self.data_list = data_list
        self.target_list = target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        token_idx = self.data_list[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [token_idx, len(token_idx), label]

In [8]:
def imdb_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list = []
    label_list = []
    length_list = []
    #print("collate batch: ", batch[0][0])
    #batch[0][0] = batch[0][0][:MAX_SENTENCE_LENGTH]
    for datum in batch:
        label_list.append(datum[2])
        length_list.append(datum[1])
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        data_list.append(padded_vec)
    return [torch.from_numpy(np.array(data_list)), 
            torch.LongTensor(length_list), 
            torch.LongTensor(label_list)]

In [9]:
class BagOfNgrams(nn.Module):
    """
    BagOfNgrams classification model
    """
    def __init__(self, vocab_size, emb_dim):
        """
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding
        """
        super(BagOfNgrams, self).__init__()
        # pay attention to padding_idx 
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.linear = nn.Linear(emb_dim,20)
    
    def forward(self, data, length):
        """
        
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        out = self.embed(data)
        out = torch.sum(out, dim=1)
        out /= length.view(length.size()[0],1).expand_as(out).float()
     
        # return logits
        out = self.linear(out.float())
        return out

In [10]:
# Function for testing the model
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data, lengths, labels in loader:
        data_batch, length_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, length_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

In [11]:
params = [[1e-2,1e-1], ## learning rates
          list(range(1,3)), ## ngrams
          [25000], ## vocab size
          [100], ## embedding size
#          [100,200], ## max sentence length
          [64] ## batch size
         ]

In [12]:
def hyperparameter_search(hyperparameter_space=params,
                          epochs=5,
                          optimizer_name = "Adam",
                          lemmatize = False):

    # returns all the permutations of the parameter search space
    param_space = [*itertools.product(*params)]
    
    # validation loss dictionary
    val_losses = {}
    
    # counter for progress
    count = 0
    
    for param_comb in param_space:
        print("-----------------------------------------------------------")
        print("Parameter Combination = " + str(count+1) + " / " + str(len(param_space)))
        count = count + 1      
        
        NUM_EPOCHS = epochs
        lr_rate = param_comb[0]             # learning rate
        grams = param_comb[1]               # n-grams
        max_vocab_size = int(param_comb[2]) # vocabulary size
        embed_dimension = param_comb[3]     # embedding vector size
        #max_sentence_length = int(param_comb[4]) # max sentence length of data loader
        BATCH_SIZE = param_comb[4]
        
        print("Learning Rate = " + str(lr_rate))
        print("Ngram = " + str(grams))
        print("Vocab Size = " + str(max_vocab_size))
        print("Embedding Dimension = " + str(embed_dimension))
        #print("Max Sentence Length = " + str(max_sentence_length))
        print("Batch Size = " + str(BATCH_SIZE))

        # Tokenization
        # All tokens are created before the hyperparameter search loop
        # Load the tokens here
        if lemmatize == True:
            grams = str(grams) +"_lemma"
        
        train_data_tokens = pkl.load(open("train_data_tokens_"+str(grams)+".p", "rb"))
        all_train_tokens = pkl.load(open("all_train_tokens_"+str(grams)+".p", "rb"))

        val_data_tokens = pkl.load(open("val_data_tokens_"+str(grams)+".p", "rb"))
        
        print("Train dataset size is {}".format(len(train_data_tokens)))
        print("Val dataset size is {}".format(len(val_data_tokens)))
        print("Total number of tokens in train dataset is {}".format(len(all_train_tokens)))
        
        # Building Vocabulary
        # implicitly gets the max_vocab_size parameter
        token2id, id2token = build_vocab(all_train_tokens,
                                         max_vocab_size=max_vocab_size)
        
        # Lets check the dictionary by loading random token from it
        random_token_id = random.randint(0, len(id2token)-1)
        random_token = id2token[random_token_id]
        print ("Token id {} -> token {}".format(random_token_id, id2token[random_token_id]))
        print ("Token {} -> token id {}".format(random_token, token2id[random_token]))
        
        train_data_indices = token2index_dataset(train_data_tokens, 
                                                 token2id = token2id)
        val_data_indices = token2index_dataset(val_data_tokens, 
                                               token2id = token2id)
        # double checking
        print ("Train dataset size is {}".format(len(train_data_indices)))
        print ("Val dataset size is {}".format(len(val_data_indices)))
        
        

        # Load training and validation data
        train_dataset = IMDBDataset(train_data_indices, 
                                    training_labels)
        train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                                   batch_size=BATCH_SIZE,
                                                   collate_fn=imdb_func,
                                                   shuffle=True)

        val_dataset = IMDBDataset(val_data_indices, 
                                  validation_labels)
        val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                                   batch_size=BATCH_SIZE,
                                                   collate_fn=imdb_func,
                                                   shuffle=True)  

        # Initialize the N-gram Model
        model = BagOfNgrams(len(id2token), embed_dimension)
        
        # Both Adam and SGD will be tried
        if optimizer_name == "Adam":
            optimizer = torch.optim.Adam(model.parameters(), lr=lr_rate)
        elif optimizer_name == "SGD":
            optimizer = torch.optim.SGD(model.parameters(), lr=lr_rate)
        else:
            print("this optimizer is not implemented yet")
        
        # Cross Entropy Loss will be used
        criterion = torch.nn.CrossEntropyLoss()  
        
        # Validation Losses will be stored in a list
        # Caution: Two different optimizers
        val_losses[param_comb] = []
        
    #for optimizer in optimizers:
        print("Optimization Start")
        print(optimizer)

        for epoch in range(NUM_EPOCHS):
            for i, (data, lengths, labels) in enumerate(train_loader):
                model.train()
                data_batch, length_batch, label_batch = data, lengths, labels
                optimizer.zero_grad()
                outputs = model(data_batch, length_batch)
                loss = criterion(outputs, label_batch)
                loss.backward()
                optimizer.step()
                # Validate every 100 iterations
                # Adjust it to accustom changing batch sizes
                if i > 0 and i % (50 * (64 / BATCH_SIZE)) == 0:

                    # Accuracy Calculations
                    train_acc = test_model(train_loader, model)
                    val_acc = test_model(val_loader, model)
                    val_losses[param_comb].append(val_acc)

                    # Logging
                    print('Epoch:[{}/{}],Step:[{}/{}],Training Acc:{},Validation Acc:{}'.format( 
                               epoch+1, NUM_EPOCHS, 
                                i+1, len(train_loader), 
                                train_acc, val_acc))
                      
    return val_losses


In [None]:
test_results = hyperparameter_search(hyperparameter_space = params,
                                         epochs = 5,
                                         optimizer_name = "SGD",
                                          lemmatize = False)