In [1]:
import itertools
import os
import random
import pickle as pkl
from collections import Counter
import numpy as np
import torch
from torch.utils.data import Dataset
import spacy
import string
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
shuffled_index = pkl.load(open("shuffled_index.p", "rb"))
training_all_clean = pkl.load(open("training_all_clean.p", "rb"))
training_labels = pkl.load(open("training_labels.p", "rb"))
validation_all_clean = pkl.load(open("validation_all_clean.p", "rb"))
validation_labels = pkl.load(open("validation_labels.p", "rb"))

In [14]:
def read_txt_files(folder_path):
    """Reads all .txt files in a folder to a list"""
    
    file_list = os.listdir(folder_path)
    # for debugging, printing out the folder path and some files in it
    print(folder_path)
    print(file_list[:10])
    
    all_reviews = []
    for file_path in file_list:
        f = open(folder_path + file_path,"r")
        all_reviews.append(f.readline())
        
    return all_reviews

data_loc = "data/imdb_reviews/"

test_pos = read_txt_files(folder_path=data_loc+"test/pos/")
print(len(test_pos))
test_neg = read_txt_files(folder_path=data_loc+"test/neg/")
print(len(test_neg))

print("Test Positive examples = " + str(len(test_pos)))
print("Test Negative examples = " + str(len(test_neg)))

data/imdb_reviews/test/pos/
['4715_9.txt', '1930_9.txt', '3205_9.txt', '10186_10.txt', '147_10.txt', '7511_7.txt', '616_10.txt', '10460_10.txt', '3240_9.txt', '1975_9.txt']
12500
data/imdb_reviews/test/neg/
['1821_4.txt', '9487_1.txt', '4604_4.txt', '2828_2.txt', '10890_1.txt', '3351_4.txt', '8070_2.txt', '1027_4.txt', '8248_3.txt', '4290_4.txt']
12500
Test Positive examples = 12500
Test Negative examples = 12500


In [15]:
test_pos_labels = np.ones((len(test_pos),), dtype=int)
test_neg_labels = np.zeros((len(test_neg),), dtype=int)
test_data_labels = np.concatenate((test_pos_labels,test_neg_labels))
print(len(test_data_labels))
test_data_labels

25000


array([1, 1, 1, ..., 0, 0, 0])

In [16]:
import re

def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

test_pos_clean = [cleanhtml(x) for x in test_pos]
test_neg_clean = [cleanhtml(x) for x in test_neg]

In [17]:
test_all_clean = test_pos_clean + test_neg_clean
len(test_all_clean)

25000

In [19]:
import spacy
import string

# Load English tokenizer, tagger, parser, NER and word vectors
tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation

# This is word tokenizer
# # lowercase and remove punctuation
# def tokenize(sent):
#     tokens = tokenizer(sent)
#     return [token.text.lower() for token in tokens if (token.text not in punctuations)]
#     #return [token.text.lower() for token in tokens]
    
# Modified for n-grams
def tokenize(sent, n_gram = 0, lemmatize = False):
    
    tokens = tokenizer(sent)
    
    # unigrams
    if lemmatize == False:
        unigrams = [token.text.lower() for token in tokens if (token.text not in punctuations)]
    else:
        #LEMMATIZED
        unigrams = [token.lemma_.lower() for token in tokens if (token.text not in punctuations)]
    
    
    output = []
    output.extend(unigrams)
    
    n = 2
    while n <= n_gram:
        ngram_tokens = [" ".join(unigrams[x:x+n]) for x in range(len(unigrams)-n+1)]
        output.extend(ngram_tokens)
        n = n + 1
        
    return output

In [20]:
def lower_case_remove_punc(parsed):
    return [token.text.lower() for token in parsed if (token.text not in punctuations)]

def tokenize_dataset(dataset, n_gram, lemmatize = True):
    token_dataset = []
    # we are keeping track of all tokens in dataset
    # in order to create vocabulary later
    all_tokens = []

#     for sample in tqdm_notebook(tokenizer.pipe(dataset, 
#                                                disable=['parser', 'tagger', 'ner'], 
#                                                batch_size=512, 
#                                                n_threads=4)):

    itr = 0
    for sample in dataset:
        
        if itr % 50 == 0:
            print(str(itr) + " / " + str(len(dataset)))
        # unigram version
        #tokens = lower_case_remove_punc(sample)
        
        # n-gram version
        tokens = tokenize(sample,n_gram, lemmatize = lemmatize)
        
        token_dataset.append(tokens)
        all_tokens += tokens
        
        itr = itr + 1

    return token_dataset, all_tokens

In [3]:
# convert token to id in the dataset
def token2index_dataset(tokens_data, token2id):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

In [4]:
# save index 0 for unk and 1 for pad
PAD_IDX = 0
UNK_IDX = 1

def build_vocab(all_tokens, max_vocab_size = 10000):
    # Returns:
    # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    # token2id: dictionary where keys represent tokens and corresponding values represent indices
    token_counter = Counter(all_tokens)
    vocab, count = zip(*token_counter.most_common(max_vocab_size))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

In [56]:
grams = [3]
lemmatize_list = [True]

In [57]:
for lemmatize_arg in lemmatize_list:
    for gram_no in grams:
        print(str(gram_no))

        print ("Tokenizing test data")
        test_data_tokens, _ = tokenize_dataset(test_all_clean,
                                              n_gram = gram_no,
                                               lemmatize = lemmatize_arg)
        pkl.dump(test_data_tokens, open("test_data_tokens_lemma_3.p", "wb"))

3
Tokenizing test data
0 / 25000
50 / 25000
100 / 25000
150 / 25000
200 / 25000
250 / 25000
300 / 25000
350 / 25000
400 / 25000
450 / 25000
500 / 25000
550 / 25000
600 / 25000
650 / 25000
700 / 25000
750 / 25000
800 / 25000
850 / 25000
900 / 25000
950 / 25000
1000 / 25000
1050 / 25000
1100 / 25000
1150 / 25000
1200 / 25000
1250 / 25000
1300 / 25000
1350 / 25000
1400 / 25000
1450 / 25000
1500 / 25000
1550 / 25000
1600 / 25000
1650 / 25000
1700 / 25000
1750 / 25000
1800 / 25000
1850 / 25000
1900 / 25000
1950 / 25000
2000 / 25000
2050 / 25000
2100 / 25000
2150 / 25000
2200 / 25000
2250 / 25000
2300 / 25000
2350 / 25000
2400 / 25000
2450 / 25000
2500 / 25000
2550 / 25000
2600 / 25000
2650 / 25000
2700 / 25000
2750 / 25000
2800 / 25000
2850 / 25000
2900 / 25000
2950 / 25000
3000 / 25000
3050 / 25000
3100 / 25000
3150 / 25000
3200 / 25000
3250 / 25000
3300 / 25000
3350 / 25000
3400 / 25000
3450 / 25000
3500 / 25000
3550 / 25000
3600 / 25000
3650 / 25000
3700 / 25000
3750 / 25000
3800 / 25000

In [25]:
test_data_tokens = pkl.load(open("test_data_tokens.p", "rb"))

In [26]:
MAX_SENTENCE_LENGTH = 200

In [27]:
class IMDBDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's 
    readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list, target_list):
        """
        @param data_list: list of newsgroup tokens 
        @param target_list: list of newsgroup targets 

        """
        self.data_list = data_list
        self.target_list = target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        token_idx = self.data_list[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [token_idx, len(token_idx), label]

In [28]:
def imdb_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list = []
    label_list = []
    length_list = []
    #print("collate batch: ", batch[0][0])
    #batch[0][0] = batch[0][0][:MAX_SENTENCE_LENGTH]
    for datum in batch:
        label_list.append(datum[2])
        length_list.append(datum[1])
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        data_list.append(padded_vec)
    return [torch.from_numpy(np.array(data_list)), 
            torch.LongTensor(length_list), 
            torch.LongTensor(label_list)]

In [29]:
class BagOfNgrams(nn.Module):
    """
    BagOfNgrams classification model
    """
    def __init__(self, vocab_size, emb_dim):
        """
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding
        """
        super(BagOfNgrams, self).__init__()
        # pay attention to padding_idx 
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.linear = nn.Linear(emb_dim,20)
    
    def forward(self, data, length):
        """
        
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        out = self.embed(data)
        out = torch.sum(out, dim=1)
        out /= length.view(length.size()[0],1).expand_as(out).float()
     
        # return logits
        out = self.linear(out.float())
        return out

In [162]:
# Function for testing the model
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    correct_print = 0
    incorrect_print = 0
    model.eval()
    for data, lengths, labels in loader:
        data_batch, length_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, length_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        total += labels.size(0)
#         print(labels)
#         print("-----")
#         print(predicted)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    
        #print(predicted.eq(labels.view_as(predicted)).sum())
    
        
#        print(predicted.eq(labels.view_as(predicted)).sum().item())
        
#         if predicted.eq(labels.view_as(predicted)).sum().item() == 1:
#             if correct_print <= 3:
#                 print("Correct prediction")
#                 correct_print += 1
                
#         else:
#             if incorrect_print <= 3:
#                 print("Incorrect Prediction")
#                 incorrect_print += 1

    prediction_list = [i.item() for i in labels.eq(predicted.view_as(labels))]
    
    for pred_no in range(len(prediction_list)):
        if prediction_list[pred_no] == 1:
            if correct_print <= 3:
                print("Correct Sentiment Classification")
                print(" ".join([id2token[j] for j in batch_data[pred_no].numpy()]))
                correct_print = correct_print + 1
                print("----------------------------------")
        if prediction_list[pred_no] == 0:
            if incorrect_print <= 3:
                print("Incorrect Sentiment Classification")
                print(" ".join([id2token[j] for j in batch_data[pred_no].numpy()]))
                incorrect_print = incorrect_print + 1
                print("----------------------------------")  
                
        
    return (100 * correct / total)

In [163]:
params = [[1e-2], ## learning rates
          [3], ## ngrams
          [25000], ## vocab size
          [100], ## embedding size
#          [100,200], ## max sentence length
          [128] ## batch size
         ]

In [166]:
def hyperparameter_search_test(hyperparameter_space=params,
                          epochs=5,
                          optimizer_name = "Adam",
                          lemmatize = False):

    # returns all the permutations of the parameter search space
    param_space = [*itertools.product(*params)]
    
    # validation loss dictionary
    val_losses = {}
    test_losses = {}
    
    # counter for progress
    count = 0
    
    for param_comb in param_space:
        print("-----------------------------------------------------------")
        print("Parameter Combination = " + str(count+1) + " / " + str(len(param_space)))
        count = count + 1      
        
        NUM_EPOCHS = epochs
        lr_rate = param_comb[0]             # learning rate
        grams = param_comb[1]               # n-grams
        max_vocab_size = int(param_comb[2]) # vocabulary size
        embed_dimension = param_comb[3]     # embedding vector size
        #max_sentence_length = int(param_comb[4]) # max sentence length of data loader
        BATCH_SIZE = param_comb[4]
        
        print("Learning Rate = " + str(lr_rate))
        print("Ngram = " + str(grams))
        print("Vocab Size = " + str(max_vocab_size))
        print("Embedding Dimension = " + str(embed_dimension))
        #print("Max Sentence Length = " + str(max_sentence_length))
        print("Batch Size = " + str(BATCH_SIZE))

        # Tokenization
        # All tokens are created before the hyperparameter search loop
        # Load the tokens here
        if lemmatize == True:
            grams = str(grams) +"_lemma"
        
        train_data_tokens = pkl.load(open("train_data_tokens_"+str(grams)+".p", "rb"))
        all_train_tokens = pkl.load(open("all_train_tokens_"+str(grams)+".p", "rb"))

        val_data_tokens = pkl.load(open("val_data_tokens_"+str(grams)+".p", "rb"))
        
        print("Train dataset size is {}".format(len(train_data_tokens)))
        print("Val dataset size is {}".format(len(val_data_tokens)))
        print("Total number of tokens in train dataset is {}".format(len(all_train_tokens)))
        
        # Building Vocabulary
        # implicitly gets the max_vocab_size parameter
        token2id, id2token = build_vocab(all_train_tokens,
                                         max_vocab_size=max_vocab_size)
        
        # Lets check the dictionary by loading random token from it
        random_token_id = random.randint(0, len(id2token)-1)
        random_token = id2token[random_token_id]
        print ("Token id {} -> token {}".format(random_token_id, id2token[random_token_id]))
        print ("Token {} -> token id {}".format(random_token, token2id[random_token]))
        
        train_data_indices = token2index_dataset(train_data_tokens, 
                                                 token2id = token2id)
        val_data_indices = token2index_dataset(val_data_tokens, 
                                               token2id = token2id)
        # double checking
        print ("Train dataset size is {}".format(len(train_data_indices)))
        print ("Val dataset size is {}".format(len(val_data_indices)))
        
        

        # Load training and validation data
        train_dataset = IMDBDataset(train_data_indices, 
                                    training_labels)
        train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                                   batch_size=BATCH_SIZE,
                                                   collate_fn=imdb_func,
                                                   shuffle=True)

        val_dataset = IMDBDataset(val_data_indices, 
                                  validation_labels)
        val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                                   batch_size=BATCH_SIZE,
                                                   collate_fn=imdb_func,
                                                   shuffle=True)  

        # Initialize the N-gram Model
        model = BagOfNgrams(len(id2token), embed_dimension)
        
        # Both Adam and SGD will be tried
        if optimizer_name == "Adam":
            optimizer = torch.optim.Adam(model.parameters(), lr=lr_rate)
        elif optimizer_name == "SGD":
            optimizer = torch.optim.SGD(model.parameters(), lr=lr_rate)
        else:
            print("this optimizer is not implemented yet")
        
        # Cross Entropy Loss will be used
        criterion = torch.nn.CrossEntropyLoss()  
        
        # Validation Losses will be stored in a list
        # Caution: Two different optimizers
        val_losses[param_comb] = []
        test_losses[param_comb] = []
        
    #for optimizer in optimizers:
        print("Optimization Start")
        print(optimizer)

        for epoch in range(NUM_EPOCHS):
            for i, (data, lengths, labels) in enumerate(train_loader):
                model.train()
                data_batch, length_batch, label_batch = data, lengths, labels
                optimizer.zero_grad()
                outputs = model(data_batch, length_batch)
                loss = criterion(outputs, label_batch)
                loss.backward()
                optimizer.step()
                # Validate every 100 iterations
                # Adjust it to accustom changing batch sizes
                if i > 0 and i % (50 * (64 / BATCH_SIZE)) == 0:

                    # Accuracy Calculations
                    train_acc = test_model(train_loader, model)
                    val_acc = test_model(val_loader, model)
                    val_losses[param_comb].append(val_acc)

                    # Logging
                    print('Epoch:[{}/{}],Step:[{}/{}],Training Acc:{},Validation Acc:{}'.format( 
                               epoch+1, NUM_EPOCHS, 
                                i+1, len(train_loader), 
                                train_acc, val_acc))
                      
        # Testing
        print("Starting Testing -----------")
        #test_data_tokens = pkl.load(open("test_data_tokens.p", "rb"))
        test_data_tokens = pkl.load(open("test_data_tokens_lemma_3.p", "rb"))
        
        test_data_indices = token2index_dataset(test_data_tokens, 
                                       token2id = token2id)
        test_dataset = IMDBDataset(test_data_indices, 
                          test_data_labels)
        test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=imdb_func,
                                           shuffle=True)  
        test_acc = test_model(test_loader, model)
        test_losses[param_comb].append(test_acc)
        
        
        
    return val_losses, test_losses, model


In [167]:
validation_results, test_results, optim_model = hyperparameter_search_test(hyperparameter_space = params,
                                                                         epochs = 2,
                                                                         optimizer_name = "Adam",
                                                                          lemmatize = True)
test_results

-----------------------------------------------------------
Parameter Combination = 1 / 1
Learning Rate = 0.01
Ngram = 3
Vocab Size = 25000
Embedding Dimension = 100
Batch Size = 128
Train dataset size is 20000
Val dataset size is 5000
Total number of tokens in train dataset is 14233317
Token id 22248 -> token see this in
Token see this in -> token id 22248
Train dataset size is 20000
Val dataset size is 5000
Optimization Start
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.01
    weight_decay: 0
)
Incorrect Sentiment Classification
imagine an <unk> remake of the <unk> <unk> with a black chick and a white chick attach to each other set the story on some <unk> island where the drug dealer rule and the revolution have arrive and have the black woman be from <unk> bear 's <unk> of lady and the white woman be a water down <unk> <unk> and -pron- have get black mama white mama in those wan day of the drive in theater this item must have be a big 

Incorrect Sentiment Classification
imagine an <unk> remake of the <unk> <unk> with a black chick and a white chick attach to each other set the story on some <unk> island where the drug dealer rule and the revolution have arrive and have the black woman be from <unk> bear 's <unk> of lady and the white woman be a water down <unk> <unk> and -pron- have get black mama white mama in those wan day of the drive in theater this item must have be a big old hit all the hot button of the <unk> be push in this one even though -pron- both fill out -pron- clothe better and will get a few <unk> from the male in the audience no one be ever go to mistake <unk> <unk> and margaret <unk> for sidney <unk> and tony curtis all right <unk> <unk> and jamie lee curtis margaret and <unk> be prisoner where the guard and the warden look <unk> at the new fish arrive margaret be a rich girl from the state who take up the revolution whilst <unk> 's -pron- basic high price call girl who be be service the local drug 

Correct Sentiment Classification
imagine an <unk> remake of the <unk> <unk> with a black chick and a white chick attach to each other set the story on some <unk> island where the drug dealer rule and the revolution have arrive and have the black woman be from <unk> bear 's <unk> of lady and the white woman be a water down <unk> <unk> and -pron- have get black mama white mama in those wan day of the drive in theater this item must have be a big old hit all the hot button of the <unk> be push in this one even though -pron- both fill out -pron- clothe better and will get a few <unk> from the male in the audience no one be ever go to mistake <unk> <unk> and margaret <unk> for sidney <unk> and tony curtis all right <unk> <unk> and jamie lee curtis margaret and <unk> be prisoner where the guard and the warden look <unk> at the new fish arrive margaret be a rich girl from the state who take up the revolution whilst <unk> 's -pron- basic high price call girl who be be service the local drug <u

Correct Sentiment Classification
imagine an <unk> remake of the <unk> <unk> with a black chick and a white chick attach to each other set the story on some <unk> island where the drug dealer rule and the revolution have arrive and have the black woman be from <unk> bear 's <unk> of lady and the white woman be a water down <unk> <unk> and -pron- have get black mama white mama in those wan day of the drive in theater this item must have be a big old hit all the hot button of the <unk> be push in this one even though -pron- both fill out -pron- clothe better and will get a few <unk> from the male in the audience no one be ever go to mistake <unk> <unk> and margaret <unk> for sidney <unk> and tony curtis all right <unk> <unk> and jamie lee curtis margaret and <unk> be prisoner where the guard and the warden look <unk> at the new fish arrive margaret be a rich girl from the state who take up the revolution whilst <unk> 's -pron- basic high price call girl who be be service the local drug <u

Correct Sentiment Classification
imagine an <unk> remake of the <unk> <unk> with a black chick and a white chick attach to each other set the story on some <unk> island where the drug dealer rule and the revolution have arrive and have the black woman be from <unk> bear 's <unk> of lady and the white woman be a water down <unk> <unk> and -pron- have get black mama white mama in those wan day of the drive in theater this item must have be a big old hit all the hot button of the <unk> be push in this one even though -pron- both fill out -pron- clothe better and will get a few <unk> from the male in the audience no one be ever go to mistake <unk> <unk> and margaret <unk> for sidney <unk> and tony curtis all right <unk> <unk> and jamie lee curtis margaret and <unk> be prisoner where the guard and the warden look <unk> at the new fish arrive margaret be a rich girl from the state who take up the revolution whilst <unk> 's -pron- basic high price call girl who be be service the local drug <u

Correct Sentiment Classification
imagine an <unk> remake of the <unk> <unk> with a black chick and a white chick attach to each other set the story on some <unk> island where the drug dealer rule and the revolution have arrive and have the black woman be from <unk> bear 's <unk> of lady and the white woman be a water down <unk> <unk> and -pron- have get black mama white mama in those wan day of the drive in theater this item must have be a big old hit all the hot button of the <unk> be push in this one even though -pron- both fill out -pron- clothe better and will get a few <unk> from the male in the audience no one be ever go to mistake <unk> <unk> and margaret <unk> for sidney <unk> and tony curtis all right <unk> <unk> and jamie lee curtis margaret and <unk> be prisoner where the guard and the warden look <unk> at the new fish arrive margaret be a rich girl from the state who take up the revolution whilst <unk> 's -pron- basic high price call girl who be be service the local drug <u

Correct Sentiment Classification
imagine an <unk> remake of the <unk> <unk> with a black chick and a white chick attach to each other set the story on some <unk> island where the drug dealer rule and the revolution have arrive and have the black woman be from <unk> bear 's <unk> of lady and the white woman be a water down <unk> <unk> and -pron- have get black mama white mama in those wan day of the drive in theater this item must have be a big old hit all the hot button of the <unk> be push in this one even though -pron- both fill out -pron- clothe better and will get a few <unk> from the male in the audience no one be ever go to mistake <unk> <unk> and margaret <unk> for sidney <unk> and tony curtis all right <unk> <unk> and jamie lee curtis margaret and <unk> be prisoner where the guard and the warden look <unk> at the new fish arrive margaret be a rich girl from the state who take up the revolution whilst <unk> 's -pron- basic high price call girl who be be service the local drug <u

Correct Sentiment Classification
imagine an <unk> remake of the <unk> <unk> with a black chick and a white chick attach to each other set the story on some <unk> island where the drug dealer rule and the revolution have arrive and have the black woman be from <unk> bear 's <unk> of lady and the white woman be a water down <unk> <unk> and -pron- have get black mama white mama in those wan day of the drive in theater this item must have be a big old hit all the hot button of the <unk> be push in this one even though -pron- both fill out -pron- clothe better and will get a few <unk> from the male in the audience no one be ever go to mistake <unk> <unk> and margaret <unk> for sidney <unk> and tony curtis all right <unk> <unk> and jamie lee curtis margaret and <unk> be prisoner where the guard and the warden look <unk> at the new fish arrive margaret be a rich girl from the state who take up the revolution whilst <unk> 's -pron- basic high price call girl who be be service the local drug <u

Correct Sentiment Classification
imagine an <unk> remake of the <unk> <unk> with a black chick and a white chick attach to each other set the story on some <unk> island where the drug dealer rule and the revolution have arrive and have the black woman be from <unk> bear 's <unk> of lady and the white woman be a water down <unk> <unk> and -pron- have get black mama white mama in those wan day of the drive in theater this item must have be a big old hit all the hot button of the <unk> be push in this one even though -pron- both fill out -pron- clothe better and will get a few <unk> from the male in the audience no one be ever go to mistake <unk> <unk> and margaret <unk> for sidney <unk> and tony curtis all right <unk> <unk> and jamie lee curtis margaret and <unk> be prisoner where the guard and the warden look <unk> at the new fish arrive margaret be a rich girl from the state who take up the revolution whilst <unk> 's -pron- basic high price call girl who be be service the local drug <u

Correct Sentiment Classification
imagine an <unk> remake of the <unk> <unk> with a black chick and a white chick attach to each other set the story on some <unk> island where the drug dealer rule and the revolution have arrive and have the black woman be from <unk> bear 's <unk> of lady and the white woman be a water down <unk> <unk> and -pron- have get black mama white mama in those wan day of the drive in theater this item must have be a big old hit all the hot button of the <unk> be push in this one even though -pron- both fill out -pron- clothe better and will get a few <unk> from the male in the audience no one be ever go to mistake <unk> <unk> and margaret <unk> for sidney <unk> and tony curtis all right <unk> <unk> and jamie lee curtis margaret and <unk> be prisoner where the guard and the warden look <unk> at the new fish arrive margaret be a rich girl from the state who take up the revolution whilst <unk> 's -pron- basic high price call girl who be be service the local drug <u

Correct Sentiment Classification
imagine an <unk> remake of the <unk> <unk> with a black chick and a white chick attach to each other set the story on some <unk> island where the drug dealer rule and the revolution have arrive and have the black woman be from <unk> bear 's <unk> of lady and the white woman be a water down <unk> <unk> and -pron- have get black mama white mama in those wan day of the drive in theater this item must have be a big old hit all the hot button of the <unk> be push in this one even though -pron- both fill out -pron- clothe better and will get a few <unk> from the male in the audience no one be ever go to mistake <unk> <unk> and margaret <unk> for sidney <unk> and tony curtis all right <unk> <unk> and jamie lee curtis margaret and <unk> be prisoner where the guard and the warden look <unk> at the new fish arrive margaret be a rich girl from the state who take up the revolution whilst <unk> 's -pron- basic high price call girl who be be service the local drug <u

Correct Sentiment Classification
imagine an <unk> remake of the <unk> <unk> with a black chick and a white chick attach to each other set the story on some <unk> island where the drug dealer rule and the revolution have arrive and have the black woman be from <unk> bear 's <unk> of lady and the white woman be a water down <unk> <unk> and -pron- have get black mama white mama in those wan day of the drive in theater this item must have be a big old hit all the hot button of the <unk> be push in this one even though -pron- both fill out -pron- clothe better and will get a few <unk> from the male in the audience no one be ever go to mistake <unk> <unk> and margaret <unk> for sidney <unk> and tony curtis all right <unk> <unk> and jamie lee curtis margaret and <unk> be prisoner where the guard and the warden look <unk> at the new fish arrive margaret be a rich girl from the state who take up the revolution whilst <unk> 's -pron- basic high price call girl who be be service the local drug <u

Correct Sentiment Classification
imagine an <unk> remake of the <unk> <unk> with a black chick and a white chick attach to each other set the story on some <unk> island where the drug dealer rule and the revolution have arrive and have the black woman be from <unk> bear 's <unk> of lady and the white woman be a water down <unk> <unk> and -pron- have get black mama white mama in those wan day of the drive in theater this item must have be a big old hit all the hot button of the <unk> be push in this one even though -pron- both fill out -pron- clothe better and will get a few <unk> from the male in the audience no one be ever go to mistake <unk> <unk> and margaret <unk> for sidney <unk> and tony curtis all right <unk> <unk> and jamie lee curtis margaret and <unk> be prisoner where the guard and the warden look <unk> at the new fish arrive margaret be a rich girl from the state who take up the revolution whilst <unk> 's -pron- basic high price call girl who be be service the local drug <u

Correct Sentiment Classification
imagine an <unk> remake of the <unk> <unk> with a black chick and a white chick attach to each other set the story on some <unk> island where the drug dealer rule and the revolution have arrive and have the black woman be from <unk> bear 's <unk> of lady and the white woman be a water down <unk> <unk> and -pron- have get black mama white mama in those wan day of the drive in theater this item must have be a big old hit all the hot button of the <unk> be push in this one even though -pron- both fill out -pron- clothe better and will get a few <unk> from the male in the audience no one be ever go to mistake <unk> <unk> and margaret <unk> for sidney <unk> and tony curtis all right <unk> <unk> and jamie lee curtis margaret and <unk> be prisoner where the guard and the warden look <unk> at the new fish arrive margaret be a rich girl from the state who take up the revolution whilst <unk> 's -pron- basic high price call girl who be be service the local drug <u

another well do moral ambiguity piece where the anti hero make -pron- hard to decide who to root for if nothing else the <unk> silence anyone who say there be no good part for actress in movie at least in 1971 there be four excellent part for actress in this film and all be well cast and well execute <unk> <unk> do a fine job as amy and would go on to play <unk> june this must have be the first time an adult male box office star share an extended kiss with a twelve year old girl on camera wonder if there be much controversy about this at the time -pron- be probably polanski 's favorite scene give the fate of amy 's turtle randolph -pron- be no surprise that <unk> grow up to be a hardcore animal right <unk> <unk> page be likewise excellent play a complex character with just the right amount of restraint -pron- be interesting that -pron- die just three day after elizabeth <unk> commit suicide throw -pron- through a fifth floor window as -pron- have also work together in -pron- be a big b

{(0.01, 3, 25000, 100, 128): [85.716]}

## Testing the Optimized Model

In [76]:
params

[[0.01], [3], [25000], [100], [128]]

In [168]:
## Testing
NUM_EPOCHS = 5
lr_rate = params[0][0]             # learning rate
grams = params[1][0]               # n-grams
max_vocab_size = params[2][0] # vocabulary size
embed_dimension = params[3][0]    # embedding vector size
#max_sentence_length = int(param_comb[4]) # max sentence length of data loader
BATCH_SIZE = params[4][0]

all_train_tokens = pkl.load(open("all_train_tokens_3_lemma.p", "rb"))
token2id, id2token = build_vocab(all_train_tokens,
                                 max_vocab_size=max_vocab_size)

test_data_tokens = pkl.load(open("test_data_tokens.p", "rb"))
test_data_indices = token2index_dataset(test_data_tokens, 
                               token2id = token2id)
test_dataset = IMDBDataset(test_data_indices, 
                  test_data_labels)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                   batch_size=BATCH_SIZE,
                                   collate_fn=imdb_func,
                                   shuffle=True)  

def test_model_testing(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    correct_print = 1
    incorrect_print = 1
    model.eval()
    for data, lengths, labels in loader:
        data_batch, length_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, length_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        total += labels.size(0)
#         print(labels)
#         print("-----")
#         print(predicted)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    
    prediction_list = [i.item() for i in labels.eq(predicted.view_as(labels))]
    
#     for pred_no in range(len(prediction_list)):
#         if prediction_list[pred_no] == 1:
#             if correct_print <= 3:
#                 print("Correct Sentiment Classification")
#                 print(" ".join([id2token[j] for j in batch_data[pred_no].numpy()]))
#                 correct_print = correct_print + 1
#                 print("----------------------------------")
#         if prediction_list[pred_no] == 0:
#             if incorrect_print <= 3:
#                 print("Incorrect Sentiment Classification")
#                 print(" ".join([id2token[j] for j in batch_data[pred_no].numpy()]))
#                 incorrect_print = incorrect_print + 1
#                 print("----------------------------------")    
    
    
    
    return (100 * correct / total) , predicted, labels, data_batch

test_acc, predictions, true_labels, batch_data = test_model_testing(test_loader, optim_model)
print(test_acc)

85.776


In [106]:
predictions.view_as(true_labels)

tensor([0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
        1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1])

In [107]:
true_labels

tensor([0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
        1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1])

In [108]:
true_labels.eq(predictions.view_as(true_labels))

tensor([1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1], dtype=torch.uint8)

In [109]:
prediction_list = [i.item() for i in true_labels.eq(predictions.view_as(true_labels))]
print(prediction_list)

[1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1]


In [120]:
for review in batch_data:
    print(review.numpy())

[   10     4     6  3079   139     6   224  7945   156     8  2531     6
  9103    60     2     4    13  3235     5     4    13 15355     8  3199
     2  4162   619    10   355     8     1    39     3  4162   619     5
  4280     2   120     6  1130   216  2162     8  3349     2     8   572
     3   572     4    13     7   411  1409     2     4   111   122  1007
     9     6   111    12   859   859     4  1007     8   246  1260     3
  7945    14  1905     5  2531     3  2545    21    20    13    20    50
     2  1905     5    43    35  4123     1     3 10171  6361    20    10
     5    20    12    19    73  1817     5   130    54    59     7    10
  1140 24967     2  2004    20    50     2     4   239     5   708     2
 17849   443    10  1051     9     6     1   235     9     3   572   412
     2   823    14     4   112     6 24967     7     2   131    68   592
    12   235    81    47     3 14665    15     5     2    71    14     6
    28   235     7  2552     7   110    95  1695   

In [127]:
print(id2token[0:50])

['<pad>', '<unk>', '-pron-', 'the', 'be', 'and', 'a', 'of', 'to', 'in', 'this', '-pron- be', 'that', 'not', 'have', 'movie', 'film', 'as', 'for', 'with', 'do', 'but', 'of the', 'on', "'s", 'be a', 'one', 'in the', 'good', 'see', 'all', 'at', 'make', 'by', 'like', 'an', 'who', '-pron- have', 'do not', 'from', 'so', 'there', 'and -pron-', 'get', 'just', 'or', 'be not', 'about', 'out', 'if']


In [158]:
" ".join([id2token[j] for j in batch_data[0].numpy()])

'this be a stereotype plot a young fighter try to enter a competition when -pron- be not ready and be not select to represent -pron- fighting school this lead to <unk> from the fighting school and naturally -pron- find a strange new master to teach -pron- to fight the fight be not of high standard -pron- be way too simple in a way that 1 1 be simple to every adult the fighter have train and enter the ring but do not do what -pron- train and get an ass <unk> the coach yell do this and do that with no success and after some more of this ridiculous beating -pron- suddenly do what -pron- be tell and hit -pron- opponent once this result in a <unk> point in the fight although -pron- hero have be take a beating of -pron- life up until that point think about the rocky movie and -pron- will have a good point of reference of how much beat -pron- really take the fight be also shoot poorly there final thing that screw this film up be the stupid romance cheesy music and awkward moment be not what -