<h1 id="tocheading">Table of Contents</h1>
<div id="toc"></div>

In [139]:
%%javascript

$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

This script uses bag-of-ngrams approach to sentiment classification using the IMDB review dataset.

# PyTorch

## Data Loading

The dataset was downloaded from: http://ai.stanford.edu/~amaas/data/sentiment/

In [4]:
import os

In [5]:
data_loc = "data/imdb_reviews/"

In [6]:
def read_txt_files(folder_path):
    """Reads all .txt files in a folder to a list"""
    
    file_list = os.listdir(folder_path)
    # for debugging, printing out the folder path and some files in it
    print(folder_path)
    print(file_list[:10])
    
    all_reviews = []
    for file_path in file_list:
        f = open(folder_path + file_path,"r")
        all_reviews.append(f.readline())
        
    return all_reviews

In [7]:
import numpy as np

In [8]:
train_pos = read_txt_files(folder_path=data_loc+"train/pos/")
print(len(train_pos))
train_neg = read_txt_files(folder_path=data_loc+"train/neg/")
print(len(train_neg))
test_pos = read_txt_files(folder_path=data_loc+"test/pos/")
print(len(test_pos))
test_neg = read_txt_files(folder_path=data_loc+"test/neg/")
print(len(test_neg))

data/imdb_reviews/train/pos/
['4715_9.txt', '12390_8.txt', '8329_7.txt', '9063_8.txt', '3092_10.txt', '9865_8.txt', '6639_10.txt', '10460_10.txt', '10331_10.txt', '11606_10.txt']
12500
data/imdb_reviews/train/neg/
['1821_4.txt', '10402_1.txt', '1062_4.txt', '9056_1.txt', '5392_3.txt', '2682_3.txt', '3351_4.txt', '399_2.txt', '10447_1.txt', '10096_1.txt']
12500
data/imdb_reviews/test/pos/
['4715_9.txt', '1930_9.txt', '3205_9.txt', '10186_10.txt', '147_10.txt', '7511_7.txt', '616_10.txt', '10460_10.txt', '3240_9.txt', '1975_9.txt']
12500
data/imdb_reviews/test/neg/
['1821_4.txt', '9487_1.txt', '4604_4.txt', '2828_2.txt', '10890_1.txt', '3351_4.txt', '8070_2.txt', '1027_4.txt', '8248_3.txt', '4290_4.txt']
12500


In [9]:
random_text = np.random.randint(1, high=len(train_pos)-1)
print(random_text)
train_pos[random_text]

12297


"Sudden Impact is the best of the five Dirty Harry movies. They don't come any leaner and meaner than this as Harry romps through a series of violent clashes, with the bad guys getting their just desserts. Which is just the way I like it. Great story too and ably directed by Clint himself. Excellent entertainment."

In [10]:
print("Train Positive examples = " + str(len(train_pos)))
print("Train Negative examples = " + str(len(train_neg)))
print("Test Positive examples = " + str(len(test_pos)))
print("Test Negative examples = " + str(len(test_neg)))

Train Positive examples = 12500
Train Negative examples = 12500
Test Positive examples = 12500
Test Negative examples = 12500


## Data Preparation

### Labeling the training dataset

In [11]:
train_pos_labels = np.ones((len(train_pos),), dtype=int)
train_pos_labels

array([1, 1, 1, ..., 1, 1, 1])

In [12]:
train_neg_labels = np.zeros((len(train_neg),), dtype=int)
train_neg_labels

array([0, 0, 0, ..., 0, 0, 0])

In [13]:
train_data_labels = np.concatenate((train_pos_labels,train_neg_labels))
train_data_labels

array([1, 1, 1, ..., 0, 0, 0])

### Storing the labels of the test set for Test Error Measuring

In [14]:
test_pos_labels = np.ones((len(test_pos),), dtype=int)
test_neg_labels = np.zeros((len(test_neg),), dtype=int)
test_data_labels = np.concatenate((test_pos_labels,test_neg_labels))
print(len(test_data_labels))
test_data_labels

25000


array([1, 1, 1, ..., 0, 0, 0])

## Data Cleaning

### Removing HTML tags

In [15]:
import re

def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

In [16]:
train_pos[random_text]

"Sudden Impact is the best of the five Dirty Harry movies. They don't come any leaner and meaner than this as Harry romps through a series of violent clashes, with the bad guys getting their just desserts. Which is just the way I like it. Great story too and ably directed by Clint himself. Excellent entertainment."

In [17]:
train_pos_clean = [cleanhtml(x) for x in train_pos]
train_neg_clean = [cleanhtml(x) for x in train_neg]

test_pos_clean = [cleanhtml(x) for x in test_pos]
test_neg_clean = [cleanhtml(x) for x in test_neg]

In [18]:
train_pos_clean[random_text]

"Sudden Impact is the best of the five Dirty Harry movies. They don't come any leaner and meaner than this as Harry romps through a series of violent clashes, with the bad guys getting their just desserts. Which is just the way I like it. Great story too and ably directed by Clint himself. Excellent entertainment."

### Replacing dots & question marks & paranthesis with space

It seems that punctuations 

In [19]:
#"asdasdasds.asdasda".replace("."," ")

In [None]:
# def remove_dqmp(review):
    
#     review = review.replace("."," ")
#     review = review.replace("?"," ")
#     review = review.replace(")"," ")
#     review = review.replace("("," ")
    
#     return review

In [None]:
# remove_dqmp(train_pos_clean[random_text])

In [None]:
# train_pos_clean = [remove_dqmp(x) for x in train_pos_clean]
# train_neg_clean = [remove_dqmp(x) for x in train_neg_clean]

## Tokenization

In [78]:
import spacy
import string

# Load English tokenizer, tagger, parser, NER and word vectors
tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation

# This is word tokenizer
# # lowercase and remove punctuation
# def tokenize(sent):
#     tokens = tokenizer(sent)
#     return [token.text.lower() for token in tokens if (token.text not in punctuations)]
#     #return [token.text.lower() for token in tokens]
    
# Modified for n-grams
def tokenize(sent, n_gram = 0):
    
    tokens = tokenizer(sent)
    
    # unigrams
    unigrams = [token.text.lower() for token in tokens if (token.text not in punctuations)]
    output = []
    output.extend(unigrams)
    
    n = 2
    while n <= n_gram:
        ngram_tokens = [" ".join(unigrams[x:x+n]) \
                            for x in range(len(unigrams)-n+1)]
        output.extend(ngram_tokens)
        n = n + 1
        
    return output


In [73]:
random_text = np.random.randint(1, high=len(train_pos)-1)
print(random_text)

9262


In [74]:
train_pos_clean[random_text]

'maybe i identify with this film cause i live in nyc and suffer from bad insomnia but whatever it is, i must praise the filmmaker on a most amazing job. to do what she did with no budget...wow, thats all i can say. really, really good. like no money was spent on this film and it still blew me away. i definitley suggest checking it out if you can. great directing, fantastic score and of course a script that will knock you on your arse. see it.'

In [82]:
# Example
tokens = tokenize(train_pos_clean[random_text], n_gram = 4)
#tokens = tokenize(train_pos_clean[random_text])
print(tokens)

['maybe', 'i', 'identify', 'with', 'this', 'film', 'cause', 'i', 'live', 'in', 'nyc', 'and', 'suffer', 'from', 'bad', 'insomnia', 'but', 'whatever', 'it', 'is', 'i', 'must', 'praise', 'the', 'filmmaker', 'on', 'a', 'most', 'amazing', 'job', 'to', 'do', 'what', 'she', 'did', 'with', 'no', 'budget', '...', 'wow', 'that', 's', 'all', 'i', 'can', 'say', 'really', 'really', 'good', 'like', 'no', 'money', 'was', 'spent', 'on', 'this', 'film', 'and', 'it', 'still', 'blew', 'me', 'away', 'i', 'definitley', 'suggest', 'checking', 'it', 'out', 'if', 'you', 'can', 'great', 'directing', 'fantastic', 'score', 'and', 'of', 'course', 'a', 'script', 'that', 'will', 'knock', 'you', 'on', 'your', 'arse', 'see', 'it', 'maybe i', 'i identify', 'identify with', 'with this', 'this film', 'film cause', 'cause i', 'i live', 'live in', 'in nyc', 'nyc and', 'and suffer', 'suffer from', 'from bad', 'bad insomnia', 'insomnia but', 'but whatever', 'whatever it', 'it is', 'is i', 'i must', 'must praise', 'praise th

### Merging neg and pos examples - Training

In [83]:
# to check the order of concatenation
train_data_labels

array([1, 1, 1, ..., 0, 0, 0])

In [84]:
train_all_clean = train_pos_clean + train_neg_clean
len(train_all_clean)

25000

### Merging neg and pos examples - Test

In [85]:
# to check the order of concatenation
test_data_labels

array([1, 1, 1, ..., 0, 0, 0])

In [86]:
test_all_clean = test_pos_clean + test_neg_clean
len(test_all_clean)

25000

### Training -> Training + Validation

In [87]:
# should be smaller than 25000
training_size = 20000

assert training_size < 25000

In [88]:
shuffled_index = np.random.permutation(len(train_all_clean))
print(len(shuffled_index))
print(shuffled_index)

25000
[15821 15685  4147 ... 18888 20316 23805]


In [89]:
shuffled_index[:training_size]

array([15821, 15685,  4147, ..., 17207, 22378, 14852])

In [90]:
training_all_clean = [train_all_clean[i] for i in shuffled_index[:training_size]]
training_labels = [train_data_labels[i] for i in shuffled_index[:training_size]]
print(len(training_all_clean))
print(len(training_labels))

20000
20000


In [91]:
validation_all_clean = [train_all_clean[i] for i in shuffled_index[training_size:]]
validation_labels = [train_data_labels[i] for i in shuffled_index[training_size:]]
print(len(validation_all_clean))
print(len(validation_labels))

5000
5000


### Tokenizing the whole dataset

In [112]:
def lower_case_remove_punc(parsed):
    return [token.text.lower() for token in parsed if (token.text not in punctuations)]

def tokenize_dataset(dataset, n_gram):
    token_dataset = []
    # we are keeping track of all tokens in dataset
    # in order to create vocabulary later
    all_tokens = []

#     for sample in tqdm_notebook(tokenizer.pipe(dataset, 
#                                                disable=['parser', 'tagger', 'ner'], 
#                                                batch_size=512, 
#                                                n_threads=4)):

    itr = 0
    for sample in dataset:
        
        if itr % 50 == 0:
            print(str(itr) + " / " + str(len(dataset)))
        # unigram version
        #tokens = lower_case_remove_punc(sample)
        
        # n-gram version
        tokens = tokenize(sample,n_gram)
        
        token_dataset.append(tokens)
        all_tokens += tokens
        
        itr = itr + 1

    return token_dataset, all_tokens

In [113]:
from tqdm import tqdm_notebook

In [114]:
import pickle as pkl

In [115]:
# train set tokens
print ("Tokenizing train data")
train_data_tokens, all_train_tokens = tokenize_dataset(training_all_clean,
                                                       n_gram = 2)
pkl.dump(train_data_tokens, open("train_data_tokens.p", "wb"))
pkl.dump(all_train_tokens, open("all_train_tokens.p", "wb"))

Tokenizing train data
0 / 20000
50 / 20000
100 / 20000
150 / 20000
200 / 20000
250 / 20000
300 / 20000
350 / 20000
400 / 20000
450 / 20000
500 / 20000
550 / 20000
600 / 20000
650 / 20000
700 / 20000
750 / 20000
800 / 20000
850 / 20000
900 / 20000
950 / 20000
1000 / 20000
1050 / 20000
1100 / 20000
1150 / 20000
1200 / 20000
1250 / 20000
1300 / 20000
1350 / 20000
1400 / 20000
1450 / 20000
1500 / 20000
1550 / 20000
1600 / 20000
1650 / 20000
1700 / 20000
1750 / 20000
1800 / 20000
1850 / 20000
1900 / 20000
1950 / 20000
2000 / 20000
2050 / 20000
2100 / 20000
2150 / 20000
2200 / 20000
2250 / 20000
2300 / 20000
2350 / 20000
2400 / 20000
2450 / 20000
2500 / 20000
2550 / 20000
2600 / 20000
2650 / 20000
2700 / 20000
2750 / 20000
2800 / 20000
2850 / 20000
2900 / 20000
2950 / 20000
3000 / 20000
3050 / 20000
3100 / 20000
3150 / 20000
3200 / 20000
3250 / 20000
3300 / 20000
3350 / 20000
3400 / 20000
3450 / 20000
3500 / 20000
3550 / 20000
3600 / 20000
3650 / 20000
3700 / 20000
3750 / 20000
3800 / 20000


In [116]:
# val set tokens
print ("Tokenizing val data")
val_data_tokens, _ = tokenize_dataset(validation_all_clean,
                                     n_gram = 2)
pkl.dump(val_data_tokens, open("val_data_tokens.p", "wb"))

Tokenizing val data
0 / 5000
50 / 5000
100 / 5000
150 / 5000
200 / 5000
250 / 5000
300 / 5000
350 / 5000
400 / 5000
450 / 5000
500 / 5000
550 / 5000
600 / 5000
650 / 5000
700 / 5000
750 / 5000
800 / 5000
850 / 5000
900 / 5000
950 / 5000
1000 / 5000
1050 / 5000
1100 / 5000
1150 / 5000
1200 / 5000
1250 / 5000
1300 / 5000
1350 / 5000
1400 / 5000
1450 / 5000
1500 / 5000
1550 / 5000
1600 / 5000
1650 / 5000
1700 / 5000
1750 / 5000
1800 / 5000
1850 / 5000
1900 / 5000
1950 / 5000
2000 / 5000
2050 / 5000
2100 / 5000
2150 / 5000
2200 / 5000
2250 / 5000
2300 / 5000
2350 / 5000
2400 / 5000
2450 / 5000
2500 / 5000
2550 / 5000
2600 / 5000
2650 / 5000
2700 / 5000
2750 / 5000
2800 / 5000
2850 / 5000
2900 / 5000
2950 / 5000
3000 / 5000
3050 / 5000
3100 / 5000
3150 / 5000
3200 / 5000
3250 / 5000
3300 / 5000
3350 / 5000
3400 / 5000
3450 / 5000
3500 / 5000
3550 / 5000
3600 / 5000
3650 / 5000
3700 / 5000
3750 / 5000
3800 / 5000
3850 / 5000
3900 / 5000
3950 / 5000
4000 / 5000
4050 / 5000
4100 / 5000
4150 / 

In [117]:
# test set tokens
print ("Tokenizing test data")
test_data_tokens, _ = tokenize_dataset(test_all_clean,
                                      n_gram = 2)
pkl.dump(test_data_tokens, open("test_data_tokens.p", "wb"))

Tokenizing test data
0 / 25000
50 / 25000
100 / 25000
150 / 25000
200 / 25000
250 / 25000
300 / 25000
350 / 25000
400 / 25000
450 / 25000
500 / 25000
550 / 25000
600 / 25000
650 / 25000
700 / 25000
750 / 25000
800 / 25000
850 / 25000
900 / 25000
950 / 25000
1000 / 25000
1050 / 25000
1100 / 25000
1150 / 25000
1200 / 25000
1250 / 25000
1300 / 25000
1350 / 25000
1400 / 25000
1450 / 25000
1500 / 25000
1550 / 25000
1600 / 25000
1650 / 25000
1700 / 25000
1750 / 25000
1800 / 25000
1850 / 25000
1900 / 25000
1950 / 25000
2000 / 25000
2050 / 25000
2100 / 25000
2150 / 25000
2200 / 25000
2250 / 25000
2300 / 25000
2350 / 25000
2400 / 25000
2450 / 25000
2500 / 25000
2550 / 25000
2600 / 25000
2650 / 25000
2700 / 25000
2750 / 25000
2800 / 25000
2850 / 25000
2900 / 25000
2950 / 25000
3000 / 25000
3050 / 25000
3100 / 25000
3150 / 25000
3200 / 25000
3250 / 25000
3300 / 25000
3350 / 25000
3400 / 25000
3450 / 25000
3500 / 25000
3550 / 25000
3600 / 25000
3650 / 25000
3700 / 25000
3750 / 25000
3800 / 25000
3

In [118]:
print(train_data_tokens[:2])

[['this', 'is', 'surely', 'one', 'of', 'the', 'worst', 'films', 'ever', 'made', 'and', 'released', 'by', 'a', 'major', 'hollywood', 'studio', 'the', 'plot', 'is', 'simply', 'stupid', 'the', 'dialog', 'is', 'written', 'in', 'clichés', 'you', 'can', 'complete', 'a', 'great', 'many', 'sentences', 'in', 'the', 'script', 'because', 'of', 'this', 'the', 'acting', 'is', 'ridiculously', 'bad', 'especially', 'that', 'of', 'rod', 'cameron', 'the', 'choreography', 'is', 'silly', 'and', 'wholly', 'unerotic', 'one', 'can', 'only', 'pity', 'the', 'reviewer', 'who', 'saw', '23-year', 'old', 'yvonne', "'s", 'dance', 'as', 'sexual', 'it', "'s", 'merely', 'very', 'bad', 'choreography', 'the', 'ballet', 'scene', 'in', 'the', 'film', "'s", 'beginning', 'is', 'especially', 'ludicrous', 'if', 'you', 'are', 'into', 'bad', 'movies', 'and', 'enjoy', 'laughing', 'at', 'some', 'of', 'hollywood', "'s", 'turkeys', 'this', 'is', 'for', 'you', 'i', 'bought', 'the', 'colorized', 'version', 'on', 'vhs', 'making', 'the

In [119]:
print(all_train_tokens[0:5])

['this', 'is', 'surely', 'one', 'of']


### Remove blank space tokens

In the above tokenization, some blankspace strings were observed, thus this section adresses that by deleting them from the token list.

In [None]:
# blankspaces = [" ","  ","   "]

In [None]:
# def remove_blankspaces(review):
    
#     review = [x for x in review if x not in blankspaces] 
    
#     return review

In [None]:
# print(remove_blankspaces(tokens))

In [None]:
# train_data_tokens_clean = [remove_blankspaces(token) for token in train_data_tokens]
# len(train_data_tokens_clean)

In [None]:
# all_train_tokens_clean = remove_blankspaces(all_train_tokens)

## Building Vocabulary

In [120]:
len(all_train_tokens)

9538806

In [121]:
len(list(set(all_train_tokens)))

1289344

we are going to create the vocabulary of most common 10,000 tokens in the training set.

In [122]:
import random

In [177]:
from collections import Counter

max_vocab_size = 10000
# save index 0 for unk and 1 for pad
PAD_IDX = 0
UNK_IDX = 1

def build_vocab(all_tokens,vocab_size=max_vocab_size):
    # Returns:
    # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    # token2id: dictionary where keys represent tokens and corresponding values represent indices
    token_counter = Counter(all_tokens)
    vocab, count = zip(*token_counter.most_common(vocab_size))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

token2id, id2token = build_vocab(all_train_tokens,vocab_size=10000)

In [126]:
# Lets check the dictionary by loading random token from it

random_token_id = random.randint(0, len(id2token)-1)
random_token = id2token[random_token_id]

print ("Token id {} ; token {}".format(random_token_id, id2token[random_token_id]))
print ("Token {}; token id {}".format(random_token, token2id[random_token]))

Token id 8255 ; token my money
Token my money; token id 8255


In [127]:
# convert token to id in the dataset
def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

train_data_indices = token2index_dataset(train_data_tokens)
val_data_indices = token2index_dataset(val_data_tokens)
test_data_indices = token2index_dataset(test_data_tokens)

# double checking
print ("Train dataset size is {}".format(len(train_data_indices)))
print ("Val dataset size is {}".format(len(val_data_indices)))
print ("Test dataset size is {}".format(len(test_data_indices)))

Train dataset size is 20000
Val dataset size is 5000
Test dataset size is 25000


## Dataset

In [128]:
MAX_SENTENCE_LENGTH = 200

In [129]:
import numpy as np
import torch
from torch.utils.data import Dataset

In [130]:
class IMDBDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's 
    readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list, target_list):
        """
        @param data_list: list of newsgroup tokens 
        @param target_list: list of newsgroup targets 

        """
        self.data_list = data_list
        self.target_list = target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        token_idx = self.data_list[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [token_idx, len(token_idx), label]

In [131]:
def imdb_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list = []
    label_list = []
    length_list = []
    #print("collate batch: ", batch[0][0])
    #batch[0][0] = batch[0][0][:MAX_SENTENCE_LENGTH]
    for datum in batch:
        label_list.append(datum[2])
        length_list.append(datum[1])
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        data_list.append(padded_vec)
    return [torch.from_numpy(np.array(data_list)), 
            torch.LongTensor(length_list), 
            torch.LongTensor(label_list)]


In [132]:
BATCH_SIZE = 32
train_dataset = IMDBDataset(train_data_indices, training_labels)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=imdb_func,
                                           shuffle=True)

val_dataset = IMDBDataset(val_data_indices, validation_labels)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=imdb_func,
                                           shuffle=True)

test_dataset = IMDBDataset(test_data_indices, test_data_labels)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=imdb_func,
                                           shuffle=False)

## Bag of N-grams

### Training

In [133]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [134]:
class BagOfNgrams(nn.Module):
    """
    BagOfNgrams classification model
    """
    def __init__(self, vocab_size, emb_dim):
        """
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding
        """
        super(BagOfNgrams, self).__init__()
        # pay attention to padding_idx 
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.linear = nn.Linear(emb_dim,20)
    
    def forward(self, data, length):
        """
        
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        out = self.embed(data)
        out = torch.sum(out, dim=1)
        out /= length.view(length.size()[0],1).expand_as(out).float()
     
        # return logits
        out = self.linear(out.float())
        return out

In [135]:
emb_dim = 100
model = BagOfNgrams(len(id2token), emb_dim)

In [136]:
learning_rate = 0.01
num_epochs = 10 # number epoch to train

# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()  
## try both sgd and adam
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [137]:
# Function for testing the model
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data, lengths, labels in loader:
        data_batch, length_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, length_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

In [138]:
for epoch in range(num_epochs):
    for i, (data, lengths, labels) in enumerate(train_loader):
        model.train()
        data_batch, length_batch, label_batch = data, lengths, labels
        optimizer.zero_grad()
        outputs = model(data_batch, length_batch)
        loss = criterion(outputs, label_batch)
        loss.backward()
        optimizer.step()
        # check training score every 100 iterations
        ## validate every 100 iterations
        if i > 0 and i % 50 == 0:
            # validate
            val_acc = test_model(val_loader, model)
            train_acc = test_model(train_loader, model)
            print('Epoch: [{}/{}], Step: [{}/{}], Training Acc: {},Validation Acc: {}'.format( 
                       epoch+1, num_epochs, i+1, 
                len(train_loader), train_acc, val_acc))

Epoch: [1/10], Step: [51/625], Training Acc: 51.965,Validation Acc: 51.7
Epoch: [1/10], Step: [101/625], Training Acc: 69.47,Validation Acc: 67.8
Epoch: [1/10], Step: [151/625], Training Acc: 81.755,Validation Acc: 80.14
Epoch: [1/10], Step: [201/625], Training Acc: 84.655,Validation Acc: 82.76
Epoch: [1/10], Step: [251/625], Training Acc: 86.205,Validation Acc: 84.12
Epoch: [1/10], Step: [301/625], Training Acc: 87.52,Validation Acc: 84.44
Epoch: [1/10], Step: [351/625], Training Acc: 88.47,Validation Acc: 85.52
Epoch: [1/10], Step: [401/625], Training Acc: 88.695,Validation Acc: 85.42
Epoch: [1/10], Step: [451/625], Training Acc: 88.65,Validation Acc: 85.02
Epoch: [1/10], Step: [501/625], Training Acc: 90.575,Validation Acc: 86.12
Epoch: [1/10], Step: [551/625], Training Acc: 90.955,Validation Acc: 86.12
Epoch: [1/10], Step: [601/625], Training Acc: 91.545,Validation Acc: 86.42
Epoch: [2/10], Step: [51/625], Training Acc: 91.66,Validation Acc: 86.42
Epoch: [2/10], Step: [101/625], Tr

Epoch: [10/10], Step: [201/625], Training Acc: 98.11,Validation Acc: 83.42
Epoch: [10/10], Step: [251/625], Training Acc: 96.115,Validation Acc: 82.44
Epoch: [10/10], Step: [301/625], Training Acc: 97.575,Validation Acc: 83.18
Epoch: [10/10], Step: [351/625], Training Acc: 98.245,Validation Acc: 83.76
Epoch: [10/10], Step: [401/625], Training Acc: 98.13,Validation Acc: 83.66
Epoch: [10/10], Step: [451/625], Training Acc: 98.185,Validation Acc: 83.58
Epoch: [10/10], Step: [501/625], Training Acc: 98.445,Validation Acc: 83.36
Epoch: [10/10], Step: [551/625], Training Acc: 97.95,Validation Acc: 82.54
Epoch: [10/10], Step: [601/625], Training Acc: 97.66,Validation Acc: 83.54


## Hyperparameter Search

The hyperparameters we are going to try to optimize are the following:

* n-gram max length
* optimizer choice
* embedding size
* vocab size
* learning rate of the optimizer

And maybe increase the batch size to speed up the optimization process.

In [245]:
import itertools

In [191]:
optimizers = [torch.optim.Adam(model.parameters(), 
                               lr=learning_rate),             
              torch.optim.SGD(model.parameters(), 
                              lr=learning_rate)]


In [192]:
shuffled_index = np.random.permutation(len(train_all_clean))
print(len(shuffled_index))
print(shuffled_index)

shuffled_index[:training_size]

training_all_clean = [train_all_clean[i] for i in shuffled_index[:training_size]]
training_labels = [train_data_labels[i] for i in shuffled_index[:training_size]]
print(len(training_all_clean))
print(len(training_labels))

validation_all_clean = [train_all_clean[i] for i in shuffled_index[training_size:]]
validation_labels = [train_data_labels[i] for i in shuffled_index[training_size:]]
print(len(validation_all_clean))
print(len(validation_labels))

25000
[14269 16316  4649 ...  8312 14965 23410]
20000
20000
5000
5000


In [273]:
from collections import Counter

# save index 0 for unk and 1 for pad
PAD_IDX = 0
UNK_IDX = 1

def build_vocab(all_tokens, max_vocab_size = 10000):
    # Returns:
    # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    # token2id: dictionary where keys represent tokens and corresponding values represent indices
    token_counter = Counter(all_tokens)
    vocab, count = zip(*token_counter.most_common(max_vocab_size))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

#### Save all ngram tokens for easy use

In [204]:
grams = params[1]
grams

[1, 2, 3, 4]

In [220]:
grams = 4

train_data_tokens, all_train_tokens = tokenize_dataset(training_all_clean,
                                                       n_gram=grams)

# Tokenize Validation
val_data_tokens, _ = tokenize_dataset(validation_all_clean,
                                      n_gram=grams)

0 / 20000
50 / 20000
100 / 20000
150 / 20000
200 / 20000
250 / 20000
300 / 20000
350 / 20000
400 / 20000
450 / 20000
500 / 20000
550 / 20000
600 / 20000
650 / 20000
700 / 20000
750 / 20000
800 / 20000
850 / 20000
900 / 20000
950 / 20000
1000 / 20000
1050 / 20000
1100 / 20000
1150 / 20000
1200 / 20000
1250 / 20000
1300 / 20000
1350 / 20000
1400 / 20000
1450 / 20000
1500 / 20000
1550 / 20000
1600 / 20000
1650 / 20000
1700 / 20000
1750 / 20000
1800 / 20000
1850 / 20000
1900 / 20000
1950 / 20000
2000 / 20000
2050 / 20000
2100 / 20000
2150 / 20000
2200 / 20000
2250 / 20000
2300 / 20000
2350 / 20000
2400 / 20000
2450 / 20000
2500 / 20000
2550 / 20000
2600 / 20000
2650 / 20000
2700 / 20000
2750 / 20000
2800 / 20000
2850 / 20000
2900 / 20000
2950 / 20000
3000 / 20000
3050 / 20000
3100 / 20000
3150 / 20000
3200 / 20000
3250 / 20000
3300 / 20000
3350 / 20000
3400 / 20000
3450 / 20000
3500 / 20000
3550 / 20000
3600 / 20000
3650 / 20000
3700 / 20000
3750 / 20000
3800 / 20000
3850 / 20000
3900 / 20

In [225]:
# grams = 1
print(grams)

# val set tokens
print ("Tokenizing val data")
pkl.dump(val_data_tokens, open("val_data_tokens_"+str(grams)+".p", "wb"))

# train set tokens
print ("Tokenizing train data")
pkl.dump(train_data_tokens, open("train_data_tokens_"+str(grams)+".p", "wb"))
pkl.dump(all_train_tokens, open("all_train_tokens_"+str(grams)+".p", "wb"))

4
Tokenizing val data
Tokenizing train data


In [226]:
# print(train_data_tokens[0:2])

In [227]:
# all_train_tokens[:2]

In [228]:
# all_train_tokens[-3:]

In [229]:
# print(val_data_tokens[:2])

In [319]:
def hyperparameter_search(hyperparameter_space=params,
                          epochs=5,
                          optimizer_name = "Adam"):

    # returns all the permutations of the parameter search space
    param_space = [*itertools.product(*params)]
    
    # validation loss dictionary
    val_losses = {}
    
    # counter for progress
    count = 0
    
    for param_comb in param_space:
        print("-----------------------------------------------------------")
        print("Parameter Combination = " + str(count+1) + " / " + str(len(param_space)))
        count = count + 1      
        
        NUM_EPOCHS = epochs
        lr_rate = param_comb[0]             # learning rate
        grams = param_comb[1]               # n-grams
        max_vocab_size = int(param_comb[2]) # vocabulary size
        embed_dimension = param_comb[3]     # embedding vector size
        MAX_SENTENCE_LENGTH = param_comb[4] # max sentence length of data loader
        BATCH_SIZE = param_comb[5]
        
        print("Learning Rate = " + str(lr_rate))
        print("Ngram = " + str(grams))
        print("Vocab Size = " + str(max_vocab_size))
        print("Embedding Dimension = " + str(embed_dimension))
        print("Max Sentence Length = " + str(MAX_SENTENCE_LENGTH))
        print("Batch Size = " + str(BATCH_SIZE))

        # Tokenization
        # All tokens are created before the hyperparameter search loop
        # Load the tokens here
        train_data_tokens = pkl.load(open("train_data_tokens_"+str(grams)+".p", "rb"))
        all_train_tokens = pkl.load(open("all_train_tokens_"+str(grams)+".p", "rb"))

        val_data_tokens = pkl.load(open("val_data_tokens_"+str(grams)+".p", "rb"))
        
        print ("Train dataset size is {}".format(len(train_data_tokens)))
        print ("Val dataset size is {}".format(len(val_data_tokens)))
        print ("Total number of tokens in train dataset is {}".format(len(all_train_tokens)))
        
        # Building Vocabulary
        # implicitly gets the max_vocab_size parameter
        token2id, id2token = build_vocab(all_train_tokens,
                                         max_vocab_size=max_vocab_size)
        
        # Lets check the dictionary by loading random token from it
        random_token_id = random.randint(0, len(id2token)-1)
        random_token = id2token[random_token_id]
        print ("Token id {} -> token {}".format(random_token_id, id2token[random_token_id]))
        print ("Token {} -> token id {}".format(random_token, token2id[random_token]))
        
        train_data_indices = token2index_dataset(train_data_tokens)
        val_data_indices = token2index_dataset(val_data_tokens)
        # double checking
        print ("Train dataset size is {}".format(len(train_data_indices)))
        print ("Val dataset size is {}".format(len(val_data_indices)))
        
        

        # Load training and validation data
        train_dataset = IMDBDataset(train_data_indices, 
                                    training_labels)
        train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                                   batch_size=BATCH_SIZE,
                                                   collate_fn=imdb_func,
                                                   shuffle=True)

        val_dataset = IMDBDataset(val_data_indices, 
                                  validation_labels)
        val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                                   batch_size=BATCH_SIZE,
                                                   collate_fn=imdb_func,
                                                   shuffle=True)  

        # Initialize the N-gram Model
        model = BagOfNgrams(len(id2token), embed_dimension)
        
        # Both Adam and SGD will be tried
        if optimizer_name == "Adam":
            optimizer = torch.optim.Adam(model.parameters(), lr=lr_rate)
        elif optimizer_name == "SGD":
            optimizer = torch.optim.SGD(model.parameters(), lr=lr_rate)
        else:
            print("this optimizer is not implemented yet")
        
        # Cross Entropy Loss will be used
        criterion = torch.nn.CrossEntropyLoss()  
        
        # Validation Losses will be stored in a list
        # Caution: Two different optimizers
        val_losses[param_comb] = []
        
    #for optimizer in optimizers:
        print("Optimization Start")
        print(optimizer)

        for epoch in range(NUM_EPOCHS):
            for i, (data, lengths, labels) in enumerate(train_loader):
                model.train()
                data_batch, length_batch, label_batch = data, lengths, labels
                optimizer.zero_grad()
                outputs = model(data_batch, length_batch)
                loss = criterion(outputs, label_batch)
                loss.backward()
                optimizer.step()
                # Validate every 100 iterations
                # Adjust it to accustom changing batch sizes
                if i > 0 and i % (50 * (64 / BATCH_SIZE)) == 0:

                    # Accuracy Calculations
                    train_acc = test_model(train_loader, model)
                    val_acc = test_model(val_loader, model)
                    val_losses[param_comb].append(val_acc)

                    # Logging
                    print('Epoch:[{}/{}],Step:[{}/{}],Training Acc:{},Validation Acc:{}'.format( 
                               epoch+1, NUM_EPOCHS, 
                                i+1, len(train_loader), 
                                train_acc, val_acc))
                      
    return val_losses


### Setting the Search Space

In [None]:
params = [[1e-2,1e-1,5e-1,1,2], ## learning rates
          list(range(1,5)), ## ngrams
          [1e5,1e6], ## vocab size
          [100,150,200], ## embedding size
          [100,200], ## max sentence length
          [64,128] ## batch size
         ]

# params = [[1e-1,1,2,5], ## learning rates
#           list(range(1,2)), ## ngrams
#           [1e5], ## vocab size
#           [100], ## embedding size
#           [100], ## max sentence length
#           [64] ## batch size
#          ]

print(len([*itertools.product(*params)]))
[*itertools.product(*params)]

480


[(0.01, 1, 100000.0, 100, 100, 64),
 (0.01, 1, 100000.0, 100, 100, 128),
 (0.01, 1, 100000.0, 100, 200, 64),
 (0.01, 1, 100000.0, 100, 200, 128),
 (0.01, 1, 100000.0, 150, 100, 64),
 (0.01, 1, 100000.0, 150, 100, 128),
 (0.01, 1, 100000.0, 150, 200, 64),
 (0.01, 1, 100000.0, 150, 200, 128),
 (0.01, 1, 100000.0, 200, 100, 64),
 (0.01, 1, 100000.0, 200, 100, 128),
 (0.01, 1, 100000.0, 200, 200, 64),
 (0.01, 1, 100000.0, 200, 200, 128),
 (0.01, 1, 1000000.0, 100, 100, 64),
 (0.01, 1, 1000000.0, 100, 100, 128),
 (0.01, 1, 1000000.0, 100, 200, 64),
 (0.01, 1, 1000000.0, 100, 200, 128),
 (0.01, 1, 1000000.0, 150, 100, 64),
 (0.01, 1, 1000000.0, 150, 100, 128),
 (0.01, 1, 1000000.0, 150, 200, 64),
 (0.01, 1, 1000000.0, 150, 200, 128),
 (0.01, 1, 1000000.0, 200, 100, 64),
 (0.01, 1, 1000000.0, 200, 100, 128),
 (0.01, 1, 1000000.0, 200, 200, 64),
 (0.01, 1, 1000000.0, 200, 200, 128),
 (0.01, 2, 100000.0, 100, 100, 64),
 (0.01, 2, 100000.0, 100, 100, 128),
 (0.01, 2, 100000.0, 100, 200, 64),
 (0

### Running the Grid Search

#### Adam

In [None]:
param_val_losses_adam = hyperparameter_search(hyperparameter_space = params,
                                         epochs = 2,
                                         optimizer_name = "Adam")

-----------------------------------------------------------
Parameter Combination = 1 / 480
Learning Rate = 0.01
Ngram = 1
Vocab Size = 100000
Embedding Dimension = 100
Max Sentence Length = 100
Batch Size = 64
Train dataset size is 20000
Val dataset size is 5000
Total number of tokens in train dataset is 4763856
Token id 1948 -> token fate
Token fate -> token id 1948
Train dataset size is 20000
Val dataset size is 5000
Optimization Start
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.01
    weight_decay: 0
)
Epoch:[1/2],Step:[51/313],Training Acc:70.62,Validation Acc:69.36
Epoch:[1/2],Step:[101/313],Training Acc:81.99,Validation Acc:80.88
Epoch:[1/2],Step:[151/313],Training Acc:85.16,Validation Acc:83.86
Epoch:[1/2],Step:[201/313],Training Acc:86.715,Validation Acc:84.4
Epoch:[1/2],Step:[251/313],Training Acc:88.31,Validation Acc:85.72
Epoch:[1/2],Step:[301/313],Training Acc:89.01,Validation Acc:85.8
Epoch:[2/2],Step:[51/313],Training Acc:

-----------------------------------------------------------
Parameter Combination = 7 / 480
Learning Rate = 0.01
Ngram = 1
Vocab Size = 100000
Embedding Dimension = 150
Max Sentence Length = 200
Batch Size = 64
Train dataset size is 20000
Val dataset size is 5000
Total number of tokens in train dataset is 4763856
Token id 68072 -> token astaire).it
Token astaire).it -> token id 68072
Train dataset size is 20000
Val dataset size is 5000
Optimization Start
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.01
    weight_decay: 0
)
Epoch:[1/2],Step:[51/313],Training Acc:74.67,Validation Acc:73.32
Epoch:[1/2],Step:[101/313],Training Acc:83.455,Validation Acc:81.96
Epoch:[1/2],Step:[151/313],Training Acc:84.85,Validation Acc:82.68
Epoch:[1/2],Step:[201/313],Training Acc:87.645,Validation Acc:84.82
Epoch:[1/2],Step:[251/313],Training Acc:88.325,Validation Acc:84.92
Epoch:[1/2],Step:[301/313],Training Acc:89.78,Validation Acc:86.24
Epoch:[2/2],Step:[5

#### SGD

In [None]:
param_val_losses_sgd = hyperparameter_search(hyperparameter_space = params,
                                         epochs = 10,
                                         optimizer_name = "SGD")

### Analyzing the Results

In [329]:
for key, value in param_val_losses_adam.items():
    print (key, value)

(0.1, 1, 100000.0, 100, 100, 64) [82.26, 83.48, 84.34, 85.2, 83.78, 85.28, 84.88, 82.8, 84.84, 84.4, 83.16, 84.8]
(0.1, 1, 100000.0, 100, 100, 128) [83.24, 78.62, 84.92, 85.38, 85.7, 85.4, 85.28, 84.44, 85.2, 84.86, 85.4, 85.96]
(1, 1, 100000.0, 100, 100, 64) [78.52, 79.34, 64.98, 82.32, 79.98, 79.36, 75.4, 79.26, 80.8, 75.32, 80.42, 81.56]
(1, 1, 100000.0, 100, 100, 128) [51.9, 52.48, 75.2, 82.9, 75.08, 73.48, 84.02, 84.46, 80.56, 83.08, 83.2, 80.08]
(2, 1, 100000.0, 100, 100, 64) [78.58, 52.7, 79.3, 77.6, 69.44, 78.26, 81.92, 81.8, 73.88, 74.9, 79.58, 79.26]
(2, 1, 100000.0, 100, 100, 128) [68.06, 67.5, 80.56, 68.62, 82.84, 83.54, 80.02, 82.04, 82.1, 81.46, 83.42, 83.66]
(5, 1, 100000.0, 100, 100, 64) [81.5, 76.28, 70.52, 56.0, 74.78, 72.3, 71.08, 79.52, 82.86, 81.08, 74.76, 77.86]
(5, 1, 100000.0, 100, 100, 128) [77.3, 71.52, 82.48, 81.4, 75.44, 82.62, 78.06, 79.76, 81.08, 83.5, 74.34, 81.64]


In [None]:
for key, value in param_val_losses_sgd.items():
    print (key, value)

### Validation Accuracy Plots