<h1 id="tocheading">Table of Contents</h1>
<div id="toc"></div>

In [1]:
%%javascript

$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

This script uses bag-of-ngrams approach to sentiment classification using the IMDB review dataset.

# PyTorch

## Data Loading

The dataset was downloaded from: http://ai.stanford.edu/~amaas/data/sentiment/

In [2]:
import os

In [3]:
data_loc = "data/imdb_reviews/"

In [4]:
def read_txt_files(folder_path):
    """Reads all .txt files in a folder to a list"""
    
    file_list = os.listdir(folder_path)
    # for debugging, printing out the folder path and some files in it
    print(folder_path)
    print(file_list[:10])
    
    all_reviews = []
    for file_path in file_list:
        f = open(folder_path + file_path,"r")
        all_reviews.append(f.readline())
        
    return all_reviews

In [5]:
import numpy as np

In [6]:
train_pos = read_txt_files(folder_path=data_loc+"train/pos/")
print(len(train_pos))
train_neg = read_txt_files(folder_path=data_loc+"train/neg/")
print(len(train_neg))
test_pos = read_txt_files(folder_path=data_loc+"test/pos/")
print(len(test_pos))
test_neg = read_txt_files(folder_path=data_loc+"test/neg/")
print(len(test_neg))

data/imdb_reviews/train/pos/
['4715_9.txt', '12390_8.txt', '8329_7.txt', '9063_8.txt', '3092_10.txt', '9865_8.txt', '6639_10.txt', '10460_10.txt', '10331_10.txt', '11606_10.txt']
12500
data/imdb_reviews/train/neg/
['1821_4.txt', '10402_1.txt', '1062_4.txt', '9056_1.txt', '5392_3.txt', '2682_3.txt', '3351_4.txt', '399_2.txt', '10447_1.txt', '10096_1.txt']
12500
data/imdb_reviews/test/pos/
['4715_9.txt', '1930_9.txt', '3205_9.txt', '10186_10.txt', '147_10.txt', '7511_7.txt', '616_10.txt', '10460_10.txt', '3240_9.txt', '1975_9.txt']
12500
data/imdb_reviews/test/neg/
['1821_4.txt', '9487_1.txt', '4604_4.txt', '2828_2.txt', '10890_1.txt', '3351_4.txt', '8070_2.txt', '1027_4.txt', '8248_3.txt', '4290_4.txt']
12500


In [7]:
random_text = np.random.randint(1, high=len(train_pos)-1)
print(random_text)
train_pos[random_text]

7752


"It is hard to describe Bug in words, it is one of those films that truly has to be seen to be understood. It follows a narrative that is more fluid and interesting than anything I have seen lately in a Hollywood release. As its characters react to the chain of events in different ways, and as the events dictate different paths for the characters to follow, the audience is merely an observer. The almost Proustian narrative flow of thought to thought, the very spontaneity in the script will have you glued to the screen, waiting anxiously to see how it all works out in the end. And as far as the thematic elements...there is a particular sequence in the film that goes from melancholy, to bright and beautiful, and then to tragic, all within the span of about a minute. And it works.<br /><br />This movie is pure magic. It reminds one why independent film is perhaps the brightest star the film industry currently has. Perhaps with more movies of Bug's quality, people will start to take notice

In [8]:
print("Train Positive examples = " + str(len(train_pos)))
print("Train Negative examples = " + str(len(train_neg)))
print("Test Positive examples = " + str(len(test_pos)))
print("Test Negative examples = " + str(len(test_neg)))

Train Positive examples = 12500
Train Negative examples = 12500
Test Positive examples = 12500
Test Negative examples = 12500


## Data Preparation

### Labeling the training dataset

In [9]:
train_pos_labels = np.ones((len(train_pos),), dtype=int)
train_pos_labels

array([1, 1, 1, ..., 1, 1, 1])

In [10]:
train_neg_labels = np.zeros((len(train_neg),), dtype=int)
train_neg_labels

array([0, 0, 0, ..., 0, 0, 0])

In [11]:
train_data_labels = np.concatenate((train_pos_labels,train_neg_labels))
train_data_labels

array([1, 1, 1, ..., 0, 0, 0])

### Storing the labels of the test set for Test Error Measuring

In [14]:
test_pos_labels = np.ones((len(test_pos),), dtype=int)
test_neg_labels = np.zeros((len(test_neg),), dtype=int)
test_data_labels = np.concatenate((test_pos_labels,test_neg_labels))
print(len(test_data_labels))
test_data_labels

25000


array([1, 1, 1, ..., 0, 0, 0])

## Data Cleaning

### Removing HTML tags

In [15]:
import re

def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

In [16]:
train_pos[random_text]

"It is hard to describe Bug in words, it is one of those films that truly has to be seen to be understood. It follows a narrative that is more fluid and interesting than anything I have seen lately in a Hollywood release. As its characters react to the chain of events in different ways, and as the events dictate different paths for the characters to follow, the audience is merely an observer. The almost Proustian narrative flow of thought to thought, the very spontaneity in the script will have you glued to the screen, waiting anxiously to see how it all works out in the end. And as far as the thematic elements...there is a particular sequence in the film that goes from melancholy, to bright and beautiful, and then to tragic, all within the span of about a minute. And it works.<br /><br />This movie is pure magic. It reminds one why independent film is perhaps the brightest star the film industry currently has. Perhaps with more movies of Bug's quality, people will start to take notice

In [19]:
train_pos_clean = [cleanhtml(x) for x in train_pos]
train_neg_clean = [cleanhtml(x) for x in train_neg]

test_pos_clean = [cleanhtml(x) for x in test_pos]
test_neg_clean = [cleanhtml(x) for x in test_neg]

In [20]:
train_pos_clean[random_text]

"It is hard to describe Bug in words, it is one of those films that truly has to be seen to be understood. It follows a narrative that is more fluid and interesting than anything I have seen lately in a Hollywood release. As its characters react to the chain of events in different ways, and as the events dictate different paths for the characters to follow, the audience is merely an observer. The almost Proustian narrative flow of thought to thought, the very spontaneity in the script will have you glued to the screen, waiting anxiously to see how it all works out in the end. And as far as the thematic elements...there is a particular sequence in the film that goes from melancholy, to bright and beautiful, and then to tragic, all within the span of about a minute. And it works.This movie is pure magic. It reminds one why independent film is perhaps the brightest star the film industry currently has. Perhaps with more movies of Bug's quality, people will start to take notice."

### Replacing dots & question marks & paranthesis with space

It seems that punctuations 

In [54]:
#"asdasdasds.asdasda".replace("."," ")

In [55]:
# def remove_dqmp(review):
    
#     review = review.replace("."," ")
#     review = review.replace("?"," ")
#     review = review.replace(")"," ")
#     review = review.replace("("," ")
    
#     return review

In [56]:
# remove_dqmp(train_pos_clean[random_text])

In [57]:
# train_pos_clean = [remove_dqmp(x) for x in train_pos_clean]
# train_neg_clean = [remove_dqmp(x) for x in train_neg_clean]

## Tokenization

In [22]:
import spacy
import string

# Load English tokenizer, tagger, parser, NER and word vectors
tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation

# lowercase and remove punctuation
def tokenize(sent):
    tokens = tokenizer(sent)
    return [token.text.lower() for token in tokens if (token.text not in punctuations)]
    #return [token.text.lower() for token in tokens]

In [51]:
random_text = np.random.randint(1, high=len(train_pos)-1)
print(random_text)

11271


In [52]:
# Example
tokens = tokenize(train_pos_clean[random_text])
print(tokens)

['this', 'movie', 'is', "n't", 'as', 'bad', 'as', 'i', 'heard', 'it', 'was', 'enjoyable', 'funny', 'and', 'i', 'love', 'that', 'is', 'revolves', 'around', 'the', 'holiday', 'season', 'it', 'totally', 'has', 'me', 'in', 'the', 'mood', 'to', 'christmas', 'shop', 'and', 'listen', 'to', 'holiday', 'music', 'when', 'this', 'movie', 'comes', 'out', 'on', 'dvd', 'it', 'will', 'take', 'the', 'place', 'of', 'christmas', 'vacation', 'in', 'my', 'collection', 'it', 'will', 'be', 'a', 'movie', 'to', 'watch', 'every', 'year', 'after', 'thanksgiving', 'to', 'get', 'me', 'in', 'the', 'mood', 'for', 'the', 'best', 'time', 'of', 'the', 'year', 'i', 'heard', 'that', 'ben', "'s", 'character', 'was', 'a', 'bit', 'crazy', 'but', 'i', 'think', 'it', 'just', 'adds', 'to', 'the', 'movie', 'and', 'why', 'be', 'so', 'serious', 'all', 'the', 'time', 'take', 'it', 'for', 'what', 'is', 'it', 'a', 'christmas', 'comedy', 'with', 'a', 'love', 'twist', 'i', 'enjoyed', 'it', 'no', 'it', 'is', "n't", 'titanic', 'and', '

### Merging neg and pos examples - Training

In [63]:
# to check the order of concatenation
train_data_labels

array([1, 1, 1, ..., 0, 0, 0])

In [67]:
train_all_clean = train_pos_clean + train_neg_clean
len(train_all_clean)

25000

### Merging neg and pos examples - Test

In [68]:
# to check the order of concatenation
test_data_labels

array([1, 1, 1, ..., 0, 0, 0])

In [69]:
test_all_clean = test_pos_clean + test_neg_clean
len(test_all_clean)

25000

### Training -> Training + Validation

In [70]:
# should be smaller than 25000
training_size = 20000

assert training_size < 25000

In [73]:
shuffled_index = np.random.permutation(len(train_all_clean))
print(len(shuffled_index))
print(shuffled_index)

25000
[10029  2819   807 ...  2673 15023 18442]


In [80]:
shuffled_index[:training_size]

array([10029,  2819,   807, ...,  2573,  5749,  3890])

In [84]:
training_all_clean = [train_all_clean[i] for i in shuffled_index[:training_size]]
training_labels = [train_data_labels[i] for i in shuffled_index[:training_size]]
print(len(training_all_clean))
print(len(training_labels))

20000
20000


In [86]:
validation_all_clean = [train_all_clean[i] for i in shuffled_index[training_size:]]
validation_labels = [train_data_labels[i] for i in shuffled_index[training_size:]]
print(len(validation_all_clean))
print(len(validation_labels))

5000
5000


### Tokenizing the whole dataset

In [83]:
def lower_case_remove_punc(parsed):
    return [token.text.lower() for token in parsed if (token.text not in punctuations)]

def tokenize_dataset(dataset):
    token_dataset = []
    # we are keeping track of all tokens in dataset
    # in order to create vocabulary later
    all_tokens = []

    for sample in tqdm_notebook(tokenizer.pipe(dataset, disable=['parser', 'tagger', 'ner'], batch_size=512, n_threads=1)):
        tokens = lower_case_remove_punc(sample)
        token_dataset.append(tokens)
        all_tokens += tokens

    return token_dataset, all_tokens

In [60]:
from tqdm import tqdm_notebook

In [88]:
import pickle as pkl

In [89]:
# train set tokens
print ("Tokenizing train data")
train_data_tokens, all_train_tokens = tokenize_dataset(training_all_clean)
pkl.dump(train_data_tokens, open("train_data_tokens.p", "wb"))
pkl.dump(all_train_tokens, open("all_train_tokens.p", "wb"))

Tokenizing train data





In [90]:
# val set tokens
print ("Tokenizing val data")
val_data_tokens, _ = tokenize_dataset(validation_all_clean)
pkl.dump(val_data_tokens, open("val_data_tokens.p", "wb"))

Tokenizing val data





In [91]:
# test set tokens
print ("Tokenizing test data")
test_data_tokens, _ = tokenize_dataset(test_all_clean)
pkl.dump(test_data_tokens, open("test_data_tokens.p", "wb"))

Tokenizing test data





In [92]:
print(train_data_tokens[:2])

[['and', 'i', 'really', 'mean', 'that', 'i', 'caught', 'it', 'last', 'night', 'on', 'vh1', 'and', 'i', 'was', 'not', 'expecting', 'it', 'to', 'be', 'so', 'good', 'this', 'is', 'now', 'one', 'of', 'my', 'favorites', 'i', 'must', 'add', 'that', 'it', 'has', 'a', 'killer', 'soundtrack'], ['emilio', 'miraglia', "'s", 'first', 'giallo', 'feature', 'the', 'night', 'evelyn', 'came', 'out', 'of', 'the', 'grave', 'was', 'a', 'great', 'combination', 'of', 'giallo', 'and', 'gothic', 'horror', 'and', 'this', 'second', 'film', 'is', 'even', 'better', 'we', "'ve", 'got', 'more', 'of', 'the', 'giallo', 'side', 'of', 'the', 'equation', 'this', 'time', 'around', 'although', 'miraglia', 'does', "n't", 'lose', 'the', 'gothic', 'horror', 'stylings', 'that', 'made', 'the', 'earlier', 'film', 'such', 'a', 'delight', 'miraglia', 'puts', 'more', 'emphasis', 'on', 'the', 'finer', 'details', 'of', 'the', 'plot', 'this', 'time', 'around', 'and', 'as', 'a', 'result', 'it', "'s", 'the', 'typical', 'giallo', 'labyr

In [93]:
print(all_train_tokens[0:5])

['and', 'i', 'really', 'mean', 'that']


### Remove blank space tokens

In the above tokenization, some blankspace strings were observed, thus this section adresses that by deleting them from the token list.

In [None]:
# blankspaces = [" ","  ","   "]

In [None]:
# def remove_blankspaces(review):
    
#     review = [x for x in review if x not in blankspaces] 
    
#     return review

In [None]:
# print(remove_blankspaces(tokens))

In [None]:
# train_data_tokens_clean = [remove_blankspaces(token) for token in train_data_tokens]
# len(train_data_tokens_clean)

In [None]:
# all_train_tokens_clean = remove_blankspaces(all_train_tokens)

## Building Vocabulary

In [94]:
len(all_train_tokens)

4756705

In [95]:
len(list(set(all_train_tokens)))

82991

we are going to create the vocabulary of most common 10,000 tokens in the training set.

In [96]:
import random

In [98]:
from collections import Counter

max_vocab_size = 10000
# save index 0 for unk and 1 for pad
PAD_IDX = 0
UNK_IDX = 1

def build_vocab(all_tokens):
    # Returns:
    # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    # token2id: dictionary where keys represent tokens and corresponding values represent indices
    token_counter = Counter(all_tokens)
    vocab, count = zip(*token_counter.most_common(max_vocab_size))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

token2id, id2token = build_vocab(all_train_tokens)

In [100]:
# Lets check the dictionary by loading random token from it

random_token_id = random.randint(0, len(id2token)-1)
random_token = id2token[random_token_id]

print ("Token id {} ; token {}".format(random_token_id, id2token[random_token_id]))
print ("Token {}; token id {}".format(random_token, token2id[random_token]))

Token id 434 ; token friend
Token friend; token id 434


In [102]:
# convert token to id in the dataset
def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

train_data_indices = token2index_dataset(train_data_tokens)
val_data_indices = token2index_dataset(val_data_tokens)
test_data_indices = token2index_dataset(test_data_tokens)

# double checking
print ("Train dataset size is {}".format(len(train_data_indices)))
print ("Val dataset size is {}".format(len(val_data_indices)))
print ("Test dataset size is {}".format(len(test_data_indices)))

Train dataset size is 20000
Val dataset size is 5000
Test dataset size is 25000


## Dataset

In [103]:
MAX_SENTENCE_LENGTH = 200

In [104]:
import numpy as np
import torch
from torch.utils.data import Dataset

In [105]:
class IMDBDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's 
    readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list, target_list):
        """
        @param data_list: list of newsgroup tokens 
        @param target_list: list of newsgroup targets 

        """
        self.data_list = data_list
        self.target_list = target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        token_idx = self.data_list[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [token_idx, len(token_idx), label]

In [106]:
def imdb_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list = []
    label_list = []
    length_list = []
    #print("collate batch: ", batch[0][0])
    #batch[0][0] = batch[0][0][:MAX_SENTENCE_LENGTH]
    for datum in batch:
        label_list.append(datum[2])
        length_list.append(datum[1])
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        data_list.append(padded_vec)
    return [torch.from_numpy(np.array(data_list)), 
            torch.LongTensor(length_list), 
            torch.LongTensor(label_list)]


In [111]:
BATCH_SIZE = 32
train_dataset = IMDBDataset(train_data_indices, training_labels)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=imdb_func,
                                           shuffle=True)

val_dataset = IMDBDataset(val_data_indices, validation_labels)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=imdb_func,
                                           shuffle=True)

test_dataset = IMDBDataset(test_data_indices, test_data_labels)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=imdb_func,
                                           shuffle=False)

## Bag of N-grams

### Training

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
class BagOfNgrams(nn.Module):
    """
    BagOfNgrams classification model
    """
    def __init__(self, vocab_size, emb_dim):
        """
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding
        """
        super(BagOfNgrams, self).__init__()
        # pay attention to padding_idx 
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.linear = nn.Linear(emb_dim,20)
    
    def forward(self, data, length):
        """
        
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        out = self.embed(data)
        out = torch.sum(out, dim=1)
        out /= length.view(length.size()[0],1).expand_as(out).float()
     
        # return logits
        out = self.linear(out.float())
        return out

In [None]:
emb_dim = 100
model = BagOfNgrams(len(id2token), emb_dim)

In [None]:
learning_rate = 0.01
num_epochs = 5 # number epoch to train

# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()  
## try both sgd and adam
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
# Function for testing the model
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data, lengths, labels in loader:
        data_batch, length_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, length_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

In [None]:
for epoch in range(num_epochs):
    for i, (data, lengths, labels) in enumerate(train_loader):
        model.train()
        data_batch, length_batch, label_batch = data, lengths, labels
        optimizer.zero_grad()
        outputs = model(data_batch, length_batch)
        loss = criterion(outputs, label_batch)
        loss.backward()
        optimizer.step()
        # check training score every 100 iterations
        ## validate every 100 iterations
        if i > 0 and i % 50 == 0:
            # validate
            val_acc = test_model(val_loader, model)
            train_acc = test_model(train_loader, model)
            print('Epoch: [{}/{}], Step: [{}/{}], Training Acc: {}, \
                        Validation Acc: {}'.format( 
                       epoch+1, num_epochs, i+1, 
                len(train_loader), train_acc, val_acc))