<h1 id="tocheading">Table of Contents</h1>
<div id="toc"></div>

In [3]:
%%javascript

$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

This script uses bag-of-ngrams approach to sentiment classification using the IMDB review dataset.

# PyTorch

## Data Loading

The dataset was downloaded from: http://ai.stanford.edu/~amaas/data/sentiment/

In [4]:
import os

In [5]:
data_loc = "data/imdb_reviews/"

In [6]:
def read_txt_files(folder_path):
    """Reads all .txt files in a folder to a list"""
    
    file_list = os.listdir(folder_path)
    # for debugging, printing out the folder path and some files in it
    print(folder_path)
    print(file_list[:10])
    
    all_reviews = []
    for file_path in file_list:
        f = open(folder_path + file_path,"r")
        all_reviews.append(f.readline())
        
    return all_reviews

In [7]:
import numpy as np

In [8]:
train_pos = read_txt_files(folder_path=data_loc+"train/pos/")
print(len(train_pos))
train_neg = read_txt_files(folder_path=data_loc+"train/neg/")
print(len(train_neg))
test_pos = read_txt_files(folder_path=data_loc+"test/pos/")
print(len(test_pos))
test_neg = read_txt_files(folder_path=data_loc+"test/neg/")
print(len(test_neg))

data/imdb_reviews/train/pos/
['4715_9.txt', '12390_8.txt', '8329_7.txt', '9063_8.txt', '3092_10.txt', '9865_8.txt', '6639_10.txt', '10460_10.txt', '10331_10.txt', '11606_10.txt']
12500
data/imdb_reviews/train/neg/
['1821_4.txt', '10402_1.txt', '1062_4.txt', '9056_1.txt', '5392_3.txt', '2682_3.txt', '3351_4.txt', '399_2.txt', '10447_1.txt', '10096_1.txt']
12500
data/imdb_reviews/test/pos/
['4715_9.txt', '1930_9.txt', '3205_9.txt', '10186_10.txt', '147_10.txt', '7511_7.txt', '616_10.txt', '10460_10.txt', '3240_9.txt', '1975_9.txt']
12500
data/imdb_reviews/test/neg/
['1821_4.txt', '9487_1.txt', '4604_4.txt', '2828_2.txt', '10890_1.txt', '3351_4.txt', '8070_2.txt', '1027_4.txt', '8248_3.txt', '4290_4.txt']
12500


In [9]:
random_text = np.random.randint(1, high=len(train_pos)-1)
print(random_text)
train_pos[random_text]

12297


"Sudden Impact is the best of the five Dirty Harry movies. They don't come any leaner and meaner than this as Harry romps through a series of violent clashes, with the bad guys getting their just desserts. Which is just the way I like it. Great story too and ably directed by Clint himself. Excellent entertainment."

In [10]:
print("Train Positive examples = " + str(len(train_pos)))
print("Train Negative examples = " + str(len(train_neg)))
print("Test Positive examples = " + str(len(test_pos)))
print("Test Negative examples = " + str(len(test_neg)))

Train Positive examples = 12500
Train Negative examples = 12500
Test Positive examples = 12500
Test Negative examples = 12500


## Data Preparation

### Labeling the training dataset

In [11]:
train_pos_labels = np.ones((len(train_pos),), dtype=int)
train_pos_labels

array([1, 1, 1, ..., 1, 1, 1])

In [12]:
train_neg_labels = np.zeros((len(train_neg),), dtype=int)
train_neg_labels

array([0, 0, 0, ..., 0, 0, 0])

In [13]:
train_data_labels = np.concatenate((train_pos_labels,train_neg_labels))
train_data_labels

array([1, 1, 1, ..., 0, 0, 0])

### Storing the labels of the test set for Test Error Measuring

In [14]:
test_pos_labels = np.ones((len(test_pos),), dtype=int)
test_neg_labels = np.zeros((len(test_neg),), dtype=int)
test_data_labels = np.concatenate((test_pos_labels,test_neg_labels))
print(len(test_data_labels))
test_data_labels

25000


array([1, 1, 1, ..., 0, 0, 0])

## Data Cleaning

### Removing HTML tags

In [15]:
import re

def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

In [16]:
train_pos[random_text]

"Sudden Impact is the best of the five Dirty Harry movies. They don't come any leaner and meaner than this as Harry romps through a series of violent clashes, with the bad guys getting their just desserts. Which is just the way I like it. Great story too and ably directed by Clint himself. Excellent entertainment."

In [17]:
train_pos_clean = [cleanhtml(x) for x in train_pos]
train_neg_clean = [cleanhtml(x) for x in train_neg]

test_pos_clean = [cleanhtml(x) for x in test_pos]
test_neg_clean = [cleanhtml(x) for x in test_neg]

In [18]:
train_pos_clean[random_text]

"Sudden Impact is the best of the five Dirty Harry movies. They don't come any leaner and meaner than this as Harry romps through a series of violent clashes, with the bad guys getting their just desserts. Which is just the way I like it. Great story too and ably directed by Clint himself. Excellent entertainment."

### Replacing dots & question marks & paranthesis with space

It seems that punctuations 

In [19]:
#"asdasdasds.asdasda".replace("."," ")

In [None]:
# def remove_dqmp(review):
    
#     review = review.replace("."," ")
#     review = review.replace("?"," ")
#     review = review.replace(")"," ")
#     review = review.replace("("," ")
    
#     return review

In [None]:
# remove_dqmp(train_pos_clean[random_text])

In [None]:
# train_pos_clean = [remove_dqmp(x) for x in train_pos_clean]
# train_neg_clean = [remove_dqmp(x) for x in train_neg_clean]

## Tokenization

In [35]:
import spacy
import string

# Load English tokenizer, tagger, parser, NER and word vectors
tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation

# This is word tokenizer
# # lowercase and remove punctuation
# def tokenize(sent):
#     tokens = tokenizer(sent)
#     return [token.text.lower() for token in tokens if (token.text not in punctuations)]
#     #return [token.text.lower() for token in tokens]
    
# Modified for n-grams
def tokenize(sent, n_gram = 0):
    
    tokens = tokenizer(sent)
    
    if n_gram >= 1:
        unigrams = [token.text.lower() for token in tokens if (token.text not in punctuations)]
        output = []
        for k in range(n_gram):
            for i in range(len(unigrams)-(k+1)+1):
                output.append(unigrams[i:i+(k+1)])
        return output
    #return [token.text.lower() for token in tokens]

In [36]:
random_text = np.random.randint(1, high=len(train_pos)-1)
print(random_text)

11800


In [37]:
train_pos_clean[random_text]

"Partially from the perceived need, one feels, to include a conventional love story in the plot to make the film more marketable to a 1950's movie-going public. The film starts with some wickedly funny characterizations of the upper-class bureaucrats running the Foreign Office --- the British are pilloried in the way that only the British can pillory themselves. But after that, the film loses its way in a conventional farcical plot. Terry-Thomas watchable as always, but the great talent in the cast (Peter Sellers, et al) is largely wasted.A diverting, but not great film."

In [38]:
# Example
tokens = tokenize(train_pos_clean[random_text], n_gram = 3)
print(tokens)

[['partially'], ['from'], ['the'], ['perceived'], ['need'], ['one'], ['feels'], ['to'], ['include'], ['a'], ['conventional'], ['love'], ['story'], ['in'], ['the'], ['plot'], ['to'], ['make'], ['the'], ['film'], ['more'], ['marketable'], ['to'], ['a'], ['1950'], ["'s"], ['movie'], ['going'], ['public'], ['the'], ['film'], ['starts'], ['with'], ['some'], ['wickedly'], ['funny'], ['characterizations'], ['of'], ['the'], ['upper'], ['class'], ['bureaucrats'], ['running'], ['the'], ['foreign'], ['office'], ['---'], ['the'], ['british'], ['are'], ['pilloried'], ['in'], ['the'], ['way'], ['that'], ['only'], ['the'], ['british'], ['can'], ['pillory'], ['themselves'], ['but'], ['after'], ['that'], ['the'], ['film'], ['loses'], ['its'], ['way'], ['in'], ['a'], ['conventional'], ['farcical'], ['plot'], ['terry'], ['thomas'], ['watchable'], ['as'], ['always'], ['but'], ['the'], ['great'], ['talent'], ['in'], ['the'], ['cast'], ['peter'], ['sellers'], ['et'], ['al'], ['is'], ['largely'], ['wasted'],

### Merging neg and pos examples - Training

In [None]:
# to check the order of concatenation
train_data_labels

In [None]:
train_all_clean = train_pos_clean + train_neg_clean
len(train_all_clean)

### Merging neg and pos examples - Test

In [None]:
# to check the order of concatenation
test_data_labels

In [None]:
test_all_clean = test_pos_clean + test_neg_clean
len(test_all_clean)

### Training -> Training + Validation

In [None]:
# should be smaller than 25000
training_size = 20000

assert training_size < 25000

In [None]:
shuffled_index = np.random.permutation(len(train_all_clean))
print(len(shuffled_index))
print(shuffled_index)

In [None]:
shuffled_index[:training_size]

In [None]:
training_all_clean = [train_all_clean[i] for i in shuffled_index[:training_size]]
training_labels = [train_data_labels[i] for i in shuffled_index[:training_size]]
print(len(training_all_clean))
print(len(training_labels))

In [None]:
validation_all_clean = [train_all_clean[i] for i in shuffled_index[training_size:]]
validation_labels = [train_data_labels[i] for i in shuffled_index[training_size:]]
print(len(validation_all_clean))
print(len(validation_labels))

### Tokenizing the whole dataset

In [None]:
def lower_case_remove_punc(parsed):
    return [token.text.lower() for token in parsed if (token.text not in punctuations)]

def tokenize_dataset(dataset):
    token_dataset = []
    # we are keeping track of all tokens in dataset
    # in order to create vocabulary later
    all_tokens = []

    for sample in tqdm_notebook(tokenizer.pipe(dataset, disable=['parser', 'tagger', 'ner'], batch_size=512, n_threads=1)):
        tokens = lower_case_remove_punc(sample)
        token_dataset.append(tokens)
        all_tokens += tokens

    return token_dataset, all_tokens

In [None]:
from tqdm import tqdm_notebook

In [None]:
import pickle as pkl

In [None]:
# train set tokens
print ("Tokenizing train data")
train_data_tokens, all_train_tokens = tokenize_dataset(training_all_clean)
pkl.dump(train_data_tokens, open("train_data_tokens.p", "wb"))
pkl.dump(all_train_tokens, open("all_train_tokens.p", "wb"))

In [None]:
# val set tokens
print ("Tokenizing val data")
val_data_tokens, _ = tokenize_dataset(validation_all_clean)
pkl.dump(val_data_tokens, open("val_data_tokens.p", "wb"))

In [None]:
# test set tokens
print ("Tokenizing test data")
test_data_tokens, _ = tokenize_dataset(test_all_clean)
pkl.dump(test_data_tokens, open("test_data_tokens.p", "wb"))

In [None]:
print(train_data_tokens[:2])

In [None]:
print(all_train_tokens[0:5])

### Remove blank space tokens

In the above tokenization, some blankspace strings were observed, thus this section adresses that by deleting them from the token list.

In [None]:
# blankspaces = [" ","  ","   "]

In [None]:
# def remove_blankspaces(review):
    
#     review = [x for x in review if x not in blankspaces] 
    
#     return review

In [None]:
# print(remove_blankspaces(tokens))

In [None]:
# train_data_tokens_clean = [remove_blankspaces(token) for token in train_data_tokens]
# len(train_data_tokens_clean)

In [None]:
# all_train_tokens_clean = remove_blankspaces(all_train_tokens)

## Building Vocabulary

In [None]:
len(all_train_tokens)

In [None]:
len(list(set(all_train_tokens)))

we are going to create the vocabulary of most common 10,000 tokens in the training set.

In [None]:
import random

In [None]:
from collections import Counter

max_vocab_size = 10000
# save index 0 for unk and 1 for pad
PAD_IDX = 0
UNK_IDX = 1

def build_vocab(all_tokens):
    # Returns:
    # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    # token2id: dictionary where keys represent tokens and corresponding values represent indices
    token_counter = Counter(all_tokens)
    vocab, count = zip(*token_counter.most_common(max_vocab_size))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

token2id, id2token = build_vocab(all_train_tokens)

In [None]:
# Lets check the dictionary by loading random token from it

random_token_id = random.randint(0, len(id2token)-1)
random_token = id2token[random_token_id]

print ("Token id {} ; token {}".format(random_token_id, id2token[random_token_id]))
print ("Token {}; token id {}".format(random_token, token2id[random_token]))

In [None]:
# convert token to id in the dataset
def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

train_data_indices = token2index_dataset(train_data_tokens)
val_data_indices = token2index_dataset(val_data_tokens)
test_data_indices = token2index_dataset(test_data_tokens)

# double checking
print ("Train dataset size is {}".format(len(train_data_indices)))
print ("Val dataset size is {}".format(len(val_data_indices)))
print ("Test dataset size is {}".format(len(test_data_indices)))

## Dataset

In [None]:
MAX_SENTENCE_LENGTH = 200

In [None]:
import numpy as np
import torch
from torch.utils.data import Dataset

In [None]:
class IMDBDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's 
    readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list, target_list):
        """
        @param data_list: list of newsgroup tokens 
        @param target_list: list of newsgroup targets 

        """
        self.data_list = data_list
        self.target_list = target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        token_idx = self.data_list[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [token_idx, len(token_idx), label]

In [None]:
def imdb_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list = []
    label_list = []
    length_list = []
    #print("collate batch: ", batch[0][0])
    #batch[0][0] = batch[0][0][:MAX_SENTENCE_LENGTH]
    for datum in batch:
        label_list.append(datum[2])
        length_list.append(datum[1])
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        data_list.append(padded_vec)
    return [torch.from_numpy(np.array(data_list)), 
            torch.LongTensor(length_list), 
            torch.LongTensor(label_list)]


In [None]:
BATCH_SIZE = 32
train_dataset = IMDBDataset(train_data_indices, training_labels)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=imdb_func,
                                           shuffle=True)

val_dataset = IMDBDataset(val_data_indices, validation_labels)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=imdb_func,
                                           shuffle=True)

test_dataset = IMDBDataset(test_data_indices, test_data_labels)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=imdb_func,
                                           shuffle=False)

## Bag of N-grams

### Training

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
class BagOfWords(nn.Module):
    """
    BagOfNgrams classification model
    """
    def __init__(self, vocab_size, emb_dim):
        """
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding
        """
        super(BagOfNgrams, self).__init__()
        # pay attention to padding_idx 
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.linear = nn.Linear(emb_dim,20)
    
    def forward(self, data, length):
        """
        
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        out = self.embed(data)
        out = torch.sum(out, dim=1)
        out /= length.view(length.size()[0],1).expand_as(out).float()
     
        # return logits
        out = self.linear(out.float())
        return out

In [None]:
emb_dim = 100
model = BagOfNgrams(len(id2token), emb_dim)

In [None]:
learning_rate = 0.01
num_epochs = 10 # number epoch to train

# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()  
## try both sgd and adam
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
# Function for testing the model
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data, lengths, labels in loader:
        data_batch, length_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, length_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

In [None]:
for epoch in range(num_epochs):
    for i, (data, lengths, labels) in enumerate(train_loader):
        model.train()
        data_batch, length_batch, label_batch = data, lengths, labels
        optimizer.zero_grad()
        outputs = model(data_batch, length_batch)
        loss = criterion(outputs, label_batch)
        loss.backward()
        optimizer.step()
        # check training score every 100 iterations
        ## validate every 100 iterations
        if i > 0 and i % 50 == 0:
            # validate
            val_acc = test_model(val_loader, model)
            train_acc = test_model(train_loader, model)
            print('Epoch: [{}/{}], Step: [{}/{}], Training Acc: {},Validation Acc: {}'.format( 
                       epoch+1, num_epochs, i+1, 
                len(train_loader), train_acc, val_acc))