<h3 id="tocheading">Table of Contents</h3>
<div id="toc"></div>

In [1]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

In [2]:
import pickle
import random
import random
import spacy
import string
import os
import torch
import numpy as np
from torch.utils.data import Dataset
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

### Preprocessing

#### Load Data

In [3]:
def read_review(path):
    """reads all reviews in a category (e.g. train, pos),
    and appends them to a list as text (not tokens, yet)
    
    arg: where you keep your examples for that cagetory,
    fot example, path = data/aclImdb/train/neg/"""
    all_reviews = []
    file_list = os.listdir(path)
    for file_path in file_list:
        f = open(path+file_path)
        all_reviews.append(f.read())
    return all_reviews

In [4]:
trn_pos = read_review("data/aclImdb/train/pos/")
trn_neg = read_review("data/aclImdb/train/neg/")

test_pos = read_review("data/aclImdb/test/pos/")
test_neg = read_review("data/aclImdb/test/pos/")

#### Label & Shuffle

In [5]:
## pos = 1, neg = 0
trn_pos = [t+" 1" for t in trn_pos]
trn_neg = [t+" 0" for t in trn_neg]

test_pos = [t+" 1" for t in test_pos]
test_neg = [t+" 0" for t in test_neg]

In [6]:
data_train = trn_pos + trn_neg
test_data = test_pos + test_neg

## shuffle train data to make sure both pos and neg
## examples are represented in val data
data_train = [*np.random.permutation(data_train)]

#### Train - Val Split

In [7]:
train_split = 20000

train_data = data_train[:train_split]
val_data = data_train[train_split:]

print ("Train dataset size is {}".format(len(train_data)))
print ("Val dataset size is {}".format(len(val_data)))
print ("Test dataset size is {}".format(len(test_data)))

Train dataset size is 20000
Val dataset size is 5000
Test dataset size is 25000


In [8]:
## save and remove labels from the end for each review
train_data_labels = [int(x[-1]) for x in train_data]
val_data_labels = [int(x[-1]) for x in val_data]
test_data_labels = [int(x[-1]) for x in test_data]

train_data = [t[:-2] for t in train_data] ## account for the extra space that we put
val_data = [t[:-2] for t in val_data]
test_data = [t[:-2] for t in test_data]

In [9]:
## check
print ("Train labels size is {}".format(len(train_data_labels)))
print ("Val labels size is {}".format(len(val_data_labels)))
print ("Test labels size is {}".format(len(test_data_labels)))

Train labels size is 20000
Val labels size is 5000
Test labels size is 25000


#### Tokenize & Lowercase

__Note__: Tokenizer modified to return ngrams

In [12]:
# Load English tokenizer, tagger, parser, NER and word vectors
tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation

## tokenizer modified for ngrams

# lowercase and remove punctuation
def tokenize(sent,n=None):
    """n = n-gram's n"""
    tokens = tokenizer(sent)
    assert n >= 1, "n should be greater than or equal to 1"
    
    ## for unigrams
    if n == 1:
        start_unigrams = [token.text.lower() \
                          for token in tokens if (token.text not in punctuations)]
        return start_unigrams
    
    ## for n > 1 n-grams
    else:
        start_unigrams = [token.text.lower() \
                          for token in tokens if (token.text not in punctuations)]
        ## get copy to preserve original unigram list 
        start_unigrams_copy = start_unigrams.copy()
        ## start from 2
        ngram = 2
        while ngram <= n:
            ngram_tokens = [" ".join(start_unigrams_copy[x:x+ngram])\
                            for x in [*range(len(start_unigrams_copy)-ngram+1)]]
            ## union 
            start_unigrams.extend(ngram_tokens)
            ## increase n until specified ngrams
            ngram += 1
            
        return start_unigrams

def tokenize_dataset(dataset,n=None):
    token_dataset = []
    # we are keeping track of all tokens in dataset 
    # in order to create vocabulary later
    all_tokens = []
    
    for sample in dataset:
        tokens = tokenize(sample,n)
        token_dataset.append(tokens)
        all_tokens += tokens

    return token_dataset, all_tokens

In [13]:
tokenize("hi, my name is Asena",n=1)

['hi', 'my', 'name', 'is', 'asena']

In [14]:
tokenize("hi, my name is Asena",n=4)

['hi',
 'my',
 'name',
 'is',
 'asena',
 'hi my',
 'my name',
 'name is',
 'is asena',
 'hi my name',
 'my name is',
 'name is asena',
 'hi my name is',
 'my name is asena']

In [15]:
## Tokenize All Datasets to contain n-grams
grams = 2
train_data_tokens, all_train_tokens = tokenize_dataset(train_data,n=grams)
val_data_tokens, _ = tokenize_dataset(val_data,n=grams)
test_data_tokens, _ = tokenize_dataset(test_data,n=grams)

#### Build Vocab

In [19]:
from collections import Counter

max_vocab_size = 10000
# save index 0 for unk and 1 for pad
PAD_IDX = 0
UNK_IDX = 1

def build_vocab(all_tokens,size=max_vocab_size):
    # Returns:
    # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    # token2id: dictionary where keys represent tokens and corresponding values represent indices
    token_counter = Counter(all_tokens)
    vocab, count = zip(*token_counter.most_common(size))
    id2token = [*vocab]
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

token2id, id2token = build_vocab(all_train_tokens,size=max_vocab_size)

In [20]:
# Lets check the dictionary by loading random token from it

random_token_id = random.randint(0, len(id2token)-1)
random_token = id2token[random_token_id]

print ("Token id {} ; token {}".format(random_token_id, id2token[random_token_id]))
print ("Token {}; token id {}".format(random_token, token2id[random_token]))

Token id 5270 ; token as some
Token as some; token id 5270


In [21]:
# token2id

In [22]:
## convert token to id in the dataset
## copied from lab3 notebook
def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in \
                      token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

train_data_indices = token2index_dataset(train_data_tokens)
val_data_indices = token2index_dataset(val_data_tokens)
test_data_indices = token2index_dataset(test_data_tokens)

# double checking
print ("Train dataset size is {}".format(len(train_data_indices)))
print ("Val dataset size is {}".format(len(val_data_indices)))
print ("Test dataset size is {}".format(len(test_data_indices)))

Train dataset size is 20000
Val dataset size is 5000
Test dataset size is 25000


#### Pytorch Data Loader

In [23]:
MAX_SENTENCE_LENGTH = 200

import numpy as np
import torch
from torch.utils.data import Dataset

class NewsGroupDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list, target_list):
        """
        @param data_list: list of newsgroup tokens 
        @param target_list: list of newsgroup targets 

        """
        self.data_list = data_list
        self.target_list = target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        token_idx = self.data_list[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [token_idx, len(token_idx), label]

def imdb_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list = []
    label_list = []
    length_list = []
    #print("collate batch: ", batch[0][0])
    #batch[0][0] = batch[0][0][:MAX_SENTENCE_LENGTH]
    for datum in batch:
        label_list.append(datum[2])
        length_list.append(datum[1])
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        data_list.append(padded_vec)
    return [torch.from_numpy(np.array(data_list)), torch.LongTensor(length_list), torch.LongTensor(label_list)]

# create pytorch dataloader
#train_loader = NewsGroupDataset(train_data_indices, train_targets)
#val_loader = NewsGroupDataset(val_data_indices, val_targets)
#test_loader = NewsGroupDataset(test_data_indices, test_targets)

BATCH_SIZE = 32
train_dataset = NewsGroupDataset(train_data_indices, train_data_labels)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=imdb_func,
                                           shuffle=True)

val_dataset = NewsGroupDataset(val_data_indices, val_data_labels)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=imdb_func,
                                           shuffle=True)

test_dataset = NewsGroupDataset(test_data_indices, test_data_labels)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=imdb_func,
                                           shuffle=False)

#for i, (data, lengths, labels) in enumerate(train_loader):
#    print (data)
#    print (labels)
#    break

### Model

#### Bag of N-grams

In [24]:
# First import torch related libraries
import torch
import torch.nn as nn
import torch.nn.functional as F

class BagOfNgrams(nn.Module):
    """
    BagOfNgrams classification model
    """
    def __init__(self, vocab_size, emb_dim):
        """
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding
        """
        super(BagOfNgrams, self).__init__()
        # pay attention to padding_idx 
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.linear = nn.Linear(emb_dim,20)
    
    def forward(self, data, length):
        """
        
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        out = self.embed(data)
        out = torch.sum(out, dim=1)
        out /= length.view(length.size()[0],1).expand_as(out).float()
     
        # return logits
        out = self.linear(out.float())
        return out

emb_dim = 100
model = BagOfNgrams(len(id2token), emb_dim)

In [25]:
learning_rate = 0.01
num_epochs = 10 # number epoch to train

# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()  
## try both sgd and adam
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Function for testing the model
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data, lengths, labels in loader:
        data_batch, length_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, length_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

for epoch in range(num_epochs):
    for i, (data, lengths, labels) in enumerate(train_loader):
        model.train()
        data_batch, length_batch, label_batch = data, lengths, labels
        optimizer.zero_grad()
        outputs = model(data_batch, length_batch)
        loss = criterion(outputs, label_batch)
        loss.backward()
        optimizer.step()
        # validate every 100 iterations
        if i > 0 and i % 100 == 0:
            # validate
            val_acc = test_model(val_loader, model)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format( 
                       epoch+1, num_epochs, i+1, len(train_loader), val_acc))

Epoch: [1/10], Step: [101/625], Validation Acc: 75.7
Epoch: [1/10], Step: [201/625], Validation Acc: 81.14
Epoch: [1/10], Step: [301/625], Validation Acc: 83.9
Epoch: [1/10], Step: [401/625], Validation Acc: 84.8
Epoch: [1/10], Step: [501/625], Validation Acc: 85.62
Epoch: [1/10], Step: [601/625], Validation Acc: 86.6
Epoch: [2/10], Step: [101/625], Validation Acc: 86.24
Epoch: [2/10], Step: [201/625], Validation Acc: 86.12
Epoch: [2/10], Step: [301/625], Validation Acc: 85.72
Epoch: [2/10], Step: [401/625], Validation Acc: 85.38
Epoch: [2/10], Step: [501/625], Validation Acc: 85.74
Epoch: [2/10], Step: [601/625], Validation Acc: 85.34
Epoch: [3/10], Step: [101/625], Validation Acc: 85.74
Epoch: [3/10], Step: [201/625], Validation Acc: 85.56
Epoch: [3/10], Step: [301/625], Validation Acc: 85.28
Epoch: [3/10], Step: [401/625], Validation Acc: 85.32
Epoch: [3/10], Step: [501/625], Validation Acc: 85.14
Epoch: [3/10], Step: [601/625], Validation Acc: 85.26
Epoch: [4/10], Step: [101/625], 

### Hyperparameter Seach

We recommend you try different preprocessing and hyperparameters for the
model, including but not limited to:

- Tokenization schemes of the dataset.
- Model hyperparameters: Vary n for n-gram (n=1; 2; 3; 4), vocabulary size
and embedding size.
- Optimization hyperparameters: Optimizer itself (SGD vs Adam), learning
rate and whether or not you use linear annealing of learning rate (learning
rate is reduced linearly over the course of training).

In [26]:
## ngrams
## learning rate
## vocab size
## embedding size
## optimizer sgd vs adam

In [27]:
import itertools

params = [[1e-3,1e-2,1e-1,5e-1,1,10,1e2], ## learning rates
          [*range(1,5)], ## ngrams
          [10000,25000,50000,75000], ## vocab size
          [50,100,150], ## embedding size
          [100,200,300], ## max sentence length
          [16,32,64,128] ## batch size
         ]

### ADD OPTIMIZERS AT THE END
### -- THIS MEANS, WE'LL DO THE SEARCH TWICE 
### -- FOR ONCE FOR ADAM AND ONCE FOR SGD

optimizers = [torch.optim.Adam(model.parameters(), lr=learning_rate),\
             torch.optim.SGD(model.parameters(), lr=learning_rate)]

[*itertools.product(*params)][:5]

[(0.001, 1, 10000, 50, 100, 16),
 (0.001, 1, 10000, 50, 100, 32),
 (0.001, 1, 10000, 50, 100, 64),
 (0.001, 1, 10000, 50, 100, 128),
 (0.001, 1, 10000, 50, 200, 16)]

In [28]:
## CONSTRUCT THE NGRAM DATASETS

NGRAM_DATASETS = {}

for n in [*range(1,5)]:
    print (n)
    grams = n
    train_data_tokens, all_train_tokens = tokenize_dataset(train_data,n=grams)
    val_data_tokens, _ = tokenize_dataset(val_data,n=grams)
    test_data_tokens, _ = tokenize_dataset(test_data,n=grams)
    
    NGRAM_DATASETS[n] = [train_data_tokens,
                        all_train_tokens,
                        val_data_tokens,
                        test_data_tokens]

1
2
3
4


In [42]:
import pandas as pd
TRICK_TO_READ_DICT = []
TRICK_TO_READ_DICT.append(NGRAM_DATASETS)
ngram_df = pd.DataFrame(TRICK_TO_READ_DICT)
ngram_df.head(1)

Unnamed: 0,1,2,3,4
0,"[[[baseketball, is, indeed, a, really, funny, ...","[[[baseketball, is, indeed, a, really, funny, ...","[[[baseketball, is, indeed, a, really, funny, ...","[[[baseketball, is, indeed, a, really, funny, ..."


In [76]:
# pd.DataFrame(NGRAM_DATASETS)[1][3]
pd.DataFrame(NGRAM_DATASETS).to_csv("NGRAM_DICT_DATA.csv")

In [77]:
ngram_df_read = pd.DataFrame(pd.read_csv("NGRAM_DICT_DATA.csv",header=None))

In [None]:
import ast
len(ast.literal_eval(ngram_df_read[4][1]))

KeyboardInterrupt: 

In [None]:
## HYPERPARAMETER SEARCH ON VALIDATION SET

def hyperparameter_search(hyperparameter_space=params):
    """Takes as input a list of parameter search space lists."""
    ## CRITERION: ONLY CROSS ENTROPY LOSS FOR NOW
    param_space = [*itertools.product(*params)]
    
    ## val loss dict
    param_losses = {}
    
    for i in param_space:
        print (i)
        
        ## will append validation losses here
        param_losses[i] = []
        
        NUM_EPOCHS = 10
        
        step = i[0] ## learning rate
        grams = i[1] ## n-grams
        vocab_size = i[2] ## vocab size
        embed_dimension = i[3] ## embedding size
        max_sentence_length = i[4] ## max sentence length of data loader
        batch_size = i[5]

        
        criterion = torch.nn.CrossEntropyLoss()

        ## tokenize training and validation data
        train_data_tokens = NGRAM_DATASETS[grams][0]
        all_train_tokens = NGRAM_DATASETS[grams][1]
        val_data_tokens = NGRAM_DATASETS[grams][2]

        ## build vocab for the specified vocab size
        token2id, id2token = build_vocab(all_train_tokens,
                                        size=vocab_size)

        train_data_indices = token2index_dataset(train_data_tokens)
        val_data_indices = token2index_dataset(val_data_tokens)

        ## assign max sentence length and batch size from 
        ## parameter space
        MAX_SENTENCE_LENGTH = max_sentence_length
        BATCH_SIZE = batch_size

        ## load train and val data
        train_dataset = NewsGroupDataset(train_data_indices, train_data_labels)
        train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                                   batch_size=BATCH_SIZE,
                                                   collate_fn=imdb_func,
                                                   shuffle=True)

        val_dataset = NewsGroupDataset(val_data_indices, val_data_labels)
        val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                                   batch_size=BATCH_SIZE,
                                                   collate_fn=imdb_func,
                                                   shuffle=True)

#         print ("Datasets ready.")
        ## assign embedding dimension
        ## from parameter space
        embed_dim = embed_dimension    

        ## model
        model = BagOfNgrams(len(id2token), emb_dim)
#         print ("Model ready.")
#         optimizers = [torch.optim.Adam(model.parameters(), lr=step),
#                       torch.optim.SGD(model.parameters(), lr=step)]
    
        optimizers = [torch.optim.Adam(model.parameters(), lr=step)]
        
        for optimizer in optimizers:
#             print ("Optimizer type: "+str(optimizer))
            for epoch in range(NUM_EPOCHS):
                for x, (data, lengths, labels) in enumerate(train_loader):
                    model.train()
                    data_batch, length_batch, label_batch = data, lengths, labels
                    optimizer.zero_grad()
                    outputs = model(data_batch, length_batch)
                    loss = criterion(outputs, label_batch)
                    loss.backward()
                    optimizer.step()
                    # validate every 100 iterations
                    if x > 0 and x % 100 == 0:
                        # validate
                        val_acc = test_model(val_loader, model)
                        param_losses[i].append(val_acc)
                        print('Epoch: [{}/{}], Step: [{}/{}], \
                        Validation Acc: {}'.format(epoch+1, \
                                                   num_epochs, \
                                                   i+1, len(train_loader), val_acc))
                        
        print (param_losses[i])
    print ("Hyperparameter search done!")
    return param_losses

In [None]:
params = [[1e-3], ## learning rates
          [*range(1,2)], ## ngrams
          [25000], ## vocab size
          [100], ## embedding size
          [200], ## max sentence length
          [32] ## batch size
         ]

param_val_losses = hyperparameter_search(params)

### Ablation Study 

### Correct and Incorrect Prediction Examples

### Test Performance

### Bonus: Rating Between 1-10

### Bonus: Other Hyperparameters