In [1]:
import pandas as pd
import numpy as np
import os
import spacy
import string
import pickle as pkl
from nltk import ngrams
from collections import Counter
import torch
from torch.utils.data import Dataset
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR
import matplotlib.pyplot as plt
%matplotlib inline

## Data Loading

In [2]:
def load_data(folder_path, label): 
    scores = []
    data_list = []
    for file in os.listdir(folder_path):
        if file.endswith(".txt"):
            scores.append(int(file[file.find("_")+1:file.find(".")]))
            with open(folder_path+file) as f:
                data_list.append(f.read())
    
    labels = label*np.ones(len(scores))
    return data_list, labels, scores

def merge_data(data1, data2, label1, label2, score1, score2, split, shuffle, train_size=20000):
    data = data1+data2
    labels = np.concatenate([label1, label2]).tolist()
    scores = score1+score2
    
    if shuffle:
        np.random.seed(0)
        index = np.random.permutation(len(data))
        data = np.array(data)[index].tolist()
        labels = np.array(labels)[index].tolist()
        scores = np.array(scores)[index].tolist()
    
    if split:
        train_data = data[:train_size]
        val_data = data[train_size:]
        train_labels = labels[:train_size]
        val_labels = labels[train_size:]
        train_scores = scores[:train_size]
        val_scores = scores[train_size:]
        return train_data, train_labels, train_scores, val_data, val_labels, val_scores
    
    return data, labels, scores

In [3]:
train_pos_path = os.getcwd()+'/aclImdb/train/pos/'
train_neg_path = os.getcwd()+'/aclImdb/train/neg/'
test_pos_path = os.getcwd()+'/aclImdb/test/pos/'
test_neg_path = os.getcwd()+'/aclImdb/test/neg/'

In [4]:
train_pos_data, train_pos_label, train_pos_scores = load_data(train_pos_path, 1)
train_neg_data, train_neg_label, train_neg_scores = load_data(train_neg_path, 0)
test_pos_data, test_pos_label, test_pos_scores = load_data(test_pos_path, 1)
test_neg_data, test_neg_label, test_neg_scores = load_data(test_neg_path, 0)

In [5]:
train_data, train_labels, train_scores, \
val_data, val_labels, val_scores = merge_data(train_pos_data, train_neg_data, train_pos_label, train_neg_label,
                                              train_pos_scores, train_neg_scores, True, True)
test_data, test_labels, test_scores = \
merge_data(test_pos_data, test_neg_data, test_pos_label, test_neg_label,
                                              test_pos_scores, test_neg_scores, False, False)

## Data preprocessing (Tokenization)

In [6]:
tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation

def tokenize(sent, tokenization):
    tokens = tokenizer(sent)
    if tokenization:
        return [token.text.lower() for token in tokens if (token.text not in punctuations)]
    else:
        return [token.text for token in tokens]

In [7]:
def tokenize_dataset(dataset, tokenization):
    token_dataset = []
    all_tokens = []
    
    for sample in dataset:
        tokens = tokenize(sample, tokenization)
        token_dataset.append(tokens)
        all_tokens += tokens

    return token_dataset, all_tokens

In [8]:
train_data_tokens = pkl.load(open("train_data_tokens.p", "rb"))
all_train_tokens = pkl.load(open("all_train_tokens.p", "rb"))
val_data_tokens = pkl.load(open("val_data_tokens.p", "rb"))
test_data_tokens = pkl.load(open("test_data_tokens.p", "rb"))

print ("Train dataset size is {}".format(len(train_data_tokens)))
print ("Val dataset size is {}".format(len(val_data_tokens)))
print ("Test dataset size is {}".format(len(test_data_tokens)))
print ("Total number of tokens in train dataset is {}".format(len(all_train_tokens)))

Train dataset size is 20000
Val dataset size is 5000
Test dataset size is 25000
Total number of tokens in train dataset is 4809135


## Build vocab lists and transform data into indices lists

In [9]:
PAD_IDX = 0
UNK_IDX = 1

def build_vocab(all_tokens, max_vocab_size):
    token_counter = Counter(all_tokens)
    vocab, count = zip(*token_counter.most_common(max_vocab_size))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

def token2index_dataset(tokens_data, token2id):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

### 1-gram (word)

In [10]:
max_vocab_size = 25000
token2id_n1, id2token_n1 = build_vocab(all_train_tokens, max_vocab_size)

In [11]:
train_data_indices_n1 = token2index_dataset(train_data_tokens, token2id_n1)
val_data_indices_n1 = token2index_dataset(val_data_tokens, token2id_n1)
test_data_indices_n1 = token2index_dataset(test_data_tokens, token2id_n1)

# double checking
print ("Train dataset size is {}".format(len(train_data_indices_n1)))
print ("Val dataset size is {}".format(len(val_data_indices_n1)))
print ("Test dataset size is {}".format(len(test_data_indices_n1)))

Train dataset size is 20000
Val dataset size is 5000
Test dataset size is 25000


## DataLoader

In [12]:
MAX_SENTENCE_LENGTH = 200

In [13]:
class NewsGroupDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list, target_list):
        """
        @param data_list: list of newsgroup tokens 
        @param target_list: list of newsgroup targets 

        """
        self.data_list = data_list
        self.target_list = target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        """
        Triggered when yo-u call dataset[i]
        """
        
        token_idx = self.data_list[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [token_idx, len(token_idx), label]

def newsgroup_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list = []
    label_list = []
    length_list = []
    #print("collate batch: ", batch[0][0])
    #batch[0][0] = batch[0][0][:MAX_SENTENCE_LENGTH]
    for datum in batch:
        label_list.append(datum[2])
        length_list.append(datum[1])
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        data_list.append(padded_vec)
    return [torch.from_numpy(np.array(data_list)), torch.LongTensor(length_list), torch.LongTensor(label_list)]

In [15]:
train_scores = [i-1 for i in train_scores]
val_scores = [i-1 for i in val_scores]
test_scores = [i-1 for i in test_scores]

In [16]:
BATCH_SIZE = 32
train_dataset_n1 = NewsGroupDataset(train_data_indices_n1, train_scores)
train_loader_n1 = torch.utils.data.DataLoader(dataset=train_dataset_n1, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

val_dataset_n1 = NewsGroupDataset(val_data_indices_n1, val_scores)
val_loader_n1 = torch.utils.data.DataLoader(dataset=val_dataset_n1, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

test_dataset_n1 = NewsGroupDataset(test_data_indices_n1, test_scores)
test_loader_n1 = torch.utils.data.DataLoader(dataset=test_dataset_n1, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=False)

## Bag of N-gram regression model

In [25]:
class BagOfNgram(nn.Module):
    """
    BagOfNgram classification model
    """
    def __init__(self, vocab_size, emb_dim, train_scores):
        """
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding
        """
        super(BagOfNgram, self).__init__()
        # pay attention to padding_idx 
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.predict = nn.Linear(emb_dim, 10)
    
    def forward(self, data, length):
        """
        
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        out = self.embed(data)
        out = torch.sum(out, dim=1)
        out /= length.view(length.size()[0],1).expand_as(out).float()
     
        # return logits
        out = self.predict(out.float())
        return out

In [26]:
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data, lengths, scores in loader:
        data_batch, length_batch, label_batch = data, lengths, scores
        outputs = F.softmax(model(data_batch, length_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        total += scores.size(0)
        correct += predicted.eq(scores.view_as(predicted)).sum().item()
    
    return (100 * correct / total)

In [27]:
def train_proc(model, train_loader, val_loader, lr, adj, ep, optim, lr_decay=0, plt=False):
    criterion = torch.nn.CrossEntropyLoss()
    if optim == 'Adam':
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    else:
        optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    
    if adj:
        scheduler = StepLR(optimizer, step_size=1, gamma=lr_decay)
    
    train_ls = []
    for epoch in range(ep):
        if adj:
            scheduler.step()
        for i, (data, lengths, scores) in enumerate(train_loader):
            model.train()
            data_batch, length_batch, score_batch = data, lengths, scores
            optimizer.zero_grad()
            outputs = model(data_batch, length_batch)
#             print(outputs)
#             print(score_batch)
            loss = criterion(outputs, score_batch)
            loss.backward()
            optimizer.step()
            train_ls.append(loss)
            
#             if i > 0 and i % 100 == 0:
#                 val_acc, val_loss = test_model(val_loader, model)
#                 val_ls += val_loss
#                 print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format(
#                     epoch+1, ep, i+1, len(train_loader), val_acc))
    
    val_acc  = test_model(val_loader, model)
    print('Val Accuracy: {}'.format(val_acc))
    
    if plt:
        plt.plot(train_ls)
        plt.xlabel("n")
        plt.ylabel("Train Loss")
    
    return val_acc, model

## Training

In [32]:
emb_size = 100
model_n1 = BagOfNgram(len(id2token_n1), emb_size, train_scores)
val_acc, model = train_proc(model_n1, train_loader_n1, val_loader_n1, 0.01, True, 3, 'Adam', 0.1)

Val Accuracy: 42.14


In [33]:
model.eval()
correct = 0
total = 0
index = []
for data, lengths, scores in test_loader_n1:
    data_batch, length_batch, label_batch = data, lengths, scores
    outputs = F.softmax(model(data_batch, length_batch), dim=1)
    predicted = outputs.max(1, keepdim=True)[1]

    total += scores.size(0)
    correct += predicted.eq(scores.view_as(predicted)).sum().item()
    
print("Test Accuracy:{}".format(100 * correct / total))

Test Accuracy:41.276
