# Using Mixup on TREC Dataset

## Import libraries

In [1]:
!pip install datasets
!python -m spacy download en_core_web_md

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

# Utility imports
import spacy
import re
import string
import time
import random

# Extras
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt

from datasets import load_dataset # using 🤗 HugggingFace datasets library


## Set Random Seed

In [3]:
SEED = 420

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## Load dataset

In [4]:
dataset = load_dataset("trec")

## Exploring the dataset

In [5]:
print(dataset)

In [6]:
dataset.keys()

In [7]:
print(dataset["train"].dataset_size)
print(dataset["train"].description)
print(dataset["train"].features)


In [8]:
# Sample data point in TREC

dataset["train"][0]

In [9]:
train_df = pd.DataFrame(dataset["train"])
train_df.head()

In [10]:
test_df = pd.DataFrame(dataset["test"])
test_df.head()

In [11]:
train_df.info()

## Tokenize Data

In [12]:
nlp = spacy.load("en_core_web_md")

def tokenize(text):
	# text = re.sub(r"[^\x00-\x7F]+", "", str(text)) # remove non-ASCII characters
	# regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
	# no_punctuation = regex.sub(" ", text.lower())	
	# no_punctuation = re.sub(r"[^\w\s]", "", text.lower())
	
	text = re.sub('[^a-z ]+', '', text.lower()) # remove all non-alphabetic, non-space characters
	multiple_spaces = re.sub(' +', ' ', text.lower()) # convert multiple spaces to single
	
	return [token.text for token in nlp.tokenizer(text)]

In [13]:
train_df["text"] = train_df["text"].str.strip()
train_df.head()

In [14]:
# Initialize counter var and count occurence of words (rather tokens of the words) in the dataset

counts = Counter()
for index, row in train_df.iterrows():
  counts.update(tokenize(row["text"]))

In [15]:
print(f"# of words before: {len(counts.keys())}")

# Remove words that occur only once, and whitespaces
for word in list(counts):
  if counts[word] < 2:
    del counts[word]
  
  if ' ' in word:
    del counts[word]

print(f"# of words before: {len(counts.keys())}")

In [16]:
# Create a vocabulary from remaining words, and assign unknown to the removed words

vocab2index = {"": 0, "UNK": 1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [17]:
max_len = train_df['text'].str.split("\\s+").str.len().max()
min_len = train_df['text'].str.split("\\s+").str.len().min()

print(f"Max # of words: {max_len}\tMin # of words: {min_len}") 

In [18]:
# Function to encode sentences
def encode_sentence(text, vocab2index, N=80):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc))
    encoded[:length] = enc[:length]
    return encoded, length

In [19]:
# Apply encoding to train and test DataFrames
train_df["text"] = train_df["text"].apply(lambda x: np.array(encode_sentence(x, vocab2index)))
test_df["text"] = test_df["text"].apply(lambda x: np.array(encode_sentence(x, vocab2index)))

In [20]:
train_df.head()

### Load data into `Dataset` and then into `DataLoader`

In [21]:
# Custom Dataset class

class TRECDataset(Dataset):
  def __init__(self, text, label):
    self.x = text
    self.y = label
  
  def __len__(self):
    return len(self.y)

  def __getitem__(self, idx):
    return torch.from_numpy(self.x[idx][0].astype(np.int32)), self.y[idx]

In [22]:
# Create Dataset from only text and coarse label columns

train_ds = TRECDataset(train_df["text"], train_df["label-coarse"])
test_ds = TRECDataset(test_df["text"], test_df["label-coarse"])

In [23]:
# Create dataloaders

train_dl = DataLoader(train_ds, batch_size=256, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=256, shuffle=True)

### Shifting data to GPU

In [24]:
# Functions to transfer data to GPU

def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')
    
def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list, tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

class DeviceDataLoader:
  """Wrap DataLoader to move batches of data to device"""
  def __init__(self, dataloader, device):
    self.dataloader = dataloader
    self.device = device
    
  def __len__(self):
    """Return number of batches"""
    return len(self.dataloader)
    
  def __iter__(self):
    """Yield a batch of data after moving it to device"""
    for batch in self.dataloader:
      yield to_device(batch, self.device)

In [25]:
device = get_default_device()
device

In [26]:
# Move dataloaders to cuda device

train_dl = DeviceDataLoader(train_dl, device)
test_dl = DeviceDataLoader(test_dl, device)

In [27]:
# Check final data shape

for batch in train_dl:
    x, y = batch
    print(x.shape, y.shape)
    break

In [28]:
vocab_size = len(words)
vocab_size

## Building the model

In [38]:
class LSTMModel(nn.Module) :
    def __init__(self) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, 512, padding_idx=0)
        self.lstm = nn.LSTM(512, 512, batch_first=True, num_layers=2, bidirectional=True)
        self.linear1 = nn.Linear(512, 1024)
        self.linear2 = nn.Linear(1024, 256)
        self.linear3 = nn.Linear(256, 128)
        self.linear4 = nn.Linear(128, 64)
        self.linear5 = nn.Linear(64, 6)
        self.dropout = nn.Dropout(0.3)
        self.out = nn.LogSoftmax(dim=-1)
        
    def forward(self, x):
        # print(x.shape, " at start")
        x = self.embeddings(x)
        # print(x.shape, " after embeddings")
        _, (x, _) = self.lstm(x)
        # print(x.shape, " after lstm")

        # x = x.reshape(256, -1)
        # print(x.shape, " after reshape")
        
        x = F.relu(self.linear1(x[-1]))
        # print(x.shape, " after linear1")
        x = self.dropout(x)
        
        x = F.relu(self.linear2(x))
        # print(x.shape, " after linear2")
        x = self.dropout(x)
        
        x = F.relu(self.linear3(x))
        # print(x.shape, " after linear3")
        x = self.dropout(x)
        
        x = F.relu(self.linear4(x))
        # print(x.shape, " after linear4")
        x = self.dropout(x)
        
        x = F.relu(self.linear5(x))
        # print(x.shape, " after linear5")
        output = self.out(x)

        # print(x.shape, " after logsoft")
        
        return output

In [39]:
baseline_model = LSTMModel()
baseline_model = to_device(baseline_model, device)

In [40]:
criterion = nn.NLLLoss().to(device)
learnable_parameters = filter(lambda p: p.requires_grad, baseline_model.parameters())
optimizer = torch.optim.Adam(learnable_parameters, lr=0.0015)

### Intialize helper functions

In [29]:
def cat_accuracy(pred, y):
  max_pred = pred.argmax(dim=1, keepdim=True)
  correct = max_pred.squeeze(1).eq(y)
  correct = correct.to("cpu")
  y = y.to("cpu")
  return correct.sum() / torch.FloatTensor([y.shape[0]])

In [30]:
def evaluate(model, iterator, criterion):
  epoch_loss = 0
  epoch_acc = 0
  model.eval()

  with torch.no_grad():
    for batch in iterator:
      x, y = batch
      pred = model(x)
      loss = criterion(pred, y)
      acc = cat_accuracy(pred, y)

      epoch_loss += loss.item()
      epoch_acc += acc.item()
    
  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [41]:
def train(model, iterator, optimizer, criterion):
  epoch_loss = 0
  epoch_acc = 0

  model.train()

  for batch in iterator:
    x, y = batch

    # print(x.shape)
    # print(y.shape)

    optimizer.zero_grad()
    
    pred = model(x)
    loss = criterion(pred, y)
    acc = cat_accuracy(pred, y)

    loss.backward()
    optimizer.step()

    epoch_loss += loss.item()
    epoch_acc += acc.item()

  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [31]:
def epoch_time(start, end):
  elapsed_time = end - start
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time  - (elapsed_mins * 60))

  return elapsed_mins, elapsed_secs

## Training the model

In [42]:
n_epochs = 30

train_losses = []
test_losses = []
train_accs = []
test_accs = []

best_loss = float("inf")

for epoch in range(n_epochs):
  start_time = time.time()

  train_loss, train_acc = train(baseline_model, train_dl, optimizer, criterion)
  test_loss, test_acc = evaluate(baseline_model, test_dl, criterion)
  
  train_losses.append(train_loss)
  test_losses.append(test_loss)
  train_accs.append(train_acc)
  test_accs.append(test_acc)

  end_time = time.time()

  epoch_mins, epoch_secs = epoch_time(start_time, end_time)

  if test_loss < best_loss:
    best_loss = test_loss
    torch.save(baseline_model.state_dict(), 'baseline-bilstm-model.pt')
    print(f"Found best model yet! at epoch {epoch + 1: 02}")
  
  print(f'Epoch: {epoch + 1: 02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
  print(f'\tTrain Loss: {train_loss: .3f} | Train Acc: {train_acc * 100: .2f}%')
  print(f'\t Val. Loss: {test_loss: .3f} |  Val. Acc: {test_acc * 100: .2f}%')

In [43]:
baseline_model.load_state_dict(torch.load('baseline-bilstm-model.pt'))

test_loss, test_acc = evaluate(baseline_model, test_dl, criterion)

print(f'Test Loss: {test_loss: .3f} |  Test Acc: {test_acc * 100: .2f}%')

In [44]:
plt.plot(train_losses, label = "Training loss")
plt.plot(test_losses, label = "Testing loss")
plt.legend()
plt.show()

plt.plot(train_accs, label = "Training accuracy")
plt.plot(test_accs, label = "Testing accuracy")
plt.legend()
plt.show()

## Defining MixUp strategy

In [32]:
# Implement Mixup (https://arxiv.org/pdf/2004.12239.pdf)
def mixup_data(x, y, alpha=0.2, use_cuda=True):
  """Return mixup values for the whole batch"""
  batch_size = x.shape[0]

  lam_vector = np.random.beta(alpha, alpha)
  
  if use_cuda:
    index = torch.randperm(batch_size).cuda()
  else:
    index = torch.randperm(batch_size)

  mixed_x = (x.T * lam_vector).T + (x.iloc[index, :].T * (1.0 - lam_vector)).T
  mixed_y = (y.T * lam_vector).T + (y.iloc[index, :].T * (1.0 - lam_vector)).T

  return mixed_x, mixed_y

In [33]:
class MixupModel(nn.Module) :
  def __init__(self) :
    super().__init__()
    self.embeddings = nn.Embedding(vocab_size, 512, padding_idx=0)
    self.lstm = nn.LSTM(512, 512, batch_first=True, num_layers=2, bidirectional=True)

    self.linear_layers = nn.ModuleList(
        [
         nn.Linear(512, 1024),
         nn.Linear(1024, 256),
         nn.Linear(256, 128),
         nn.Linear(128, 64),
         nn.Linear(64, 6)
        ]
    )
    self.dropout = nn.Dropout(0.3)
    self.out = nn.LogSoftmax(dim=-1)
    
  def forward(self, x_one, x_two=None, mixup_layer=-1, mixup_lambda=0.2):
    x_one = self.embeddings(x_one)
    _, (x_one, _) = self.lstm(x_one)
    x_one = x_one[-1]

    if x_two is not None:
      x_two = self.embeddings(x_two)
      _, (x_two, _) = self.lstm(x_two)
      x_two = x_two[-1]

    for i, layer in enumerate(self.linear_layers):
      x_one = self.dropout(F.relu(layer(x_one)))

      if mixup_layer != -1:
        if i <= mixup_layer:
          x_two = self.dropout(F.relu(layer(x_two)))

        if i == mixup_layer:
          hidden_mixup = mixup_lambda * x_one + (1.0 - mixup_lambda) * x_two
          x_one = hidden_mixup

    output = self.out(x_one)
    return output


In [46]:
mixup_model = MixupModel()
mixup_model = to_device(mixup_model, device)

criterion = nn.NLLLoss().to(device)
learnable_parameters = filter(lambda p: p.requires_grad, mixup_model.parameters())
optimizer = torch.optim.Adam(learnable_parameters, lr=0.0015)

In [47]:
def split_batch(x, y):
  half_size = x.shape[0] // 2
  
  x_left, x_right = torch.split(x, half_size)
  y_left, y_right = torch.split(y, half_size)

  return x_left, y_left, x_right, y_right

In [48]:
def train_mixup(model, iterator, optimizer, criterion):
  epoch_loss = 0
  # epoch_acc = 0

  model.train()

  for batch in iterator:
    x, y = batch

    mix_layer = random.choice([1, 2, 3])
    mix_layer = mix_layer - 1

    mix_lambda = np.random.beta(0.2, 0.2)
    mix_lambda = max(mix_lambda, 1 - mix_lambda)

    # print(x.shape, y.shape, " originally")
    x_l, y_l, x_r, y_r = split_batch(x, y)
    # print(y_l.shape, " left shape")
    # print(y_r.shape, " right shape")

    # y_mixed = mix_lambda * y_l + (1.0 - mix_lambda) * x_r

    # print(x.shape)
    # print(y.shape)

    optimizer.zero_grad()
    
    preds = model(x_one=x_l, x_two=x_r, mixup_layer=mix_layer, mixup_lambda=mix_lambda)
    y_l.squeeze_()
    y_r.squeeze_()
    loss = criterion(preds, y_l) * mix_lambda + criterion(preds, y_r) * (1.0 - mix_lambda)
    # acc = cat_accuracy(preds, y)

    loss.backward()
    optimizer.step()

    epoch_loss += loss.item()
    # epoch_acc += acc.item()

  return epoch_loss / len(iterator)

In [49]:
n_epochs = 30

train_losses = []
test_losses = []
# train_accs = []
test_accs = []

best_loss = float("inf")

for epoch in range(n_epochs):
  start_time = time.time()

  train_loss = train_mixup(mixup_model, train_dl, optimizer, criterion)
  test_loss, test_acc = evaluate(mixup_model, test_dl, criterion)
  
  train_losses.append(train_loss)
  test_losses.append(test_loss)
  # train_accs.append(train_acc)
  test_accs.append(test_acc)

  end_time = time.time()

  epoch_mins, epoch_secs = epoch_time(start_time, end_time)

  if test_loss < best_loss:
    best_loss = test_loss
    torch.save(mixup_model.state_dict(), 'mixup-bilstm-model.pt')
    print(f"Found best model yet! at epoch {epoch + 1: 02}")
  
  print(f'Epoch: {epoch + 1: 02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
  print(f'\tTrain Loss: {train_loss: .3f}')
  print(f'\t Val. Loss: {test_loss: .3f} |  Val. Acc: {test_acc * 100: .2f}%')

In [50]:
mixup_model.load_state_dict(torch.load('mixup-bilstm-model.pt'))

test_loss, test_acc = evaluate(mixup_model, test_dl, criterion)

print(f'Test Loss: {test_loss: .3f} |  Test Acc: {test_acc * 100: .2f}%')

In [51]:
plt.plot(train_losses, label = "Training loss")
plt.plot(test_losses, label = "Testing loss")
plt.legend()
plt.show()

# plt.plot(train_accs, label = "Training accuracy")
plt.plot(test_accs, label = "Testing accuracy")
plt.legend()
plt.show()