## Import packages

In [3]:
!pip install sacremoses

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import os
import pandas as pd
import sacremoses
from torch.utils.data import dataloader, Dataset
from tqdm.auto import tqdm



# Create DataLoaders
Create Pytorch DataLoaders for our train, val, and test data.

In [4]:
import numpy as np
import torch
from torch.utils.data import Dataset

class SpamDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    This class inherits torch.utils.data.Dataset
    """
    def __init__(self, data_list, target_list, max_sent_length=128):
        """
        @param data_list: list of data tokens 
        @param target_list: list of data targets 
        """
        self.data_list = data_list
        self.target_list = target_list
        self.max_sent_length = max_sent_length
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key, max_sent_length=None):
        """
        Triggered when calling dataset[i]
        """
        if max_sent_length is None:
            max_sent_length = self.max_sent_length
        token_idx = self.data_list[key][:max_sent_length]
        label = self.target_list[key]
        return [token_idx, label]

    def spam_collate_func(self, batch):
        """
        Customized function for DataLoader that dynamically pads the batch so that all 
        data have the same length
        """ 
        data_list = [] # store padded sequences
        label_list = [element[1] for element in batch]
        max_batch_seq_len = None # the length of longest sequence in batch
                                 # if it is less than self.max_sent_length
                                 # else max_batch_seq_len = self.max_sent_length

        # If self.max_sent_length is less than the length of longest sequence 
        # in the batch, use self.max_sent_length. Otherwise, use the length 
        # of longest sequence in the batch.
        max_num_elements = max([len(element[0]) for element in batch])
        if max_num_elements < self.max_sent_length:
          max_batch_seq_len = max_num_elements
        else:
          max_batch_seq_len = self.max_sent_length

        """
          # Pad the sequences in your data 
          # Trim the sequences that are longer than self.max_sent_length
          # return padded data_list and label_list
        """

        for element in batch:
          sequence = element[0]
          length = len(sequence)
          print(max_num_elements)
          print(max_batch_seq_len)
          if length < max_batch_seq_len:
            padding = [0 for _ in range(max_batch_seq_len - length)]
            data_list.append(sequence + padding)
          else:
            data_list.append(sequence[:max_batch_seq_len])
        
        data_list = torch.tensor(data_list)
        label_list = torch.tensor(label_list)

        return [data_list, label_list]

BATCH_SIZE = 64
max_sent_length=128
train_dataset = SpamDataset(train_data_indices, train_labels, max_sent_length)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=train_dataset.spam_collate_func,
                                           shuffle=True)

val_dataset = SpamDataset(val_data_indices, val_labels, train_dataset.max_sent_length)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=train_dataset.spam_collate_func,
                                           shuffle=False)

test_dataset = SpamDataset(test_data_indices, test_labels, train_dataset.max_sent_length)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=train_dataset.spam_collate_func,
                                           shuffle=False)



NameError: ignored

Let's try to print out an batch from train_loader.


In [None]:
data_batch, labels = next(iter(train_loader))
print("data batch dimension: ", data_batch.size())
print("data_batch: ", data_batch)
print("labels: ", labels)

In [None]:
print(len(data_batch[0]))

# Build BiLSTM Classifier

In [None]:
# First import torch related libraries
import torch
import torch.nn as nn
import torch.nn.functional as F

class LSTMClassifier(nn.Module):
    """
    LSTMClassifier classification model
    """
    def __init__(self, embeddings, hidden_size, num_layers, num_classes, bidirectional, dropout_prob=0.3):
        """
           Components of BiLSTM Classifier model
        """
        super().__init__()
        self.embedding_layer = self.load_pretrained_embeddings(embeddings)
        self.dropout = nn.Dropout(p=dropout_prob)
        self.lstm = nn.LSTM(
            input_size=embeddings.shape[1], hidden_size=hidden_size, 
            num_layers=num_layers, dropout=dropout_prob, 
            batch_first=True, bidirectional=bidirectional)
        self.non_linearity = nn.ReLU() # For example, ReLU
        self.clf = nn.Linear(hidden_size*2, 2) # classifier layer
        
    
    def load_pretrained_embeddings(self, embeddings):
        embedding_layer = nn.Embedding(embeddings.shape[0], embeddings.shape[1], padding_idx=0)
        embedding_layer.weight.data = torch.Tensor(embeddings).float()
        return embedding_layer


    def forward(self, inputs):
        logits = None
        v_embedded = self.embedding_layer(inputs)
        v_dropout = self.dropout(v_embedded)
        v_bilstm, _ = self.lstm(v_dropout)
        v_avg_pool = torch.mean(v_bilstm, 1)
        v_nonlinear = self.non_linearity(v_avg_pool)
        v_classify = self.clf(v_nonlinear)

        return v_classify

# Initialize the BiLSTM classifier model, criterion and optimizer


In [None]:
# BiLSTM hyperparameters
hidden_size = 32
num_layers = 1
num_classes = 2
bidirectional=True
torch.manual_seed(1234)

# if cuda exists, use cuda, else run on cpu
if torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device=torch.device('cpu')

model = LSTMClassifier(embeddings, hidden_size, num_layers, num_classes, bidirectional)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Train model with early stopping (10 pts)

Train the model for `NUM_EPOCHS`. 
Keep track of training loss.  
Compute the validation accuracy after each epoch. Keep track of the best validation accuracy and save the model with the best validation accuracy.  

If the validation accuracy does not improve for more than `early_stop_patience` number of epochs in a row, stop training. 


In [None]:
def evaluate(model, dataloader, device):
    accuracy = None
    n_correct = n_total = 0 
    model.eval()
    with torch.no_grad():
        for (data_batch, batch_labels) in dataloader:
            out = model(data_batch.to(device))
            max_scores, preds = out.max(dim=1)
            n_correct += np.sum(preds.cpu().numpy() == batch_labels.numpy())
            n_total += out.shape[0]
    accuracy = n_correct*1.0/n_total
    return accuracy 

In [None]:
train_loss_history = []
val_accuracy_history = []
best_val_accuracy = 0
n_no_improve = 0
early_stop_patience=2
NUM_EPOCHS=10
  
for epoch in tqdm(range(NUM_EPOCHS)):
    model.train()  # this enables dropout/regularization
    for i, (data_batch, batch_labels) in enumerate(train_loader):
        """
           Code for training lstm
        """
        preds = model(data_batch.to(device))
        loss = criterion(preds, batch_labels.to(device))
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        train_loss_history.append(loss.item())
        

    """
        Code for tracking best validation accuracy, saving the best model, and early stopping
        # Compute validation accuracy after each training epoch using `evaluate` function
        # Keep track of validation accuracy in `val_accuracy_history`
        # save model with best validation accuracy, hint: torch.save(model, 'best_model.pt')
        # Early stopping: 
        # stop training if the validation accuracy does not improve for more than `early_stop_patience` runs
    """
    accuracy = evaluate(model, val_loader, device)
    val_accuracy_history.append(accuracy)
    torch.save(model, 'best_model.pt')
    if best_val_accuracy < accuracy:
      best_val_accuracy = accuracy
    else:
      n_no_improve += 1
    if n_no_improve == early_stop_patience:
      break

print("Best validation accuracy is: ", best_val_accuracy)

To avoid overfiting of our model, we use early stopping. Particularly when training a large model, early stopping can help us stop training when at the point where the model stops making genaralizations about the data and begins learning statistical noise that would cause the model to overfit.This would make our model less useful and have less performance when tested on new data/datasets.

# Draw training curve 
X-axis: training steps, Y-axis: training loss

Make sure to draw your own curves. 

In [None]:
pd.Series(train_loss_history).plot()

# Validation accuracy curve
X-axis: Epochs, Y-axis: validation accuracy

In [None]:
pd.Series(val_accuracy_history).plot()

## You should expect to get test accuracy > 0.95.

In [None]:
# Reload best model from saved checkpoint
# Compute test accuracy
#device = "cuda:0"
model = torch.load('best_model.pt')
test_accuracy = evaluate(model, test_loader, device)
print(test_accuracy)