In [None]:
import torch
from torch import nn
import numpy as np
import pandas as pd

In [None]:
import typing
from typing import List, Tuple, Dict
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

Colab-Specific Settings

In [None]:
from google.colab import drive
drive.mount("/content/drive/")

# Config

In [None]:
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

In [None]:
colab_prefix = "drive/MyDrive/CMPUT651_DL4NLP/"

source_embedding_dim = 300
version = 1
model_type = "supervised_distilled"
embedding_dim = 100
nn_hidden_size = 50

experiment_name = f"{model_type}_{source_embedding_dim}to{embedding_dim}_v{version}"
embedding_path = colab_prefix + f"data/embeddings/trained/{experiment_name}.glove.6B.300d.txt"
model_output_path = colab_prefix + f"models/classifier_glove_{experiment_name}.pt"
# embedding_path = colab_prefix + f"data/embeddings/base/glove.6B.300d.txt"
# model_output_path = colab_prefix + f"models/classifier_hidden10_glove_clipped_300d.pt"

freeze_embeddings = True
epochs = 5
batch_size = 32
learning_rate = 1e-3

train_datapath = colab_prefix + "data/datasets/ag_news/train.csv"
test_datapath = colab_prefix + "data/datasets/ag_news/test.csv"

pad_tag = "<PAD>"
unk_tag = "<UNK>"

In [None]:
gpu = torch.cuda.is_available()
device = torch.device("cuda" if gpu else "cpu")

In [None]:
print(experiment_name)
print(gpu, device)

# Load Embeddings

In [None]:
words = []
vectors = []
with open(embedding_path, "r", encoding="utf-8") as fp:
    for line in fp:
        line = line.split()
        word = line[0]
        vector = np.asarray(line[1:], dtype='float32')
        words.append(word)
        vectors.append(vector)
vectors = np.asarray(vectors)

Create an embedding for both \<PAD> (all 0s) and \<UNK> (average of all embeddings) tags.

In [None]:
unk_embedding = np.mean(vectors, axis=0).reshape(1, -1)
pad_embedding = np.zeros((1, vectors.shape[1]))

In [None]:
vectors = torch.as_tensor(np.concatenate((vectors, pad_embedding, unk_embedding)))

Set up dictionaries for converting tags to indices, tokens to indices and vice-versa.

In [None]:
token2index = {word: i for i, word in enumerate(words)}
pad_token_index = len(token2index)
unk_token_index = len(token2index) + 1
token2index[pad_tag] = pad_token_index
token2index[unk_tag] = unk_token_index

index2token = {i: word for word, i in token2index.items()}

In [None]:
index2tag ={
    0: "World",
    1: "Sports",
    2: "Business",
    3: "Sci/Tech"
}

# Load Data & Preprocess

In [None]:
def sentence_to_indices(sentence, token2index, unk_token_index):
    return [token2index.get(word, unk_token_index) for word in sentence]

In [None]:
def indices_to_sentence(sentence, index2token):
    return [index2token[int(index)] for index in sentence]

In [None]:
def pad_to_max_length(sentence, pad_token_index, max_length):
    padding = [pad_token_index] * (max_length - len(sentence))
    return sentence + padding

In [None]:
def preprocess_data(df, token2index, pad_token_index, unk_token_index, title_max_length=0, desc_max_length=0):
    print("Splitting & Lowercasing.")
    df['Title'] = df['Title'].str.lower().str.split()
    df['Description'] = df['Description'].str.lower().str.split()
    
    # If no max length is specified, compute from data
    if (title_max_length == desc_max_length) and (title_max_length == 0):
        print("Computing Max Lengths.")
        title_max_length = df['Title'].apply(len).max()
        desc_max_length = df['Description'].apply(len).max()
    
    # Convert tokens to indices
    print("Transforming tokens into indices.")
    df['Title'] = df['Title'].apply(sentence_to_indices, args=(token2index, unk_token_index))
    df['Description'] = df['Description'].apply(sentence_to_indices, args=(token2index, unk_token_index))
    
    # Pad data
    print("Padding data.")
    df['Title'] = df['Title'].apply(pad_to_max_length, args=(pad_token_index, title_max_length))
    df['Description'] = df['Description'].apply(pad_to_max_length, args=(pad_token_index, desc_max_length))
    
    # Convert to tensor
    print("Splitting features & labels and converting to tensors.")
    data = df.to_dict(orient="records")
    titles = [x['Title'] for x in data]
    descriptions = [x['Description'] for x in data]
    features = [torch.as_tensor(titles), torch.as_tensor(descriptions)]
    labels = torch.as_tensor([x['Class Index'] for x in data]) - 1 # We want 0-3 not 1-4
    return features, labels, title_max_length, desc_max_length

In [None]:
train = pd.read_csv(train_datapath)
test = pd.read_csv(test_datapath)

In [None]:
train, val = train_test_split(
    train, test_size=4000, stratify=train['Class Index'], random_state=seed
)

In [None]:
len(train), len(val), len(test)

In [None]:
train_features, train_labels, title_max_length, desc_max_length = preprocess_data(
    train, token2index, pad_token_index, unk_token_index
)

In [None]:
val_features, val_labels, title_max_length, desc_max_lenght = preprocess_data(
    val, token2index, pad_token_index, unk_token_index, title_max_length, desc_max_length
)

In [None]:
test_features, test_labels, title_max_length, desc_max_lenght = preprocess_data(
    test, token2index, pad_token_index, unk_token_index, title_max_length, desc_max_length
)

In [None]:
title_max_length, desc_max_length

In [None]:
num_classes = len(train_labels.unique())

In [None]:
train_dataset = torch.utils.data.TensorDataset(train_features[0], train_features[1], train_labels)
val_dataset = torch.utils.data.TensorDataset(val_features[0], val_features[1], val_labels)
test_dataset = torch.utils.data.TensorDataset(test_features[0], test_features[1], test_labels)

In [None]:
train_dataloader = torch.utils.data.DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=gpu, 
)
val_dataloader = torch.utils.data.DataLoader(
    val_dataset, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=gpu, 
)
test_dataloader = torch.utils.data.DataLoader(
    test_dataset, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=gpu, 
)

# Model Time

In [None]:
# https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html
class BiLSTM(nn.Module):
    def __init__(self, embeddings, embedding_dim, hidden_dim, num_classes, freeze_embeddings):
        super(BiLSTM, self).__init__()
        self.word_embeddings = nn.Embedding.from_pretrained(embeddings, freeze=freeze_embeddings)
        self.title_lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.desc_lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.classifier = nn.Linear(hidden_dim * 4, num_classes) # Since BiLSTM + 2 inputs

    def forward(self, title, description):
        # (batch_size, seq_len) -> (batch_size, seq_len, embedding_dim)
        title_embedding = self.word_embeddings(title)
        desc_embedding = self.word_embeddings(description)
        # (batch_size, seq_len, embedding_dim) -> (batch_size, directions, hidden_dim) 
        _, (title_hidden, _) = self.title_lstm(title_embedding) 
        _, (desc_hidden, _) = self.desc_lstm(desc_embedding)
        # (directions, batch_size, hidden_dim), (directions, batch_size, hidden_dim)
        # -> (directions, batch_size, 2*hidden_dim)
        out = torch.cat((title_hidden, desc_hidden), dim=2)
        # (batch_size, directions, 2*hidden_dim) -> (batch_size, 2*hidden_dim*directions)
        out = out.permute(1, 0, 2).flatten(start_dim=1)
        # (batch_size, 2*hidden_dim*directions) -> (batch_size, num_classes)|
        out = self.classifier(out)
        # We use the CrossEntropyLoss so we aren't adding a softmax layer here
        # Because in PyTorch CrossEntropyLoss combines a LogSoftmax with NLLLoss
        # So we output raw logits
        # Since we don't care about the confidences, we don't need a softmax during inference
        # Since the highest value in a softmax will always be the highest value pre-softmax
        return out

In [None]:
model = BiLSTM(vectors, embedding_dim, nn_hidden_size, num_classes, freeze_embeddings)
model.double() # Since our embeddings are 32-dimensional
model.to(device)
loss_function = nn.CrossEntropyLoss() 
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training

In [None]:
for epoch in range(epochs):
    train_loss = 0
    val_loss = 0
    
    # Training Loop
    for iteration, batch in enumerate(tqdm(train_dataloader)):
        # Move data to device
        titles, descriptions, labels = batch
        titles = titles.to(device)
        descriptions = descriptions.to(device)
        labels = labels.to(device)
        
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward pass
        predictions = model(titles, descriptions)
        
        # Calculate loss
        batch_loss = loss_function(predictions, labels)
        
        # Backward pass
        batch_loss.backward()
        optimizer.step()
        
        # Update train loss
        train_loss += batch_loss.item()
    
    # Validation Loop
    with torch.no_grad():
        for iteration, batch in enumerate(tqdm(val_dataloader)):
            # Move data to device
            titles, descriptions, labels = batch
            titles = titles.to(device)
            descriptions = descriptions.to(device)
            labels = labels.to(device)

            # Forward pass
            predictions = model(titles, descriptions)

            # Calculate loss
            batch_loss = loss_function(predictions, labels)

            # Update train loss
            val_loss += batch_loss.item()
    
    # Compute the average losses for this epoch
    train_loss = train_loss / len(train_dataloader)
    val_loss = val_loss / len(val_dataloader)
    
    
    # Print Metrics
    print(
        f"Epoch: {epoch+1}/{epochs}, Train Loss = {train_loss}, \
        Validation Loss = {val_loss}"
    )

# Evaluation

In [None]:
model.eval()

In [None]:
# Test Loop
with torch.no_grad():
    y_true = []
    y_pred = []
    for iteration, batch in enumerate(tqdm(test_dataloader)):
        # Move data to device
        titles, descriptions, labels = batch
        titles = titles.to(device)
        descriptions = descriptions.to(device)
        labels = labels.to(device)

        # Forward pass
        predictions = model(titles, descriptions).detach().cpu().numpy().argmax(axis=1)
        
        y_true.extend(labels.detach().cpu().numpy())
        y_pred.extend(predictions)
y_true = indices_to_sentence(y_true, index2tag)
y_pred = indices_to_sentence(y_pred, index2tag)

In [None]:
print(classification_report(y_true, y_pred, digits=4))

# Save

In [None]:
torch.save(model.state_dict(), model_output_path)