In [None]:
import torch
from torch import nn
import numpy as np

In [None]:
import typing
from typing import List, Tuple, Dict
from tqdm import tqdm

Colab-Specific Settings

In [None]:
from google.colab import drive
!pip install seqeval
drive.mount("/content/drive/")

Putting this separately since it isn't installed by default in colab

In [None]:
from seqeval import metrics

# Config

In [None]:
colab_prefix = "drive/MyDrive/CMPUT651_DL4NLP/"

source_embedding_dim = 300
version = 1
model_type = "supervised_distilled"
embedding_dim = 50
nn_hidden_size = 50

experiment_name = f"{model_type}_{source_embedding_dim}to{embedding_dim}_v{version}"
embedding_path = colab_prefix + f"data/embeddings/trained/{experiment_name}.glove.6B.300d.txt"
model_output_path = colab_prefix + f"models/classifier_glove_{experiment_name}.pt"
# embedding_path = colab_prefix + f"data/embeddings/base/glove.6B.300d.txt"
# model_output_path = colab_prefix + f"models/classifier_hidden10_glove_clipped_300d.pt"

freeze_embeddings = True
epochs = 5
batch_size = 32
learning_rate = 1e-3

train_datapath = colab_prefix + "data/datasets/conll_2003/train.txt"
validation_datapath = colab_prefix + "data/datasets/conll_2003/valid.txt"
test_datapath = colab_prefix + "data/datasets/conll_2003/test.txt"

pad_tag = "<PAD>"
unk_tag = "<UNK>"

In [None]:
gpu = torch.cuda.is_available()
device = torch.device("cuda" if gpu else "cpu")

In [None]:
print(experiment_name)
print(gpu, device)

# Load Embeddings

In [None]:
words = []
vectors = []
with open(embedding_path, "r", encoding="utf-8") as fp:
    for line in fp:
        line = line.split()
        word = line[0]
        vector = np.asarray(line[1:], dtype='float32')
        words.append(word)
        vectors.append(vector)
vectors = np.asarray(vectors)

Create an embedding for both \<PAD> (all 0s) and \<UNK> (average of all embeddings) tags.

In [None]:
unk_embedding = np.mean(vectors, axis=0).reshape(1, -1)
pad_embedding = np.zeros((1, vectors.shape[1]))

In [None]:
vectors = torch.as_tensor(np.concatenate((vectors, pad_embedding, unk_embedding)))

# Load Data & Preprocess

In [None]:
def load_data_from_file(file_path: str) -> List[List[str]]:
    sentences = []
    sentence = []
    with open(file_path, "r", encoding="utf-8") as fp:
        for line in fp:
            if "-DOCSTART-" in line: # Start of new doc
                if len(sentence) > 0:
                    sentences.append(sentence)
                    sentence = []
            elif len(line) == 1: # Empty line
                if len(sentence) > 0:
                    sentences.append(sentence)
                    sentence = []
            else:
                word = line.split()
                sentence.append(word)
    if len(sentence) > 0:
        sentences.append(sentence)
    return sentences

In [None]:
def get_words_and_tags(
    data: List[List[str]], tag_index: int, lowercase: bool=True
) -> Tuple[List[List[str]], List[List[str]]]:
    """
    tag_index: [(0,POS), (1,Syntactic Chunks), (2, NER)]
    """
    sentences = []
    tags = []
    for sentence in data:
        x = []
        y = []
        for item in sentence:
            if lowercase:
                item[0] = item[0].lower()
            x.append(item[0])
            y.append(item[tag_index+1])
        sentences.append(x)
        tags.append(y)
    return sentences, tags

Load Data & Split into Words + Tags

In [None]:
train = load_data_from_file(train_datapath)
val = load_data_from_file(validation_datapath)
test = load_data_from_file(test_datapath)

In [None]:
len(train), len(val), len(test)

In [None]:
x_train, y_train = get_words_and_tags(train, tag_index=2)
x_val, y_val = get_words_and_tags(val, tag_index=2)
x_test, y_test = get_words_and_tags(test, tag_index=2)

Compute Max Length (for padding; we compute this only from training data)

In [None]:
# We increase the max length here to account for potentially longer samples in val/test
max_length = round(max([len(sample) for sample in x_train]) * 1.3)

In [None]:
max_length

Set up dictionaries for converting tags to indices, tokens to indices and vice-versa.

In [None]:
token2index = {word: i for i, word in enumerate(words)}
pad_token_index = len(token2index)
unk_token_index = len(token2index) + 1
token2index[pad_tag] = pad_token_index
token2index[unk_tag] = unk_token_index

index2token = {i: word for word, i in token2index.items()}

In [None]:
tags = set([xx for x in y_train for xx in x])
tag2index = {tag: i for i, tag in enumerate(tags)}
index2tag = {i: tag for tag, i in tag2index.items()}

unk_tag_index = tag2index["O"]
pad_tag_index = tag2index["O"]

Prepare sentences for the model

In [None]:
def sequence_to_tensor(
    sequence: List[str], sequence_to_index: Dict[str, int], vocab: List[str], unk: int, pad_to: int, padding_token: int
) -> torch.Tensor:
    indices = [sequence_to_index[word] if word in vocab else unk for word in sequence]
    padding = [padding_token] * (pad_to - len(indices))
    return indices + padding

In [None]:
def tensor_to_sequence(
    tensor: torch.Tensor, index_to_sequence: Dict[int, str]
) -> List[str]:
    return [index_to_sequence[int(i)] for i in tensor]

In [None]:
x_train = torch.as_tensor([sequence_to_tensor(sentence, token2index, words, unk_token_index, max_length, pad_token_index) 
                           for sentence in x_train])

y_train = torch.as_tensor([sequence_to_tensor(sentence_tags, tag2index, tags, unk_tag_index, max_length, pad_tag_index) 
                           for sentence_tags in y_train])

In [None]:
x_val = torch.as_tensor([sequence_to_tensor(sentence, token2index, words, unk_token_index, max_length, pad_token_index) 
                         for sentence in x_val])

y_val = torch.as_tensor([sequence_to_tensor(tags, tag2index, tags, unk_tag_index, max_length, pad_tag_index) 
                         for tags in y_val])

In [None]:
x_test = torch.as_tensor([sequence_to_tensor(sentence, token2index, words, unk_token_index, max_length, pad_token_index) 
                          for sentence in x_test])

y_test = torch.as_tensor([sequence_to_tensor(tags, tag2index, tags, unk_tag_index, max_length, pad_tag_index) 
                          for tags in y_test])

In [None]:
train_dataset = torch.utils.data.TensorDataset(x_train, y_train)
val_dataset = torch.utils.data.TensorDataset(x_val, y_val)
test_dataset = torch.utils.data.TensorDataset(x_test, y_test)

In [None]:
train_dataloader = torch.utils.data.DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=gpu, 
)
val_dataloader = torch.utils.data.DataLoader(
    val_dataset, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=gpu, 
)
test_dataloader = torch.utils.data.DataLoader(
    test_dataset, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=gpu, 
)

# Model Time

In [None]:
# https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html
class BiLSTM(nn.Module):
    def __init__(self, embeddings, embedding_dim, hidden_dim, tagset_size, freeze_embeddings):
        super(BiLSTM, self).__init__()
        self.word_embeddings = nn.Embedding.from_pretrained(embeddings, freeze=freeze_embeddings)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.tag_classifier = nn.Linear(hidden_dim * 2, tagset_size) #Since BiLSTM

    def forward(self, sentence):
        # (batch_size, seq_len) -> (batch_size, seq_len, embedding_dim)
        embedded = self.word_embeddings(sentence)
        # (batch_size, seq_len, embedding_dim) -> (batch_size, seq_len, hidden_dim) 
        lstm_out, _ = self.lstm(embedded)
        # (batch_size, seq_len, hidden_dim) -> (batch_size, seq_len, tagset_size)
        tag_space = self.tag_classifier(lstm_out)
        # We use the CrossEntropyLoss so we aren't adding a softmax layer here
        # Because in PyTorch CrossEntropyLoss combines a LogSoftmax with NLLLoss
        # So we output raw logits
        # Since we don't care about the confidences, we don't need a softmax during inference
        # Since the highest value in a softmax will always be the highest value pre-softmax
        return tag_space

In [None]:
model = BiLSTM(vectors, embedding_dim, nn_hidden_size, len(tag2index), freeze_embeddings)
model.double() # Since our embeddings are 32-dimensional
model.to(device)
loss_function = nn.CrossEntropyLoss() 
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training

In [None]:
for epoch in range(epochs):
    train_loss = 0
    val_loss = 0
    
    # Training Loop
    for iteration, batch in enumerate(tqdm(train_dataloader)):
        # Move data to device
        sentences, tags = batch
        sentences = sentences.to(device)
        tags = tags.to(device)
        
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward pass
        predictions = model(sentences)
        
        # Calculate loss
        batch_loss = loss_function(predictions.transpose(1, 2), tags)
        
        # Backward pass
        batch_loss.backward()
        optimizer.step()
        
        # Update train loss
        train_loss += batch_loss.item()
    
    # Validation Loop
    with torch.no_grad():
        for iteration, batch in enumerate(tqdm(val_dataloader)):
            # Move data to device
            sentences, tags = batch
            sentences = sentences.to(device)
            tags = tags.to(device)

            # Forward pass
            predictions = model(sentences)

            # Calculate loss
            batch_loss = loss_function(predictions.transpose(1, 2), tags)

            # Update train loss
            val_loss += batch_loss.item()
    
    # Compute the average losses for this epoch
    train_loss = train_loss / len(train_dataloader)
    val_loss = val_loss / len(val_dataloader)
    
    
    # Print Metrics
    print(
        f"Epoch: {epoch+1}/{epochs}, Train Loss = {train_loss}, \
        Validation Loss = {val_loss}"
    )

# Evaluation

In [None]:
model.eval()

In [None]:
# Test Loop
with torch.no_grad():
    y_true = []
    y_pred = []
    for iteration, batch in enumerate(tqdm(test_dataloader)):
        # Move data to device
        sentences, tags = batch
        sentences = sentences.to(device)

        # Forward pass
        predictions = model(sentences).detach().cpu().numpy().argmax(axis=2)
        
        y_true.extend(tags.detach().cpu().numpy())
        y_pred.extend(predictions)
y_true = [tensor_to_sequence(x, index2tag) for x in np.asarray(y_true)]
y_pred = [tensor_to_sequence(x, index2tag) for x in np.asarray(y_pred)]

In [None]:
print(metrics.classification_report(y_true, y_pred, digits=2))

# Save

In [None]:
torch.save(model.state_dict(), model_output_path)