In [1]:
import torch
from torch import nn
import numpy as np

In [2]:
import typing
from typing import List, Tuple, Dict

# Config

In [44]:
embedding_path = "data/embeddings/base/glove.6B.300d.txt"
embedding_dim = 300
nn_hidden_size = 50

epochs = 5
batch_size = 32
learning_rate = 1e-3

train_datapath = "data/datasets/conll_2003/train.txt"
validation_datapath = "data/datasets/conll_2003/valid.txt"
test_datapath = "data/datasets/conll_2003/test.txt"

pad_tag = "<PAD>"
unk_tag = "<UNK>"

In [40]:
gpu = torch.cuda.is_available()
device = torch.device("cuda" if gpu else "cpu")

# Load Embeddings

In [4]:
words = []
vectors = []
with open(embedding_path, "r", encoding="utf-8") as fp:
    for line in fp:
        line = line.split()
        word = line[0]
        vector = np.asarray(line[1:], dtype='float32')
        words.append(word)
        vectors.append(vector)
vectors = np.asarray(vectors)

# Load Data & Preprocess

In [5]:
def load_data_from_file(file_path: str) -> List[List[str]]:
    sentences = []
    sentence = []
    with open(file_path, "r", encoding="utf-8") as fp:
        for line in fp:
            if "-DOCSTART-" in line: # Start of new doc
                if len(sentence) > 0:
                    sentences.append(sentence)
                    sentence = []
            elif len(line) == 1: # Empty line
                if len(sentence) > 0:
                    sentences.append(sentence)
                    sentence = []
            else:
                word = line.split()
                sentence.append(word)
    if len(sentence) > 0:
        sentences.append(sentence)
    return sentences

In [6]:
def get_words_and_tags(
    data: List[List[str]], tag_index: int, lowercase: bool=True
) -> Tuple[List[List[str]], List[List[str]]]:
    """
    tag_index: [(0,POS), (1,Syntactic Chunks), (2, NER)]
    """
    sentences = []
    tags = []
    for sentence in data:
        x = []
        y = []
        for item in sentence:
            if lowercase:
                item[0] = item[0].lower()
            x.append(item[0])
            y.append(item[tag_index+1])
        sentences.append(x)
        tags.append(y)
    return sentences, tags

Load Data & Split into Words + Tags

In [7]:
train = load_data_from_file(train_datapath)
val = load_data_from_file(validation_datapath)
test = load_data_from_file(test_datapath)

In [8]:
len(train), len(val), len(test)

(14041, 3250, 3453)

In [27]:
x_train, y_train = get_words_and_tags(train, tag_index=2)
x_val, y_val = get_words_and_tags(val, tag_index=2)
x_test, y_test = get_words_and_tags(test, tag_index=2)

Compute Max Length (for padding; we compute this only from training data)

In [10]:
# We increase the max length here to account for potentially longer samples in val/test
max_length = round(max([len(sample) for sample in x_train]) * 1.3)

In [11]:
max_length

147

Create an embedding for both \<PAD> (all 0s) and \<UNK> (average of all embeddings) tags.

In [12]:
unk_embedding = np.mean(vectors, axis=0).reshape(1, -1)
pad_embedding = np.zeros((1, vectors.shape[1]))

In [13]:
vectors = torch.as_tensor(np.concatenate((vectors, pad_embedding, unk_embedding)))

Set up dictionaries for converting tags to indices, tokens to indices and vice-versa.

In [14]:
token2index = {word: i for i, word in enumerate(words)}
pad_token_index = len(token2index) + 1
unk_token_index = len(token2index) + 2
token2index[pad_tag] = pad_token_index
token2index[unk_tag] = unk_token_index

index2token = {i: word for word, i in token2index.items()}

In [15]:
tags = set([xx for x in y_train for xx in x])
tag2index = {tag: i for i, tag in enumerate(tags)}
index2tag = {i: tag for tag, i in tag2index.items()}

unk_tag_index = tag2index["O"]
pad_tag_index = tag2index["O"]

Prepare sentences for the model

In [24]:
def sequence_to_tensor(
    sequence: List[str], sequence_to_index: Dict[str, int], vocab: List[str], unk: int, pad_to: int, padding_token: int
) -> torch.Tensor:
    indices = [sequence_to_index[word] if word in vocab else unk for word in sequence]
    padding = [padding_token] * (pad_to - len(indices))
    return indices + padding

In [25]:
def tensor_to_sequence(
    tensor: torch.Tensor, index_to_sequence: Dict[int, str]
) -> List[str]:
    return [index_to_sequence[int(i)] for i in tensor]

In [28]:
x_train = torch.as_tensor([sequence_to_tensor(sentence, token2index, words, unk_token_index, max_length, pad_token_index) 
                           for sentence in x_train])

y_train = torch.as_tensor([sequence_to_tensor(sentence_tags, tag2index, tags, unk_tag_index, max_length, pad_tag_index) 
                           for sentence_tags in y_train])

In [32]:
x_val = torch.as_tensor([sequence_to_tensor(sentence, token2index, words, unk_token_index, max_length, pad_token_index) 
                         for sentence in x_val])

y_val = torch.as_tensor([sequence_to_tensor(tags, tag2index, tags, unk_tag_index, max_length, pad_tag_index) 
                         for tags in y_val])

In [33]:
x_test = torch.as_tensor([sequence_to_tensor(sentence, token2index, words, unk_token_index, max_length, pad_token_index) 
                          for sentence in x_test])

y_test = torch.as_tensor([sequence_to_tensor(tags, tag2index, tags, unk_tag_index, max_length, pad_tag_index) 
                          for tags in y_test])

In [35]:
train_dataset = torch.utils.data.TensorDataset(x_train, y_train)
val_dataset = torch.utils.data.TensorDataset(x_val, y_val)
test_dataset = torch.utils.data.TensorDataset(x_test, y_test)

In [42]:
train_dataloader = torch.utils.data.DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=gpu, 
)
val_dataloader = torch.utils.data.DataLoader(
    val_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=gpu, 
)
test_dataloader = torch.utils.data.DataLoader(
    test_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=gpu, 
)

# Model Time

In [43]:
# https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html
class BiLSTM(nn.Module):
    def __init__(self, embeddings, embedding_dim, hidden_dim, tagset_size):
        super(BiLSTM, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding.from_pretrained(embeddings)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)
        self.tag_classifier = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embedded = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embedded.view(len(sentence), 1, -1)) 
        tag_space = self.tag_classifier(lstm_out.view(len(sentence), -1))
        return tag_space

In [53]:
model = BiLSTM(vectors, embedding_dim, nn_hidden_size, len(tag2index))
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training

# Evaluation

# Save