## Pytorch Exercise: Augmenting the LSTM part-of-speech tagger with character-level feature
This notebook implements a POS tagger using PyTorch. The model uses both word embeddings and character-level embeddings to predict the POS tags for each word in a sentence. The notebook is buit following the instructions from the PyTorch tutorial: "Sequence Models and Long Short-Term Memory Networks" (https://docs.pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html).

The text for training and validation is the first 4 chapters of "Moby Dick" by Herman Melville. 

In [1]:
import spacy
import numpy as np

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import torch
from torch import optim, nn
import torch.nn.functional as F

print("Cuda is: ", torch.cuda.is_available())


Cuda is:  True


In [28]:
class POSTagger(nn.Module):
    def __init__(self, word_embedding_dim, char_embedding_dim, char_hidden_dim, hidden_dim, vocab_size, num_chars, output_dim):
        super(POSTagger, self).__init__()
        self.word_embedding = nn.Embedding(vocab_size, word_embedding_dim)
        self.char_embedding = nn.Embedding(num_chars, char_embedding_dim)

        self.char_lstm = nn.LSTM(char_embedding_dim, char_hidden_dim, batch_first=True)
        self.lstm = nn.LSTM(word_embedding_dim + char_embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text, char_list):
        word_embedded = self.word_embedding(text)
        word_embedded = word_embedded.unsqueeze(1)

        word_char_embedded = []
        for word in char_list:
            char_embedded = self.char_embedding(word)
            char_embedded = self.char_lstm(char_embedded.unsqueeze(0))[0][:, -1, :]
            word_char_embedded.append(char_embedded)
        word_char_embedded = torch.stack(word_char_embedded, dim=0)

        combined = torch.cat((word_embedded, word_char_embedded), dim=2)
        lstm_out, _ = self.lstm(combined)
        tag_space = self.fc(lstm_out.view(-1, lstm_out.shape[2]))
        output = F.log_softmax(tag_space, dim=1)
        return output

In [None]:
from google.colab import drive
drive.mount('/content/drive')

file_loc = '/content/drive/MyDrive/Colab Notebooks/moby_dick_four_chapters.txt'

with open(file_loc, 'r') as f:
    whole_text = f.read()

import spacy

nlp_en = spacy.load('en_core_web_sm')
doc = nlp_en(whole_text)
all_sents = list(doc.sents)
len(all_sents)

Mounted at /content/drive


453

In [4]:
MAJOR_TAGS = ['NOUN', 'VERB', 'ADJ', 'ADV', 'PRON', 'DET', 'ADP', 'AUX', 'PROPN', 'NUM', 'X']

def modify_word(word):
    return word.lower().strip()

def filter_pos_tag(pos_tag):
    return pos_tag if pos_tag in MAJOR_TAGS else 'X'

all_data = [
    (
        [modify_word(token.text) for token in sent if not token.is_punct and not token.is_space],
        [filter_pos_tag(token.pos_) for token in sent if not token.is_punct and not token.is_space]
    )
    for sent in all_sents
]

word_to_ix = {}
for sent, tags in all_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

chars = ''.join(list(word_to_ix.keys()))
chars = list(set(chars))
char_to_idx = {ch: idx for idx, ch in enumerate(chars)}

pos_to_ix = {pos: idx for idx, pos in enumerate(MAJOR_TAGS)}

In [5]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


all_data_vec = [
    (
        prepare_sequence(seq, word_to_ix),
        [prepare_sequence(word, char_to_idx) for word in seq],
        prepare_sequence(tags, pos_to_ix) 
    )
    for seq, tags in all_data
]
all_data[0], all_data_vec[0]

((['call', 'me', 'ishmael'], ['VERB', 'PRON', 'PROPN']),
 (tensor([0, 1, 2]),
  [tensor([ 7, 33, 30, 30]),
   tensor([18,  5]),
   tensor([29, 19, 16, 18, 33,  5, 30])],
  tensor([1, 4, 8])))

In [30]:
# Define model parameters

WORD_EMBEDDING_DIM = 16
CHAR_EMBEDDING_DIM = 3
CHAR_HIDDEN_DIM = 3
HIDDEN_DIM = 64
VOCAB_SIZE = len(word_to_ix)
NUM_CHARS = len(char_to_idx)
OUTPUT_DIM = len(MAJOR_TAGS)

train_selection = np.random.choice(len(all_data_vec), size=int(0.8*len(all_data_vec)), replace=False).tolist()
val_selection = [i for i in range(len(all_data_vec)) if i not in train_selection]
training_data = [all_data_vec[i] for i in train_selection]
validation_data = [all_data_vec[i] for i in val_selection]

model = POSTagger(WORD_EMBEDDING_DIM, CHAR_EMBEDDING_DIM, CHAR_HIDDEN_DIM, HIDDEN_DIM, VOCAB_SIZE, NUM_CHARS, OUTPUT_DIM)
optimizer = optim.SGD(model.parameters(), lr=0.01)
loss_function = nn.NLLLoss()

print("# params: ", sum(p.numel() for p in model.parameters()))

# params:  66173


In [None]:
# Accuracy calculation
def calculate_accuracy(model, validation_data):
    preds = []
    targets = []
    for text, char, y in validation_data:
        output = model(text, char)
        output = output.argmax(dim=1)
        preds.append(output)
        targets.append(y)

    preds = torch.cat(preds, dim=0).detach().cpu().numpy()
    targets = torch.cat(targets, dim=0).detach().cpu().numpy()
    print(classification_report(targets, preds))
    return accuracy_score(targets, preds)

# Model before training
calculate_accuracy(model, validation_data)

UniqueCountsResult(values=array([2, 5, 6, 9]), counts=array([ 212, 1013,  277,  623]))
UniqueCountsResult(values=array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10]), counts=array([404, 286, 150, 137, 261, 245, 267,  96,  55,  15, 209]))
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       404
           1       0.00      0.00      0.00       286
           2       0.07      0.09      0.08       150
           3       0.00      0.00      0.00       137
           4       0.00      0.00      0.00       261
           5       0.03      0.11      0.04       245
           6       0.11      0.11      0.11       267
           7       0.00      0.00      0.00        96
           8       0.00      0.00      0.00        55
           9       0.02      0.67      0.03        15
          10       0.00      0.00      0.00       209

    accuracy                           0.04      2125
   macro avg       0.02      0.09      0.02      2125
weigh

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0.03811764705882353

In [43]:
# Training loop
NUM_EPOCHS = 100
losses = []
for epoch in range(NUM_EPOCHS):
    total_loss = 0
    for sentence, char_sentence, tags in training_data:
        model.zero_grad()
        tag_scores = model(sentence, char_sentence)
        loss = loss_function(tag_scores, tags.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    losses.append(total_loss)
    if (epoch+1) % 10 == 0:
        print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {total_loss:.4f}")

Epoch 10/100, Loss: 745.8057
Epoch 20/100, Loss: 664.5018
Epoch 30/100, Loss: 601.3794
Epoch 40/100, Loss: 543.6922
Epoch 50/100, Loss: 493.3460
Epoch 60/100, Loss: 453.3121
Epoch 70/100, Loss: 419.5597
Epoch 80/100, Loss: 389.6323
Epoch 90/100, Loss: 364.0924
Epoch 100/100, Loss: 341.4937


In [44]:
# Model after training
calculate_accuracy(model, validation_data)

UniqueCountsResult(values=array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10]), counts=array([595, 255,  49,  95, 259, 219, 269, 119,  15,   9, 241]))
UniqueCountsResult(values=array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10]), counts=array([404, 286, 150, 137, 261, 245, 267,  96,  55,  15, 209]))
              precision    recall  f1-score   support

           0       0.47      0.70      0.56       404
           1       0.48      0.43      0.45       286
           2       0.51      0.17      0.25       150
           3       0.44      0.31      0.36       137
           4       0.90      0.89      0.89       261
           5       0.92      0.82      0.87       245
           6       0.87      0.87      0.87       267
           7       0.57      0.71      0.63        96
           8       0.13      0.04      0.06        55
           9       0.89      0.53      0.67        15
          10       0.76      0.88      0.82       209

    accuracy                           0.66      2

0.6583529411764706

#### Further things to do

- Train it further to imporve accuracy.
- Experiment with different model hyperparameters.
- Batch the training data and vectorise steps in the forward funciton to make model faster.