## Pytorch Exercise: Augmenting the LSTM part-of-speech tagger with character-level feature
This notebook implements a POS tagger using PyTorch. The model uses both word embeddings and character-level embeddings to predict the POS tags for each word in a sentence. The notebook is buit following the instructions from the PyTorch tutorial: "Sequence Models and Long Short-Term Memory Networks" (https://docs.pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html).

The text for training and validation is the first 4 chapters of "Moby Dick" by Herman Melville. 

In [1]:
import spacy
import numpy as np

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import torch
from torch import optim, nn
import torch.nn.functional as F

print("Cuda is: ", torch.cuda.is_available())

Cuda is:  True


In [2]:
from google.colab import drive
drive.mount('/content/drive')

file_loc = '/content/drive/MyDrive/Colab Notebooks/moby_dick_four_chapters.txt'

with open(file_loc, 'r') as f:
    whole_text = f.read()

nlp_en = spacy.load('en_core_web_sm')
doc = nlp_en(whole_text)
all_sents = list(doc.sents)
len(all_sents)

Mounted at /content/drive


453

In [3]:
MAJOR_TAGS = ['NOUN', 'VERB', 'ADJ', 'ADV', 'PRON', 'DET', 'ADP', 'AUX', 'PROPN', 'NUM', 'X']

def modify_word(word):
    return word.lower().strip()

def filter_pos_tag(pos_tag):
    return pos_tag if pos_tag in MAJOR_TAGS else 'X'

all_data = [
    (
        [modify_word(token.text) for token in sent if not token.is_punct and not token.is_space],
        [filter_pos_tag(token.pos_) for token in sent if not token.is_punct and not token.is_space]
    )
    for sent in all_sents
]

word_to_ix = {}
for sent, tags in all_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

chars = ''.join(list(word_to_ix.keys()))
chars = list(set(chars))
char_to_idx = {ch: idx for idx, ch in enumerate(chars)}

pos_to_ix = {pos: idx for idx, pos in enumerate(MAJOR_TAGS)}

In [85]:
class POSTagger(nn.Module):
    def __init__(self, word_embedding_dim, char_embedding_dim, char_hidden_dim, word_hidden_dim, vocab_size, num_chars, output_dim):
        super(POSTagger, self).__init__()
        self.word_embedding = nn.Embedding(vocab_size, word_embedding_dim)
        self.char_embedding = nn.Embedding(num_chars, char_embedding_dim)

        self.char_lstm = nn.LSTM(char_embedding_dim, char_hidden_dim, batch_first=True)
        self.lstm = nn.LSTM(word_embedding_dim + char_hidden_dim, word_hidden_dim, batch_first=True)
        self.fc = nn.Linear(word_hidden_dim, output_dim)

    def forward(self, words, word_lengths, padded_chars):
        words_embedded = self.word_embedding(words)
        words_embedded = words_embedded.unsqueeze(1)

        # Embed characters (padding_idx will produce zero embeddings)
        char_embedded = self.char_embedding(padded_chars)  # (num_words, max_char_len, char_embedding_dim)
    
        # Get lengths for packing
        packed_chars = nn.utils.rnn.pack_padded_sequence(
            char_embedded, word_lengths, batch_first=True, enforce_sorted=False
        )
    
        packed_out, _ = self.char_lstm(packed_chars)
        unpacked_out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)
    
        # Extract last hidden state for each word
        word_char_embedded = unpacked_out[:, (-1,), :]  # (num_words, -1, char_hidden_dim)

        combined = torch.cat((words_embedded, word_char_embedded), dim=2)
        lstm_out, _ = self.lstm(combined)
        tag_space = self.fc(lstm_out.view(-1, lstm_out.shape[2]))
        output = F.log_softmax(tag_space, dim=1)
        return output

In [86]:
# Define model parameters

WORD_EMBEDDING_DIM = 8
CHAR_EMBEDDING_DIM = 3
CHAR_HIDDEN_DIM = 3
HIDDEN_DIM = 32
VOCAB_SIZE = len(word_to_ix)
print("Vocab size: ", VOCAB_SIZE)
NUM_CHARS = len(char_to_idx)
print("Num chars: ", NUM_CHARS)
OUTPUT_DIM = len(MAJOR_TAGS)

Vocab size:  2718
Num chars:  38


In [87]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


all_data_vec = [
    (
        prepare_sequence(seq, word_to_ix),
        [len(word) for word in seq],  # Get lengths for packing
        nn.utils.rnn.pad_sequence([prepare_sequence(word, char_to_idx) for word in seq], 
                                  batch_first=True,
                                  padding_value=NUM_CHARS),  # Pad with vocab size index which is out of range for chars
        prepare_sequence(tags, pos_to_ix) 
    )
    for seq, tags in all_data
]
print(all_data[0], all_data_vec[0])

train_selection = np.random.choice(len(all_data_vec), size=int(0.8*len(all_data_vec)), replace=False).tolist()
val_selection = [i for i in range(len(all_data_vec)) if i not in train_selection]
training_data = [all_data_vec[i] for i in train_selection]
validation_data = [all_data_vec[i] for i in val_selection]

(['call', 'me', 'ishmael'], ['VERB', 'PRON', 'PROPN']) (tensor([0, 1, 2]), [4, 2, 7], tensor([[34, 24,  7,  7, 38, 38, 38],
        [35,  8, 38, 38, 38, 38, 38],
        [12, 20, 16, 35, 24,  8,  7]]), tensor([1, 4, 8]))


In [88]:
model = POSTagger(WORD_EMBEDDING_DIM, CHAR_EMBEDDING_DIM, CHAR_HIDDEN_DIM, HIDDEN_DIM, 
                  VOCAB_SIZE, 
                  NUM_CHARS + 1, # Add 1 for padding index
                  OUTPUT_DIM)
optimizer = optim.SGD(model.parameters(), lr=0.01)
loss_function = nn.NLLLoss()

print("# params in the model: ", sum(p.numel() for p in model.parameters()))

# params in the model:  28080


In [None]:
# Accuracy calculation
def calculate_accuracy(model, validation_data, print_report=True):
    preds = []
    targets = []
    for words, word_lengths, padded_chars, y in validation_data:
        output = model(words, word_lengths, padded_chars)
        output = output.argmax(dim=1)
        preds.append(output)
        targets.append(y)

    preds = torch.cat(preds, dim=0).detach().cpu().numpy()
    targets = torch.cat(targets, dim=0).detach().cpu().numpy()
    
    if print_report:
        print(classification_report(targets, preds))
    return accuracy_score(targets, preds)

# Model before training
calculate_accuracy(model, validation_data)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       455
           1       0.17      0.13      0.15       328
           2       0.00      0.00      0.00       188
           3       0.00      0.00      0.00       139
           4       0.00      0.00      0.00       319
           5       0.12      0.96      0.21       275
           6       0.00      0.00      0.00       294
           7       0.00      0.00      0.00       136
           8       0.00      0.00      0.00        60
           9       0.00      0.00      0.00        20
          10       0.00      0.00      0.00       255

    accuracy                           0.12      2469
   macro avg       0.03      0.10      0.03      2469
weighted avg       0.04      0.12      0.04      2469



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0.12434183880113406

In [94]:
# Training loop
NUM_EPOCHS = 100
losses = []
for epoch in range(NUM_EPOCHS):
    total_loss = 0
    for words, word_lengths, padded_chars, tags in training_data:
        model.zero_grad()
        tag_scores = model(words, word_lengths, padded_chars)
        loss = loss_function(tag_scores, tags.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    losses.append(total_loss)
    if (epoch+1) % 10 == 0:
        print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {total_loss:.4f}")

Epoch 10/100, Loss: 430.6575
Epoch 20/100, Loss: 414.8810
Epoch 30/100, Loss: 400.4139
Epoch 40/100, Loss: 386.9662
Epoch 50/100, Loss: 374.4065
Epoch 60/100, Loss: 362.7232
Epoch 70/100, Loss: 351.8608
Epoch 80/100, Loss: 341.7567
Epoch 90/100, Loss: 332.3723
Epoch 100/100, Loss: 323.6613


In [96]:
# Model after training
calculate_accuracy(model, validation_data)

              precision    recall  f1-score   support

           0       0.43      0.67      0.52       455
           1       0.35      0.31      0.33       328
           2       0.42      0.05      0.09       188
           3       0.47      0.35      0.40       139
           4       0.94      0.86      0.90       319
           5       0.91      0.91      0.91       275
           6       0.86      0.85      0.86       294
           7       0.76      0.76      0.76       136
           8       0.19      0.08      0.11        60
           9       1.00      0.60      0.75        20
          10       0.74      0.87      0.80       255

    accuracy                           0.64      2469
   macro avg       0.64      0.58      0.59      2469
weighted avg       0.64      0.64      0.62      2469



0.6411502632644795

#### Further things to do

- Train it further to imporve accuracy.
- Experiment with different model hyperparameters.
- Batch the training data and vectorise steps in the forward funciton to make model faster.