# Task
This assignment starts out with a working PoS tagger based on LSTMs, an untrained word embedding, and a softmax output over PoS tag. The task is to improve the given model and its evaluation.

# Imports

In [None]:
# Our standard imports for maths and basic methodology
import numpy as np
from sklearn.model_selection import train_test_split

# For user feedback
from tqdm import tqdm
import matplotlib.pyplot as plt

# Imports for pytorch
import torch
import torch.nn as nn

In [None]:
if torch.cuda.is_available():
  for i in range(torch.cuda.device_count()):
    print(torch.cuda.get_device_name(i))
else:
  print("No GPU available")

Tesla K80


In [None]:
!git clone https://github.com/UniversalDependencies/UD_English-GUM.git
!git clone https://github.com/UniversalDependencies/UD_German-GSD.git
!git clone https://github.com/UniversalDependencies/UD_Swedish-LinES.git

fatal: destination path 'UD_English-GUM' already exists and is not an empty directory.
fatal: destination path 'UD_German-GSD' already exists and is not an empty directory.
fatal: destination path 'UD_Swedish-LinES' already exists and is not an empty directory.


In [None]:
!pip install conllu

import conllu



For the extensions part, I decided to implement the GRU model, bidirectionality, and to use both the UD's universal and language-specific tag sets. All three of these are specified as parameters when instantiating an object of the model class. I also added the option of masking some of the tokens in the training data, using weight decay, and dropout.

In [None]:
class LSTMTagger(nn.Module):
  def __init__(self,  train_file,  test_file, word_embedding_dim, hidden_dim, lstm=True, bidir=False, universal=True, mask=False, decay=False, drop=False):
    super(LSTMTagger, self).__init__()                                          # We need to initialise the class we are inheriting from
    self.lstm = lstm                                                            # This simply stores the parameters
    self.bidir = bidir
    self.universal = universal
    self.mask = mask
    self.decay = decay
    self.drop = drop
    self.train_file = train_file 
    self.test_file = test_file  
    self.preprocessing()                                                        # calls parsefile, mask_tokens and pad_and_encode
    self.hidden_dim_ = hidden_dim                                     
    self.vocabulary_size = len(self.token2idx)
    self.tagset_size = len(self.tag2idx)
    self._word_embedding = nn.Embedding(num_embeddings=self.vocabulary_size,    # Creates the vector space for the input words
                                         embedding_dim=word_embedding_dim, 
                                         padding_idx=self.token2idx['<PAD>'])
    
    if self.lstm:
      if self.bidir:
        self._lstm = nn.LSTM(input_size=word_embedding_dim,                     # The LSTM takes an embedded sentence as input, and outputs 
                          hidden_size=hidden_dim,                               # vectors with dimensionality lstm_hidden_dim.
                          batch_first=True, bidirectional=True)
        
      else:
        self._lstm = nn.LSTM(input_size=word_embedding_dim,                         
                          hidden_size=hidden_dim,                           
                          batch_first=True, bidirectional=False)

    else:
      if self.bidir:
        self._gru = nn.GRU(input_size=word_embedding_dim,                         
                          hidden_size=hidden_dim,                           
                          batch_first=True, bidirectional=True)
      else:
        self._gru = nn.GRU(input_size=word_embedding_dim,                          
                          hidden_size=hidden_dim,                           
                          batch_first=True, bidirectional=False)
    if self.drop:
      self.dropout = nn.Dropout(0.25) # dropout
      
    if bidir:
      self._fc = nn.Linear(hidden_dim*2, self.tagset_size)                      # The linear layer maps from the RNN output space to tag space
    else:
      self._fc = nn.Linear(hidden_dim, self.tagset_size)  
    self._softmax = nn.LogSoftmax(dim=1)                                        # Softmax of outputting PDFs over tags
    
    self.training_loss = list()                                                 # For plotting
    self.training_accuracy = list()
    self.batch_size = 256 
    

    if torch.cuda.is_available():                                               # Move the model to the GPU (if we have one)
      self.cuda()

  def parsefile(self, file_path):                                               # parses the conllu files, outputs X and y
    if self.universal:
      pos_type = "upos"                                                         # upos -> universal pos-tag
    else:
      pos_type = "xpos"                                                         # xpos -> language-specific pos-tag

    data = open(file_path, mode="r", encoding="utf-8")
    annotations = data.read()
    sentences = conllu.parse(annotations)

    X = []

    for i in range(len(sentences)):
      sent_list = []
      for element in sentences[i]:
        word = element["form"]                                                  # "form" refers to the actual word form (as in not the lemma)
        sent_list.append(word)
      if len(sent_list) > 2:
        X.append(sent_list)

    y = []

    for i in range(len(sentences)):
      sent_list = []
      for element in sentences[i]:
        word = element[pos_type]                                                # choose either universal or lang-specific pos-tag
        sent_list.append(word)
      if len(sent_list) > 2:
        y.append(sent_list)

    return X, y

  def mask_tokens(self, X):                                                     # masks every 7th token
    counter = 100
    masking = '<MASK>'
    for sentence in X:
      for i, word in enumerate(sentence):
        if counter == 0:
          counter = 100
        elif counter % 7 == 0:
          sentence[i] = masking
        counter -= 1

    return X

  def token_tag_idx(self, X_train, y_train):                                    # creates token-idx dictionary and tag-idx dictionary
    tokens = {token for sentence in X_train for token in sentence}
    idx2token = list(tokens)
    idx2token.insert(0, '<UNK>')
    idx2token.append('<PAD>')
    token2idx = {token:idx for idx, token in enumerate(idx2token)}

    tags = {tag for tags in y_train for tag in tags}
    idx2tag = list(tags)
    idx2tag.insert(0, '<UNK>')
    idx2tag.append('<PAD>')
    tag2idx = {tag:idx for idx, tag in enumerate(idx2tag)}

    return token2idx, tag2idx

  def pad_and_encode(self, sentences, labels, token2idx, tag2idx):              # padding
    assert len(sentences)==len(labels)
    assert np.all([len(sentence)==len(tags) for sentence, tags in zip(sentences, labels)])
    max_sentence_length = np.max([len(sentence) for sentence in sentences])     # Find out how much to pad
    padded_sentences = torch.zeros(len(sentences), max_sentence_length,         # Create data structures with <PAD> as default
                                  dtype=torch.long)
    padded_sentences[:] = token2idx['<PAD>']
    padded_labels = torch.zeros(len(sentences), max_sentence_length, 
                                dtype=torch.long)
    padded_labels[:] = tag2idx['<PAD>']
    for i, (sentence, tags) in enumerate(zip(sentences, labels)):               # Loop over the data
      for j, token in enumerate(sentence):
        if token in token2idx.keys():
          padded_sentences[i, j] = token2idx[token]
        else:
          padded_sentences[i, j] = token2idx['<UNK>']
      for j, tag in enumerate(tags):
        if tag in tag2idx.keys():
          padded_labels[i, j] = tag2idx[tag]
        else:
          padded_labels[i, j] = tag2idx['<UNK>']                                
    return padded_sentences, padded_labels

  def batch_iterator(self, sentences, labels):
    """Helper function for iterating over batches of the data"""
    assert len(sentences) == len(labels)
    for i in range(0, len(sentences), self.batch_size):
      X, y = self.pad_and_encode(sentences[i:min(i+self.batch_size, len(sentences))], 
                            labels[i:min(i+self.batch_size, len(sentences))], self.token2idx, self.tag2idx)
      if torch.cuda.is_available():                                             # Move data to the GPU, if possible, before yielding it
        yield (X.cuda(), y.cuda())
      else:
        yield (X, y)

  def preprocessing(self):
    X_train, y_train = self.parsefile(self.train_file)
    X_test, y_test = self.parsefile(self.test_file)

    if self.mask:
      X_train = self.mask_tokens(X_train)

    token2idx, tag2idx = self.token_tag_idx(X_train, y_train)
    self.token2idx = token2idx
    self.tag2idx = tag2idx

    ### This piece of the code can be used to calculate a majority baseline ###

    #tag_list = [tag for tags in y_train for tag in tags] # create a list with all the tags
    #print("denominator ", len(tag_list))
    #tag_dictionary = dict()
    #for tag in tag_list: # create a dictionary with tags as keys and the tags frequency as values
    #  if tag in tag_dictionary.keys():
    #    tag_dictionary[tag] += 1
    #  else:
    #    tag_dictionary[tag] = 1

    #inverse = [(value, key) for key, value in tag_dictionary.items()]
    #print(max(inverse)) # get the key-value pair with the highest value
    #print("nominator", tag_dictionary)

    X_train_pad, y_train_pad = self.pad_and_encode(X_train, y_train, self.token2idx, self.tag2idx)
   
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    padded_sentences = X_train_pad.to(device)

    self.X_train = X_train
    self.y_train = y_train
    self.X_test = X_test
    self.y_test = y_test


  def fitting(self):
    loss_function = nn.NLLLoss(ignore_index=self.tag2idx['<PAD>'])              # A loss function that fits our choice of output layer and data. The
                                                                                # loss function will ignore the padding index in the targets.
    if self.decay:                                                              
      optimizer = torch.optim.Adam(self.parameters(), lr=0.01, weight_decay=1e-20) # optimizer with or without weight decay
    else:
      optimizer = torch.optim.Adam(self.parameters(), lr=0.01)
                                                                
    for epoch in range(5):                                                      # Times to loop over the full dataset
      with tqdm(self.batch_iterator(self.X_train, self.y_train), 
                total=len(self.X_train)//self.batch_size+1, unit="batch", desc="Epoch %i" % epoch) as batches:
        for inputs, targets in batches:                                         # Loop once over the training data
          self.zero_grad()                                                      # Reset gradients
          scores = self(inputs)                                                 # Forward pass
          loss = loss_function(scores.view(-1, self.tagset_size),               # Get loss, the data is reshaped as a long line of predictions and targets
                              targets.view(-1))   
                 
          loss.backward()                                                       # Backpropagate the error
          optimizer.step()                                                      # Run the optimizer to change the weights w.r.t the loss
          predictions = scores.argmax(dim=2, keepdim=True).squeeze()            # Calculate the batch training accuracy
          mask = targets!=self.tag2idx['<PAD>']                                 # Create a mask for ignoring <PAD> in the targets
          correct = (predictions[mask] == targets[mask]).sum().item()           # Item pulls the value from the GPU automatically (if needed)
          accuracy = correct / mask.sum().item()*100
          self.training_accuracy.append(accuracy)                               # Save the accuracy for plotting
          self.training_loss.append(loss.item())                                # Save the loss for plotting
          batches.set_postfix(loss=loss.item(), accuracy=accuracy)
          
    return self.training_accuracy,self.training_loss                            # Update the progress bar

    
  def predict(self, train):

    if train:
      sentences = self.X_train
      labels = self.y_train
    else:
      sentences = self.X_test
      labels = self.y_test
    with torch.no_grad():                                                       # Do not use the following forward passes to calculate a gradient
      n_correct = 0
      n_total = 0
      for inputs, targets in self.batch_iterator(sentences, labels):            # Loop once over the test data
        scores = self(inputs)                                                   # Runs the test data through the model
        predictions = scores.argmax(dim=2, keepdim=True).squeeze()              # Finds the predictions
        mask = targets!=self.tag2idx['<PAD>']                                   # Create a mask for ignoring <PAD> in the targets
        n_correct += (predictions[mask] == targets[mask]).sum().item()          # Sums the number of correct predictions
        n_total += mask.sum().item()

    return 100*n_correct/n_total
  

    
  def forward(self, padded_sentences):
    """The forward pass through the network"""
    batch_size, max_sentence_length = padded_sentences.size()

    embedded_sentences = self._word_embedding(padded_sentences)                 # Sentences encoded as integers are mapped to vectors    

    sentence_lengths = (padded_sentences!=self.token2idx['<PAD>']).sum(dim=1)   # Find the length of sentences
    sentence_lengths = sentence_lengths.long().cpu()                            # Ensure the correct format
    X = nn.utils.rnn.pack_padded_sequence(embedded_sentences, sentence_lengths, # Pack the embedded data
                                          batch_first=True, enforce_sorted=False)
    
    if self.lstm:
      lstm_out, _ = self._lstm(X)                                               # Run the LSTM layer
      X, _ = nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)       # Unpack the output from the LSTM
    else:
      gru_out, _ = self._gru(X)
      X, _ = nn.utils.rnn.pad_packed_sequence(gru_out, batch_first=True)

    

    X = X.contiguous().view(-1, X.shape[2])                                     # The output from the LSTM layer is flattened
    
    if self.drop:                                                               
      X = self.dropout(X)                                                       # dropout

    tag_space = self._fc(X)                                                     # Fully connected layer
    tag_scores = self._softmax(tag_space)                                       # Softmax is applied to normalise the outputs
                                      
    return tag_scores.view(batch_size, max_sentence_length, self.tagset_size)

# Baseline

Originally I was going to use a majority baseline (taking the most common label and checking how often it occurs in the training data (in percent). I tried this for a one-directional LSTM model on English test data with universal tags, and the most common label is Noun. However, since this label only gets about 17% this does not seem like a good baseline, probably because there are too many labels in total (The code for finding the majority baseline can still be found in the model class, for reference).
Instead, I have decided to use the original model (the code that was provided), train it with the training data and see what training and test accuracy that results in. These accuracies for English, German and Swedish can be found in the output below.

In [None]:
# run this to get the baselines for English, German and Swedish

model_english = LSTMTagger("UD_English-GUM/en_gum-ud-train.conllu", "UD_English-GUM/en_gum-ud-test.conllu", 
                   word_embedding_dim=32, hidden_dim=64)                   
   
weights = model_english.fitting()                                               # fit the model to the training data                                                    

score_english_train = model_english.predict(train=True)                         # predict training accuracy
score_english_test = model_english.predict(train=False)                         # predict test accuracy
print("English baseline training accuracy: ", score_english_train)
print("English baseline test accuracy: ", score_english_test)


model_german = LSTMTagger("UD_German-GSD/de_gsd-ud-train.conllu", "UD_German-GSD/de_gsd-ud-test.conllu", 
                   word_embedding_dim=32, hidden_dim=64)                   
   
weights = model_german.fitting()                                                # fit the model to the training data         

score_german_train = model_german.predict(train=True)                           # predict training accuracy
score_german_test = model_german.predict(train=False)                           # predict test accuracy
print("German baseline training accuracy: ", score_german_train)
print("German baseline test accuracy: ", score_german_test)

model_swedish = LSTMTagger("UD_Swedish-LinES/sv_lines-ud-train.conllu", "UD_Swedish-LinES/sv_lines-ud-test.conllu", 
                   word_embedding_dim=32, hidden_dim=64)

weights = model_swedish.fitting()                                               # fit the model to the training data 

score_swedish_train = model_swedish.predict(train=True)                         # predict training accuracy
score_swedish_test = model_swedish.predict(train=False)                         # predict test accuracy
print("Swedish baseline training accuracy: ", score_swedish_train)
print("Swedish baseline test accuracy: ", score_swedish_test)

Epoch 0: 100%|██████████| 21/21 [00:02<00:00,  7.85batch/s, accuracy=51.4, loss=1.65]
Epoch 1: 100%|██████████| 21/21 [00:02<00:00, 10.16batch/s, accuracy=68, loss=1.1]
Epoch 2: 100%|██████████| 21/21 [00:02<00:00, 10.29batch/s, accuracy=76.1, loss=0.816]
Epoch 3: 100%|██████████| 21/21 [00:02<00:00, 10.16batch/s, accuracy=82, loss=0.614]
Epoch 4: 100%|██████████| 21/21 [00:02<00:00, 10.12batch/s, accuracy=86.8, loss=0.45]


English baseline training accuracy:  85.95667002930297
English baseline test accuracy:  79.20372983252561


Epoch 0: 100%|██████████| 54/54 [00:05<00:00, 10.45batch/s, accuracy=70.4, loss=0.864]
Epoch 1: 100%|██████████| 54/54 [00:05<00:00, 10.61batch/s, accuracy=82.5, loss=0.517]
Epoch 2: 100%|██████████| 54/54 [00:05<00:00, 10.49batch/s, accuracy=89.5, loss=0.329]
Epoch 3: 100%|██████████| 54/54 [00:05<00:00, 10.50batch/s, accuracy=93, loss=0.221]
Epoch 4: 100%|██████████| 54/54 [00:05<00:00, 10.46batch/s, accuracy=95.2, loss=0.159]


German baseline training accuracy:  95.5489481567593
German baseline test accuracy:  84.2425689387609


Epoch 0: 100%|██████████| 13/13 [00:01<00:00, 11.72batch/s, accuracy=42.9, loss=1.93]
Epoch 1: 100%|██████████| 13/13 [00:01<00:00, 11.55batch/s, accuracy=60.2, loss=1.32]
Epoch 2: 100%|██████████| 13/13 [00:01<00:00, 11.57batch/s, accuracy=69.6, loss=0.963]
Epoch 3: 100%|██████████| 13/13 [00:01<00:00, 11.83batch/s, accuracy=77.4, loss=0.717]
Epoch 4: 100%|██████████| 13/13 [00:01<00:00, 11.55batch/s, accuracy=83.4, loss=0.534]


Swedish baseline training accuracy:  81.77536362652037
Swedish baseline test accuracy:  77.53755522827687


# LSTM vs. GRU, bidirectional, universal tags vs. language-specific tags
Since German got the highest accuracy for the basic model it is this training and test set that will be used in the following to test performance differences between some different parameter combinations. Parameters are LSTM vs. GRU, bidirectional vs. unidirectional and universal tagset vs. language-specific tagset. The parameter combination that got the highest accuracy score here is  lstm=False,bidir=True, universal=False, i.e. a bidirectional GRU model using language-specific tags. Second best was the bidirectional LSTM model, also using language-specific tags. 

In [None]:
score_list = []

model_german = LSTMTagger("UD_German-GSD/de_gsd-ud-train.conllu", "UD_German-GSD/de_gsd-ud-test.conllu", 
                   word_embedding_dim=32, hidden_dim=64, lstm=True,bidir=False, universal=False)                   
weights = model_german.fitting()                                                         
score_german_train = model_german.predict(train=True)
score_german_test = model_german.predict(train=False)
score_list.append(score_german_train)
score_list.append(score_german_test)

model_german = LSTMTagger("UD_German-GSD/de_gsd-ud-train.conllu", "UD_German-GSD/de_gsd-ud-test.conllu", 
                   word_embedding_dim=32, hidden_dim=64, lstm=False,bidir=False, universal=False)                   
weights = model_german.fitting()                                                         
score_german_train = model_german.predict(train=True)
score_german_test = model_german.predict(train=False)
score_list.append(score_german_train)
score_list.append(score_german_test)

model_german = LSTMTagger("UD_German-GSD/de_gsd-ud-train.conllu", "UD_German-GSD/de_gsd-ud-test.conllu", 
                   word_embedding_dim=32, hidden_dim=64, lstm=True,bidir=True, universal=False)                   
weights = model_german.fitting()                                                         
score_german_train = model_german.predict(train=True)
score_german_test = model_german.predict(train=False)
score_list.append(score_german_train)
score_list.append(score_german_test)

model_german = LSTMTagger("UD_German-GSD/de_gsd-ud-train.conllu", "UD_German-GSD/de_gsd-ud-test.conllu", 
                   word_embedding_dim=32, hidden_dim=64, lstm=False,bidir=True, universal=False)                   
weights = model_german.fitting()                                                         
score_german_train = model_german.predict(train=True)
score_german_test = model_german.predict(train=False)
score_list.append(score_german_train)
score_list.append(score_german_test)

model_german = LSTMTagger("UD_German-GSD/de_gsd-ud-train.conllu", "UD_German-GSD/de_gsd-ud-test.conllu", 
                   word_embedding_dim=32, hidden_dim=64, lstm=True,bidir=True, universal=True)                   
weights = model_german.fitting()                                                         
score_german_train = model_german.predict(train=True)
score_german_test = model_german.predict(train=False)
score_list.append(score_german_train)
score_list.append(score_german_test)

model_german = LSTMTagger("UD_German-GSD/de_gsd-ud-train.conllu", "UD_German-GSD/de_gsd-ud-test.conllu", 
                   word_embedding_dim=32, hidden_dim=64, lstm=False,bidir=False, universal=True)                   
weights = model_german.fitting()                                                         
score_german_train = model_german.predict(train=True)
score_german_test = model_german.predict(train=False)
score_list.append(score_german_train)
score_list.append(score_german_test)

for score in score_list:
  print(score)

                                               

Epoch 0: 100%|██████████| 54/54 [00:05<00:00, 10.25batch/s, accuracy=67.6, loss=1.07]
Epoch 1: 100%|██████████| 54/54 [00:05<00:00, 10.35batch/s, accuracy=81.5, loss=0.624]
Epoch 2: 100%|██████████| 54/54 [00:05<00:00, 10.27batch/s, accuracy=88.6, loss=0.404]
Epoch 3: 100%|██████████| 54/54 [00:05<00:00, 10.27batch/s, accuracy=92.4, loss=0.274]
Epoch 4: 100%|██████████| 54/54 [00:05<00:00, 10.26batch/s, accuracy=94.4, loss=0.196]
Epoch 0: 100%|██████████| 54/54 [00:05<00:00, 10.45batch/s, accuracy=70.3, loss=0.993]
Epoch 1: 100%|██████████| 54/54 [00:05<00:00, 10.52batch/s, accuracy=82.3, loss=0.596]
Epoch 2: 100%|██████████| 54/54 [00:05<00:00, 10.40batch/s, accuracy=89.1, loss=0.386]
Epoch 3: 100%|██████████| 54/54 [00:05<00:00, 10.54batch/s, accuracy=93, loss=0.255]
Epoch 4: 100%|██████████| 54/54 [00:05<00:00, 10.47batch/s, accuracy=95.1, loss=0.176]
Epoch 0: 100%|██████████| 54/54 [00:06<00:00,  8.53batch/s, accuracy=73.2, loss=0.858]
Epoch 1: 100%|██████████| 54/54 [00:06<00:00, 

94.94943849530155
84.23063149098722
95.52212112405267
86.02124865703712
97.65934139634705
87.31646174047988
97.89221494414761
89.4532648919661
97.34226077366182
84.5529425808762
95.76430961376525
84.49325534200788


# Changing the hidden_dimensions
The default for the hidden dimensions is 64. When raising the hidden dimensions to 84 we get a slightly higher test accuracy, whereas when lowering them to 44, the test accuracy is lower. This could suggest that a higher number of hidden dimensions might be beneficial for the German model. 

In [None]:
model_german = LSTMTagger("UD_German-GSD/de_gsd-ud-train.conllu", "UD_German-GSD/de_gsd-ud-test.conllu", 
                   word_embedding_dim=32, hidden_dim=64)                  

weights = model_german.fitting()                                                         

score_german_train = model_german.predict(train=True)
score_german_test = model_german.predict(train=False)

print(f"Training accuracy German: {score_german_train}")
print(f"Testing accuracy German: {score_german_test}")


model_german = LSTMTagger("UD_German-GSD/de_gsd-ud-train.conllu", "UD_German-GSD/de_gsd-ud-test.conllu", 
                   word_embedding_dim=32, hidden_dim=84)                   

weights = model_german.fitting()                                                         

score_german_train = model_german.predict(train=True)
score_german_test = model_german.predict(train=False)

print(f"Training accuracy German: {score_german_train}")
print(f"Testing accuracy German: {score_german_test}")


model_german = LSTMTagger("UD_German-GSD/de_gsd-ud-train.conllu", "UD_German-GSD/de_gsd-ud-test.conllu", 
                   word_embedding_dim=32, hidden_dim=44)                 

weights = model_german.fitting()                                                         

score_german_train = model_german.predict(train=True)
score_german_test = model_german.predict(train=False)

print(f"Training accuracy German: {score_german_train}")
print(f"Testing accuracy German: {score_german_test}")

Epoch 0: 100%|██████████| 54/54 [00:05<00:00, 10.30batch/s, accuracy=68.4, loss=0.881]
Epoch 1: 100%|██████████| 54/54 [00:05<00:00, 10.47batch/s, accuracy=81.9, loss=0.534]
Epoch 2: 100%|██████████| 54/54 [00:05<00:00, 10.56batch/s, accuracy=89.3, loss=0.33]
Epoch 3: 100%|██████████| 54/54 [00:05<00:00, 10.57batch/s, accuracy=93.8, loss=0.214]
Epoch 4: 100%|██████████| 54/54 [00:05<00:00, 10.49batch/s, accuracy=95.5, loss=0.152]


Training accuracy German: 95.65737408061523
Testing accuracy German: 84.81556643189685


Epoch 0: 100%|██████████| 54/54 [00:05<00:00,  9.92batch/s, accuracy=70.6, loss=0.837]
Epoch 1: 100%|██████████| 54/54 [00:05<00:00,  9.96batch/s, accuracy=82.7, loss=0.517]
Epoch 2: 100%|██████████| 54/54 [00:05<00:00,  9.96batch/s, accuracy=89.6, loss=0.329]
Epoch 3: 100%|██████████| 54/54 [00:05<00:00, 10.00batch/s, accuracy=93.6, loss=0.218]
Epoch 4: 100%|██████████| 54/54 [00:05<00:00,  9.94batch/s, accuracy=96, loss=0.15]


Training accuracy German: 95.93309636121108
Testing accuracy German: 85.78846842545065


Epoch 0: 100%|██████████| 54/54 [00:04<00:00, 10.93batch/s, accuracy=67.9, loss=0.95]
Epoch 1: 100%|██████████| 54/54 [00:04<00:00, 11.03batch/s, accuracy=81.1, loss=0.555]
Epoch 2: 100%|██████████| 54/54 [00:04<00:00, 10.98batch/s, accuracy=88.6, loss=0.36]
Epoch 3: 100%|██████████| 54/54 [00:04<00:00, 10.98batch/s, accuracy=92.8, loss=0.246]
Epoch 4: 100%|██████████| 54/54 [00:04<00:00, 11.03batch/s, accuracy=95.1, loss=0.176]


Training accuracy German: 95.21323765025001
Testing accuracy German: 80.85233377103975


# Changing the word embedding dimensions
Both when lowering and raising the number of embeddings we get a slightly lower accuracy. This suggests that the number of word embeddings was reasonable for German.

In [None]:
model_german = LSTMTagger("UD_German-GSD/de_gsd-ud-train.conllu", "UD_German-GSD/de_gsd-ud-test.conllu", 
                   word_embedding_dim=32, hidden_dim=64)                  

weights = model_german.fitting()                                                         

score_german_train = model_german.predict(train=True)
score_german_test = model_german.predict(train=False)

print(f"Training accuracy German: {score_german_train}")
print(f"Testing accuracy German: {score_german_test}")


model_german = LSTMTagger("UD_German-GSD/de_gsd-ud-train.conllu", "UD_German-GSD/de_gsd-ud-test.conllu", 
                   word_embedding_dim=22, hidden_dim=64)                   

weights = model_german.fitting()                                                         

score_german_train = model_german.predict(train=True)
score_german_test = model_german.predict(train=False)

print(f"Training accuracy German: {score_german_train}")
print(f"Testing accuracy German: {score_german_test}")


model_german = LSTMTagger("UD_German-GSD/de_gsd-ud-train.conllu", "UD_German-GSD/de_gsd-ud-test.conllu", 
                   word_embedding_dim=42, hidden_dim=64)                 

weights = model_german.fitting()                                                         

score_german_train = model_german.predict(train=True)
score_german_test = model_german.predict(train=False)

print(f"Training accuracy German: {score_german_train}")
print(f"Testing accuracy German: {score_german_test}")

Epoch 0: 100%|██████████| 54/54 [00:05<00:00, 10.51batch/s, accuracy=69.5, loss=0.879]
Epoch 1: 100%|██████████| 54/54 [00:05<00:00, 10.50batch/s, accuracy=82, loss=0.528]
Epoch 2: 100%|██████████| 54/54 [00:05<00:00, 10.55batch/s, accuracy=89.4, loss=0.328]
Epoch 3: 100%|██████████| 54/54 [00:05<00:00, 10.58batch/s, accuracy=93.6, loss=0.214]
Epoch 4: 100%|██████████| 54/54 [00:05<00:00, 10.58batch/s, accuracy=95.9, loss=0.147]


Training accuracy German: 95.81833627685498
Testing accuracy German: 85.04237793959652


Epoch 0: 100%|██████████| 54/54 [00:05<00:00, 10.54batch/s, accuracy=65.4, loss=0.999]
Epoch 1: 100%|██████████| 54/54 [00:05<00:00, 10.60batch/s, accuracy=79.4, loss=0.603]
Epoch 2: 100%|██████████| 54/54 [00:05<00:00, 10.69batch/s, accuracy=87.6, loss=0.385]
Epoch 3: 100%|██████████| 54/54 [00:04<00:00, 10.81batch/s, accuracy=92.1, loss=0.257]
Epoch 4: 100%|██████████| 54/54 [00:05<00:00, 10.71batch/s, accuracy=94.9, loss=0.181]


Training accuracy German: 94.7396660034428
Testing accuracy German: 82.69666945207115


Epoch 0: 100%|██████████| 54/54 [00:05<00:00, 10.47batch/s, accuracy=72, loss=0.817]
Epoch 1: 100%|██████████| 54/54 [00:05<00:00, 10.55batch/s, accuracy=83.8, loss=0.482]
Epoch 2: 100%|██████████| 54/54 [00:05<00:00, 10.58batch/s, accuracy=90.9, loss=0.294]
Epoch 3: 100%|██████████| 54/54 [00:05<00:00, 10.57batch/s, accuracy=94.6, loss=0.193]
Epoch 4: 100%|██████████| 54/54 [00:05<00:00, 10.70batch/s, accuracy=96.4, loss=0.135]


Training accuracy German: 96.23042930704284
Testing accuracy German: 84.1888504237794


Final comments:

I did not test all parameter combinations, but all the ones I tested were better than the baseline. Out-of-vocabulary words are handled in the pad_and_encode function.