In [1]:
# Configuration dictionary
CONFIG_DICT = {
    "file_name": "litlat_14_github",
    "bert": "EMBEDDIA/litlat-bert", #AiLab-IMCS-UL/lvbert
    "bert_hidden_states": True,
    "learning_rate": 5e-5,
    "lstm_hidden_size": 1024,
    "lstm_num_layers": 1,
    "lstm_bidirectional": True,
    "batch_size": 32,
    "dropout": 0.25,
    "epochs": 30,
}

In [2]:
# torchtext version 0.6.0 was used
!pip show torchtext

Name: torchtext
Version: 0.6.0
Summary: Text utilities and datasets for PyTorch
Home-page: https://github.com/pytorch/text
Author: PyTorch core devs and James Bradbury
Author-email: jekbradbury@gmail.com
License: BSD
Location: c:\users\artur\appdata\local\packages\pythonsoftwarefoundation.python.3.10_qbz5n2kfra8p0\localcache\local-packages\python310\site-packages
Requires: numpy, requests, sentencepiece, six, torch, tqdm
Required-by: 




In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from statistics import mean 

import pandas as pd

from torchtext import data, datasets

from transformers import AutoTokenizer, AutoModelForSequenceClassification

import numpy as np

import gc

import json
import time
import random
import functools
import random
import datetime

In [4]:
# Seeding for result recreation
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Determine computing device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [5]:
# Initialize tokenizer for preprocessing
tokenizer = AutoTokenizer.from_pretrained(CONFIG_DICT['bert'])

In [6]:
# Initialize special token variables
init_token = tokenizer.cls_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

print(init_token, pad_token, unk_token)

<s> <pad> <unk>


In [7]:
# Store special token ids
init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)

print(init_token_idx, pad_token_idx, unk_token_idx)

0 1 3


In [8]:
print(tokenizer.max_model_input_sizes)

{'xlm-roberta-base': 512, 'xlm-roberta-large': 512, 'xlm-roberta-large-finetuned-conll02-dutch': 512, 'xlm-roberta-large-finetuned-conll02-spanish': 512, 'xlm-roberta-large-finetuned-conll03-english': 512, 'xlm-roberta-large-finetuned-conll03-german': 512}


In [9]:
# Standart BERT max input
max_input_length = 512

In [10]:
# Preprocessing for tags
def cut_and_convert_to_id(tokens, tokenizer, max_input_length):
    tokens = tokens[:max_input_length-1]
    tokens = ' '.join(tokens)
    tokens = tokens.lower()
    tokens = tokenizer.tokenize(tokens)
    tokens = tokenizer.convert_tokens_to_ids(tokens)
    return tokens

def cut_to_max_length(tokens, max_input_length):
    tokens = tokens[:max_input_length-1]
    return tokens

tag_preprocessor = functools.partial(cut_to_max_length,
                                     max_input_length = max_input_length)

In [11]:
# All possible Latvian language attributes
TAG_NAMES = ['Vārdšķira',
 'Pieturzīmes tips',
 'Skaitlis',
 'Rekcija',
 'Locījums',
 'Dzimte',
 'Skaitlis 2',
 'Lietvārda tips',
 'Laiks',
 'Persona',
 'Darbības vārda tips',
 'Izteiksme',
 'Lokāmība',
 'Noteiktība',
 'Saīsinājuma tips',
 'Apstākļa vārda tips',
 'Vietniekvārda tips',
 'Reziduāļa tips']

# Initialize attribute dictionary
tag_dict = {}

# Populate dictionary
tag_dict.update(dict(list(zip(TAG_NAMES, [data.Field(unk_token=None, init_token="<pad>", preprocessing = tag_preprocessor) for x in TAG_NAMES]))))
print(tag_dict)

{'Vārdšķira': <torchtext.data.field.Field object at 0x000002D761CC2560>, 'Pieturzīmes tips': <torchtext.data.field.Field object at 0x000002D761CC37F0>, 'Skaitlis': <torchtext.data.field.Field object at 0x000002D761CC3820>, 'Rekcija': <torchtext.data.field.Field object at 0x000002D761CC3850>, 'Locījums': <torchtext.data.field.Field object at 0x000002D761CC2CE0>, 'Dzimte': <torchtext.data.field.Field object at 0x000002D761CC2CB0>, 'Skaitlis 2': <torchtext.data.field.Field object at 0x000002D761CC15D0>, 'Lietvārda tips': <torchtext.data.field.Field object at 0x000002D761CC14E0>, 'Laiks': <torchtext.data.field.Field object at 0x000002D761CC2050>, 'Persona': <torchtext.data.field.Field object at 0x000002D761CC2080>, 'Darbības vārda tips': <torchtext.data.field.Field object at 0x000002D761CC20B0>, 'Izteiksme': <torchtext.data.field.Field object at 0x000002D761CC20E0>, 'Lokāmība': <torchtext.data.field.Field object at 0x000002D761CC2110>, 'Noteiktība': <torchtext.data.field.Field object at 0x

In [12]:
def tokenize_and_convert_to_id(tokens, tokenizer, max_input_length):
    # List for subtokens to store
    subtokens = []
    # Subtoken mapping maps which word the subtoken belongs to
    subtoken_map = []

    for i, token in enumerate(tokens):
        # Convert token to only lower characters
        token = token.lower()
        # Get a list of all the subtokens for word
        subtoken_list = tokenizer.tokenize(token)
        # Add subtoken_list to all subtokens
        subtokens.extend(subtoken_list)
        # Add mapping for stored subtokens
        subtoken_map.extend([i] * len(subtoken_list))

    # Truncate if exceeds limit
    subtokens = subtokens[:max_input_length - 1]
    # Convert subtokens to numerical values
    subtoken_ids = tokenizer.convert_tokens_to_ids(subtokens)

    return subtoken_ids, subtoken_map

In [13]:
def align_tags_with_subtokens(tags, subtoken_map):
    aligned_tags = []
    # Track current tag
    current_tag = ""
    for index in subtoken_map:
        # Check if tag is different from tag at index
        if current_tag != tags[index]:
            # Update current tag with tag at index
            current_tag = tags[index]
        aligned_tags.append(current_tag)
    return aligned_tags

In [14]:
def preprocess_example(sentence, tokenizer, max_input_length):
    # Extract all words
    text = [word["wordform"] for word in sentence]
    # Tokenize all words and convert them to ids, as well as get subtoken mapping
    input_tokens, subtoken_map = tokenize_and_convert_to_id(text, tokenizer, max_input_length)

    #Dictionary for storing tags
    tag_dictionary = {}
    
    for field in TAG_NAMES:
        # Extract all fields that are present in word's attributes
        tags = [word["gold_attributes"][field] if field in word["gold_attributes"] else "" for word in sentence]
        # Align the tags with subtoken mapping
        tag_tokens = align_tags_with_subtokens(tags, subtoken_map)
        # Store aligned tags
        tag_dictionary[field] = tag_tokens

    return input_tokens, list(tag_dictionary.values()), subtoken_map

In [15]:
# Load  JSON data files
def load_json_file(file_path):
    with open(file_path, "r", encoding='utf-8') as f:
        data = json.load(f)
    return data

# Load necessary files
train_file = load_json_file("train.json")
valid_file = load_json_file("dev.json")
test_file = load_json_file("test.json")

# Define text and subtoken mapping fields
TEXT = data.Field(use_vocab=False, init_token=init_token_idx, pad_token=pad_token_idx, unk_token=unk_token_idx)
# 769 value for padding is used to ensure it's out of sentence length borders
ST_MAP = data.Field(use_vocab=False, init_token=init_token_idx, pad_token=769, unk_token=unk_token_idx)

# Define each example case field tuple
fields = tuple((zip(["text"] + ["subtoken_map"] + list(tag_dict.keys()), [TEXT] + [ST_MAP] + list(tag_dict.values()))))

# Define examples for each file
train_examples = []
valid_examples = []
test_examples = []
test_old_examples = []
tag_dictionary = {}

# Populate tag_dictionary with attribute names
tag_dictionary.update(dict(list(zip(TAG_NAMES, [None for x in TAG_NAMES]))))

# Load training examples
for sentence in train_file:
    input_tokens, tag_tokens, subtoken_map = preprocess_example(sentence, tokenizer, max_input_length)
    train_examples.append(data.Example.fromlist([input_tokens, subtoken_map, *tag_tokens], fields))

# Load validation examples
for sentence in valid_file:
    input_tokens, tag_tokens, subtoken_map = preprocess_example(sentence, tokenizer, max_input_length)
    valid_examples.append(data.Example.fromlist([input_tokens, subtoken_map, *tag_tokens], fields))

# Load test examples
for sentence in test_file:
    input_tokens, tag_tokens, subtoken_map = preprocess_example(sentence, tokenizer, max_input_length)
    test_examples.append(data.Example.fromlist([input_tokens, subtoken_map, *tag_tokens], fields))

# Create datasets
train_data = data.Dataset(train_examples, fields)
valid_data = data.Dataset(valid_examples, fields)
test_data = data.Dataset(test_examples, fields)

In [16]:
print(vars(train_data.examples[0]))

{'text': [3229, 643, 43922, 43036, 812, 24563, 265, 31, 47265, 933, 4, 2503, 39029, 408, 15, 13577, 14863, 5], 'subtoken_map': [0, 1, 2, 3, 4, 5, 5, 6, 7, 7, 8, 9, 10, 11, 11, 12, 12, 13], 'Vārdšķira': ['Pieturzīme', 'Prievārds', 'Lietvārds', 'Lietvārds', 'Darbības vārds', 'Darbības vārds', 'Darbības vārds', 'Saiklis', 'Darbības vārds', 'Darbības vārds', 'Pieturzīme', 'Pieturzīme', 'Darbības vārds', 'Saīsinājums', 'Saīsinājums', 'Lietvārds', 'Lietvārds', 'Pieturzīme'], 'Pieturzīmes tips': ['Pēdiņa', '', '', '', '', '', '', '', '', '', 'Komats', 'Pēdiņa', '', '', '', '', '', 'Punkts'], 'Skaitlis': ['', 'Daudzskaitlis', '', 'Vienskaitlis', 'Nepiemīt', 'Vienskaitlis', 'Vienskaitlis', '', 'Vienskaitlis', 'Vienskaitlis', '', '', 'Nepiemīt', '', '', 'Vienskaitlis', 'Vienskaitlis', ''], 'Rekcija': ['', 'Datīvs', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], 'Locījums': ['', '', 'Datīvs', 'Nominatīvs', '', 'Nominatīvs', 'Nominatīvs', '', 'Nominatīvs', 'Nominatīvs', '', '', '

In [17]:
for tag in tag_dict.keys():
    tag_dict[tag].build_vocab(train_data)
    print(tag_dict[tag].vocab.stoi)

defaultdict(None, {'<pad>': 0, 'Lietvārds': 1, 'Darbības vārds': 2, 'Pieturzīme': 3, 'Īpašības vārds': 4, 'Vietniekvārds': 5, 'Apstākļa vārds': 6, 'Saiklis': 7, 'Prievārds': 8, 'Reziduālis': 9, 'Partikula': 10, 'Saīsinājums': 11, 'Skaitļa vārds': 12, 'Izsauksmes vārds': 13})
defaultdict(None, {'<pad>': 0, '': 1, 'Komats': 2, 'Punkts': 3, 'Pēdiņa': 4, 'Domuzīme': 5, 'Iekava': 6, 'Kols': 7, 'Cita': 8})
defaultdict(None, {'<pad>': 0, 'Vienskaitlis': 1, '': 2, 'Daudzskaitlis': 3, 'Nepiemīt': 4})
defaultdict(None, {'<pad>': 0, '': 1, 'Akuzatīvs': 2, 'Datīvs': 3, 'Ģenitīvs': 4, 'Nepiemīt': 5})
defaultdict(None, {'<pad>': 0, '': 1, 'Nominatīvs': 2, 'Ģenitīvs': 3, 'Akuzatīvs': 4, 'Datīvs': 5, 'Lokatīvs': 6, 'Nepiemīt': 7, 'Vokatīvs': 8})
defaultdict(None, {'<pad>': 0, '': 1, 'Vīriešu': 2, 'Sieviešu': 3, 'Nepiemīt': 4})
defaultdict(None, {'<pad>': 0, '': 1, 'Daudzskaitlinieks': 2, 'Vienskaitlinieks': 3})
defaultdict(None, {'<pad>': 0, '': 1, 'Sugas vārds': 2, 'Īpašvārds': 3})
defaultdict(None, 

In [18]:
print(tag_dict)

{'Vārdšķira': <torchtext.data.field.Field object at 0x000002D761CC2560>, 'Pieturzīmes tips': <torchtext.data.field.Field object at 0x000002D761CC37F0>, 'Skaitlis': <torchtext.data.field.Field object at 0x000002D761CC3820>, 'Rekcija': <torchtext.data.field.Field object at 0x000002D761CC3850>, 'Locījums': <torchtext.data.field.Field object at 0x000002D761CC2CE0>, 'Dzimte': <torchtext.data.field.Field object at 0x000002D761CC2CB0>, 'Skaitlis 2': <torchtext.data.field.Field object at 0x000002D761CC15D0>, 'Lietvārda tips': <torchtext.data.field.Field object at 0x000002D761CC14E0>, 'Laiks': <torchtext.data.field.Field object at 0x000002D761CC2050>, 'Persona': <torchtext.data.field.Field object at 0x000002D761CC2080>, 'Darbības vārda tips': <torchtext.data.field.Field object at 0x000002D761CC20B0>, 'Izteiksme': <torchtext.data.field.Field object at 0x000002D761CC20E0>, 'Lokāmība': <torchtext.data.field.Field object at 0x000002D761CC2110>, 'Noteiktība': <torchtext.data.field.Field object at 0x

In [19]:
print(torch.cuda.is_available())

True


In [20]:
# Initialize batch size from config
BATCH_SIZE = CONFIG_DICT['batch_size']

# Create data iterators from given datasets
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    sort=False,
    device = device)

In [21]:
# Tagger class
class BERTPoSTagger(nn.Module):
    def __init__(self,
                 bert,
                 output_dict,
                 dropout):
        
        super().__init__()
        
        # Assign bert
        self.bert = bert
        
        # Bert hidden size
        embedding_dim = bert.config.to_dict()['hidden_size']
        
        # Define LSTM layer
        self.lstm = nn.LSTM(input_size=embedding_dim,
                            hidden_size=CONFIG_DICT['lstm_hidden_size'],
                            num_layers=CONFIG_DICT['lstm_num_layers'],
                            bidirectional=CONFIG_DICT['lstm_bidirectional'],
                            batch_first=True)
        
        # All linear classification layer list
        self.y = []
        
        # Store linear classification layer for each attribute
        for dim_size in output_dict:
            self.y.append(nn.Linear(CONFIG_DICT['lstm_hidden_size'] * 2 if CONFIG_DICT['lstm_bidirectional'] else CONFIG_DICT['lstm_hidden_size'], dim_size).to(device))
        
        # Define dropout
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        # Make sure input corresponds to used device
        text.to(device)
        # text = [sentence length, batch size]
        
        text = text.permute(1, 0)
        # text = [batch size, sentence length]
        
        # Retrieve embeddings
        embedded = self.dropout(self.bert(text)[1][-1])

        # Retrieve LSTM output based on embeddings
        lstm_out, _ = self.lstm(embedded)
        # lstm_out = [batch_size, sentence length, embedding dimensions]
        
        lstm_out = lstm_out.permute(1, 0, 2)
        # lstm_out = [sentence length, batch_size, embedding dimensions]

        # List for all outputs
        output_list = []
    
        for y in self.y:
            # Get classifications for each attribute
            output_list.append(y(self.dropout(lstm_out)))
        
        return output_list

In [22]:
# Initialize BERT
bert = AutoModelForSequenceClassification.from_pretrained(CONFIG_DICT['bert'], output_hidden_states=CONFIG_DICT['bert_hidden_states'])
# Run BERT on device
bert = bert.to(device)

Some weights of the model checkpoint at EMBEDDIA/litlat-bert were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.decoder.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at EMBEDDIA/litlat-bert and are newly

In [23]:
# Store output dimensions for each attribute
output_dims = []
for tag in tag_dict.keys():
    output_dims.append(len(tag_dict[tag].vocab))

# Check possible classes for each attribute
print(output_dims)

[14, 9, 5, 6, 9, 5, 4, 4, 6, 6, 9, 9, 5, 5, 8, 7, 10, 7]


In [24]:
# Define dropout ratio
DROPOUT = CONFIG_DICT['dropout']

# Initialize tagger instance
model = BERTPoSTagger(bert,
                      output_dims,
                      DROPOUT)

In [25]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 165,406,466 trainable parameters


In [26]:
# Define optimizer
optimizer = optim.Adam(model.parameters(), CONFIG_DICT['learning_rate'])

In [27]:
criterion_list = []
tag_pad_list = []
for tag in tag_dict.keys():
    # Populate paddings in case different paddings were used in preprocessing
    tag_pad_idx = tag_dict[tag].vocab.stoi[tag_dict[tag].pad_token]
    tag_pad_list.append(tag_pad_idx)
    # Define criterion for each attribute
    criterion_list.append(nn.CrossEntropyLoss(ignore_index = tag_pad_idx))

In [28]:
# Run model and each criterion on device
model = model.to(device)

for criterion in criterion_list:
    criterion = criterion.to(device)

In [29]:
# Return accuracy per batch
def categorical_accuracy(preds, y, tag_pad_idx):
    # Get index of maximum probability
    max_preds = preds.argmax(dim = 1, keepdim = True)
    # Only take non padding elements into account
    non_pad_elements = (y != tag_pad_idx).nonzero()
    # Retrieve all correct predictions
    correct = max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements])
    # Return the ratio of correct predictions
    return correct.sum() / torch.FloatTensor([y[non_pad_elements].shape[0]]).to(device)

In [30]:
def train(model, iterator, optimizer, criterion_list, tag_pad_list):
    
    epoch_loss = 0

    epoch_acc_list = []
    
    # Initialize model training
    model.train()
    
    for batch in iterator:
        loss = 0
        accuracy_list = []
        
        batch_tag_dictionary = {}

        batch_tag_dictionary.update(dict(list(zip(TAG_NAMES, [None for x in TAG_NAMES]))))
        
        batch_dict = vars(batch)

        # Retrieve text of batch
        text = batch.text.to(device)
        
        # Populate batch_tag_dictionary with all tags from batch
        for tag in batch_tag_dictionary.keys():
            batch_tag_dictionary[tag] = batch_dict[tag].view(-1)

        # Zero the gradients
        optimizer.zero_grad()
        
        # Retrieve predictions
        predictions_list = model(text)

        # Reshape predictions
        for i in range(0, len(predictions_list)):
            predictions_list[i] = predictions_list[i].view(-1, predictions_list[i].shape[-1])

        # Calculate loss for each attribute
        for i in range(0, len(predictions_list)):
            loss += criterion_list[i](predictions_list[i], batch_tag_dictionary[TAG_NAMES[i]])

        # Calculate accuracy for each attribute
        for i in range(0, len(predictions_list)):
            accuracy_list.append(categorical_accuracy(predictions_list[i], batch_tag_dictionary[TAG_NAMES[i]], tag_pad_list[i]))

        # Backpropogation
        loss.backward()
        
        optimizer.step()
        
        # Add batch loss to epoch loss
        epoch_loss += loss.item()

        # Add batch accuracy to epoch accuracy
        epoch_acc_list.append([accuracy.item() for accuracy in accuracy_list])
       
        # Clear GPU cache
        torch.cuda.empty_cache()
    
    # Calculate average los and accuracy for each epoch
    return epoch_loss / len(iterator), [mean(x) for x in list(zip(*epoch_acc_list))]

In [31]:
# Method for calculating whole tag accuracy:
# NOTE: This is used only during training evakluation, as it calculates predictions of subtokens
# For model test evaluation, a different method is used, due to computational resource saving
def return_category_list(preds_list, y_list, tag_pad_idx_list):
    temp_list = []
    
    dict_pred = {}
    dict_pred.update(dict(list(zip(TAG_NAMES, [None for x in TAG_NAMES]))))
    dict_target = {}
    dict_target.update(dict(list(zip(TAG_NAMES, [None for x in TAG_NAMES]))))
    
    # Iterate over each attribute for predictions
    for i, tag in enumerate(dict_pred.keys()):
        # Retrieve non padding elements
        non_pad_elements = (y_list[tag] != tag_pad_idx_list[i]).nonzero().view(-1)
        dict_target[tag] = y_list[tag][non_pad_elements].view(-1)
        # Retrieve maximum probability index for attribute's predictions
        dict_pred[tag] = preds_list[i].argmax(dim = 1, keepdim = True)[non_pad_elements].squeeze(1) # get the index of the max probability
   
    true_cnt = 0
    # Iterate over predictions
    for i in range(0, len(dict_pred['Vārdšķira'])):
        correct = 0
        # Check if predicted value matches target value for each attribute
        for j, tag in enumerate(dict_pred.keys()):
            if dict_target[tag][i] == dict_pred[tag][i]:
                correct = correct + 1
        # If all predicted tags match the expected - add as correct tag
        if correct == len(dict_pred):
            true_cnt = true_cnt+1
    
    # Return the count of correctly predicted tags
    return true_cnt/len(dict_pred['Vārdšķira'])

In [32]:
# Method for calculating whole tag accuracy:
# NOTE: This is used only during testing evaluation, as it calculates accuracy of predictions
# taking into account subtoken mapping.
# This method assumes first subtoken prediction represents the prediction for the whole word
def test_return_category_list(preds_list, y_list, tag_pad_idx_list, subtoken_mapping):
    subtoken_mapping = subtoken_mapping.t()
    counter = -1
    true_element_counter = []
    first_encounter = True
    
    # Iterate over subtoken mapping
    for i in range(subtoken_mapping.shape[0]):
        # If it's first encounter - it means it's the first subtoken
        first_encounter = True
        for j in range(subtoken_mapping.shape[1]):
            # Skip sequence start tokens
            if j == 0:
                continue
            # Mark token as traversed
            counter = counter + 1
            
            # Ignore indices of padding elements
            if subtoken_mapping[i, j].item() == 769:
                counter = counter - 1
                continue
            # If it's first encounter - append this index as relevant for calculation
            if first_encounter == True:
                true_element_counter.append(counter)
            # If next subtoken is same as current subtoken set first_encounter as false for next iteraiton
            if j != subtoken_mapping.shape[1] - 1:
                if subtoken_mapping[i, j+1] == subtoken_mapping[i, j]:
                    first_encounter = False
                else:
                    first_encounter = True

    temp_list = []
    dict_subtoken_mapping = {}
    
    dict_pred = {}
    dict_pred.update(dict(list(zip(TAG_NAMES, [None for x in TAG_NAMES]))))
    dict_target = {}
    dict_target.update(dict(list(zip(TAG_NAMES, [None for x in TAG_NAMES]))))

    # Get non-padding predictions for each attribute
    for i, tag in enumerate(dict_pred.keys()):
        non_pad_elements = (y_list[tag] != tag_pad_idx_list[i]).nonzero().view(-1)

        dict_target[tag] = y_list[tag][non_pad_elements].view(-1)

        dict_pred[tag] = preds_list[i].argmax(dim = 1, keepdim = True)[non_pad_elements].squeeze(1) # get the index of the max probability
    
    # Iterate over each word and check if all attributes are predicted correctly
    true_cnt = 0
    for i in range(0, len(dict_pred['Vārdšķira'])):
        correct = 0
        for j, tag in enumerate(dict_pred.keys()):
            # First condition skips elements that are not first subtokens
            if i in true_element_counter and dict_target[tag][i] == dict_pred[tag][i]:
                correct = correct + 1
        if correct == len(dict_pred):
            true_cnt = true_cnt+1

    # Return the count of correctly tagged first subtokens
    return true_cnt/len(true_element_counter)

In [33]:
# Evaluation used during training
def evaluate(model, iterator, optimizer, criterion_list, tag_pad_list):
    
    epoch_loss = 0
    epoch_acc_list = []
    tag_accuracy = []
    
    model.eval()
    
    # Only difference for evaluation - do not use gradient
    with torch.no_grad():
    
        for batch in iterator:
            loss = 0
            accuracy_list = []

            batch_tag_dictionary = {}

            batch_tag_dictionary.update(dict(list(zip(TAG_NAMES, [None for x in TAG_NAMES]))))

            batch_dict = vars(batch)

            text = batch.text

            for tag in batch_tag_dictionary.keys():
                batch_tag_dictionary[tag] = batch_dict[tag].view(-1)

            predictions_list = model(text)

            for i in range(0, len(predictions_list)):
                predictions_list[i] = predictions_list[i].view(-1, predictions_list[i].shape[-1])
            
            for i in range(0, len(predictions_list)):
                loss += criterion_list[i](predictions_list[i], batch_tag_dictionary[TAG_NAMES[i]])

            # Utilize training evaluation tagging accuracy method
            tag_accuracy.append(return_category_list(predictions_list, batch_tag_dictionary, tag_pad_list))

            for i in range(0, len(predictions_list)):
                accuracy_list.append(categorical_accuracy(predictions_list[i], batch_tag_dictionary[TAG_NAMES[i]], tag_pad_list[i]))

            epoch_loss += loss.item()

            epoch_acc_list.append([accuracy.item() for accuracy in accuracy_list])
    
            torch.cuda.empty_cache()
            gc.collect()

    epoch_avg_acc_list = []

    return epoch_loss / len(iterator), [mean(x) for x in list(zip(*epoch_acc_list))], tag_accuracy

In [34]:
# Evaluation used during testing
def test_evaluate(model, iterator, optimizer, criterion_list, tag_pad_list):
    
    epoch_loss = 0
    epoch_acc_list = []
    tag_accuracy = []
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:
            loss = 0
            accuracy_list = []
            eval_dict = {}

            batch_tag_dictionary = {}
            batch_tag_dictionary.update(dict(list(zip(TAG_NAMES, [None for x in TAG_NAMES]))))

            batch_dict = vars(batch)

            text = batch.text
            subtoken_mapping = batch.subtoken_map

            for tag in batch_tag_dictionary.keys():
                batch_tag_dictionary[tag] = batch_dict[tag].view(-1)

            predictions_list = model(text)

            for i in range(0, len(predictions_list)):
                predictions_list[i] = predictions_list[i].view(-1, predictions_list[i].shape[-1])
            
            for i in range(0, len(predictions_list)):
                loss += criterion_list[i](predictions_list[i], batch_tag_dictionary[TAG_NAMES[i]])

            # Utilize testing evaluation tagging accuracy method
            tag_accuracy.append(test_return_category_list(predictions_list, batch_tag_dictionary, tag_pad_list, subtoken_mapping))

            for i in range(0, len(predictions_list)):
                accuracy_list.append(categorical_accuracy(predictions_list[i], batch_tag_dictionary[TAG_NAMES[i]], tag_pad_list[i]))

            epoch_loss += loss.item()

            epoch_acc_list.append([accuracy.item() for accuracy in accuracy_list])
    
            torch.cuda.empty_cache()
            gc.collect()

    epoch_avg_acc_list = []
  
    return epoch_loss / len(iterator), [mean(x) for x in list(zip(*epoch_acc_list))], tag_accuracy

In [35]:
# Calculate epoch time
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
# Skip this cell to not train model
# Epoch count
N_EPOCHS = CONFIG_DICT['epochs']

best_valid_loss = float('inf')


# Get the current date and time
now = datetime.datetime.now()

# Format the date and time as a string
datetime_str = now.strftime("%Y-%m-%d_%H-%M-%S")

file_name = CONFIG_DICT['file_name']

# Variable to determine best epoch
best_valid_epoch = 0

for epoch in range(N_EPOCHS):
    print(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
    # Clear GPU cache
    torch.cuda.empty_cache()
    gc.collect()
    start_time = time.time()
    
    # Perform model training
    train_loss, train_acc_list = train(model, train_iterator, optimizer, criterion_list, tag_pad_list)
    # Commence training evaluation
    valid_loss, valid_acc_list, tag_accuracy_list = evaluate(model, valid_iterator, optimizer, criterion_list, tag_pad_list)
    
    end_time = time.time()

    # Calculate epoch time
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    # Store best validation loss state and epoch
    if valid_loss < best_valid_loss:
        best_valid_epoch = epoch+1
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), file_name)
        
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f}' )
    for train_acc in train_acc_list:
        print(f'Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f}')
    for val_acc in valid_acc_list:
        print(f'Val. Acc: {val_acc*100:.2f}%')
    print(f'Tag Acc: {mean(tag_accuracy_list)}')

In [None]:
print(f"Best Valid Epoch: {best_valid_epoch}")

In [36]:
# Run this to load pretrained model or comment to use trained
file_name = "litlat_after_accuracy_15"

In [37]:
# Load best state (or pretrained model) before testing
model.load_state_dict(torch.load(file_name))

<All keys matched successfully>

In [38]:
print(CONFIG_DICT)

# Evaluate model, using word representation by first subtoken
test_loss, test_acc_list, tag_accuracy_list = test_evaluate(model, test_iterator, optimizer, criterion_list, tag_pad_list)

print(f'Test Loss: {test_loss:.3f}')
for i, test_acc in enumerate(test_acc_list):
        print(f'Test Acc:{TAG_NAMES[i]}, {test_acc*100:.2f}%')
print(f'Tag Acc: {mean(tag_accuracy_list)}')

{'file_name': 'litlat_14_github', 'bert': 'EMBEDDIA/litlat-bert', 'bert_hidden_states': True, 'learning_rate': 5e-05, 'lstm_hidden_size': 1024, 'lstm_num_layers': 1, 'lstm_bidirectional': True, 'batch_size': 32, 'dropout': 0.25, 'epochs': 30}
Test Loss: 1.546
Test Acc:Vārdšķira, 98.59%
Test Acc:Pieturzīmes tips, 99.99%
Test Acc:Skaitlis, 97.83%
Test Acc:Rekcija, 99.75%
Test Acc:Locījums, 98.49%
Test Acc:Dzimte, 97.91%
Test Acc:Skaitlis 2, 99.40%
Test Acc:Lietvārda tips, 98.76%
Test Acc:Laiks, 99.04%
Test Acc:Persona, 99.47%
Test Acc:Darbības vārda tips, 99.23%
Test Acc:Izteiksme, 99.41%
Test Acc:Lokāmība, 99.80%
Test Acc:Noteiktība, 99.63%
Test Acc:Saīsinājuma tips, 99.78%
Test Acc:Apstākļa vārda tips, 99.26%
Test Acc:Vietniekvārda tips, 99.77%
Test Acc:Reziduāļa tips, 99.73%
Tag Acc: 0.9311444874331951


In [42]:
def tag_sentence(model, device, sentence, tokenizer, text_field, tag_field_list):
    
    model.eval()
    
    # Tokenize sentence
    if isinstance(sentence, str):
        tokens = tokenizer.tokenize(sentence)
    else:
        tokens = sentence
    
    # Convert to ids
    numericalized_tokens = tokenizer.convert_tokens_to_ids(tokens)
    numericalized_tokens = [text_field.init_token] + numericalized_tokens
    
    unk_idx = text_field.unk_token
    
    # Identify unknown tokens
    unks = [t for t, n in zip(tokens, numericalized_tokens) if n == unk_idx]
    
    # Initialize token tensor
    token_tensor = torch.LongTensor(numericalized_tokens)
    
    token_tensor = token_tensor.unsqueeze(-1).to(device)
    
    # Get predictions
    predictions_list = model(token_tensor)
    
    # Retrieve top predictions
    top_predictors_list = []
    for i in range(0, len(predictions_list)):
        top_predictors_list.append(predictions_list[i].argmax(-1))
    
    # Get all predicted tags
    predicted_tags_list = []
    for i, tag in enumerate(tag_field_list):
        predicted_tags_list.append([tag_field_list[tag].vocab.itos[t.item()] for t in top_predictors_list[i]])
        predicted_tags_list[i] = predicted_tags_list[i][1:]

    # Assert prediction length matches token length
    for predicted_tags in predicted_tags_list:
        assert len(tokens) == len(predicted_tags)
    
    return tokens, predicted_tags_list, unks

In [43]:
sentence = 'Brīdi turēja to rokās.'

tokens, predicted_tags_list, unks = tag_sentence(model, 
                                  device, 
                                  sentence,
                                  tokenizer,
                                  TEXT,
                                  tag_dict)

print(unks)

[]


In [44]:
# Create a Dataframe to visualize each tag
pd.DataFrame(list(zip(tokens, *predicted_tags_list)), columns=["token"]+TAG_NAMES)

Unnamed: 0,token,Vārdšķira,Pieturzīmes tips,Skaitlis,Rekcija,Locījums,Dzimte,Skaitlis 2,Lietvārda tips,Laiks,Persona,Darbības vārda tips,Izteiksme,Lokāmība,Noteiktība,Saīsinājuma tips,Apstākļa vārda tips,Vietniekvārda tips,Reziduāļa tips
0,▁Br,Lietvārds,,Daudzskaitlis,,Nominatīvs,Vīriešu,,Sugas vārds,,,,,,,,,,
1,īdi,Lietvārds,,Daudzskaitlis,,Nominatīvs,Vīriešu,,Sugas vārds,,,,,,,,,,
2,▁tur,Darbības vārds,,Nepiemīt,,,,,,Pagātne,3.0,Patstāvīgs darbības vārds,Īstenības,,,,,,
3,ēja,Darbības vārds,,Nepiemīt,,,,,,Pagātne,3.0,Patstāvīgs darbības vārds,Īstenības,,,,,,
4,▁to,Vietniekvārds,,Vienskaitlis,,Akuzatīvs,Vīriešu,,,,3.0,,,,,,,Norādāmais,
5,▁rokās,Lietvārds,,Daudzskaitlis,,Lokatīvs,Sieviešu,,Sugas vārds,,,,,,,,,,
6,.,Pieturzīme,Punkts,,,,,,,,,,,,,,,,
