# Simple Batched Encoder Decoder with characters and tags as input

In [1]:
START_CHAR = '⏵'
STOP_CHAR = '⏹'
UNKNOWN_CHAR = '⊗'
UNKNOWN_TAG = '⊤'
PAD_CHAR = '₮'
PAD_TAG = '<PAD>'

In [2]:
from itertools import zip_longest
import os
import pickle

import Levenshtein
import numpy as np
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
import torch.nn.functional as F

In [3]:
def load_data(file_name):
    """Loads data.

    Args:
        file_name: path to file containing the data

    Returns:
        lemmas: list of lemma
        tags: list of tags
        inflected_forms: list of inflected form
    """

    with open(file_name, 'r', encoding='utf') as file:
        text = file.read()

    lemmas = []
    tags = []
    inflected_forms = []

    for line in text.split('\n')[:-1]:
        lemma, inflected_form, tag = line.split('\t')
        lemmas.append(lemma)
        inflected_forms.append(inflected_form)
        tags.append(tag) 

    return lemmas, tags, inflected_forms

def get_index_dictionaries(lemmas, tags, inflected_forms):
    """Returns char2index, index2char, tag2index

    Args:
        lemmas: list of lemma
        tags: list of tags
        inflected_forms: list of inflected form

    Returns: 
        char2index: a dictionary which maps character to index
        index2char: a dictionary which maps index to character
        tag2index: a ditionary which maps morphological tag to index 
    """

    unique_chars = set(''.join(lemmas) + ''.join(inflected_forms))
    unique_chars.update(START_CHAR, STOP_CHAR) # special start and end symbols
    unique_chars.update(UNKNOWN_CHAR) # special charcter for unknown word
    char2index = {}
    index2char = {}

    char2index[PAD_CHAR] = 0
    index2char[0] = PAD_CHAR
    
    for index, char in enumerate(unique_chars):
        char2index[char] = index + 1
        index2char[index + 1] = char

    unique_tags = set(';'.join(tags).split(';'))
    unique_tags.update(UNKNOWN_TAG)
    tag2index = {tag:index+1 for index, tag in enumerate(unique_tags)}
    tag2index[PAD_TAG] = 0

    return char2index, index2char, tag2index

def get_combined_index_dictionaries(lemmas, tags, inflected_forms):
    """Returns char2index, index2char

    Args:
        lemmas: list of lemma
        tags: list of tags
        inflected_forms: list of inflected form

    Returns: 
        char2index: a dictionary which maps inputs and  to index
        index2char: a dictionary which maps index to inputs
    """

    unique_chars = set(''.join(lemmas) + ''.join(inflected_forms))
    unique_chars.update(START_CHAR, STOP_CHAR, UNKNOWN_CHAR) # special start and end symbols  
    
    input2index = {}
    index2input = {}
    
    input2index[PAD_CHAR] = 0
    index2input[0] = PAD_CHAR
        
    for index, char in enumerate(unique_chars, start=1):
        input2index[char] = index
        index2input[index] = char
        
    char_vocab_length = len(input2index.keys())

    unique_tags = set(';'.join(tags).split(';'))
    unique_tags.add(UNKNOWN_TAG) # special character for unknown tags
    
    for index, char in enumerate(unique_tags, start=char_vocab_length):
        input2index[char] = index
        index2input[index] = char

    return input2index, index2input, char_vocab_length

def words_to_indices(words, char2index, tensor=False, start_char=False, stop_char=False):
    """Converts list of words to a list with list containing indices

    Args:
        words: list of words
        char2index: dictionary which maps character to index
        tensor: if to return a list of tensor  

    Returns:
        tensor: list of list/tensor containing indices for a sequence of characters
    """

    list_indices = []
    for word in words:
        word_indices = []
        if start_char:
            word_indices.append(char2index[START_CHAR])
        for char in word:
            if char in char2index.keys():
                word_indices.append(char2index[char])
            else:
                word_indices.append(char2index[UNKNOWN_CHAR])
        if stop_char:
            word_indices.append(char2index[STOP_CHAR])
        if tensor:
            word_indices = torch.Tensor(word_indices)
        list_indices.append(word_indices)

    return list_indices

def tag_to_vector(tags, tag2index):
    """Returns one hot representation of tags given a tag.

    Args:
        tags: list of string representation of tag (eg, V;IND;PRS;2;PL)

    Returns:
        tag_vectors: list of 1D tensors with one hot representation of tags 
    """

    tag_vectors = []
    for tag in tags:
        tag_vector = torch.zeros(len(tag2index))
        for tag_feature in tag.split(';'):
            if tag_feature in tag2index:
                tag_vector[tag2index[tag_feature]] = 1
            else:
                tag_vector[tag2index[UNKNOWN_TAG]] = 1
        tag_vectors.append(tag_vector)
    return tag_vectors

def tag_to_indices(tags, tag2index):
    """Converts list of tags to a list of lists containing indices

    Args:
        words: list of tags

    Returns:
        tensor: list of list containing indices of sub_tags
    """
    
    list_indices = []
    for tag in tags:
        tag_indices = []
        for sub_tag in tag.split(';'):
            if sub_tag in tag2index.keys():
                tag_indices.append(tag2index[sub_tag])
            else:
                tag_indices.append(tag2index[UNKNOWN_TAG])
        list_indices.append(tag_indices)

    return list_indices
    

def indices_to_word(indices, index2char):
    """Returns a word given list contaning indices of words

    Args:
        indices: list containing indices

    Returns:
        word: a string
    """

    return ''.join([index2char[index] for index in indices])[:-1]

def pad_lists(lists, pad_int, pad_len=None):
    """Pads lists in a list to make them of equal size"""
    
    if pad_len is None:
        pad_len = max([len(lst) for lst in lists])
    new_list = []
    for lst in lists:
        if len(lst) < pad_len:
            new_list.append(torch.tensor(lst + [pad_int] * (pad_len-len(lst))))
        else:
            new_list.append(torch.tensor(lst[:pad_len]))
    return torch.stack(new_list)

def merge_lists(lists1, lists2):
    """Add two list of lists."""
    
    merged_lists = []
    for list1, list2 in zip(lists1, lists2):
        merged_lists.append(list1 + list2)
    return merged_lists

def grouper(iterable, n, fillvalue=None):
    "Collect data into fixed-length chunks or blocks"
    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
    args = [iter(iterable)] * n
    return zip_longest(*args, fillvalue=fillvalue)

def accuracy(predictions, targets):
    correct_count = 0
    for prediction, target in zip(predictions, targets):
        if prediction == target:
            correct_count += 1
    return correct_count / len(predictions)

def average_distance(predictions, targets):
    total_distance = 0
    for prediction, target in zip(predictions, targets):
        total_distance += Levenshtein.distance(prediction, target)
    return total_distance / len(predictions)

def evaluate(predictions, targets):
    return accuracy(predictions, targets), average_distance(predictions, targets)

In [4]:
lemmas, tags, inflected_forms = load_data('./conll2018/task1/all/middle-french-train-high')

In [5]:
lemmas_train, lemmas_val, tags_train, tags_val, inflected_forms_train, inflected_forms_val = train_test_split(lemmas, tags, inflected_forms, test_size=0.2, random_state=42)

In [6]:
input2index, index2input, char_vocab_size = get_combined_index_dictionaries(lemmas_train, tags_train, inflected_forms_train)


# Train

In [7]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [8]:
batch_size = 32
embedding_size = 300
hidden_size = 100
input_vocab_size = len(index2input.keys())

In [9]:
Embedder = nn.Embedding(input_vocab_size, embedding_size, padding_idx=input2index[PAD_CHAR]).to(device)
Encoder = nn.LSTM(embedding_size, hidden_size, batch_first=True, bidirectional=True).to(device)
Decoder = nn.LSTM(embedding_size, hidden_size, batch_first=True).to(device)
linear1 = nn.Linear(hidden_size, char_vocab_size).to(device)
log_softmax = nn.LogSoftmax(dim=2).to(device)
criterion = nn.NLLLoss(ignore_index=0)
params = list(Embedder.parameters()) + list(Encoder.parameters()) + list(Decoder.parameters()) + list(linear1.parameters())

In [10]:
def test(lemmas_val, tags_val, inflected_forms_val, batch_size=32):

    inflected_predicted = []
    inflected_forms_true = []

    for batch in grouper(zip(lemmas_val, tags_val, inflected_forms_val), batch_size):
        batch = list(filter(lambda x: x is not None, batch))
        lemmas, tags, inflected_forms = zip(*batch)

        lemmas_indices = words_to_indices(lemmas, input2index, start_char=True, stop_char=True)
        tags_indices = tag_to_indices(tags, input2index)    
        input_indices = merge_lists(lemmas_indices, tags_indices)

        # Sort by length of input sequence
        input_indices, inflected_forms = zip(*sorted(zip(input_indices, inflected_forms), key=lambda x: len(x[0]), reverse=True))
        input_indices = [torch.tensor(lst) for lst in input_indices]


        input_tensor = pad_sequence(input_indices, padding_value=input2index[PAD_CHAR], batch_first=True).to(device)
        embedding = Embedder(input_tensor)
        lengths = [Tensor.shape[0] for Tensor in input_indices]
        packed_input = pack_padded_sequence(embedding, lengths, batch_first=True)
        encoded_packed_seq, (hidden, cell) = Encoder(packed_input)
        encoded_input = pad_packed_sequence(encoded_packed_seq, batch_first=True)[0]


        # Decode
        hidden_state = hidden[0,:,:] + hidden[1,:,:]
        hidden_state = hidden_state.unsqueeze(0)
        cell_state = torch.zeros(1, len(lengths), hidden_size).to(device)

        decoder_input = torch.tensor([input2index[START_CHAR]] * len(lengths)).to(device)
        decoder_input = Embedder(decoder_input).unsqueeze(1)

        outputs = []
        for seq in range(0, 25):
            output, (hidden_state, cell_state) = Decoder(decoder_input, (hidden_state, cell_state))
            output = F.relu(linear1(hidden_state))
            decoder_input = Embedder(output.argmax(dim=2)).transpose(0, 1)
            outputs.append(output.squeeze())

        batch_indices = torch.stack(outputs).argmax(dim=2).transpose(0, 1).cpu().numpy()
        for indices in batch_indices:
            inflected_predicted.append(''.join([index2input[index] for index in indices]).split(STOP_CHAR)[0])
        inflected_forms_true += inflected_forms
        
    return inflected_predicted, inflected_forms_true

In [11]:
optimiser = optim.Adagrad(params)

In [12]:
def train(lemmas_train, tags_train, inflected_forms_train, epochs=1):
    for epoch in range(epochs):

        epoch_loss = 0

        for batch in grouper(zip(lemmas_train, tags_train, inflected_forms_train), batch_size):
            batch = list(filter(lambda x: x is not None, batch))
            lemmas, tags, inflected_forms = zip(*batch)


            lemmas_indices = words_to_indices(lemmas, input2index, start_char=True, stop_char=True)
            tags_indices = tag_to_indices(tags, input2index)    
            inflected_forms_indices = words_to_indices(inflected_forms, input2index)
            input_indices = merge_lists(lemmas_indices, tags_indices)

            # Sort by length of input sequence
            input_indices, inflected_forms_indices = zip(*sorted(zip(input_indices, inflected_forms_indices), key=lambda x: len(x[0]), reverse=True))
            input_indices = [torch.tensor(lst) for lst in input_indices]

            optimiser.zero_grad()

            input_tensor = pad_sequence(input_indices, padding_value=input2index[PAD_CHAR], batch_first=True).to(device)
            embedding = Embedder(input_tensor)
            lengths = [Tensor.shape[0] for Tensor in input_indices]
            packed_input = pack_padded_sequence(embedding, lengths, batch_first=True)
            encoded_packed_seq, (hidden, cell) = Encoder(packed_input)
            encoded_input = pad_packed_sequence(encoded_packed_seq, batch_first=True)[0]


            # Decode
            hidden_state = hidden[0,:,:] + hidden[1,:,:]
            hidden_state = hidden_state.unsqueeze(0)
            cell_state = torch.zeros(1, len(lengths), hidden_size).to(device)

            target = pad_lists([lst + [input2index[STOP_CHAR]] for lst in inflected_forms_indices], input2index[PAD_CHAR]).to(device)    

            decoder_input = pad_lists(inflected_forms_indices, input2index[PAD_CHAR]).to(device)
            decoder_input = torch.cat([torch.tensor([input2index[START_CHAR]] * len(lengths)).unsqueeze(1).to(device), decoder_input], dim=1)
            decoder_input = Embedder(decoder_input)


            loss = 0

            max_length = target.shape[1]
            for seq in range(0, max_length):
                output, (hidden_state, cell_state) = Decoder(decoder_input[:,seq,:].unsqueeze(1), (hidden_state, cell_state))
                output = F.relu(linear1(hidden_state))
                output = log_softmax(output).squeeze()
                loss += criterion(output, target[:,seq])

            epoch_loss += loss.item()
            loss.backward()
            optimiser.step()

        print("Epoch: {}/{}\tLoss: {:.4f}\tAccuracy: {:4f}\tDistance: {:.4f} ".format(epoch, epochs, epoch_loss / len(lemmas_train), *evaluate(*test(lemmas_val, tags_val, inflected_forms_val))))

In [13]:
train(lemmas_train, tags_train, inflected_forms_train, epochs=150)

Epoch: 0/150	Loss: 0.7338	Accuracy: 0.001000	Distance: 6.2455 
Epoch: 1/150	Loss: 0.5336	Accuracy: 0.002000	Distance: 5.6285 
Epoch: 2/150	Loss: 0.4469	Accuracy: 0.010500	Distance: 5.2160 
Epoch: 3/150	Loss: 0.3919	Accuracy: 0.028000	Distance: 4.8560 
Epoch: 4/150	Loss: 0.3485	Accuracy: 0.035500	Distance: 4.6885 
Epoch: 5/150	Loss: 0.3141	Accuracy: 0.061000	Distance: 4.4365 
Epoch: 6/150	Loss: 0.2853	Accuracy: 0.087000	Distance: 4.1325 
Epoch: 7/150	Loss: 0.2606	Accuracy: 0.101500	Distance: 3.8835 
Epoch: 8/150	Loss: 0.2439	Accuracy: 0.124000	Distance: 3.7635 
Epoch: 9/150	Loss: 0.2298	Accuracy: 0.156500	Distance: 3.6090 
Epoch: 10/150	Loss: 0.2072	Accuracy: 0.197500	Distance: 3.3940 
Epoch: 11/150	Loss: 0.1958	Accuracy: 0.218500	Distance: 3.2905 
Epoch: 12/150	Loss: 0.1831	Accuracy: 0.242500	Distance: 3.2190 
Epoch: 13/150	Loss: 0.1718	Accuracy: 0.257500	Distance: 3.1705 
Epoch: 14/150	Loss: 0.1620	Accuracy: 0.296500	Distance: 3.0035 
Epoch: 15/150	Loss: 0.1502	Accuracy: 0.325000	Dist

KeyboardInterrupt: 

In [14]:
print("Train accuracy:", evaluate(*test(lemmas_train, tags_train, inflected_forms_train)), "Validation accuracy: ", evaluate(*test(lemmas_val, tags_val, inflected_forms_val)))

Train accuracy: (0.968375, 0.154375) Validation accuracy:  (0.7745, 0.7125)


In [None]:
list(zip(*test(lemmas_val, tags_val, inflected_forms_val)))

# Development Dataset

In [15]:
lemmas_dev, tags_dev, inflected_forms_dev = load_data('./conll2018/task1/all/middle-french-dev')

In [16]:
evaluate(*test(lemmas_dev, tags_dev, inflected_forms_dev))

(0.788, 0.631)

Hindi/high - (0.629, 1.427)  
Middle-French/high - (0.788, 0.631)


In [17]:
for pred, true in list(zip(*test(lemmas_dev, tags_dev, inflected_forms_dev))):
    print(pred, true)

enconvenencierois enconvenencierois
pourpensiez pourpensiez
accompaignerez accompaignerez
allaictois allaictois
adoiusteryons adioustoyent
menasseroyt menassoys
pourforce pourforce
approuchie approuche
effectuas effectuas
evaporiez evaporiez
besoigneroient besoigneroient
desistas desista
baptisastes baptisastes
essayoys essayoys
afferampe afferma
armoias armoia
essilieroys essilieroys
exploreriez exploreriez
aornois aornois
esperassiez esperassiez
evityez evityez
gaigna gaigna
eslancent eslancent
baysast baysassions
desyrent desyrent
estimerions estimerions
affieroyt affieront
mesleroient mesleroient
resveroyt resveroyt
oublye oublye
cryes cryes
en deslyant en deslyant
esgratignasse esgratignasse
accomplissoyt accomplissoyy
entitulyez entitulyez
allaictyez allaictyez
gouvernyons gouvernyons
enchargeassiez enchargeassent
accointyez accointyez
eschaufferiez eschaufferiez
troussois troussois
inquietons inquietons
souspirera souspirera
esbauchons esbauchons
esquachent esquachent
esguisas e