In [1]:
import sys
import dynet as dy
import ipdb 
import timeit
from extractor import get_data
from simple_pos_tagger import SimplePOSTagger
from helper import time

In [2]:
train_inputs, train_labels, tags, vocab = get_data("data/da_ddt-ud-train.conllu")
val_inputs, val_labels, _, _ = get_data("data/da_ddt-ud-dev.conllu")

In [3]:
def to_input(word2int, word, unknown = 0):
    """
    Transforms words to their respective integer representation or unknown if not in dict
    """
    if word in word2int.keys():
        return word2int[word]
    return unknown 

In [4]:
int2word = ["<UNK>"] + vocab
word2int = {w:i for i, w in enumerate(int2word)}
int2tag  = tags
tag2int  = {w:i for i, w in enumerate(int2tag)}

train_inputs = [[word2int[w] for w in ws] for ws in train_inputs] 
train_labels = [[tag2int[t] for t in ts] for ts in train_labels]

val_inputs = [[to_input(word2int, w, 0) for w in ws] for ws in val_inputs]
val_labels = [[tag2int[t] for t in ts] for ts in val_labels]

In [8]:
VOCAB_SIZE = len(int2word)
EMBED_SIZE = 86
HIDDEN_DIM = 16
OUTPUT_DIM = len(tags)

my_tagger = SimplePOSTagger(
                vocab_size = VOCAB_SIZE, 
                output_size = OUTPUT_DIM, 
                embed_size = EMBED_SIZE, 
                hidden_size = HIDDEN_DIM)

In [6]:
if len(sys.argv) > 1 and sys.argv[1] == "load":
    print("Loading...")
    my_tagger.load("tmp/tmp.model")
else:
    print("Training...")
    result, elapsed = time(my_tagger.fit, train_inputs, train_labels)
    print(f"Training time {elapsed}")
    print(result)
    my_tagger.save("tmp/tmp.model")


Training...
Training time 2.087472979999802
None


In [7]:
idx = 3

sentence = [int2word[i] for i in val_inputs[idx]]

print(f"Predicting: {' '.join(sentence)}")
prediction = my_tagger.predict(val_inputs[idx])
prediction = [int2tag[i] for i in prediction]

labels = [int2tag[i] for i in val_labels[idx]]
print("Word\tPredicted\tExpected\tSame")
for word, predicted, expected in zip(sentence, prediction, labels):
      print(f"{word}\t{predicted}\t\t{expected}\t{predicted==expected}")

Predicting: <UNK> <UNK> med vand og får et <UNK> <UNK> udseende , som man dog ikke skal lade sig narre af !
Word	Predicted	Expected	Same
<UNK>	NOUN		PROPN	False
<UNK>	NOUN		VERB	False
med	ADP		ADP	True
vand	NOUN		NOUN	True
og	CCONJ		CCONJ	True
får	VERB		VERB	True
et	DET		DET	True
<UNK>	NOUN		ADJ	False
<UNK>	NOUN		ADJ	False
udseende	NOUN		NOUN	True
,	PUNCT		PUNCT	True
som	PRON		ADP	False
man	PRON		PRON	True
dog	ADV		ADV	True
ikke	ADV		ADV	True
skal	AUX		AUX	True
lade	VERB		VERB	True
sig	PRON		PRON	True
narre	VERB		VERB	True
af	ADP		ADP	True
!	PUNCT		PUNCT	True
