In [269]:
path_to_input = "hmm-training-data/it_isdt_train_tagged.txt"
path_to_predict = "hmm-training-data/ja_gsd_train_tagged.txt"

In [270]:
sentences = []
with open(path_to_input, "r") as fp:
    for line in fp.readlines():
        sentences.append( line.strip().split(" ") )
len(sentences)

13121

In [271]:
def seperate_tag_word(word_tag):
    i = len(word_tag)-1
    while word_tag[i] != "/": i -= 1
    return word_tag[:i], word_tag[i+1:]

emission_list = defaultdict(list)
for sentence in sentences:
    for word_tag in sentence:
        word, tag = seperate_tag_word(word_tag)
        emission_list[tag].append(word)

sorted_tag_list = []
vocabulary = set()
emission_matrix = defaultdict(lambda: defaultdict(float))
for tag, words in emission_list.items():
    v = 1/len(words)
    sorted_tag_list.append( (v, tag) )
    for word in words:
        emission_matrix[tag][word] += v
        vocabulary.add(word)
len(emission_matrix)

39

In [272]:
sorted_tag_list = [ t for _,t in sorted(sorted_tag_list) ]

In [273]:
transition_list = defaultdict(list)
for sentence in sentences:
    n = len(sentence)
    tag_list = []
    for word_tag in sentence:
        tag = seperate_tag_word(word_tag)[1]
        tag_list.append(tag)
    
    transition_list[""].append( tag_list[0] )
    for i in range(1,n):
        transition_list[ tag_list[i-1] ].append( tag_list[i] )

transition_matrix = defaultdict(lambda: defaultdict(float))
for prev_tag, tags in transition_list.items():
    n = len(tags)+len(sorted_tag_list)
    count = dict()
    for tag in tags: count[tag] = count.get(tag,0)+1
    for tag in sorted_tag_list:
        transition_matrix[prev_tag][tag] = (count.get(tag,0)+1)/n
len(transition_matrix)

40

In [234]:
import json
model = {
    "transition": transition_matrix,
    "emission": emission_matrix,
    "vocab": list(vocabulary),
    "tags": sorted_tag_list  
}
with open("hmmmodel.txt", "w") as fp:
    json.dump(model, fp)

## Viterbi Decoding Algorithm

In [274]:
path_to_decode = "hmm-training-data/it_isdt_dev_raw.txt"
sentences = []
with open(path_to_decode, "r") as fp:
    for line in fp.readlines():
        sentences.append( line.strip().split(" ") )
len(sentences)

564

In [276]:
import json
with open("hmmmodel.txt", "r") as fp:
    model = json.load(fp)
transition = model["transition"]
emission = model["emission"]
vocab = set(model["vocabulary"])
sorted_tags = model["tags"]
len(transition), len(emission), len(vocab), len(tags)

(40, 39, 28307, 1)

In [284]:
def decode(sentence, emission, transition, sorted_tags, vocab):
    probability = defaultdict(float)
    backpointer = dict()
    t_n = len(sorted_tags)
    
    n = len(sentence)
    if sentence[0] not in vocab:
        for tag in sorted_tags[:t_n//2]:
            probability[ (tag,0) ] = transition[""][tag]
            backpointer[ (tag,0) ] = None
    else:
        for tag in sorted_tags:
            if sentence[0] not in emission[tag]: continue
            probability[ (tag,0) ] = transition[""][tag]*emission[tag][sentence[0]]
            backpointer[ (tag,0) ] = None
    
    for i in range(1,n):
        for prev_tag in sorted_tags:
            if sentence[i] not in vocab:
                for curr_tag in sorted_tags[:t_n//2]:
                    prob = probability[(prev_tag,i-1)]*transition[prev_tag][curr_tag]
                    if prob > probability[ (curr_tag,i) ]:
                        probability[ (curr_tag,i) ] = prob
                        backpointer[ (curr_tag,i) ] = prev_tag
            else:
                for curr_tag in sorted_tags:
                    if sentence[i] not in emission[curr_tag]:
                        prob = 0
                    else:
                        prob = probability[(prev_tag,i-1)]*transition[prev_tag][curr_tag]*emission[curr_tag][sentence[i]]
                    
                    if prob > probability[ (curr_tag,i) ]:
                        probability[ (curr_tag,i) ] = prob
                        backpointer[ (curr_tag,i) ] = prev_tag
    
    max_probable_last_tag, prob = None, 0
    for tag in sorted_tags:
        if probability.get( (tag,n-1), 0 ) > prob:
            prob = probability[(tag,n-1)]
            max_probable_last_tag = tag
    
    return backpointer, max_probable_last_tag, n

In [285]:
def get_tags_via_backpointer(backpointer, last_tag, n):
    tags, i = [last_tag], n-1
    while backpointer[ (last_tag, i) ] != None:
        last_tag = backpointer[ (last_tag, i) ]
        tags.append(last_tag)
        i -= 1
    return tags[::-1]

In [286]:
s = "Corriere Sport da pagina 23 a pagina 26".split(" ")

In [287]:
backpointer, last_tag, n = decode(s, emission, transition, sorted_tag_list, vocab)
tags = decode_via_backpointer(backpointer, last_tag, n)

In [288]:
" ".join([f"{word}/{tag}" for word, tag in zip(s, tags)])

'Corriere/SP Sport/SP da/E pagina/S 23/N a/E pagina/S 26/N'

In [294]:
prediction = "hmmoutput.txt"
truth = "hmm-training-data/it_isdt_dev_tagged.txt"

In [295]:
prediction_tags = []
with open(prediction, "r") as fp:
    for line in fp.readlines():
        prediction_tags.extend( line.strip().split(" ") )

In [296]:
prediction_tags[:10]

['Corriere/SP',
 'Sport/SP',
 'da/E',
 'pagina/S',
 '23/N',
 'a/E',
 'pagina/S',
 '26/N',
 'I/RD',
 'tre/N']

In [297]:
true_tags = []
with open(truth, "r") as fp:
    for line in fp.readlines():
        true_tags.extend( line.strip().split(" ") )

In [298]:
true_tags[:10]

['Corriere/SP',
 'Sport/SP',
 'da/E',
 'pagina/S',
 '23/N',
 'a/E',
 'pagina/S',
 '26/N',
 'I/RD',
 'tre/N']

In [300]:
correct = 0
for p,t in zip(prediction_tags, true_tags):
    if p==t: correct+=1
correct/len(true_tags)

0.9305508901578771