arnsholt/Algorithm-Viterbi

Subversion checkout URL

You can clone with
or
.

Added code and tests for unsmoothed training.

commit f6a791e9286c4d797506c50cdb9eded52629afd7 1 parent 32e90de
authored
2  .gitignore
 @@ -1 +1,3 @@ .*.swp +*.pir +Makefile
108 lib/Algorithm/Viterbi.pm
 @@ -8,10 +8,39 @@ our class End {}; # TODO: our role Observation {}; +my grammar Grammar { + token TOP { + + + [ \$ || <.panic: "Syntax error"> ] + } + + token chunk { + + \n + } + + token record { + \$=[\w+] \t \$=[\w+] \n + } +} + +my class Actions { + method TOP(\$/) { + make \$>>.ast; + } + + method chunk(\$/) { + make \$>>.ast; + } + + method record(\$/) { + make ~\$ => ~\$; + } +} + has @!alphabet; # The HMM's alphabet has %!name-to-index; -has %!p-transition; -has %!p-emission; +has %.p-transition; +has %.p-emission; method BUILD(:@alphabet) { @!alphabet = @alphabet; @@ -19,6 +48,9 @@ method BUILD(:@alphabet) { for @!alphabet.kv -> \$index, \$state { %!name-to-index{\$state} = \$index; } + + %!p-transition = {}; + %!p-emission = {}; } # TODO: Algorithm::Viterbi on CPAN also computes the Forward probability of @@ -33,11 +65,10 @@ method decode(\$hmm: @input) { my @trellis; my @trace; - # TODO: Initialise the first row in the trellis with the initial - # probabilities. + my \$first = @input.shift; # Shift the first observation off the input. for ^@!alphabet -> \$state { @trellis[0][\$state] = %!p-transition{Start}{\$state} - * %!p-emission{\$state}{@input[0]}; + * %!p-emission{\$state}{\$first}; @trace[0][\$state] = \$!initial-state; } @@ -45,6 +76,17 @@ method decode(\$hmm: @input) { for @input.kv -> \$index, \$observation { for ^@!alphabet -> \$state { # TODO: Get argmax here. + my (\$max-p, \$i) = (0, 0); + for ^@!alphabet -> \$prev-state { + my \$new-p = @trellis[\$index][\$prev-state] * + %!p-transition{\$prev-state}{\$state}; + + if \$new-p > \$max-p { + \$max-p = \$new-p; + \$i = \$prev-state; + } + } + @trellis[\$index+1][\$state] = \$max-p; @trace[\$index+1][\$state] = \$i; } @@ -59,12 +101,54 @@ method decode(\$hmm: @input) { # TODO: Get the best list of events from the trellis and return it. } -# Compute unsmoothed bigram probabilities from some kind of input. An array of -# arrays perhaps? -#multi method train(\$hmm: Array of Array of Observation @inputs) { -multi method train(\$hmm: @inputs) { +# Compute unsmoothed bigram probabilities from an input file. +multi method train(\$hmm: Str \$file) { + my \$res = Grammar.parsefile(\$file, :actions(Actions.new)); + \$hmm.train(\$res.ast); } -# TODO: How does file IO work in P6? -#multi method train(\$hmm: \$file) { -#} +#multi method train(\$hmm: Array of Pair @input) { +multi method train(\$hmm: @input) { + # First, count the number of transitions between pairs of tags, and + # emission counts for each tag-observation pair. + for @input -> @sequence { + my \$prev = Start; + for @sequence -> \$pair { + my (\$observation, \$tag) = (\$pair.key, \$pair.value); + + # Increment transition count. + %!p-transition{\$prev} //= {}; + %!p-transition{\$prev}{\$tag}++; + # Increment emission count. + %!p-emission{\$tag} //= {}; + %!p-emission{\$tag}{\$observation}++; + + \$prev = \$tag; + } + + %!p-transition{\$prev} //= {}; + %!p-transition{\$prev}{End}++; + } + + # XXX: Development testing code + #say %!p-transition{Start}; # Should be: 77 + #say %!p-transition; # Should be: 26 + #say %!p-transition{End}; # Should be: 44 + #say %!p-emission<3>; # Should be: 20 + + # Compute the actual transition probabilities. + for %!p-transition.kv -> \$from, %to { + my \$sum = [+] %to.values; + for %to.keys -> \$k { + %to{\$k} /= \$sum; + } + } + + # Compute the actual emission probabilities. + for %!p-emission.kv -> \$tag, %value { + my \$sum = [+] %value.values; + for %value.keys -> \$k { + %value{\$k} /= \$sum; + } + } +}
9 t/00-basic.t
 @@ -1,5 +1,14 @@ use v6; +use Test; + use Algorithm::Viterbi; my Algorithm::Viterbi \$hmm .= new(:alphabet); +pass("creating new decoder"); + +\$hmm.train("t/eisner.tt"); +ok(\$hmm.p-transition == 13/68, "C -> H == 13/68?"); +ok(\$hmm.p-emission<3> == 5/34, "C -> 3 == 5/34?"); + +done_testing;
459 t/eisner.tt
 @@ -0,0 +1,459 @@ +2 H +3 H +2 C + +1 H +2 H +3 H +3 H + +3 H +3 H +2 H +2 H + +1 C +2 C +1 C + +2 H +3 H +1 C + +3 H +1 C +1 C + +3 C +1 C +2 C +1 C +1 H + +1 H +1 C +2 C + +3 H +1 H +1 H + +2 H +1 H +3 H + +2 C +2 H +3 C + +2 H +2 C +3 C + +1 H +2 C +2 C + +2 H +2 H +3 H + +3 H +1 C +3 H +3 H + +2 H +2 H +3 H + +3 H +1 C +2 C +3 C +2 C + +3 H +2 H +2 H +3 H +1 C + +1 H +1 H +1 C + +2 H +3 H +2 C + +1 C +2 H +3 H + +3 H +3 H +3 H +3 H + +2 H +2 C +1 C + +1 C +1 C +2 H +3 H +2 H + +1 H +2 H +3 H + +3 H +1 H +1 H + +2 H +2 C +1 C + +3 H +1 H +1 H +3 H +1 H + +1 H +3 C +1 C + +1 H +3 H +3 H +3 H +2 H + +1 C +3 C +1 H + +1 C +1 C +1 H + +2 H +1 C +2 C + +1 C +1 C +2 H + +2 C +2 H +3 H + +2 H +1 C +1 C +1 H +2 C + +3 H +3 C +2 H + +3 H +2 H +2 C + +1 H +2 H +2 H + +1 C +1 C +2 C +2 C +1 C + +1 H +2 H +2 H +1 C + +2 H +3 H +3 C +2 H + +3 H +2 H +1 H + +2 C +2 C +3 C +2 H + +3 H +2 H +2 H + +3 H +2 H +1 C +3 C +1 H + +1 H +2 H +3 H +3 C +2 C + +1 H +1 C +3 C + +3 H +2 C +1 C + +2 H +1 H +1 H + +2 H +1 H +3 H + +1 C +3 H +1 C + +1 H +2 H +3 H + +1 H +2 H +1 C + +1 C +2 H +2 H +2 H +1 H + +3 H +3 H +1 H + +3 H +3 C +3 H + +3 C +1 C +2 C + +3 H +3 C +2 C +2 C +2 H + +3 H +3 H +2 H + +2 H +3 H +3 H + +1 H +1 H +1 H + +1 C +1 C +1 C +3 H + +2 H +3 H +2 C + +2 H +1 C +1 C +1 C + +3 C +1 C +1 C +3 C +1 C + +2 H +2 H +1 C + +1 C +1 C +1 C + +3 H +3 H +3 C +2 C +1 C + +2 H +3 H +3 H + +3 H +3 H +3 H + +2 H +1 H +1 H + +1 H +1 H +2 C +2 C +3 H + +3 H +1 H +3 H +3 H + +2 H +2 H +3 H + +2 H +1 H +3 H + +1 H +2 H +2 H +2 H + +2 H +2 C +1 C + +1 C +1 C +2 C +2 H +3 H + +1 C +2 C +3 C +1 C + +2 H +2 C +2 C + +2 H +2 H +1 C +2 H +1 C + +1 C +1 C +1 C + +1 C +2 C +2 H + +3 H +1 H +1 C + +1 H +1 H +1 H + +2 H +2 C +2 C +1 C +1 H + +2 H +2 C +2 C +2 C +2 C + +3 H +3 H +1 H +3 H + +1 H +2 H +3 H + +1 H +1 H +1 C + +2 H +2 H +1 C +3 H + +2 H +2 H +2 H +1 H +1 H + +2 H +2 H +3 H + +2 C +2 C +1 C +1 C + +2 H +2 H +2 H +2 H +3 C + +2 C +1 H +3 H +3 H + +1 H +3 H +2 C + +3 H +1 H +2 C + +3 H +2 H +1 H +3 H +