Permalink
Browse files

Added code and tests for unsmoothed training.

  • Loading branch information...
1 parent 32e90de commit f6a791e9286c4d797506c50cdb9eded52629afd7 @arnsholt committed Jun 7, 2010
Showing with 566 additions and 12 deletions.
  1. +2 −0 .gitignore
  2. +96 −12 lib/Algorithm/Viterbi.pm
  3. +9 −0 t/00-basic.t
  4. +459 −0 t/eisner.tt
View
@@ -1 +1,3 @@
.*.swp
+*.pir
+Makefile
View
@@ -8,17 +8,49 @@ our class End {};
# TODO:
our role Observation {};
+my grammar Grammar {
+ token TOP {
+ <chunk>+
+ [ $ || <.panic: "Syntax error"> ]
+ }
+
+ token chunk {
+ <record>+ \n
+ }
+
+ token record {
+ $<observation>=[\w+] \t $<tag>=[\w+] \n
+ }
+}
+
+my class Actions {
+ method TOP($/) {
+ make $<chunk>>>.ast;
+ }
+
+ method chunk($/) {
+ make $<record>>>.ast;
+ }
+
+ method record($/) {
+ make ~$<observation> => ~$<tag>;
+ }
+}
+
has @!alphabet; # The HMM's alphabet
has %!name-to-index;
-has %!p-transition;
-has %!p-emission;
+has %.p-transition;
+has %.p-emission;
method BUILD(:@alphabet) {
@!alphabet = @alphabet;
for @!alphabet.kv -> $index, $state {
%!name-to-index{$state} = $index;
}
+
+ %!p-transition = {};
+ %!p-emission = {};
}
# TODO: Algorithm::Viterbi on CPAN also computes the Forward probability of
@@ -33,18 +65,28 @@ method decode($hmm: @input) {
my @trellis;
my @trace;
- # TODO: Initialise the first row in the trellis with the initial
- # probabilities.
+ my $first = @input.shift; # Shift the first observation off the input.
for ^@!alphabet -> $state {
@trellis[0][$state] = %!p-transition{Start}{$state}
- * %!p-emission{$state}{@input[0]};
+ * %!p-emission{$state}{$first};
@trace[0][$state] = $!initial-state;
}
# TODO: Iterate over the input, calculating probabilities as we go.
for @input.kv -> $index, $observation {
for ^@!alphabet -> $state {
# TODO: Get argmax here.
+ my ($max-p, $i) = (0, 0);
+ for ^@!alphabet -> $prev-state {
+ my $new-p = @trellis[$index][$prev-state] *
+ %!p-transition{$prev-state}{$state};
+
+ if $new-p > $max-p {
+ $max-p = $new-p;
+ $i = $prev-state;
+ }
+ }
+
@trellis[$index+1][$state] = $max-p;
@trace[$index+1][$state] = $i;
}
@@ -59,12 +101,54 @@ method decode($hmm: @input) {
# TODO: Get the best list of events from the trellis and return it.
}
-# Compute unsmoothed bigram probabilities from some kind of input. An array of
-# arrays perhaps?
-#multi method train($hmm: Array of Array of Observation @inputs) {
-multi method train($hmm: @inputs) {
+# Compute unsmoothed bigram probabilities from an input file.
+multi method train($hmm: Str $file) {
+ my $res = Grammar.parsefile($file, :actions(Actions.new));
+ $hmm.train($res.ast);
}
-# TODO: How does file IO work in P6?
-#multi method train($hmm: $file) {
-#}
+#multi method train($hmm: Array of Pair @input) {
+multi method train($hmm: @input) {
+ # First, count the number of transitions between pairs of tags, and
+ # emission counts for each tag-observation pair.
+ for @input -> @sequence {
+ my $prev = Start;
+ for @sequence -> $pair {
+ my ($observation, $tag) = ($pair.key, $pair.value);
+
+ # Increment transition count.
+ %!p-transition{$prev} //= {};
+ %!p-transition{$prev}{$tag}++;
+ # Increment emission count.
+ %!p-emission{$tag} //= {};
+ %!p-emission{$tag}{$observation}++;
+
+ $prev = $tag;
+ }
+
+ %!p-transition{$prev} //= {};
+ %!p-transition{$prev}{End}++;
+ }
+
+ # XXX: Development testing code
+ #say %!p-transition{Start}<H>; # Should be: 77
+ #say %!p-transition<C><H>; # Should be: 26
+ #say %!p-transition<C>{End}; # Should be: 44
+ #say %!p-emission<C><3>; # Should be: 20
+
+ # Compute the actual transition probabilities.
+ for %!p-transition.kv -> $from, %to {
+ my $sum = [+] %to.values;
+ for %to.keys -> $k {
+ %to{$k} /= $sum;
+ }
+ }
+
+ # Compute the actual emission probabilities.
+ for %!p-emission.kv -> $tag, %value {
+ my $sum = [+] %value.values;
+ for %value.keys -> $k {
+ %value{$k} /= $sum;
+ }
+ }
+}
View
@@ -1,5 +1,14 @@
use v6;
+use Test;
+
use Algorithm::Viterbi;
my Algorithm::Viterbi $hmm .= new(:alphabet<H C>);
+pass("creating new decoder");
+
+$hmm.train("t/eisner.tt");
+ok($hmm.p-transition<C><H> == 13/68, "C -> H == 13/68?");
+ok($hmm.p-emission<C><3> == 5/34, "C -> 3 == 5/34?");
+
+done_testing;
Oops, something went wrong.

0 comments on commit f6a791e

Please sign in to comment.