Fixed grammar packing, big change

- GrammarPacker now uses the appropriate {Moses,Hiero}FormatReader objects in explore() and binarize() passes, instead of doing its own parsing - MosesFormatReader chains to HieroFormatReader after munging input, removes some redundancy - Updated test case
apache · May 24, 2016 · f5adcde · fhieber · May 25, 2016 · fhieber
1 parent 210573b
commit f5adcde
Show file tree

Hide file tree

Showing 10 changed files with 66 additions and 127 deletions.
diff --git a/scripts/support/grammar-packer.pl b/scripts/support/grammar-packer.pl
@@ -90,7 +90,7 @@ sub usage {
 my $grammars = join(" ", @sorted_grammars);
 my $outputs  = join(" ", @outputs);
 my $alignments = $opts{a} ? "--ga" : "";
-my $cmd = "java -Xmx$opts{m} -cp $JOSHUA/lib/args4j-2.0.29.jar:$JOSHUA/class joshua.tools.GrammarPackerCli -g $grammars --outputs $outputs $alignments";
+my $cmd = "java -Xmx$opts{m} -cp $JOSHUA/lib/args4j-2.0.29.jar:$JOSHUA/lib/guava-19.0.jar:$JOSHUA/class joshua.tools.GrammarPackerCli -g $grammars --outputs $outputs $alignments";
 print STDERR "Packing with $cmd...\n" if $opts{v};
 
 my $retval = system($cmd);

diff --git a/src/joshua/decoder/ff/tm/format/HieroFormatReader.java b/src/joshua/decoder/ff/tm/format/HieroFormatReader.java
@@ -18,6 +18,8 @@
  */
 package joshua.decoder.ff.tm.format;
 
+import java.io.IOException;
+
 import joshua.corpus.Vocabulary;
 import joshua.decoder.ff.tm.GrammarReader;
 import joshua.decoder.ff.tm.Rule;
@@ -41,7 +43,7 @@ public HieroFormatReader() {
     super();
   }
 
-  public HieroFormatReader(String grammarFile) {
+  public HieroFormatReader(String grammarFile) throws IOException {
     super(grammarFile);
   }
 
@@ -52,7 +54,7 @@ public Rule parseLine(String line) {
       throw new RuntimeException(String.format("Rule '%s' does not have four fields", line));
     }
 
-    int lhs = Vocabulary.id(FormatUtils.stripNonTerminalIndex(fields[0]));
+    int lhs = Vocabulary.id(fields[0]);
 
     /**
      * On the foreign side, we map nonterminals to negative IDs, and terminals to positive IDs.
@@ -61,16 +63,14 @@ public Rule parseLine(String line) {
     String[] sourceWords = fields[1].split("\\s+");
     int[] sourceIDs = new int[sourceWords.length];
     for (int i = 0; i < sourceWords.length; i++) {
+      sourceIDs[i] = Vocabulary.id(sourceWords[i]);
       if (FormatUtils.isNonterminal(sourceWords[i])) {
-        Vocabulary.id(sourceWords[i]);
         sourceIDs[i] = Vocabulary.id(FormatUtils.stripNonTerminalIndex(sourceWords[i]));
         arity++;
 
         // TODO: the arity here (after incrementing) should match the rule index. Should
         // check that arity == FormatUtils.getNonterminalIndex(foreignWords[i]), throw runtime
         // error if not
-      } else {
-        sourceIDs[i] = Vocabulary.id(sourceWords[i]);
       }
     }
 
@@ -86,10 +86,9 @@ public Rule parseLine(String line) {
     String[] targetWords = fields[2].split("\\s+");
     int[] targetIDs = new int[targetWords.length];
     for (int i = 0; i < targetWords.length; i++) {
+      targetIDs[i] = Vocabulary.id(targetWords[i]);
       if (FormatUtils.isNonterminal(targetWords[i])) {
         targetIDs[i] = -FormatUtils.getNonterminalIndex(targetWords[i]);
-      } else {
-        targetIDs[i] = Vocabulary.id(targetWords[i]);
       }
     }
 

diff --git a/src/joshua/decoder/ff/tm/format/MosesFormatReader.java b/src/joshua/decoder/ff/tm/format/MosesFormatReader.java
@@ -18,6 +18,8 @@
  */
 package joshua.decoder.ff.tm.format;
 
+import java.io.IOException;
+
 import joshua.corpus.Vocabulary;
 import joshua.decoder.ff.tm.Rule;
 import joshua.util.io.LineReader;
@@ -44,7 +46,7 @@ public class MosesFormatReader extends HieroFormatReader {
 
   private int lhs;
 
-  public MosesFormatReader(String grammarFile) {
+  public MosesFormatReader(String grammarFile) throws IOException {
     super(grammarFile);
     this.lhs = Vocabulary.id("[X]");
   }
@@ -76,44 +78,23 @@ public Rule parseLine(String line) {
     String[] fields = line.split(fieldDelimiter);
 
     int arity = 1;
-    int fieldIndex = 0;
-
-    // foreign side
-    String[] foreignWords = fields[fieldIndex].split("\\s+");
-    int[] french = new int[foreignWords.length + 1];
-    french[0] = lhs; 
-    for (int i = 0; i < foreignWords.length; i++) {
-      french[i+1] = Vocabulary.id(foreignWords[i]);
-    }
 
-    // English side
-    fieldIndex++;
-    String[] englishWords = fields[fieldIndex].split("\\s+");
-    int[] english = new int[englishWords.length + 1];
-    english[0] = -1;
-    for (int i = 0; i < englishWords.length; i++) {
-      english[i+1] = Vocabulary.id(englishWords[i]);
-    }
+    StringBuffer hieroLine = new StringBuffer();
+    hieroLine.append("[X] ||| [X,1] " + fields[0] + " ||| [X,1] " + fields[1] + " |||");
 
-    // transform feature values
-    fieldIndex++;
-
-    String mosesFeatureString = fields[fieldIndex];
-    StringBuffer values = new StringBuffer();
+    String mosesFeatureString = fields[2];
     for (String value: mosesFeatureString.split(" ")) {
       float f = Float.parseFloat(value);
-      values.append(String.format("%f ", f <= 0.0 ? -100 : -Math.log(f)));
+      hieroLine.append(String.format(" %f", f <= 0.0 ? -100 : -Math.log(f)));
     }
 
-    String sparse_features = values.toString().trim();
-
-//    System.out.println(String.format("parseLine: %s\n  ->%s", line, sparse_features));
-
     // alignments
-    fieldIndex++;
-    String alignment = (fields.length > fieldIndex) ? fields[fieldIndex] : null;
+    if (fields.length >= 4)
+      hieroLine.append(" ||| " + fields[3]);
 
-    return new Rule(lhs, french, english, sparse_features, arity, alignment);
+    System.err.println(String.format("LINE: %s -> %s", line, hieroLine.toString()));
+
+    return super.parseLine(hieroLine.toString());
   }
 
   /**

diff --git a/src/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java b/src/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
@@ -123,7 +123,7 @@ public MemoryBasedBatchGrammar(String formatKeyword, String grammarFile, String
     this.printGrammar();
   }
 
-  protected GrammarReader<Rule> createReader(String format, String grammarFile) {
+  protected GrammarReader<Rule> createReader(String format, String grammarFile) throws IOException {
 
     if (grammarFile != null) {
       if ("hiero".equals(format) || "thrax".equals(format) || "regexp".equals(format)) {

diff --git a/src/joshua/tools/GrammarPacker.java b/src/joshua/tools/GrammarPacker.java
@@ -37,6 +37,9 @@
 import java.util.logging.Logger;
 
 import joshua.corpus.Vocabulary;
+import joshua.decoder.ff.tm.Rule;
+import joshua.decoder.ff.tm.format.HieroFormatReader;
+import joshua.decoder.ff.tm.format.MosesFormatReader;
 import joshua.util.FormatUtils;
 import joshua.util.encoding.EncoderConfiguration;
 import joshua.util.encoding.FeatureTypeAnalyzer;
@@ -154,13 +157,12 @@ private void readConfig(String config_filename) throws IOException {
    */
   public void pack() throws IOException {
     logger.info("Beginning exploration pass.");
-    LineReader grammar_reader = null;
-    LineReader alignment_reader = null;
 
     // Explore pass. Learn vocabulary and feature value histograms.
     logger.info("Exploring: " + grammar);
-    grammar_reader = new LineReader(grammar);
-    explore(grammar_reader);
+
+    HieroFormatReader grammarReader = getGrammarReader();
+    explore(grammarReader);
 
     logger.info("Exploration pass complete. Freezing vocabulary and finalizing encoders.");
     if (dump != null) {
@@ -194,78 +196,56 @@ public void pack() throws IOException {
 
     logger.info("Beginning packing pass.");
     // Actual binarization pass. Slice and pack source, target and data.
-    grammar_reader = new LineReader(grammar);
-
+    grammarReader = getGrammarReader();
+    LineReader alignment_reader = null;
     if (packAlignments && !grammarAlignments)
       alignment_reader = new LineReader(alignments);
-    binarize(grammar_reader, alignment_reader);
+    binarize(grammarReader, alignment_reader);
     logger.info("Packing complete.");
 
     logger.info("Packed grammar in: " + output);
     logger.info("Done.");
   }
 
-  private void explore(LineReader grammar) {
+  /**
+   * Returns a reader that turns whatever file format is found into Hiero grammar rules.
+   * 
+   * @param grammarFile
+   * @return
+   * @throws IOException 
+   */
+  private HieroFormatReader getGrammarReader() throws IOException {
+    LineReader reader = new LineReader(grammar);
+    String line = reader.next();
+    if (line.startsWith("[")) {
+      return new HieroFormatReader(grammar);
+    } else {
+      return new MosesFormatReader(grammar);
+    }
+  }
+
+  private void explore(HieroFormatReader reader) {
     int counter = 0;
     // We always assume a labeled grammar. Unlabeled features are assumed to be dense and to always
     // appear in the same order. They are assigned numeric names in order of appearance.
     this.types.setLabeled(true);
 
-    while (grammar.hasNext()) {
-      String line = grammar.next().trim();
+    for (Rule rule: reader) {
       counter++;
-      ArrayList<String> fields = new ArrayList<String>(Arrays.asList(line.split("\\s\\|{3}\\s")));
-
-      String lhs = null;
-      if (line.startsWith("[")) {
-        // hierarchical model
-        if (fields.size() < 4) {
-          logger.warning(String.format("Incomplete grammar line at line %d: '%s'", counter, line));
-          continue;
-        }
-        lhs = fields.remove(0);
-      } else {
-        // phrase-based model
-        if (fields.size() < 3) {
-          logger.warning("Incomplete phrase line at line " + counter);
-          logger.warning(line);
-          continue;
-        }
-        lhs = "[X]";
-      }
 
-      String[] source = fields.get(0).split("\\s");
-      String[] target = fields.get(1).split("\\s");
-      String[] features = fields.get(2).split("\\s");
-
-      max_source_len = Math.max(max_source_len, source.length);
-
-      Vocabulary.id(lhs);
-      try {
-        /* Add symbols to vocabulary.
-         * NOTE: In case of nonterminals, we add both stripped versions ("[X]")
-         * and "[X,1]" to the vocabulary.
-         */
-        for (String source_word : source) {
-          Vocabulary.id(source_word);
-          if (FormatUtils.isNonterminal(source_word)) {
-            Vocabulary.id(FormatUtils.stripNonTerminalIndex(source_word));
-          }
-        }
-        for (String target_word : target) {
-          Vocabulary.id(target_word);
-          if (FormatUtils.isNonterminal(target_word)) {
-            Vocabulary.id(FormatUtils.stripNonTerminalIndex(target_word));
-          }
-        }
-      } catch (java.lang.StringIndexOutOfBoundsException e) {
-        System.err.println(String.format("* Skipping bad grammar line '%s'", line));
-        continue;
-      }
+      max_source_len = Math.max(max_source_len, rule.getFrench().length);
+
+      /* Add symbols to vocabulary.
+       * NOTE: In case of nonterminals, we add both stripped versions ("[X]")
+       * and "[X,1]" to the vocabulary.
+       * 
+       * TODO: MJP May 2016: do we need to add [X,1]? If so, should be done in FormatReaders.
+       */
 
       // Add feature names to vocabulary and pass the value through the
       // appropriate encoder.
       int feature_counter = 0;
+      String[] features = rule.getFeatureString().split("\\s+");
       for (int f = 0; f < features.length; ++f) {
         if (features[f].contains("=")) {
           String[] fe = features[f].split("=");
@@ -278,6 +258,7 @@ private void explore(LineReader grammar) {
         }
       }
     }
+    System.err.println("COUNTER: " + counter);
   }
 
   /**
@@ -288,7 +269,7 @@ private String getFirstTwoSourceWords(final String[] source_words) {
     return source_words[0] + SOURCE_WORDS_SEPARATOR + ((source_words.length > 1) ? source_words[1] : "");
   }
 
-  private void binarize(LineReader grammar_reader, LineReader alignment_reader) throws IOException {
+  private void binarize(HieroFormatReader grammarReader, LineReader alignment_reader) throws IOException {
     int counter = 0;
     int slice_counter = 0;
     int num_slices = 0;
@@ -306,36 +287,14 @@ private void binarize(LineReader grammar_reader, LineReader alignment_reader) th
       alignment_buffer = new AlignmentBuffer();
 
     TreeMap<Integer, Float> features = new TreeMap<Integer, Float>();
-    while (grammar_reader.hasNext()) {
-      String grammar_line = grammar_reader.next().trim();
+    for (Rule rule: grammarReader) {
       counter++;
       slice_counter++;
 
-      ArrayList<String> fields = new ArrayList<String>(Arrays.asList(grammar_line.split("\\s\\|{3}\\s")));
-      String lhs_word;
-      String[] source_words;
-      String[] target_words;
-      String[] feature_entries;
-      if (grammar_line.startsWith("[")) {
-        if (fields.size() < 4)
-          continue;
-
-        lhs_word = fields.remove(0);
-        source_words = fields.get(0).split("\\s");
-        target_words = fields.get(1).split("\\s");
-        feature_entries = fields.get(2).split("\\s");
-
-      } else {
-        if (fields.size() < 3)
-          continue;
-
-        lhs_word = "[X]";
-        String tmp = "[X,1] " + fields.get(0);
-        source_words = tmp.split("\\s");
-        tmp = "[X,1] " + fields.get(1);
-        target_words = tmp.split("\\s");
-        feature_entries = fields.get(2).split("\\s");
-      }
+      String lhs_word = Vocabulary.word(rule.getLHS());
+      String[] source_words = rule.getFrenchWords().split("\\s+");
+      String[] target_words = rule.getEnglishWords().split("\\s+");
+      String[] feature_entries = rule.getFeatureString().split("\\s+");
 
       // Reached slice limit size, indicate that we're closing up.
       if (!ready_to_flush
@@ -373,7 +332,7 @@ private void binarize(LineReader grammar_reader, LineReader alignment_reader) th
       if (packAlignments) {
         String alignment_line;
         if (grammarAlignments) {
-          alignment_line = fields.get(3);
+          alignment_line = rule.getAlignmentString();
         } else {
           if (!alignment_reader.hasNext()) {
             logger.severe("No more alignments starting in line " + counter);

diff --git a/test/decoder/phrase/decode/rules.packed/config b/test/decoder/phrase/decode/rules.packed/config
@@ -1 +1 @@
-max-source-len = 5
+max-source-len = 3
diff --git a/test/decoder/phrase/decode/rules.packed/slice_00000.features b/test/decoder/phrase/decode/rules.packed/slice_00000.features
diff --git a/test/decoder/phrase/decode/rules.packed/slice_00000.source b/test/decoder/phrase/decode/rules.packed/slice_00000.source
diff --git a/test/decoder/phrase/decode/rules.packed/slice_00000.target b/test/decoder/phrase/decode/rules.packed/slice_00000.target
diff --git a/test/decoder/phrase/decode/rules.packed/vocabulary b/test/decoder/phrase/decode/rules.packed/vocabulary