large commit moving in the direction of simplifying grammar reading a…

…nd packing - SAMT grammars are no longer supported - removed special PhraseRule, now just have Rule again; Thrax needs to be updated to add [X,1] to the source and target sides of rules - removed redundant functions in GrammarReader and Vocabulary dealing with identifying and parsing nonterminals (and using expensive regexs), now all moved to FormatUtils - cleaned up the GrammarReader interface - Moses phrase table packing updated -
apache · May 24, 2016 · 76eb958 · fhieber · May 25, 2016 · mjpost
1 parent d0f7b53
commit 76eb958
Show file tree

Hide file tree

Showing 12 changed files with 89 additions and 678 deletions.
diff --git a/src/joshua/corpus/Vocabulary.java b/src/joshua/corpus/Vocabulary.java
@@ -155,7 +155,7 @@ public static int id(String token) {
       if (stringToId.containsKey(token)) {
         return stringToId.get(token);
       }
-      int id = idToString.size() * (nt(token) ? -1 : 1);
+      int id = idToString.size() * (FormatUtils.isNonterminal(token) ? -1 : 1);
 
       // register this (token,id) mapping with each language
       // model, so that they can map it to their own private
@@ -237,10 +237,6 @@ public static boolean nt(int id) {
     return (id < 0);
   }
 
-  public static boolean nt(String word) {
-    return FormatUtils.isNonterminal(word);
-  }
-
   public static int size() {
     long lock_stamp = lock.readLock();
     try {

diff --git a/src/joshua/decoder/ff/tm/GrammarReader.java b/src/joshua/decoder/ff/tm/GrammarReader.java
@@ -36,8 +36,6 @@
 public abstract class GrammarReader<R extends Rule> implements Iterable<R>, Iterator<R> {
 
   protected static String fieldDelimiter;
-  protected static String nonTerminalRegEx;
-  protected static String nonTerminalCleanRegEx;
 
   protected static String description;
 
@@ -165,43 +163,4 @@ public R next() {
   }
 
   protected abstract R parseLine(String line);
-
-  // TODO: keep these around or not?
-  public abstract String toWords(R rule);
-
-  public abstract String toWordsWithoutFeatureScores(R rule);
-
-  /**
-   * Removes square brackets (and index, if present) from nonterminal id 
-   * @param tokenID
-   * @return cleaned ID
-   */
-  public static int cleanNonTerminal(int tokenID) {
-    // cleans NT of any markup, e.g., [X,1] may becomes [X], depending
-    return Vocabulary.id(cleanNonTerminal(Vocabulary.word(tokenID)));
-  }
-
-  /**
-   * Removes square brackets (and index, if present) from nonterminal id 
-   * @param token
-   * @return cleaned token
-   */
-  public static String cleanNonTerminal(String token) {
-    // cleans NT of any markup, e.g., [X,1] may becomes [X], depending on nonTerminalCleanRegEx
-    return token.replaceAll(nonTerminalCleanRegEx, "");
-  }
-
-  public static boolean isNonTerminal(final String word) {
-    // checks if word matches NT regex
-    return word.matches(nonTerminalRegEx);
-  }
-
-  public String getNonTerminalRegEx() {
-    return nonTerminalRegEx;
-  }
-
-  public String getNonTerminalCleanRegEx() {
-    return nonTerminalCleanRegEx;
-  }
-
 }
diff --git a/src/joshua/decoder/ff/tm/PhraseRule.java b/src/joshua/decoder/ff/tm/PhraseRule.java
diff --git a/src/joshua/decoder/ff/tm/Rule.java b/src/joshua/decoder/ff/tm/Rule.java
@@ -56,7 +56,7 @@
 public class Rule implements Comparator<Rule>, Comparable<Rule> {
 
   private int lhs; // tag of this rule
-  private int[] pFrench; // pointer to the RuleCollection, as all the rules under it share the same
+  private int[] source; // pointer to the RuleCollection, as all the rules under it share the same
                          // Source side
   protected int arity;
 
@@ -81,7 +81,7 @@ public class Rule implements Comparator<Rule>, Comparable<Rule> {
 
   private float precomputableCost = Float.NEGATIVE_INFINITY;
 
-  private int[] english;
+  private int[] target;
 
   // The alignment string, e.g., 0-0 0-1 1-1 2-1
   private String alignmentString;
@@ -96,18 +96,18 @@ public class Rule implements Comparator<Rule>, Comparable<Rule> {
    * Constructor used by other constructors below;
    * 
    * @param lhs Left-hand side of the rule.
-   * @param sourceRhs Source language right-hand side of the rule.
-   * @param targetRhs Target language right-hand side of the rule.
+   * @param source Source language right-hand side of the rule.
+   * @param target Target language right-hand side of the rule.
    * @param sparseFeatures Feature value scores for the rule.
    * @param arity Number of nonterminals in the source language right-hand side.
    * @param owner
    */
-  public Rule(int lhs, int[] sourceRhs, int[] targetRhs, String sparseFeatures, int arity, int owner) {
+  public Rule(int lhs, int[] source, int[] target, String sparseFeatures, int arity, int owner) {
     this.lhs = lhs;
-    this.pFrench = sourceRhs;
+    this.source = source;
     this.arity = arity;
     this.owner = owner;
-    this.english = targetRhs;
+    this.target = target;
     this.sparseFeatureStringSupplier = Suppliers.memoize(() -> { return sparseFeatures; });
     this.featuresSupplier = initializeFeatureSupplierFromString();
     this.alignmentSupplier = initializeAlignmentSupplier();
@@ -118,10 +118,10 @@ public Rule(int lhs, int[] sourceRhs, int[] targetRhs, String sparseFeatures, in
    */
   public Rule(int lhs, int[] sourceRhs, int[] targetRhs, FeatureVector features, int arity, int owner) {
     this.lhs = lhs;
-    this.pFrench = sourceRhs;
+    this.source = sourceRhs;
     this.arity = arity;
     this.owner = owner;
-    this.english = targetRhs;
+    this.target = targetRhs;
     this.featuresSupplier = Suppliers.memoize(() -> { return features; });
     this.sparseFeatureStringSupplier = initializeSparseFeaturesStringSupplier();
     this.alignmentSupplier = initializeAlignmentSupplier();
@@ -199,11 +199,11 @@ private Supplier<String> initializeSparseFeaturesStringSupplier() {
   // ===============================================================
 
   public void setEnglish(int[] eng) {
-    this.english = eng;
+    this.target = eng;
   }
 
   public int[] getEnglish() {
-    return this.english;
+    return this.target;
   }
 
   /**
@@ -224,7 +224,7 @@ public boolean equals(Object o) {
     if (!Arrays.equals(getFrench(), other.getFrench())) {
       return false;
     }
-    if (!Arrays.equals(english, other.getEnglish())) {
+    if (!Arrays.equals(target, other.getEnglish())) {
       return false;
     }
     return true;
@@ -234,7 +234,7 @@ public int hashCode() {
     // I just made this up. If two rules are equal they'll have the
     // same hashcode. Maybe someone else can do a better job though?
     int frHash = Arrays.hashCode(getFrench());
-    int enHash = Arrays.hashCode(english);
+    int enHash = Arrays.hashCode(target);
     return frHash ^ enHash ^ getLHS();
   }
 
@@ -267,11 +267,11 @@ public int getLHS() {
   }
 
   public void setFrench(int[] french) {
-    this.pFrench = french;
+    this.source = french;
   }
 
   public int[] getFrench() {
-    return this.pFrench;
+    return this.source;
   }
 
   /**

diff --git a/src/joshua/decoder/ff/tm/format/HieroFormatReader.java b/src/joshua/decoder/ff/tm/format/HieroFormatReader.java
@@ -21,6 +21,7 @@
 import joshua.corpus.Vocabulary;
 import joshua.decoder.ff.tm.GrammarReader;
 import joshua.decoder.ff.tm.Rule;
+import joshua.util.FormatUtils;
 
 /**
  * This class implements reading files in the format defined by David Chiang for Hiero. 
@@ -33,10 +34,6 @@ public class HieroFormatReader extends GrammarReader<Rule> {
 
   static {
     fieldDelimiter = "\\s\\|{3}\\s";
-    nonTerminalRegEx = "^\\[[^\\s]+\\,[0-9]*\\]$";
-    nonTerminalCleanRegEx = ",[0-9\\s]+";
-    // nonTerminalRegEx = "^\\[[A-Z]+\\,[0-9]*\\]$";
-    // nonTerminalCleanRegEx = "[\\[\\]\\,0-9\\s]+";
     description = "Original Hiero format";
   }
 
@@ -55,69 +52,58 @@ public Rule parseLine(String line) {
       throw new RuntimeException(String.format("Rule '%s' does not have four fields", line));
     }
 
-    int lhs = Vocabulary.id(cleanNonTerminal(fields[0]));
+    int lhs = Vocabulary.id(FormatUtils.stripNonTerminalIndex(fields[0]));
 
+    /**
+     * On the foreign side, we map nonterminals to negative IDs, and terminals to positive IDs.
+     */
     int arity = 0;
-    // foreign side
-    String[] foreignWords = fields[1].split("\\s+");
-    int[] french = new int[foreignWords.length];
-    for (int i = 0; i < foreignWords.length; i++) {
-      french[i] = Vocabulary.id(foreignWords[i]);
-      if (Vocabulary.nt(french[i])) {
+    String[] sourceWords = fields[1].split("\\s+");
+    int[] sourceIDs = new int[sourceWords.length];
+    for (int i = 0; i < sourceWords.length; i++) {
+      if (FormatUtils.isNonterminal(sourceWords[i])) {
+        Vocabulary.id(sourceWords[i]);
+        sourceIDs[i] = Vocabulary.id(FormatUtils.stripNonTerminalIndex(sourceWords[i]));
         arity++;
-        french[i] = cleanNonTerminal(french[i]);
+
+        // TODO: the arity here (after incrementing) should match the rule index. Should
+        // check that arity == FormatUtils.getNonterminalIndex(foreignWords[i]), throw runtime
+        // error if not
+      } else {
+        sourceIDs[i] = Vocabulary.id(sourceWords[i]);
       }
     }
 
-    // English side
-    String[] englishWords = fields[2].split("\\s+");
-    int[] english = new int[englishWords.length];
-    for (int i = 0; i < englishWords.length; i++) {
-      english[i] = Vocabulary.id(englishWords[i]);
-      if (Vocabulary.nt(english[i])) {
-        english[i] = -Vocabulary.getTargetNonterminalIndex(english[i]);
+    /**
+     * The English side maps terminal symbols to positive IDs. Nonterminal symbols are linked
+     * to the index of the source-side nonterminal they are linked to. So for a rule
+     * 
+     * [X] ||| [X,1] [X,2] [X,3] ||| [X,2] [X,1] [X,3] ||| ...
+     * 
+     * the English side nonterminals will be -2, -1, -3. This assumes that the source side of
+     * the rule is always listed monotonically.
+     */
+    String[] targetWords = fields[2].split("\\s+");
+    int[] targetIDs = new int[targetWords.length];
+    for (int i = 0; i < targetWords.length; i++) {
+      if (FormatUtils.isNonterminal(targetWords[i])) {
+        targetIDs[i] = -FormatUtils.getNonterminalIndex(targetWords[i]);
+      } else {
+        targetIDs[i] = Vocabulary.id(targetWords[i]);
       }
     }
 
     String sparse_features = (fields.length > 3 ? fields[3] : "");
     String alignment = (fields.length > 4) ? fields[4] : null;
 
-    return new Rule(lhs, french, english, sparse_features, arity, alignment);
+    return new Rule(lhs, sourceIDs, targetIDs, sparse_features, arity, alignment);
   }
 
-  @Override
-  public String toWords(Rule rule) {
-    StringBuffer sb = new StringBuffer("");
-    sb.append(Vocabulary.word(rule.getLHS()));
-    sb.append(" ||| ");
-    sb.append(Vocabulary.getWords(rule.getFrench()));
-    sb.append(" ||| ");
-    sb.append(Vocabulary.getWords(rule.getEnglish()));
-    sb.append(" |||");
-    sb.append(" " + rule.getFeatureVector());
-
-    return sb.toString();
-  }
-
-  @Override
-  public String toWordsWithoutFeatureScores(Rule rule) {
-    StringBuffer sb = new StringBuffer();
-    sb.append(rule.getLHS());
-    sb.append(" ||| ");
-    sb.append(Vocabulary.getWords(rule.getFrench()));
-    sb.append(" ||| ");
-    sb.append(Vocabulary.getWords(rule.getEnglish()));
-    sb.append(" |||");
-
-    return sb.toString();
-  }
-
-
   public static String getFieldDelimiter() {
     return fieldDelimiter;
   }
 
   public static boolean isNonTerminal(final String word) {
-    return GrammarReader.isNonTerminal(word);
+    return FormatUtils.isNonterminal(word);
   }
 }