diff --git a/src/joshua/corpus/Vocabulary.java b/src/joshua/corpus/Vocabulary.java index d79170d6..6f72ad87 100644 --- a/src/joshua/corpus/Vocabulary.java +++ b/src/joshua/corpus/Vocabulary.java @@ -155,7 +155,7 @@ public static int id(String token) { if (stringToId.containsKey(token)) { return stringToId.get(token); } - int id = idToString.size() * (nt(token) ? -1 : 1); + int id = idToString.size() * (FormatUtils.isNonterminal(token) ? -1 : 1); // register this (token,id) mapping with each language // model, so that they can map it to their own private @@ -237,10 +237,6 @@ public static boolean nt(int id) { return (id < 0); } - public static boolean nt(String word) { - return FormatUtils.isNonterminal(word); - } - public static int size() { long lock_stamp = lock.readLock(); try { diff --git a/src/joshua/decoder/ff/tm/GrammarReader.java b/src/joshua/decoder/ff/tm/GrammarReader.java index f94a472b..7edab7c0 100644 --- a/src/joshua/decoder/ff/tm/GrammarReader.java +++ b/src/joshua/decoder/ff/tm/GrammarReader.java @@ -36,8 +36,6 @@ public abstract class GrammarReader implements Iterable, Iterator { protected static String fieldDelimiter; - protected static String nonTerminalRegEx; - protected static String nonTerminalCleanRegEx; protected static String description; @@ -165,43 +163,4 @@ public R next() { } protected abstract R parseLine(String line); - - // TODO: keep these around or not? - public abstract String toWords(R rule); - - public abstract String toWordsWithoutFeatureScores(R rule); - - /** - * Removes square brackets (and index, if present) from nonterminal id - * @param tokenID - * @return cleaned ID - */ - public static int cleanNonTerminal(int tokenID) { - // cleans NT of any markup, e.g., [X,1] may becomes [X], depending - return Vocabulary.id(cleanNonTerminal(Vocabulary.word(tokenID))); - } - - /** - * Removes square brackets (and index, if present) from nonterminal id - * @param token - * @return cleaned token - */ - public static String cleanNonTerminal(String token) { - // cleans NT of any markup, e.g., [X,1] may becomes [X], depending on nonTerminalCleanRegEx - return token.replaceAll(nonTerminalCleanRegEx, ""); - } - - public static boolean isNonTerminal(final String word) { - // checks if word matches NT regex - return word.matches(nonTerminalRegEx); - } - - public String getNonTerminalRegEx() { - return nonTerminalRegEx; - } - - public String getNonTerminalCleanRegEx() { - return nonTerminalCleanRegEx; - } - } diff --git a/src/joshua/decoder/ff/tm/PhraseRule.java b/src/joshua/decoder/ff/tm/PhraseRule.java deleted file mode 100644 index 8f5d2497..00000000 --- a/src/joshua/decoder/ff/tm/PhraseRule.java +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package joshua.decoder.ff.tm; - -import com.google.common.base.Supplier; -import com.google.common.base.Suppliers; - -/*** - * A class for reading in rules from a Moses phrase table. Most of the conversion work is done - * in {@link joshua.decoder.ff.tm.format.PhraseFormatReader}. This includes prepending every - * rule with a nonterminal, so that the phrase-based decoder can assume the same hypergraph - * format as the hierarchical decoder (by pretending to be a strictly left-branching grammar and - * dispensing with the notion of coverage spans). However, prepending the nonterminals means all - * the alignments are off by 1. We do not want to fix those when reading in due to the expense, - * so instead we use this rule which adjust the alignments on the fly. - * - * Also, we only convert the Moses dense features on the fly, via this class. - * - * TODO: this class should also be responsible for prepending the nonterminals. - * - * @author Matt Post - * - */ -public class PhraseRule extends Rule { - - - private final String mosesFeatureString; - private final Supplier alignmentSupplier; - private final Supplier sparseFeaturesStringSupplier; - - public PhraseRule(int lhs, int[] french, int[] english, String sparse_features, int arity, - String alignment) { - super(lhs, french, english, null, arity, alignment); - this.mosesFeatureString = sparse_features; - this.alignmentSupplier = initializeAlignmentSupplier(); - this.sparseFeaturesStringSupplier = initializeSparseFeaturesStringSupplier(); - } - - /** - * Moses features are probabilities; we need to convert them here by taking the negative log prob. - * We do this only when the rule is used to amortize. - */ - private Supplier initializeSparseFeaturesStringSupplier() { - return Suppliers.memoize(() ->{ - StringBuffer values = new StringBuffer(); - for (String value: mosesFeatureString.split(" ")) { - float f = Float.parseFloat(value); - values.append(String.format("%f ", f <= 0.0 ? -100 : -Math.log(f))); - } - return values.toString().trim(); - }); - } - - /** - * This is the exact same as the parent implementation, but we need to add 1 to each alignment - * point to account for the nonterminal [X] that was prepended to each rule. - */ - private Supplier initializeAlignmentSupplier(){ - return Suppliers.memoize(() ->{ - String[] tokens = getAlignmentString().split("[-\\s]+"); - byte[] alignmentArray = new byte[tokens.length + 2]; - alignmentArray[0] = alignmentArray[1] = 0; - for (int i = 0; i < tokens.length; i++) - alignmentArray[i + 2] = (byte) (Short.parseShort(tokens[i]) + 1); - return alignmentArray; - }); - } - - @Override - public String getFeatureString() { - return this.sparseFeaturesStringSupplier.get(); - } - - @Override - public byte[] getAlignment() { - return this.alignmentSupplier.get(); - } -} diff --git a/src/joshua/decoder/ff/tm/Rule.java b/src/joshua/decoder/ff/tm/Rule.java index 9f1fb8fe..89bb2a2d 100644 --- a/src/joshua/decoder/ff/tm/Rule.java +++ b/src/joshua/decoder/ff/tm/Rule.java @@ -56,7 +56,7 @@ public class Rule implements Comparator, Comparable { private int lhs; // tag of this rule - private int[] pFrench; // pointer to the RuleCollection, as all the rules under it share the same + private int[] source; // pointer to the RuleCollection, as all the rules under it share the same // Source side protected int arity; @@ -81,7 +81,7 @@ public class Rule implements Comparator, Comparable { private float precomputableCost = Float.NEGATIVE_INFINITY; - private int[] english; + private int[] target; // The alignment string, e.g., 0-0 0-1 1-1 2-1 private String alignmentString; @@ -96,18 +96,18 @@ public class Rule implements Comparator, Comparable { * Constructor used by other constructors below; * * @param lhs Left-hand side of the rule. - * @param sourceRhs Source language right-hand side of the rule. - * @param targetRhs Target language right-hand side of the rule. + * @param source Source language right-hand side of the rule. + * @param target Target language right-hand side of the rule. * @param sparseFeatures Feature value scores for the rule. * @param arity Number of nonterminals in the source language right-hand side. * @param owner */ - public Rule(int lhs, int[] sourceRhs, int[] targetRhs, String sparseFeatures, int arity, int owner) { + public Rule(int lhs, int[] source, int[] target, String sparseFeatures, int arity, int owner) { this.lhs = lhs; - this.pFrench = sourceRhs; + this.source = source; this.arity = arity; this.owner = owner; - this.english = targetRhs; + this.target = target; this.sparseFeatureStringSupplier = Suppliers.memoize(() -> { return sparseFeatures; }); this.featuresSupplier = initializeFeatureSupplierFromString(); this.alignmentSupplier = initializeAlignmentSupplier(); @@ -118,10 +118,10 @@ public Rule(int lhs, int[] sourceRhs, int[] targetRhs, String sparseFeatures, in */ public Rule(int lhs, int[] sourceRhs, int[] targetRhs, FeatureVector features, int arity, int owner) { this.lhs = lhs; - this.pFrench = sourceRhs; + this.source = sourceRhs; this.arity = arity; this.owner = owner; - this.english = targetRhs; + this.target = targetRhs; this.featuresSupplier = Suppliers.memoize(() -> { return features; }); this.sparseFeatureStringSupplier = initializeSparseFeaturesStringSupplier(); this.alignmentSupplier = initializeAlignmentSupplier(); @@ -199,11 +199,11 @@ private Supplier initializeSparseFeaturesStringSupplier() { // =============================================================== public void setEnglish(int[] eng) { - this.english = eng; + this.target = eng; } public int[] getEnglish() { - return this.english; + return this.target; } /** @@ -224,7 +224,7 @@ public boolean equals(Object o) { if (!Arrays.equals(getFrench(), other.getFrench())) { return false; } - if (!Arrays.equals(english, other.getEnglish())) { + if (!Arrays.equals(target, other.getEnglish())) { return false; } return true; @@ -234,7 +234,7 @@ public int hashCode() { // I just made this up. If two rules are equal they'll have the // same hashcode. Maybe someone else can do a better job though? int frHash = Arrays.hashCode(getFrench()); - int enHash = Arrays.hashCode(english); + int enHash = Arrays.hashCode(target); return frHash ^ enHash ^ getLHS(); } @@ -267,11 +267,11 @@ public int getLHS() { } public void setFrench(int[] french) { - this.pFrench = french; + this.source = french; } public int[] getFrench() { - return this.pFrench; + return this.source; } /** diff --git a/src/joshua/decoder/ff/tm/format/HieroFormatReader.java b/src/joshua/decoder/ff/tm/format/HieroFormatReader.java index a47813dd..d2a01ebd 100644 --- a/src/joshua/decoder/ff/tm/format/HieroFormatReader.java +++ b/src/joshua/decoder/ff/tm/format/HieroFormatReader.java @@ -21,6 +21,7 @@ import joshua.corpus.Vocabulary; import joshua.decoder.ff.tm.GrammarReader; import joshua.decoder.ff.tm.Rule; +import joshua.util.FormatUtils; /** * This class implements reading files in the format defined by David Chiang for Hiero. @@ -33,10 +34,6 @@ public class HieroFormatReader extends GrammarReader { static { fieldDelimiter = "\\s\\|{3}\\s"; - nonTerminalRegEx = "^\\[[^\\s]+\\,[0-9]*\\]$"; - nonTerminalCleanRegEx = ",[0-9\\s]+"; - // nonTerminalRegEx = "^\\[[A-Z]+\\,[0-9]*\\]$"; - // nonTerminalCleanRegEx = "[\\[\\]\\,0-9\\s]+"; description = "Original Hiero format"; } @@ -55,69 +52,58 @@ public Rule parseLine(String line) { throw new RuntimeException(String.format("Rule '%s' does not have four fields", line)); } - int lhs = Vocabulary.id(cleanNonTerminal(fields[0])); + int lhs = Vocabulary.id(FormatUtils.stripNonTerminalIndex(fields[0])); + /** + * On the foreign side, we map nonterminals to negative IDs, and terminals to positive IDs. + */ int arity = 0; - // foreign side - String[] foreignWords = fields[1].split("\\s+"); - int[] french = new int[foreignWords.length]; - for (int i = 0; i < foreignWords.length; i++) { - french[i] = Vocabulary.id(foreignWords[i]); - if (Vocabulary.nt(french[i])) { + String[] sourceWords = fields[1].split("\\s+"); + int[] sourceIDs = new int[sourceWords.length]; + for (int i = 0; i < sourceWords.length; i++) { + if (FormatUtils.isNonterminal(sourceWords[i])) { + Vocabulary.id(sourceWords[i]); + sourceIDs[i] = Vocabulary.id(FormatUtils.stripNonTerminalIndex(sourceWords[i])); arity++; - french[i] = cleanNonTerminal(french[i]); + + // TODO: the arity here (after incrementing) should match the rule index. Should + // check that arity == FormatUtils.getNonterminalIndex(foreignWords[i]), throw runtime + // error if not + } else { + sourceIDs[i] = Vocabulary.id(sourceWords[i]); } } - // English side - String[] englishWords = fields[2].split("\\s+"); - int[] english = new int[englishWords.length]; - for (int i = 0; i < englishWords.length; i++) { - english[i] = Vocabulary.id(englishWords[i]); - if (Vocabulary.nt(english[i])) { - english[i] = -Vocabulary.getTargetNonterminalIndex(english[i]); + /** + * The English side maps terminal symbols to positive IDs. Nonterminal symbols are linked + * to the index of the source-side nonterminal they are linked to. So for a rule + * + * [X] ||| [X,1] [X,2] [X,3] ||| [X,2] [X,1] [X,3] ||| ... + * + * the English side nonterminals will be -2, -1, -3. This assumes that the source side of + * the rule is always listed monotonically. + */ + String[] targetWords = fields[2].split("\\s+"); + int[] targetIDs = new int[targetWords.length]; + for (int i = 0; i < targetWords.length; i++) { + if (FormatUtils.isNonterminal(targetWords[i])) { + targetIDs[i] = -FormatUtils.getNonterminalIndex(targetWords[i]); + } else { + targetIDs[i] = Vocabulary.id(targetWords[i]); } } String sparse_features = (fields.length > 3 ? fields[3] : ""); String alignment = (fields.length > 4) ? fields[4] : null; - return new Rule(lhs, french, english, sparse_features, arity, alignment); + return new Rule(lhs, sourceIDs, targetIDs, sparse_features, arity, alignment); } - @Override - public String toWords(Rule rule) { - StringBuffer sb = new StringBuffer(""); - sb.append(Vocabulary.word(rule.getLHS())); - sb.append(" ||| "); - sb.append(Vocabulary.getWords(rule.getFrench())); - sb.append(" ||| "); - sb.append(Vocabulary.getWords(rule.getEnglish())); - sb.append(" |||"); - sb.append(" " + rule.getFeatureVector()); - - return sb.toString(); - } - - @Override - public String toWordsWithoutFeatureScores(Rule rule) { - StringBuffer sb = new StringBuffer(); - sb.append(rule.getLHS()); - sb.append(" ||| "); - sb.append(Vocabulary.getWords(rule.getFrench())); - sb.append(" ||| "); - sb.append(Vocabulary.getWords(rule.getEnglish())); - sb.append(" |||"); - - return sb.toString(); - } - - public static String getFieldDelimiter() { return fieldDelimiter; } public static boolean isNonTerminal(final String word) { - return GrammarReader.isNonTerminal(word); + return FormatUtils.isNonterminal(word); } } diff --git a/src/joshua/decoder/ff/tm/format/PhraseFormatReader.java b/src/joshua/decoder/ff/tm/format/MosesFormatReader.java similarity index 75% rename from src/joshua/decoder/ff/tm/format/PhraseFormatReader.java rename to src/joshua/decoder/ff/tm/format/MosesFormatReader.java index be4d5221..0b33ba1c 100644 --- a/src/joshua/decoder/ff/tm/format/PhraseFormatReader.java +++ b/src/joshua/decoder/ff/tm/format/MosesFormatReader.java @@ -19,38 +19,37 @@ package joshua.decoder.ff.tm.format; import joshua.corpus.Vocabulary; -import joshua.decoder.ff.tm.PhraseRule; +import joshua.decoder.ff.tm.Rule; import joshua.util.io.LineReader; /*** * This class reads in the Moses phrase table format, with support for the source and target side, - * list of features, and word alignments. It works by simply casting the phrase-based rules to - * left-branching hierarchical rules and passing them on to its parent class, {@HieroFormatReader}. + * list of features, and word alignments. It works by + * + * - casting the phrase-based rules to left-branching hierarchical rules and passing them on \ + * to its parent class, {@HieroFormatReader}. + * - converting the probabilities to -log probabilities * * There is also a tool to convert the grammars directly, so that they can be suitably packed. Usage: * *
- *     cat PHRASE_TABLE | java -cp $JOSHUA/class joshua.decoder.ff.tm.format.PhraseFormatReader > grammar
+ *     cat PHRASE_TABLE | java -cp $JOSHUA/class joshua.decoder.ff.tm.format.MosesFormatReader > grammar
  * 
* * @author Matt Post * */ -public class PhraseFormatReader extends HieroFormatReader { +public class MosesFormatReader extends HieroFormatReader { private int lhs; - /* Whether we are reading a Moses phrase table or Thrax phrase table */ - private boolean moses_format = false; - - public PhraseFormatReader(String grammarFile, boolean is_moses) { + public MosesFormatReader(String grammarFile) { super(grammarFile); this.lhs = Vocabulary.id("[X]"); - this.moses_format = is_moses; } - public PhraseFormatReader() { + public MosesFormatReader() { super(); this.lhs = Vocabulary.id("[X]"); } @@ -73,15 +72,11 @@ public PhraseFormatReader() { * [X] ||| [X,1] mots francaises ||| [X,1] French words ||| 1 2 3 ||| 0-1 1-0 */ @Override - public PhraseRule parseLine(String line) { + public Rule parseLine(String line) { String[] fields = line.split(fieldDelimiter); int arity = 1; - - /* For Thrax phrase-based grammars, skip over the beginning nonterminal */ int fieldIndex = 0; - if (! moses_format) - fieldIndex++; // foreign side String[] foreignWords = fields[fieldIndex].split("\\s+"); @@ -102,7 +97,15 @@ public PhraseRule parseLine(String line) { // transform feature values fieldIndex++; - String sparse_features = fields[fieldIndex]; + + String mosesFeatureString = fields[fieldIndex]; + StringBuffer values = new StringBuffer(); + for (String value: mosesFeatureString.split(" ")) { + float f = Float.parseFloat(value); + values.append(String.format("%f ", f <= 0.0 ? -100 : -Math.log(f))); + } + + String sparse_features = values.toString().trim(); // System.out.println(String.format("parseLine: %s\n ->%s", line, sparse_features)); @@ -110,7 +113,7 @@ public PhraseRule parseLine(String line) { fieldIndex++; String alignment = (fields.length > fieldIndex) ? fields[fieldIndex] : null; - return new PhraseRule(lhs, french, english, sparse_features, arity, alignment); + return new Rule(lhs, french, english, sparse_features, arity, alignment); } /** @@ -119,9 +122,9 @@ public PhraseRule parseLine(String line) { * @param args */ public static void main(String[] args) { - PhraseFormatReader reader = new PhraseFormatReader(); + MosesFormatReader reader = new MosesFormatReader(); for (String line: new LineReader(System.in)) { - PhraseRule rule = reader.parseLine(line); + Rule rule = reader.parseLine(line); System.out.println(rule.textFormat()); } } diff --git a/src/joshua/decoder/ff/tm/format/SamtFormatReader.java b/src/joshua/decoder/ff/tm/format/SamtFormatReader.java deleted file mode 100644 index 6539d38a..00000000 --- a/src/joshua/decoder/ff/tm/format/SamtFormatReader.java +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package joshua.decoder.ff.tm.format; - -import java.util.logging.Logger; - -import joshua.corpus.Vocabulary; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.ff.tm.GrammarReader; - -public class SamtFormatReader extends GrammarReader { - - private static final Logger logger = Logger.getLogger(SamtFormatReader.class.getName()); - - private static final String samtNonTerminalMarkup; - - static { - fieldDelimiter = "#"; - nonTerminalRegEx = "^@[^\\s]+"; - nonTerminalCleanRegEx = ",[0-9\\s]+"; - - samtNonTerminalMarkup = "@"; - - description = "Original SAMT format"; - } - - public SamtFormatReader(String grammarFile) { - super(grammarFile); - } - - // Format example: - // @VZ-HD @APPR-DA+ART-DA minutes#@2 protokoll @1#@PP-MO+VZ-HD#0 1 1 -0 0.5 -0 - - @Override - protected Rule parseLine(String line) { - String[] fields = line.split(fieldDelimiter); - if (fields.length != 4) { - logger.severe("Rule line does not have four fields: " + line); - logger.severe("Skipped."); - return null; - } - - int lhs = Vocabulary.id(adaptNonTerminalMarkup(fields[2])); - - int arity = 0; - - // foreign side - String[] foreignWords = fields[0].split("\\s+"); - int[] french = new int[foreignWords.length]; - for (int i = 0; i < foreignWords.length; i++) { - if (isNonTerminal(foreignWords[i])) { - arity++; - french[i] = Vocabulary.id(adaptNonTerminalMarkup(foreignWords[i], arity)); - } else { - french[i] = Vocabulary.id(foreignWords[i]); - } - } - - // english side - String[] englishWords = fields[1].split("\\s+"); - int[] english = new int[englishWords.length]; - for (int i = 0; i < englishWords.length; i++) { - if (isNonTerminal(englishWords[i])) { - english[i] = -Integer.parseInt(cleanSamtNonTerminal(englishWords[i])); - } else { - english[i] = Vocabulary.id(englishWords[i]); - } - } - - // feature scores - String sparseFeatures = fields[3]; - - return new Rule(lhs, french, english, sparseFeatures, arity); - } - - protected String cleanSamtNonTerminal(String word) { - // changes SAMT markup to Hiero-style - return word.replaceAll(samtNonTerminalMarkup, ""); - } - - protected String adaptNonTerminalMarkup(String word) { - // changes SAMT markup to Hiero-style - return "[" - + word.replaceAll(",", "_COMMA_").replaceAll("\\$", "_DOLLAR_") - .replaceAll(samtNonTerminalMarkup, "") + "]"; - } - - protected String adaptNonTerminalMarkup(String word, int ntIndex) { - // changes SAMT markup to Hiero-style - return "[" - + word.replaceAll(",", "_COMMA_").replaceAll("\\$", "_DOLLAR_") - .replaceAll(samtNonTerminalMarkup, "") + "," + ntIndex + "]"; - } - - @Override - public String toWords(Rule rule) { - StringBuffer sb = new StringBuffer(); - sb.append(Vocabulary.word(rule.getLHS())); - sb.append(" ||| "); - sb.append(Vocabulary.getWords(rule.getFrench())); - sb.append(" ||| "); - sb.append(Vocabulary.getWords(rule.getEnglish())); - sb.append(" ||| " + rule.getFeatureString()); - - return sb.toString(); - } - - @Override - public String toWordsWithoutFeatureScores(Rule rule) { - StringBuffer sb = new StringBuffer(); - sb.append(Vocabulary.word(rule.getLHS())); - sb.append(" ||| "); - sb.append(Vocabulary.getWords(rule.getFrench())); - sb.append(" ||| "); - sb.append(Vocabulary.getWords(rule.getEnglish())); - sb.append(" |||"); - - return sb.toString(); - } -} diff --git a/src/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java b/src/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java index 4ba514a5..e85ce09c 100644 --- a/src/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java +++ b/src/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java @@ -33,8 +33,7 @@ import joshua.decoder.ff.tm.GrammarReader; import joshua.decoder.ff.tm.Trie; import joshua.decoder.ff.tm.format.HieroFormatReader; -import joshua.decoder.ff.tm.format.PhraseFormatReader; -import joshua.decoder.ff.tm.format.SamtFormatReader; +import joshua.decoder.ff.tm.format.MosesFormatReader; import joshua.util.FormatUtils; /** @@ -130,10 +129,8 @@ protected GrammarReader createReader(String format, String grammarFile) { if (grammarFile != null) { if ("hiero".equals(format) || "thrax".equals(format) || "regexp".equals(format)) { return new HieroFormatReader(grammarFile); - } else if ("samt".equals(format)) { - return new SamtFormatReader(grammarFile); - } else if ("phrase".equals(format) || "moses".equals(format)) { - return new PhraseFormatReader(grammarFile, format.equals("moses")); + } else if ("moses".equals(format)) { + return new MosesFormatReader(grammarFile); } else { throw new RuntimeException(String.format("* FATAL: unknown grammar format '%s'", format)); } diff --git a/src/joshua/decoder/ff/tm/packed/PackedGrammar.java b/src/joshua/decoder/ff/tm/packed/PackedGrammar.java index fb38cf04..cc58578b 100644 --- a/src/joshua/decoder/ff/tm/packed/PackedGrammar.java +++ b/src/joshua/decoder/ff/tm/packed/PackedGrammar.java @@ -652,10 +652,7 @@ public List getRules() { rules = new ArrayList(num_rules); for (int i = 0; i < num_rules; i++) { - if (type.equals("moses") || type.equals("phrase")) - rules.add(new PackedPhrasePair(rule_position + 3 * i)); - else - rules.add(new PackedRule(rule_position + 3 * i)); + rules.add(new PackedRule(rule_position + 3 * i)); } cached_rules.put(this, rules); @@ -794,105 +791,6 @@ public void remove() { } } - /** - * A packed phrase pair represents a rule of the form of a phrase pair, packed with the - * grammar-packer.pl script, which simply adds a nonterminal [X] to the left-hand side of - * all phrase pairs (and converts the Moses features). The packer then packs these. We have - * to then put a nonterminal on the source and target sides to treat the phrase pairs like - * left-branching rules, which is how Joshua deals with phrase decoding. - * - * @author Matt Post - * - */ - public final class PackedPhrasePair extends PackedRule { - - private final Supplier englishSupplier; - private final Supplier alignmentSupplier; - - public PackedPhrasePair(int address) { - super(address); - englishSupplier = initializeEnglishSupplier(); - alignmentSupplier = initializeAlignmentSupplier(); - } - - @Override - public int getArity() { - return PackedTrie.this.getArity() + 1; - } - - /** - * Initialize a number of suppliers which get evaluated when their respective getters - * are called. - * Inner lambda functions are guaranteed to only be called once, because of this underlying - * structures are accessed in a threadsafe way. - * Guava's implementation makes sure only one read of a volatile variable occurs per get. - * This means this implementation should be as thread-safe and performant as possible. - */ - - private Supplier initializeEnglishSupplier(){ - Supplier result = Suppliers.memoize(() ->{ - int[] phrase = getTarget(source[address + 1]); - int[] tgt = new int[phrase.length + 1]; - tgt[0] = -1; - for (int i = 0; i < phrase.length; i++) - tgt[i+1] = phrase[i]; - return tgt; - }); - return result; - } - - private Supplier initializeAlignmentSupplier(){ - Supplier result = Suppliers.memoize(() ->{ - byte[] raw_alignment = getAlignmentArray(source[address + 2]); - byte[] points = new byte[raw_alignment.length + 2]; - points[0] = points[1] = 0; - for (int i = 0; i < raw_alignment.length; i++) - points[i + 2] = (byte) (raw_alignment[i] + 1); - return points; - }); - return result; - } - - /** - * Take the English phrase of the underlying rule and prepend an [X]. - * - * @return - */ - @Override - public int[] getEnglish() { - return this.englishSupplier.get(); - } - - /** - * Take the French phrase of the underlying rule and prepend an [X]. - * - * @return - */ - @Override - public int[] getFrench() { - int phrase[] = new int[src.length + 1]; - int ntid = Vocabulary.id(PackedGrammar.this.joshuaConfiguration.default_non_terminal); - phrase[0] = ntid; - System.arraycopy(src, 0, phrase, 1, src.length); - return phrase; - } - - /** - * Similarly the alignment array needs to be shifted over by one. - * - * @return - */ - @Override - public byte[] getAlignment() { - // if no alignments in grammar do not fail - if (alignments == null) { - return null; - } - - return this.alignmentSupplier.get(); - } - } - public class PackedRule extends Rule { protected final int address; private final Supplier englishSupplier; diff --git a/src/joshua/decoder/hypergraph/GrammarBuilderWalkerFunction.java b/src/joshua/decoder/hypergraph/GrammarBuilderWalkerFunction.java index 12e79c59..a4df7e5a 100644 --- a/src/joshua/decoder/hypergraph/GrammarBuilderWalkerFunction.java +++ b/src/joshua/decoder/hypergraph/GrammarBuilderWalkerFunction.java @@ -27,6 +27,7 @@ import joshua.decoder.ff.tm.Rule; import joshua.decoder.ff.tm.format.HieroFormatReader; import joshua.decoder.ff.tm.hash_based.MemoryBasedBatchGrammar; +import joshua.util.FormatUtils; /** * This walker function builds up a new context-free grammar by visiting each node in a hypergraph. @@ -79,8 +80,7 @@ private static int getLabelWithSpan(HGNode node) { private static String getLabelWithSpanAsString(HGNode node) { String label = Vocabulary.word(node.lhs); - String cleanLabel = HieroFormatReader.cleanNonTerminal(label); - String unBracketedCleanLabel = cleanLabel.substring(1, cleanLabel.length() - 1); + String unBracketedCleanLabel = label.substring(1, label.length() - 1); return String.format("[%d-%s-%d]", node.i, unBracketedCleanLabel, node.j); } diff --git a/src/joshua/util/CompareGrammars.java b/src/joshua/util/CompareGrammars.java deleted file mode 100644 index 109d7a19..00000000 --- a/src/joshua/util/CompareGrammars.java +++ /dev/null @@ -1,207 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package joshua.util; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.HashSet; -import java.util.Scanner; -import java.util.Set; -import java.util.logging.Level; -import java.util.logging.Logger; - -import joshua.decoder.ff.tm.format.HieroFormatReader; - -/** - * This class allows two grammars (loaded from disk) to be compared. - * - * @author Lane Schwartz - */ -public class CompareGrammars { - - /** Logger for this class. */ - private static final Logger logger = Logger.getLogger(CompareGrammars.class.getName()); - - /** - * Gets a set containing all unique instances of the specified field. - * - * @param grammarFile File containing a grammar. - * @param fieldDelimiter Regular expression to split each line - * @param fieldNumber Field from each rule to extract - * @return set containing all unique instances of the specified field - * @throws FileNotFoundException - */ - public static Set getFields(File grammarFile, String fieldDelimiter, int fieldNumber) - throws FileNotFoundException { - - Scanner grammarScanner = new Scanner(grammarFile); - - Set set = new HashSet(); - - while (grammarScanner.hasNextLine()) { - - String line = grammarScanner.nextLine(); - - String[] fields = line.split(fieldDelimiter); - - set.add(fields[fieldNumber]); - } - - grammarScanner.close(); - - return set; - } - - public static void compareValues(File grammarFile1, File grammarFile2, String fieldDelimiter, - int fieldNumber, String scoresDelimiter, int scoresFieldNumber, float delta) - throws FileNotFoundException { - - Scanner grammarScanner1 = new Scanner(grammarFile1); - Scanner grammarScanner2 = new Scanner(grammarFile2); - - Set set = new HashSet(); - - int counter = 0; - float totalOverDiffs = 0.0f; - while (grammarScanner1.hasNextLine() && grammarScanner2.hasNextLine()) { - - counter++; - - String line1 = grammarScanner1.nextLine(); - String[] fields1 = line1.split(fieldDelimiter); - String[] scores1 = fields1[fieldNumber].split(scoresDelimiter); - float score1 = Float.valueOf(scores1[scoresFieldNumber]); - - String line2 = grammarScanner2.nextLine(); - String[] fields2 = line2.split(fieldDelimiter); - String[] scores2 = fields2[fieldNumber].split(scoresDelimiter); - float score2 = Float.valueOf(scores2[scoresFieldNumber]); - - if (fields1[0].endsWith(fields2[0]) && fields1[1].endsWith(fields2[1]) - && fields1[1].endsWith(fields2[1])) { - - float diff1 = Math.abs(score1 - score2); - float diff2 = Math.abs(score2 - score1); - float diff = (diff1 < diff2) ? diff1 : diff2; - - if (diff > delta) { - logger.fine("Line " + counter + ": Score mismatch: " + score1 + " vs " + score2); - set.add(line1); - totalOverDiffs += diff; - } else if (logger.isLoggable(Level.FINEST)) { - logger.finest("Line " + counter + ": Scores MATCH: " + score1 + " vs " + score2); - } - - } else { - throw new RuntimeException("Lines don't match: " + line1 + " and " + line2); - } - } - - grammarScanner1.close(); - grammarScanner2.close(); - - if (set.isEmpty()) { - logger.info("No score mismatches"); - } else { - logger.warning("Number of mismatches: " + set.size() + " out of " + counter); - logger.warning("Total mismatch logProb mass: " + totalOverDiffs + " (" + totalOverDiffs - / set.size() + ") (" + totalOverDiffs / counter + ")"); - } - } - - /** - * Main method. - * - * @param args names of the two grammars to be compared - * @throws FileNotFoundException - */ - public static void main(String[] args) throws FileNotFoundException { - - if (args.length != 2) { - logger.severe("Usage: " + CompareGrammars.class.toString() + " grammarFile1 grammarFile2"); - System.exit(-1); - } - - // Tell standard in and out to use UTF-8 - FormatUtils.useUTF8(); - logger.finest("Using UTF-8"); - - logger.info("Comparing grammar files " + args[0] + " and " + args[1]); - - File grammarFile1 = new File(args[0]); - File grammarFile2 = new File(args[1]); - - String fieldDelimiter = HieroFormatReader.getFieldDelimiter(); - - boolean compareScores = true; - - // Compare left-hand sides - { - Set leftHandSides1 = getFields(grammarFile1, fieldDelimiter, 0); - Set leftHandSides2 = getFields(grammarFile2, fieldDelimiter, 0); - - if (leftHandSides1.equals(leftHandSides2)) { - logger.info("Grammar files have the same set of left-hand sides"); - } else { - logger.warning("Grammar files have differing sets of left-hand sides"); - compareScores = false; - } - } - - // Compare source right-hand sides - { - Set sourceRHSs1 = getFields(grammarFile1, fieldDelimiter, 1); - Set sourceRHSs2 = getFields(grammarFile2, fieldDelimiter, 1); - - if (sourceRHSs1.equals(sourceRHSs2)) { - logger.info("Grammar files have the same set of source right-hand sides"); - } else { - logger.warning("Grammar files have differing sets of source right-hand sides"); - compareScores = false; - } - } - - - // Compare target right-hand sides - { - Set targetRHSs1 = getFields(grammarFile1, fieldDelimiter, 2); - Set targetRHSs2 = getFields(grammarFile2, fieldDelimiter, 2); - - if (targetRHSs1.equals(targetRHSs2)) { - logger.info("Grammar files have the same set of target right-hand sides"); - } else { - logger.warning("Grammar files have differing sets of target right-hand sides"); - compareScores = false; - } - } - - // Compare translation probs - if (compareScores) { - float delta = 0.001f; - compareValues(grammarFile1, grammarFile2, fieldDelimiter, 3, "\\s+", 0, delta); - compareValues(grammarFile1, grammarFile2, fieldDelimiter, 3, "\\s+", 1, delta); - compareValues(grammarFile1, grammarFile2, fieldDelimiter, 3, "\\s+", 2, delta); - - } - - } - - - -} diff --git a/src/joshua/util/FormatUtils.java b/src/joshua/util/FormatUtils.java index 67b2bf33..c925cbac 100644 --- a/src/joshua/util/FormatUtils.java +++ b/src/joshua/util/FormatUtils.java @@ -78,6 +78,15 @@ public static String stripNonTerminalIndex(String nt) { return markup(cleanNonTerminal(nt)); } + /** + * Nonterminals on source and target sides are represented as [X,1], where 1 is an integer + * that links the two sides. This function extracts the index, e.g., + * + * getNonterminalIndex("[X,7]") -> 7 + * + * @param the nonterminal index + * @return + */ public static int getNonterminalIndex(String nt) { return Integer.parseInt(nt.substring(nt.indexOf(INDEX_SEPARATOR) + 1, nt.length() - 1)); }