Commit
…nd packing - SAMT grammars are no longer supported - removed special PhraseRule, now just have Rule again; Thrax needs to be updated to add [X,1] to the source and target sides of rules - removed redundant functions in GrammarReader and Vocabulary dealing with identifying and parsing nonterminals (and using expensive regexs), now all moved to FormatUtils - cleaned up the GrammarReader interface - Moses phrase table packing updated -
- Loading branch information
There are no files selected for viewing
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -56,7 +56,7 @@ | |
public class Rule implements Comparator<Rule>, Comparable<Rule> { | ||
|
||
private int lhs; // tag of this rule | ||
private int[] pFrench; // pointer to the RuleCollection, as all the rules under it share the same | ||
private int[] source; // pointer to the RuleCollection, as all the rules under it share the same | ||
// Source side | ||
protected int arity; | ||
|
||
|
@@ -81,7 +81,7 @@ public class Rule implements Comparator<Rule>, Comparable<Rule> { | |
|
||
private float precomputableCost = Float.NEGATIVE_INFINITY; | ||
|
||
private int[] english; | ||
private int[] target; | ||
|
||
// The alignment string, e.g., 0-0 0-1 1-1 2-1 | ||
private String alignmentString; | ||
|
@@ -96,18 +96,18 @@ public class Rule implements Comparator<Rule>, Comparable<Rule> { | |
* Constructor used by other constructors below; | ||
* | ||
* @param lhs Left-hand side of the rule. | ||
* @param sourceRhs Source language right-hand side of the rule. | ||
* @param targetRhs Target language right-hand side of the rule. | ||
* @param source Source language right-hand side of the rule. | ||
* @param target Target language right-hand side of the rule. | ||
* @param sparseFeatures Feature value scores for the rule. | ||
* @param arity Number of nonterminals in the source language right-hand side. | ||
* @param owner | ||
*/ | ||
public Rule(int lhs, int[] sourceRhs, int[] targetRhs, String sparseFeatures, int arity, int owner) { | ||
public Rule(int lhs, int[] source, int[] target, String sparseFeatures, int arity, int owner) { | ||
this.lhs = lhs; | ||
this.pFrench = sourceRhs; | ||
this.source = source; | ||
this.arity = arity; | ||
this.owner = owner; | ||
this.english = targetRhs; | ||
this.target = target; | ||
this.sparseFeatureStringSupplier = Suppliers.memoize(() -> { return sparseFeatures; }); | ||
this.featuresSupplier = initializeFeatureSupplierFromString(); | ||
this.alignmentSupplier = initializeAlignmentSupplier(); | ||
|
@@ -118,10 +118,10 @@ public Rule(int lhs, int[] sourceRhs, int[] targetRhs, String sparseFeatures, in | |
*/ | ||
public Rule(int lhs, int[] sourceRhs, int[] targetRhs, FeatureVector features, int arity, int owner) { | ||
this.lhs = lhs; | ||
this.pFrench = sourceRhs; | ||
this.source = sourceRhs; | ||
this.arity = arity; | ||
this.owner = owner; | ||
this.english = targetRhs; | ||
this.target = targetRhs; | ||
this.featuresSupplier = Suppliers.memoize(() -> { return features; }); | ||
this.sparseFeatureStringSupplier = initializeSparseFeaturesStringSupplier(); | ||
this.alignmentSupplier = initializeAlignmentSupplier(); | ||
|
@@ -199,11 +199,11 @@ private Supplier<String> initializeSparseFeaturesStringSupplier() { | |
// =============================================================== | ||
|
||
public void setEnglish(int[] eng) { | ||
This comment has been minimized.
Sorry, something went wrong.
This comment has been minimized.
Sorry, something went wrong.
mjpost
Author
Contributor
|
||
this.english = eng; | ||
this.target = eng; | ||
} | ||
|
||
public int[] getEnglish() { | ||
return this.english; | ||
return this.target; | ||
} | ||
|
||
/** | ||
|
@@ -224,7 +224,7 @@ public boolean equals(Object o) { | |
if (!Arrays.equals(getFrench(), other.getFrench())) { | ||
return false; | ||
} | ||
if (!Arrays.equals(english, other.getEnglish())) { | ||
if (!Arrays.equals(target, other.getEnglish())) { | ||
return false; | ||
} | ||
return true; | ||
|
@@ -234,7 +234,7 @@ public int hashCode() { | |
// I just made this up. If two rules are equal they'll have the | ||
// same hashcode. Maybe someone else can do a better job though? | ||
int frHash = Arrays.hashCode(getFrench()); | ||
int enHash = Arrays.hashCode(english); | ||
int enHash = Arrays.hashCode(target); | ||
return frHash ^ enHash ^ getLHS(); | ||
} | ||
|
||
|
@@ -267,11 +267,11 @@ public int getLHS() { | |
} | ||
|
||
public void setFrench(int[] french) { | ||
this.pFrench = french; | ||
this.source = french; | ||
} | ||
|
||
public int[] getFrench() { | ||
return this.pFrench; | ||
return this.source; | ||
} | ||
|
||
/** | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,6 +21,7 @@ | |
import joshua.corpus.Vocabulary; | ||
import joshua.decoder.ff.tm.GrammarReader; | ||
import joshua.decoder.ff.tm.Rule; | ||
import joshua.util.FormatUtils; | ||
|
||
/** | ||
* This class implements reading files in the format defined by David Chiang for Hiero. | ||
|
@@ -33,10 +34,6 @@ public class HieroFormatReader extends GrammarReader<Rule> { | |
|
||
static { | ||
fieldDelimiter = "\\s\\|{3}\\s"; | ||
nonTerminalRegEx = "^\\[[^\\s]+\\,[0-9]*\\]$"; | ||
nonTerminalCleanRegEx = ",[0-9\\s]+"; | ||
// nonTerminalRegEx = "^\\[[A-Z]+\\,[0-9]*\\]$"; | ||
// nonTerminalCleanRegEx = "[\\[\\]\\,0-9\\s]+"; | ||
description = "Original Hiero format"; | ||
} | ||
|
||
|
@@ -55,69 +52,58 @@ public Rule parseLine(String line) { | |
throw new RuntimeException(String.format("Rule '%s' does not have four fields", line)); | ||
} | ||
|
||
int lhs = Vocabulary.id(cleanNonTerminal(fields[0])); | ||
int lhs = Vocabulary.id(FormatUtils.stripNonTerminalIndex(fields[0])); | ||
|
||
/** | ||
* On the foreign side, we map nonterminals to negative IDs, and terminals to positive IDs. | ||
*/ | ||
int arity = 0; | ||
// foreign side | ||
String[] foreignWords = fields[1].split("\\s+"); | ||
int[] french = new int[foreignWords.length]; | ||
for (int i = 0; i < foreignWords.length; i++) { | ||
french[i] = Vocabulary.id(foreignWords[i]); | ||
if (Vocabulary.nt(french[i])) { | ||
String[] sourceWords = fields[1].split("\\s+"); | ||
int[] sourceIDs = new int[sourceWords.length]; | ||
for (int i = 0; i < sourceWords.length; i++) { | ||
if (FormatUtils.isNonterminal(sourceWords[i])) { | ||
Vocabulary.id(sourceWords[i]); | ||
sourceIDs[i] = Vocabulary.id(FormatUtils.stripNonTerminalIndex(sourceWords[i])); | ||
arity++; | ||
french[i] = cleanNonTerminal(french[i]); | ||
|
||
// TODO: the arity here (after incrementing) should match the rule index. Should | ||
// check that arity == FormatUtils.getNonterminalIndex(foreignWords[i]), throw runtime | ||
// error if not | ||
} else { | ||
sourceIDs[i] = Vocabulary.id(sourceWords[i]); | ||
} | ||
} | ||
|
||
// English side | ||
String[] englishWords = fields[2].split("\\s+"); | ||
int[] english = new int[englishWords.length]; | ||
for (int i = 0; i < englishWords.length; i++) { | ||
english[i] = Vocabulary.id(englishWords[i]); | ||
if (Vocabulary.nt(english[i])) { | ||
english[i] = -Vocabulary.getTargetNonterminalIndex(english[i]); | ||
/** | ||
* The English side maps terminal symbols to positive IDs. Nonterminal symbols are linked | ||
* to the index of the source-side nonterminal they are linked to. So for a rule | ||
* | ||
* [X] ||| [X,1] [X,2] [X,3] ||| [X,2] [X,1] [X,3] ||| ... | ||
* | ||
* the English side nonterminals will be -2, -1, -3. This assumes that the source side of | ||
This comment has been minimized.
Sorry, something went wrong.
fhieber
Contributor
|
||
* the rule is always listed monotonically. | ||
*/ | ||
String[] targetWords = fields[2].split("\\s+"); | ||
int[] targetIDs = new int[targetWords.length]; | ||
for (int i = 0; i < targetWords.length; i++) { | ||
if (FormatUtils.isNonterminal(targetWords[i])) { | ||
targetIDs[i] = -FormatUtils.getNonterminalIndex(targetWords[i]); | ||
} else { | ||
targetIDs[i] = Vocabulary.id(targetWords[i]); | ||
} | ||
} | ||
|
||
String sparse_features = (fields.length > 3 ? fields[3] : ""); | ||
String alignment = (fields.length > 4) ? fields[4] : null; | ||
|
||
return new Rule(lhs, french, english, sparse_features, arity, alignment); | ||
return new Rule(lhs, sourceIDs, targetIDs, sparse_features, arity, alignment); | ||
} | ||
|
||
@Override | ||
public String toWords(Rule rule) { | ||
StringBuffer sb = new StringBuffer(""); | ||
sb.append(Vocabulary.word(rule.getLHS())); | ||
sb.append(" ||| "); | ||
sb.append(Vocabulary.getWords(rule.getFrench())); | ||
sb.append(" ||| "); | ||
sb.append(Vocabulary.getWords(rule.getEnglish())); | ||
sb.append(" |||"); | ||
sb.append(" " + rule.getFeatureVector()); | ||
|
||
return sb.toString(); | ||
} | ||
|
||
@Override | ||
public String toWordsWithoutFeatureScores(Rule rule) { | ||
StringBuffer sb = new StringBuffer(); | ||
sb.append(rule.getLHS()); | ||
sb.append(" ||| "); | ||
sb.append(Vocabulary.getWords(rule.getFrench())); | ||
sb.append(" ||| "); | ||
sb.append(Vocabulary.getWords(rule.getEnglish())); | ||
sb.append(" |||"); | ||
|
||
return sb.toString(); | ||
} | ||
|
||
|
||
public static String getFieldDelimiter() { | ||
return fieldDelimiter; | ||
} | ||
|
||
public static boolean isNonTerminal(final String word) { | ||
return GrammarReader.isNonTerminal(word); | ||
return FormatUtils.isNonterminal(word); | ||
} | ||
} |
1 comment
on commit 76eb958
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Great change!
Fantasic commit! Would it make sense to refactor these methods to setTarget/setSource as well?