Skip to content
This repository has been archived by the owner on Dec 13, 2021. It is now read-only.

Commit

Permalink
large commit moving in the direction of simplifying grammar reading a…
Browse files Browse the repository at this point in the history
…nd packing

-  SAMT grammars are no longer supported
-  removed special PhraseRule, now just have Rule again; Thrax needs to be updated to add [X,1] to the source and target sides of rules
-  removed redundant functions in GrammarReader and Vocabulary dealing with identifying and parsing nonterminals (and using expensive regexs), now all moved to FormatUtils
-  cleaned up the GrammarReader interface
-  Moses phrase table packing updated
-
  • Loading branch information
mjpost committed May 24, 2016
1 parent d0f7b53 commit 76eb958
Show file tree
Hide file tree
Showing 12 changed files with 89 additions and 678 deletions.
6 changes: 1 addition & 5 deletions src/joshua/corpus/Vocabulary.java
Expand Up @@ -155,7 +155,7 @@ public static int id(String token) {
if (stringToId.containsKey(token)) {
return stringToId.get(token);
}
int id = idToString.size() * (nt(token) ? -1 : 1);
int id = idToString.size() * (FormatUtils.isNonterminal(token) ? -1 : 1);

// register this (token,id) mapping with each language
// model, so that they can map it to their own private
Expand Down Expand Up @@ -237,10 +237,6 @@ public static boolean nt(int id) {
return (id < 0);
}

public static boolean nt(String word) {
return FormatUtils.isNonterminal(word);
}

public static int size() {
long lock_stamp = lock.readLock();
try {
Expand Down
41 changes: 0 additions & 41 deletions src/joshua/decoder/ff/tm/GrammarReader.java
Expand Up @@ -36,8 +36,6 @@
public abstract class GrammarReader<R extends Rule> implements Iterable<R>, Iterator<R> {

protected static String fieldDelimiter;
protected static String nonTerminalRegEx;
protected static String nonTerminalCleanRegEx;

protected static String description;

Expand Down Expand Up @@ -165,43 +163,4 @@ public R next() {
}

protected abstract R parseLine(String line);

// TODO: keep these around or not?
public abstract String toWords(R rule);

public abstract String toWordsWithoutFeatureScores(R rule);

/**
* Removes square brackets (and index, if present) from nonterminal id
* @param tokenID
* @return cleaned ID
*/
public static int cleanNonTerminal(int tokenID) {
// cleans NT of any markup, e.g., [X,1] may becomes [X], depending
return Vocabulary.id(cleanNonTerminal(Vocabulary.word(tokenID)));
}

/**
* Removes square brackets (and index, if present) from nonterminal id
* @param token
* @return cleaned token
*/
public static String cleanNonTerminal(String token) {
// cleans NT of any markup, e.g., [X,1] may becomes [X], depending on nonTerminalCleanRegEx
return token.replaceAll(nonTerminalCleanRegEx, "");
}

public static boolean isNonTerminal(final String word) {
// checks if word matches NT regex
return word.matches(nonTerminalRegEx);
}

public String getNonTerminalRegEx() {
return nonTerminalRegEx;
}

public String getNonTerminalCleanRegEx() {
return nonTerminalCleanRegEx;
}

}
94 changes: 0 additions & 94 deletions src/joshua/decoder/ff/tm/PhraseRule.java

This file was deleted.

30 changes: 15 additions & 15 deletions src/joshua/decoder/ff/tm/Rule.java
Expand Up @@ -56,7 +56,7 @@
public class Rule implements Comparator<Rule>, Comparable<Rule> {

private int lhs; // tag of this rule
private int[] pFrench; // pointer to the RuleCollection, as all the rules under it share the same
private int[] source; // pointer to the RuleCollection, as all the rules under it share the same
// Source side
protected int arity;

Expand All @@ -81,7 +81,7 @@ public class Rule implements Comparator<Rule>, Comparable<Rule> {

private float precomputableCost = Float.NEGATIVE_INFINITY;

private int[] english;
private int[] target;

// The alignment string, e.g., 0-0 0-1 1-1 2-1
private String alignmentString;
Expand All @@ -96,18 +96,18 @@ public class Rule implements Comparator<Rule>, Comparable<Rule> {
* Constructor used by other constructors below;
*
* @param lhs Left-hand side of the rule.
* @param sourceRhs Source language right-hand side of the rule.
* @param targetRhs Target language right-hand side of the rule.
* @param source Source language right-hand side of the rule.
* @param target Target language right-hand side of the rule.
* @param sparseFeatures Feature value scores for the rule.
* @param arity Number of nonterminals in the source language right-hand side.
* @param owner
*/
public Rule(int lhs, int[] sourceRhs, int[] targetRhs, String sparseFeatures, int arity, int owner) {
public Rule(int lhs, int[] source, int[] target, String sparseFeatures, int arity, int owner) {
this.lhs = lhs;
this.pFrench = sourceRhs;
this.source = source;
this.arity = arity;
this.owner = owner;
this.english = targetRhs;
this.target = target;
this.sparseFeatureStringSupplier = Suppliers.memoize(() -> { return sparseFeatures; });
this.featuresSupplier = initializeFeatureSupplierFromString();
this.alignmentSupplier = initializeAlignmentSupplier();
Expand All @@ -118,10 +118,10 @@ public Rule(int lhs, int[] sourceRhs, int[] targetRhs, String sparseFeatures, in
*/
public Rule(int lhs, int[] sourceRhs, int[] targetRhs, FeatureVector features, int arity, int owner) {
this.lhs = lhs;
this.pFrench = sourceRhs;
this.source = sourceRhs;
this.arity = arity;
this.owner = owner;
this.english = targetRhs;
this.target = targetRhs;
this.featuresSupplier = Suppliers.memoize(() -> { return features; });
this.sparseFeatureStringSupplier = initializeSparseFeaturesStringSupplier();
this.alignmentSupplier = initializeAlignmentSupplier();
Expand Down Expand Up @@ -199,11 +199,11 @@ private Supplier<String> initializeSparseFeaturesStringSupplier() {
// ===============================================================

public void setEnglish(int[] eng) {

This comment has been minimized.

Copy link
@fhieber

fhieber May 25, 2016

Contributor

Fantasic commit! Would it make sense to refactor these methods to setTarget/setSource as well?

This comment has been minimized.

Copy link
@mjpost

mjpost May 25, 2016

Author Contributor

Yes, definitely, will do. I was originally trying to limit the impact of the merge given the huge impending merge with JOSHUA-252, but by now, with the other stuff, it's too late :)

this.english = eng;
this.target = eng;
}

public int[] getEnglish() {
return this.english;
return this.target;
}

/**
Expand All @@ -224,7 +224,7 @@ public boolean equals(Object o) {
if (!Arrays.equals(getFrench(), other.getFrench())) {
return false;
}
if (!Arrays.equals(english, other.getEnglish())) {
if (!Arrays.equals(target, other.getEnglish())) {
return false;
}
return true;
Expand All @@ -234,7 +234,7 @@ public int hashCode() {
// I just made this up. If two rules are equal they'll have the
// same hashcode. Maybe someone else can do a better job though?
int frHash = Arrays.hashCode(getFrench());
int enHash = Arrays.hashCode(english);
int enHash = Arrays.hashCode(target);
return frHash ^ enHash ^ getLHS();
}

Expand Down Expand Up @@ -267,11 +267,11 @@ public int getLHS() {
}

public void setFrench(int[] french) {
this.pFrench = french;
this.source = french;
}

public int[] getFrench() {
return this.pFrench;
return this.source;
}

/**
Expand Down
84 changes: 35 additions & 49 deletions src/joshua/decoder/ff/tm/format/HieroFormatReader.java
Expand Up @@ -21,6 +21,7 @@
import joshua.corpus.Vocabulary;
import joshua.decoder.ff.tm.GrammarReader;
import joshua.decoder.ff.tm.Rule;
import joshua.util.FormatUtils;

/**
* This class implements reading files in the format defined by David Chiang for Hiero.
Expand All @@ -33,10 +34,6 @@ public class HieroFormatReader extends GrammarReader<Rule> {

static {
fieldDelimiter = "\\s\\|{3}\\s";
nonTerminalRegEx = "^\\[[^\\s]+\\,[0-9]*\\]$";
nonTerminalCleanRegEx = ",[0-9\\s]+";
// nonTerminalRegEx = "^\\[[A-Z]+\\,[0-9]*\\]$";
// nonTerminalCleanRegEx = "[\\[\\]\\,0-9\\s]+";
description = "Original Hiero format";
}

Expand All @@ -55,69 +52,58 @@ public Rule parseLine(String line) {
throw new RuntimeException(String.format("Rule '%s' does not have four fields", line));
}

int lhs = Vocabulary.id(cleanNonTerminal(fields[0]));
int lhs = Vocabulary.id(FormatUtils.stripNonTerminalIndex(fields[0]));

/**
* On the foreign side, we map nonterminals to negative IDs, and terminals to positive IDs.
*/
int arity = 0;
// foreign side
String[] foreignWords = fields[1].split("\\s+");
int[] french = new int[foreignWords.length];
for (int i = 0; i < foreignWords.length; i++) {
french[i] = Vocabulary.id(foreignWords[i]);
if (Vocabulary.nt(french[i])) {
String[] sourceWords = fields[1].split("\\s+");
int[] sourceIDs = new int[sourceWords.length];
for (int i = 0; i < sourceWords.length; i++) {
if (FormatUtils.isNonterminal(sourceWords[i])) {
Vocabulary.id(sourceWords[i]);
sourceIDs[i] = Vocabulary.id(FormatUtils.stripNonTerminalIndex(sourceWords[i]));
arity++;
french[i] = cleanNonTerminal(french[i]);

// TODO: the arity here (after incrementing) should match the rule index. Should
// check that arity == FormatUtils.getNonterminalIndex(foreignWords[i]), throw runtime
// error if not
} else {
sourceIDs[i] = Vocabulary.id(sourceWords[i]);
}
}

// English side
String[] englishWords = fields[2].split("\\s+");
int[] english = new int[englishWords.length];
for (int i = 0; i < englishWords.length; i++) {
english[i] = Vocabulary.id(englishWords[i]);
if (Vocabulary.nt(english[i])) {
english[i] = -Vocabulary.getTargetNonterminalIndex(english[i]);
/**
* The English side maps terminal symbols to positive IDs. Nonterminal symbols are linked
* to the index of the source-side nonterminal they are linked to. So for a rule
*
* [X] ||| [X,1] [X,2] [X,3] ||| [X,2] [X,1] [X,3] ||| ...
*
* the English side nonterminals will be -2, -1, -3. This assumes that the source side of

This comment has been minimized.

Copy link
@fhieber

fhieber May 25, 2016

Contributor

to be consistent with the above changes (English/French -> target/source) the comment could be adopted

This comment has been minimized.

Copy link
@mjpost

mjpost May 25, 2016

Author Contributor

I started doing this, but it's touching too many files. I will make a note to do it after the 252 merge.

* the rule is always listed monotonically.
*/
String[] targetWords = fields[2].split("\\s+");
int[] targetIDs = new int[targetWords.length];
for (int i = 0; i < targetWords.length; i++) {
if (FormatUtils.isNonterminal(targetWords[i])) {
targetIDs[i] = -FormatUtils.getNonterminalIndex(targetWords[i]);
} else {
targetIDs[i] = Vocabulary.id(targetWords[i]);
}
}

String sparse_features = (fields.length > 3 ? fields[3] : "");
String alignment = (fields.length > 4) ? fields[4] : null;

return new Rule(lhs, french, english, sparse_features, arity, alignment);
return new Rule(lhs, sourceIDs, targetIDs, sparse_features, arity, alignment);
}

@Override
public String toWords(Rule rule) {
StringBuffer sb = new StringBuffer("");
sb.append(Vocabulary.word(rule.getLHS()));
sb.append(" ||| ");
sb.append(Vocabulary.getWords(rule.getFrench()));
sb.append(" ||| ");
sb.append(Vocabulary.getWords(rule.getEnglish()));
sb.append(" |||");
sb.append(" " + rule.getFeatureVector());

return sb.toString();
}

@Override
public String toWordsWithoutFeatureScores(Rule rule) {
StringBuffer sb = new StringBuffer();
sb.append(rule.getLHS());
sb.append(" ||| ");
sb.append(Vocabulary.getWords(rule.getFrench()));
sb.append(" ||| ");
sb.append(Vocabulary.getWords(rule.getEnglish()));
sb.append(" |||");

return sb.toString();
}


public static String getFieldDelimiter() {
return fieldDelimiter;
}

public static boolean isNonTerminal(final String word) {
return GrammarReader.isNonTerminal(word);
return FormatUtils.isNonterminal(word);
}
}

1 comment on commit 76eb958

@fhieber
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great change!

Please sign in to comment.