Skip to content
This repository has been archived by the owner on Dec 13, 2021. It is now read-only.

Commit

Permalink
Fixed grammar packing, big change
Browse files Browse the repository at this point in the history
- GrammarPacker now uses the appropriate {Moses,Hiero}FormatReader objects in explore() and binarize() passes, instead of doing its own parsing
- MosesFormatReader chains to HieroFormatReader after munging input, removes some redundancy
- Updated test case
  • Loading branch information
mjpost committed May 24, 2016
1 parent 210573b commit f5adcde
Show file tree
Hide file tree
Showing 10 changed files with 66 additions and 127 deletions.
2 changes: 1 addition & 1 deletion scripts/support/grammar-packer.pl
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ sub usage {
my $grammars = join(" ", @sorted_grammars);
my $outputs = join(" ", @outputs);
my $alignments = $opts{a} ? "--ga" : "";
my $cmd = "java -Xmx$opts{m} -cp $JOSHUA/lib/args4j-2.0.29.jar:$JOSHUA/class joshua.tools.GrammarPackerCli -g $grammars --outputs $outputs $alignments";
my $cmd = "java -Xmx$opts{m} -cp $JOSHUA/lib/args4j-2.0.29.jar:$JOSHUA/lib/guava-19.0.jar:$JOSHUA/class joshua.tools.GrammarPackerCli -g $grammars --outputs $outputs $alignments";
print STDERR "Packing with $cmd...\n" if $opts{v};

my $retval = system($cmd);
Expand Down
13 changes: 6 additions & 7 deletions src/joshua/decoder/ff/tm/format/HieroFormatReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
*/
package joshua.decoder.ff.tm.format;

import java.io.IOException;

import joshua.corpus.Vocabulary;
import joshua.decoder.ff.tm.GrammarReader;
import joshua.decoder.ff.tm.Rule;
Expand All @@ -41,7 +43,7 @@ public HieroFormatReader() {
super();
}

public HieroFormatReader(String grammarFile) {
public HieroFormatReader(String grammarFile) throws IOException {
super(grammarFile);
}

Expand All @@ -52,7 +54,7 @@ public Rule parseLine(String line) {
throw new RuntimeException(String.format("Rule '%s' does not have four fields", line));
}

int lhs = Vocabulary.id(FormatUtils.stripNonTerminalIndex(fields[0]));
int lhs = Vocabulary.id(fields[0]);

/**
* On the foreign side, we map nonterminals to negative IDs, and terminals to positive IDs.
Expand All @@ -61,16 +63,14 @@ public Rule parseLine(String line) {
String[] sourceWords = fields[1].split("\\s+");
int[] sourceIDs = new int[sourceWords.length];
for (int i = 0; i < sourceWords.length; i++) {
sourceIDs[i] = Vocabulary.id(sourceWords[i]);
if (FormatUtils.isNonterminal(sourceWords[i])) {
Vocabulary.id(sourceWords[i]);
sourceIDs[i] = Vocabulary.id(FormatUtils.stripNonTerminalIndex(sourceWords[i]));
arity++;

// TODO: the arity here (after incrementing) should match the rule index. Should
// check that arity == FormatUtils.getNonterminalIndex(foreignWords[i]), throw runtime
// error if not
} else {
sourceIDs[i] = Vocabulary.id(sourceWords[i]);
}
}

Expand All @@ -86,10 +86,9 @@ public Rule parseLine(String line) {
String[] targetWords = fields[2].split("\\s+");
int[] targetIDs = new int[targetWords.length];
for (int i = 0; i < targetWords.length; i++) {
targetIDs[i] = Vocabulary.id(targetWords[i]);
if (FormatUtils.isNonterminal(targetWords[i])) {
targetIDs[i] = -FormatUtils.getNonterminalIndex(targetWords[i]);
} else {
targetIDs[i] = Vocabulary.id(targetWords[i]);
}
}

Expand Down
43 changes: 12 additions & 31 deletions src/joshua/decoder/ff/tm/format/MosesFormatReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
*/
package joshua.decoder.ff.tm.format;

import java.io.IOException;

import joshua.corpus.Vocabulary;
import joshua.decoder.ff.tm.Rule;
import joshua.util.io.LineReader;
Expand All @@ -44,7 +46,7 @@ public class MosesFormatReader extends HieroFormatReader {

private int lhs;

public MosesFormatReader(String grammarFile) {
public MosesFormatReader(String grammarFile) throws IOException {
super(grammarFile);
this.lhs = Vocabulary.id("[X]");
}
Expand Down Expand Up @@ -76,44 +78,23 @@ public Rule parseLine(String line) {
String[] fields = line.split(fieldDelimiter);

int arity = 1;
int fieldIndex = 0;

// foreign side
String[] foreignWords = fields[fieldIndex].split("\\s+");
int[] french = new int[foreignWords.length + 1];
french[0] = lhs;
for (int i = 0; i < foreignWords.length; i++) {
french[i+1] = Vocabulary.id(foreignWords[i]);
}

// English side
fieldIndex++;
String[] englishWords = fields[fieldIndex].split("\\s+");
int[] english = new int[englishWords.length + 1];
english[0] = -1;
for (int i = 0; i < englishWords.length; i++) {
english[i+1] = Vocabulary.id(englishWords[i]);
}
StringBuffer hieroLine = new StringBuffer();

This comment has been minimized.

Copy link
@fhieber

fhieber May 25, 2016

Contributor

you can initialize the StringBuffer directly with the following line:
new StringBuffer("[X] ||| [X,1] " + fields[0] + " ||| [X,1] " + fields[1] + " |||")

hieroLine.append("[X] ||| [X,1] " + fields[0] + " ||| [X,1] " + fields[1] + " |||");

// transform feature values
fieldIndex++;

String mosesFeatureString = fields[fieldIndex];
StringBuffer values = new StringBuffer();
String mosesFeatureString = fields[2];
for (String value: mosesFeatureString.split(" ")) {
float f = Float.parseFloat(value);
values.append(String.format("%f ", f <= 0.0 ? -100 : -Math.log(f)));
hieroLine.append(String.format(" %f", f <= 0.0 ? -100 : -Math.log(f)));
}

String sparse_features = values.toString().trim();

// System.out.println(String.format("parseLine: %s\n ->%s", line, sparse_features));

// alignments
fieldIndex++;
String alignment = (fields.length > fieldIndex) ? fields[fieldIndex] : null;
if (fields.length >= 4)
hieroLine.append(" ||| " + fields[3]);

return new Rule(lhs, french, english, sparse_features, arity, alignment);
System.err.println(String.format("LINE: %s -> %s", line, hieroLine.toString()));

This comment has been minimized.

Copy link
@fhieber

fhieber May 25, 2016

Contributor

this should probably go at some point or changed to logging


return super.parseLine(hieroLine.toString());
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ public MemoryBasedBatchGrammar(String formatKeyword, String grammarFile, String
this.printGrammar();
}

protected GrammarReader<Rule> createReader(String format, String grammarFile) {
protected GrammarReader<Rule> createReader(String format, String grammarFile) throws IOException {

if (grammarFile != null) {
if ("hiero".equals(format) || "thrax".equals(format) || "regexp".equals(format)) {
Expand Down
131 changes: 45 additions & 86 deletions src/joshua/tools/GrammarPacker.java
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@
import java.util.logging.Logger;

import joshua.corpus.Vocabulary;
import joshua.decoder.ff.tm.Rule;
import joshua.decoder.ff.tm.format.HieroFormatReader;
import joshua.decoder.ff.tm.format.MosesFormatReader;
import joshua.util.FormatUtils;
import joshua.util.encoding.EncoderConfiguration;
import joshua.util.encoding.FeatureTypeAnalyzer;
Expand Down Expand Up @@ -154,13 +157,12 @@ private void readConfig(String config_filename) throws IOException {
*/
public void pack() throws IOException {
logger.info("Beginning exploration pass.");
LineReader grammar_reader = null;
LineReader alignment_reader = null;

// Explore pass. Learn vocabulary and feature value histograms.
logger.info("Exploring: " + grammar);
grammar_reader = new LineReader(grammar);
explore(grammar_reader);

HieroFormatReader grammarReader = getGrammarReader();
explore(grammarReader);

logger.info("Exploration pass complete. Freezing vocabulary and finalizing encoders.");
if (dump != null) {
Expand Down Expand Up @@ -194,78 +196,56 @@ public void pack() throws IOException {

logger.info("Beginning packing pass.");
// Actual binarization pass. Slice and pack source, target and data.
grammar_reader = new LineReader(grammar);

grammarReader = getGrammarReader();
LineReader alignment_reader = null;
if (packAlignments && !grammarAlignments)
alignment_reader = new LineReader(alignments);
binarize(grammar_reader, alignment_reader);
binarize(grammarReader, alignment_reader);
logger.info("Packing complete.");

logger.info("Packed grammar in: " + output);
logger.info("Done.");
}

private void explore(LineReader grammar) {
/**
* Returns a reader that turns whatever file format is found into Hiero grammar rules.
*
* @param grammarFile
* @return
* @throws IOException
*/
private HieroFormatReader getGrammarReader() throws IOException {
LineReader reader = new LineReader(grammar);
String line = reader.next();
if (line.startsWith("[")) {
return new HieroFormatReader(grammar);
} else {
return new MosesFormatReader(grammar);
}
}

private void explore(HieroFormatReader reader) {
int counter = 0;
// We always assume a labeled grammar. Unlabeled features are assumed to be dense and to always
// appear in the same order. They are assigned numeric names in order of appearance.
this.types.setLabeled(true);

while (grammar.hasNext()) {
String line = grammar.next().trim();
for (Rule rule: reader) {
counter++;
ArrayList<String> fields = new ArrayList<String>(Arrays.asList(line.split("\\s\\|{3}\\s")));

String lhs = null;
if (line.startsWith("[")) {
// hierarchical model
if (fields.size() < 4) {
logger.warning(String.format("Incomplete grammar line at line %d: '%s'", counter, line));
continue;
}
lhs = fields.remove(0);
} else {
// phrase-based model
if (fields.size() < 3) {
logger.warning("Incomplete phrase line at line " + counter);
logger.warning(line);
continue;
}
lhs = "[X]";
}

String[] source = fields.get(0).split("\\s");
String[] target = fields.get(1).split("\\s");
String[] features = fields.get(2).split("\\s");

max_source_len = Math.max(max_source_len, source.length);

Vocabulary.id(lhs);
try {
/* Add symbols to vocabulary.
* NOTE: In case of nonterminals, we add both stripped versions ("[X]")
* and "[X,1]" to the vocabulary.
*/
for (String source_word : source) {
Vocabulary.id(source_word);
if (FormatUtils.isNonterminal(source_word)) {
Vocabulary.id(FormatUtils.stripNonTerminalIndex(source_word));
}
}
for (String target_word : target) {
Vocabulary.id(target_word);
if (FormatUtils.isNonterminal(target_word)) {
Vocabulary.id(FormatUtils.stripNonTerminalIndex(target_word));
}
}
} catch (java.lang.StringIndexOutOfBoundsException e) {
System.err.println(String.format("* Skipping bad grammar line '%s'", line));
continue;
}
max_source_len = Math.max(max_source_len, rule.getFrench().length);

/* Add symbols to vocabulary.
* NOTE: In case of nonterminals, we add both stripped versions ("[X]")
* and "[X,1]" to the vocabulary.
*
* TODO: MJP May 2016: do we need to add [X,1]? If so, should be done in FormatReaders.

This comment has been minimized.

Copy link
@fhieber

fhieber May 25, 2016

Contributor

I have to look into this. I think we found a case where it was necessary but I cannot remember why.

This comment has been minimized.

Copy link
@mjpost

mjpost May 25, 2016

Author Contributor

Okay. Currently it's doing it in the GrammarReaders, but it's a redundancy. I'll make a comment so it's clearer in the code.

*/

// Add feature names to vocabulary and pass the value through the
// appropriate encoder.
int feature_counter = 0;
String[] features = rule.getFeatureString().split("\\s+");
for (int f = 0; f < features.length; ++f) {
if (features[f].contains("=")) {
String[] fe = features[f].split("=");
Expand All @@ -278,6 +258,7 @@ private void explore(LineReader grammar) {
}
}
}
System.err.println("COUNTER: " + counter);

This comment has been minimized.

Copy link
@fhieber

fhieber May 25, 2016

Contributor

should probably be removed

}

/**
Expand All @@ -288,7 +269,7 @@ private String getFirstTwoSourceWords(final String[] source_words) {
return source_words[0] + SOURCE_WORDS_SEPARATOR + ((source_words.length > 1) ? source_words[1] : "");
}

private void binarize(LineReader grammar_reader, LineReader alignment_reader) throws IOException {
private void binarize(HieroFormatReader grammarReader, LineReader alignment_reader) throws IOException {
int counter = 0;
int slice_counter = 0;
int num_slices = 0;
Expand All @@ -306,36 +287,14 @@ private void binarize(LineReader grammar_reader, LineReader alignment_reader) th
alignment_buffer = new AlignmentBuffer();

TreeMap<Integer, Float> features = new TreeMap<Integer, Float>();
while (grammar_reader.hasNext()) {
String grammar_line = grammar_reader.next().trim();
for (Rule rule: grammarReader) {
counter++;
slice_counter++;

ArrayList<String> fields = new ArrayList<String>(Arrays.asList(grammar_line.split("\\s\\|{3}\\s")));
String lhs_word;
String[] source_words;
String[] target_words;
String[] feature_entries;
if (grammar_line.startsWith("[")) {
if (fields.size() < 4)
continue;

lhs_word = fields.remove(0);
source_words = fields.get(0).split("\\s");
target_words = fields.get(1).split("\\s");
feature_entries = fields.get(2).split("\\s");

} else {
if (fields.size() < 3)
continue;

lhs_word = "[X]";
String tmp = "[X,1] " + fields.get(0);
source_words = tmp.split("\\s");
tmp = "[X,1] " + fields.get(1);
target_words = tmp.split("\\s");
feature_entries = fields.get(2).split("\\s");
}
String lhs_word = Vocabulary.word(rule.getLHS());
String[] source_words = rule.getFrenchWords().split("\\s+");

This comment has been minimized.

Copy link
@fhieber

fhieber May 25, 2016

Contributor

The "\s+" string should probably defined as a token separator constant somewhere in the project to avoid hard-coding it everywhere.
It might be useful to add a method to Vocabulary.java that returns a list of strings instead of the joined version to avoid the splitting here:
public List getSourceWordList(int[] ids)

This comment has been minimized.

Copy link
@mjpost

mjpost May 25, 2016

Author Contributor

These are all fine ideas. I think they're the kind of thing that one of us should just pick off sometime we're in the code doing something else.

This comment has been minimized.

Copy link
@fhieber

fhieber May 25, 2016

Contributor

Completely agreed :)

String[] target_words = rule.getEnglishWords().split("\\s+");
String[] feature_entries = rule.getFeatureString().split("\\s+");

// Reached slice limit size, indicate that we're closing up.
if (!ready_to_flush
Expand Down Expand Up @@ -373,7 +332,7 @@ private void binarize(LineReader grammar_reader, LineReader alignment_reader) th
if (packAlignments) {
String alignment_line;
if (grammarAlignments) {
alignment_line = fields.get(3);
alignment_line = rule.getAlignmentString();
} else {
if (!alignment_reader.hasNext()) {
logger.severe("No more alignments starting in line " + counter);
Expand Down
2 changes: 1 addition & 1 deletion test/decoder/phrase/decode/rules.packed/config
Original file line number Diff line number Diff line change
@@ -1 +1 @@
max-source-len = 5
max-source-len = 3
Binary file modified test/decoder/phrase/decode/rules.packed/slice_00000.features
Binary file not shown.
Binary file modified test/decoder/phrase/decode/rules.packed/slice_00000.source
Binary file not shown.
Binary file modified test/decoder/phrase/decode/rules.packed/slice_00000.target
Binary file not shown.
Binary file modified test/decoder/phrase/decode/rules.packed/vocabulary
Binary file not shown.

1 comment on commit f5adcde

@fhieber
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I love this!

Please sign in to comment.