-
Notifications
You must be signed in to change notification settings - Fork 64
Commit
- GrammarPacker now uses the appropriate {Moses,Hiero}FormatReader objects in explore() and binarize() passes, instead of doing its own parsing - MosesFormatReader chains to HieroFormatReader after munging input, removes some redundancy - Updated test case
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -37,6 +37,9 @@ | |
import java.util.logging.Logger; | ||
|
||
import joshua.corpus.Vocabulary; | ||
import joshua.decoder.ff.tm.Rule; | ||
import joshua.decoder.ff.tm.format.HieroFormatReader; | ||
import joshua.decoder.ff.tm.format.MosesFormatReader; | ||
import joshua.util.FormatUtils; | ||
import joshua.util.encoding.EncoderConfiguration; | ||
import joshua.util.encoding.FeatureTypeAnalyzer; | ||
|
@@ -154,13 +157,12 @@ private void readConfig(String config_filename) throws IOException { | |
*/ | ||
public void pack() throws IOException { | ||
logger.info("Beginning exploration pass."); | ||
LineReader grammar_reader = null; | ||
LineReader alignment_reader = null; | ||
|
||
// Explore pass. Learn vocabulary and feature value histograms. | ||
logger.info("Exploring: " + grammar); | ||
grammar_reader = new LineReader(grammar); | ||
explore(grammar_reader); | ||
|
||
HieroFormatReader grammarReader = getGrammarReader(); | ||
explore(grammarReader); | ||
|
||
logger.info("Exploration pass complete. Freezing vocabulary and finalizing encoders."); | ||
if (dump != null) { | ||
|
@@ -194,78 +196,56 @@ public void pack() throws IOException { | |
|
||
logger.info("Beginning packing pass."); | ||
// Actual binarization pass. Slice and pack source, target and data. | ||
grammar_reader = new LineReader(grammar); | ||
|
||
grammarReader = getGrammarReader(); | ||
LineReader alignment_reader = null; | ||
if (packAlignments && !grammarAlignments) | ||
alignment_reader = new LineReader(alignments); | ||
binarize(grammar_reader, alignment_reader); | ||
binarize(grammarReader, alignment_reader); | ||
logger.info("Packing complete."); | ||
|
||
logger.info("Packed grammar in: " + output); | ||
logger.info("Done."); | ||
} | ||
|
||
private void explore(LineReader grammar) { | ||
/** | ||
* Returns a reader that turns whatever file format is found into Hiero grammar rules. | ||
* | ||
* @param grammarFile | ||
* @return | ||
* @throws IOException | ||
*/ | ||
private HieroFormatReader getGrammarReader() throws IOException { | ||
LineReader reader = new LineReader(grammar); | ||
String line = reader.next(); | ||
if (line.startsWith("[")) { | ||
return new HieroFormatReader(grammar); | ||
} else { | ||
return new MosesFormatReader(grammar); | ||
} | ||
} | ||
|
||
private void explore(HieroFormatReader reader) { | ||
int counter = 0; | ||
// We always assume a labeled grammar. Unlabeled features are assumed to be dense and to always | ||
// appear in the same order. They are assigned numeric names in order of appearance. | ||
this.types.setLabeled(true); | ||
|
||
while (grammar.hasNext()) { | ||
String line = grammar.next().trim(); | ||
for (Rule rule: reader) { | ||
counter++; | ||
ArrayList<String> fields = new ArrayList<String>(Arrays.asList(line.split("\\s\\|{3}\\s"))); | ||
|
||
String lhs = null; | ||
if (line.startsWith("[")) { | ||
// hierarchical model | ||
if (fields.size() < 4) { | ||
logger.warning(String.format("Incomplete grammar line at line %d: '%s'", counter, line)); | ||
continue; | ||
} | ||
lhs = fields.remove(0); | ||
} else { | ||
// phrase-based model | ||
if (fields.size() < 3) { | ||
logger.warning("Incomplete phrase line at line " + counter); | ||
logger.warning(line); | ||
continue; | ||
} | ||
lhs = "[X]"; | ||
} | ||
|
||
String[] source = fields.get(0).split("\\s"); | ||
String[] target = fields.get(1).split("\\s"); | ||
String[] features = fields.get(2).split("\\s"); | ||
|
||
max_source_len = Math.max(max_source_len, source.length); | ||
|
||
Vocabulary.id(lhs); | ||
try { | ||
/* Add symbols to vocabulary. | ||
* NOTE: In case of nonterminals, we add both stripped versions ("[X]") | ||
* and "[X,1]" to the vocabulary. | ||
*/ | ||
for (String source_word : source) { | ||
Vocabulary.id(source_word); | ||
if (FormatUtils.isNonterminal(source_word)) { | ||
Vocabulary.id(FormatUtils.stripNonTerminalIndex(source_word)); | ||
} | ||
} | ||
for (String target_word : target) { | ||
Vocabulary.id(target_word); | ||
if (FormatUtils.isNonterminal(target_word)) { | ||
Vocabulary.id(FormatUtils.stripNonTerminalIndex(target_word)); | ||
} | ||
} | ||
} catch (java.lang.StringIndexOutOfBoundsException e) { | ||
System.err.println(String.format("* Skipping bad grammar line '%s'", line)); | ||
continue; | ||
} | ||
max_source_len = Math.max(max_source_len, rule.getFrench().length); | ||
|
||
/* Add symbols to vocabulary. | ||
* NOTE: In case of nonterminals, we add both stripped versions ("[X]") | ||
* and "[X,1]" to the vocabulary. | ||
* | ||
* TODO: MJP May 2016: do we need to add [X,1]? If so, should be done in FormatReaders. | ||
This comment has been minimized.
Sorry, something went wrong.
fhieber
Contributor
|
||
*/ | ||
|
||
// Add feature names to vocabulary and pass the value through the | ||
// appropriate encoder. | ||
int feature_counter = 0; | ||
String[] features = rule.getFeatureString().split("\\s+"); | ||
for (int f = 0; f < features.length; ++f) { | ||
if (features[f].contains("=")) { | ||
String[] fe = features[f].split("="); | ||
|
@@ -278,6 +258,7 @@ private void explore(LineReader grammar) { | |
} | ||
} | ||
} | ||
System.err.println("COUNTER: " + counter); | ||
This comment has been minimized.
Sorry, something went wrong. |
||
} | ||
|
||
/** | ||
|
@@ -288,7 +269,7 @@ private String getFirstTwoSourceWords(final String[] source_words) { | |
return source_words[0] + SOURCE_WORDS_SEPARATOR + ((source_words.length > 1) ? source_words[1] : ""); | ||
} | ||
|
||
private void binarize(LineReader grammar_reader, LineReader alignment_reader) throws IOException { | ||
private void binarize(HieroFormatReader grammarReader, LineReader alignment_reader) throws IOException { | ||
int counter = 0; | ||
int slice_counter = 0; | ||
int num_slices = 0; | ||
|
@@ -306,36 +287,14 @@ private void binarize(LineReader grammar_reader, LineReader alignment_reader) th | |
alignment_buffer = new AlignmentBuffer(); | ||
|
||
TreeMap<Integer, Float> features = new TreeMap<Integer, Float>(); | ||
while (grammar_reader.hasNext()) { | ||
String grammar_line = grammar_reader.next().trim(); | ||
for (Rule rule: grammarReader) { | ||
counter++; | ||
slice_counter++; | ||
|
||
ArrayList<String> fields = new ArrayList<String>(Arrays.asList(grammar_line.split("\\s\\|{3}\\s"))); | ||
String lhs_word; | ||
String[] source_words; | ||
String[] target_words; | ||
String[] feature_entries; | ||
if (grammar_line.startsWith("[")) { | ||
if (fields.size() < 4) | ||
continue; | ||
|
||
lhs_word = fields.remove(0); | ||
source_words = fields.get(0).split("\\s"); | ||
target_words = fields.get(1).split("\\s"); | ||
feature_entries = fields.get(2).split("\\s"); | ||
|
||
} else { | ||
if (fields.size() < 3) | ||
continue; | ||
|
||
lhs_word = "[X]"; | ||
String tmp = "[X,1] " + fields.get(0); | ||
source_words = tmp.split("\\s"); | ||
tmp = "[X,1] " + fields.get(1); | ||
target_words = tmp.split("\\s"); | ||
feature_entries = fields.get(2).split("\\s"); | ||
} | ||
String lhs_word = Vocabulary.word(rule.getLHS()); | ||
String[] source_words = rule.getFrenchWords().split("\\s+"); | ||
This comment has been minimized.
Sorry, something went wrong.
fhieber
Contributor
|
||
String[] target_words = rule.getEnglishWords().split("\\s+"); | ||
String[] feature_entries = rule.getFeatureString().split("\\s+"); | ||
|
||
// Reached slice limit size, indicate that we're closing up. | ||
if (!ready_to_flush | ||
|
@@ -373,7 +332,7 @@ private void binarize(LineReader grammar_reader, LineReader alignment_reader) th | |
if (packAlignments) { | ||
String alignment_line; | ||
if (grammarAlignments) { | ||
alignment_line = fields.get(3); | ||
alignment_line = rule.getAlignmentString(); | ||
} else { | ||
if (!alignment_reader.hasNext()) { | ||
logger.severe("No more alignments starting in line " + counter); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
max-source-len = 5 | ||
max-source-len = 3 |
1 comment
on commit f5adcde
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I love this!
you can initialize the StringBuffer directly with the following line:
new StringBuffer("[X] ||| [X,1] " + fields[0] + " ||| [X,1] " + fields[1] + " |||")