Skip to content

Commit

Permalink
Further strengthen and comment unit-tests, mark exactly where empty s…
Browse files Browse the repository at this point in the history
…entence is happening
  • Loading branch information
Jerin Philip committed Apr 1, 2021
1 parent d8d89f3 commit f10d6d2
Show file tree
Hide file tree
Showing 2 changed files with 122 additions and 51 deletions.
155 changes: 110 additions & 45 deletions src/tests/annotation_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,70 +11,125 @@ TEST_CASE("Test Annotation API with random sentences") {
/// which sentence went in where and try to use accessor methods on
/// AnnotatedText to check if what we have as ground-truth by construction is
/// consistent with what is returned.
size_t sentences = 20;
size_t sentences = 500;
size_t maxWords = 40;

bool debug{false}; // Set in case needed to see output.
// Set in case needed to see output. The output is in lines of #sentences +
// header, which can be split and compared for easy understanding. The ideal
// way to inspect what is going wrong is to redirect output and use to split
// the different stages by sentences + 1 lines and check the diff.
bool debug{false};

std::mt19937 randomIntGen_;
randomIntGen_.seed(42);

AnnotatedText testAnnotation;
std::vector<std::vector<ByteRange>> sentenceWords;
std::vector<ByteRange> wordByteRanges;
std::vector<ByteRange> sentenceByteRanges;
AnnotatedText testAnnotation; // This the container we add through API and
// check if the access is correct.

// External book-keeping so we have ground truths. Each element represents a
// sentence.

// word byte ranges - for testAnnotation.word(sId, wId)
std::vector<std::vector<ByteRange>> groundTruthWords;
// sentence byte ranges - for testAnnotation.sentence(sId, wId)
std::vector<ByteRange> groundTruthSentences;

// Prepare the text and construct ByteRanges as intended for sentences and
// words. The ByteRanges we construct here are expected to be the
// ground-truths for words and sentences.
// ground-truths for words and sentences. The string being constructed is like
// as follows:
//
// 0-0 0-1 0-2 0-3
// 1-0 1-1 1-2 1-3 1-4
// 2-0 2-1
//
// 4-0 4-1 4-2 4-3
//
// Words are separated by space units.
//
// Below, we accumulate the text with intended structure as above, and
// ground-truth tables populated to be aware of the ByteRanges where they are
// meant to be.
if (debug) {
std::cout << "Preparing text and ground truth-tables" << std::endl;
}
for (size_t idx = 0; idx < sentences; idx++) {
if (idx != 0)
testAnnotation.text += "\n";

wordByteRanges.clear();
size_t words = randomIntGen_() % maxWords + 1;
wordByteRanges.reserve(words);
size_t sentenceBegin, sentenceEnd;
for (size_t idw = 0; idw < words; idw++) {
if (idw != 0)
// Words can be zero, we need to support empty word sentences as well.
size_t numWords = randomIntGen_() % maxWords;

std::vector<ByteRange> wordByteRanges;
wordByteRanges.reserve(numWords);

// For empty sentence, we expect it to be empty and marked in position where
// the existing string is if needed to be pointed out.
size_t before = testAnnotation.text.size() - 1;
size_t sentenceBegin{before}, sentenceEnd{before};

for (size_t idw = 0; idw < numWords; idw++) {
if (idw != 0) {
testAnnotation.text += " ";
size_t before = testAnnotation.text.size();
if (debug) {
std::cout << " ";
}
}

// Get new beginning, accounting for space above.
before = testAnnotation.text.size();

// Add the word
std::string word = std::to_string(idx) + "-" + std::to_string(idw);
testAnnotation.text += word;

// Do math, before, before + new-word's size.
wordByteRanges.push_back((ByteRange){before, before + word.size()});

if (debug) {
std::cout << word;
}

if (idw == 0) {
sentenceBegin = before;
}
if (idw == words - 1) {
if (idw == numWords - 1) {
sentenceEnd = before + word.size();
}
}
if (debug) {
std::cout << std::endl;
}

sentenceWords.push_back(wordByteRanges);
sentenceByteRanges.push_back((ByteRange){sentenceBegin, sentenceEnd});
groundTruthWords.push_back(wordByteRanges);
groundTruthSentences.push_back((ByteRange){sentenceBegin, sentenceEnd});
}

// Use the AnnotatedText's addSentence() API to add sentences to transparently
// convert from string_views to ByteRanges, rebasing/working out the math
// underneath.
// We prepare string_views now with the known ByteRanges and use the
// string_view based AnnotatedText.addSentence(...) API to add sentences to
// transparently convert from string_views to ByteRanges, rebasing/working out
// the math underneath.

if (debug) {
std::cout << "Inserting words onto container:" << std::endl;
std::cout << "Inserting words onto container and save ground-truth-table:"
<< std::endl;
}

///
std::vector<std::vector<marian::string_view>> wordStringViews;
for (auto &sentence : sentenceWords) {
for (auto &sentence : groundTruthWords) {
std::vector<marian::string_view> wordByteRanges;
bool first{true};
for (auto &word : sentence) {
marian::string_view wordView(&testAnnotation.text[word.begin],
word.size());
wordByteRanges.push_back(wordView);
if (debug) {
std::cout << std::string(wordView) << " ";
if (first) {
first = false;
} else {
std::cout << " ";
}
std::cout << std::string(wordView);
}
}
testAnnotation.addSentence(wordByteRanges);
Expand All @@ -84,25 +139,32 @@ TEST_CASE("Test Annotation API with random sentences") {
}
}

// Access from the sentence(sentenceIdx) API and confirm that the ground truth
// we expect is same as what comes out of the contianer.
if (debug) {
std::cout
<< "Inserting sentences onto container and save ground-truth-table"
<< std::endl;
}
std::vector<marian::string_view> sentenceStringViews;
for (auto &sentenceByteRange : sentenceByteRanges) {
marian::string_view sentenceView(
&testAnnotation.text[sentenceByteRange.begin],
sentenceByteRange.size());
for (auto &sentenceByteRange : groundTruthSentences) {
char *data = &(testAnnotation.text[sentenceByteRange.begin]);
marian::string_view sentenceView(data, sentenceByteRange.size());
sentenceStringViews.push_back(sentenceView);

if (debug) {
std::cout << sentenceView << std::endl;
}
}

// Access from the sentence(sentenceIdx) API and confirm that the ground truth
// we expect is same as what comes out of the container.
if (debug) {
std::cout << "From container: " << std::endl;
std::cout << "Sentences: " << std::endl;
std::cout << "From container: Sentences" << std::endl;
}
for (int idx = 0; idx < sentenceByteRanges.size(); idx++) {
ByteRange expected = sentenceByteRanges[idx];
for (int idx = 0; idx < groundTruthSentences.size(); idx++) {
ByteRange expected = groundTruthSentences[idx];
ByteRange obtained = testAnnotation.sentenceAsByteRange(idx);
if (debug) {
std::cout << std::string(testAnnotation.sentence(idx)) << " ";
std::cout << std::string(testAnnotation.sentence(idx)) << std::endl;
}
CHECK(expected.begin == obtained.begin);
CHECK(expected.end == obtained.end);
Expand All @@ -112,20 +174,19 @@ TEST_CASE("Test Annotation API with random sentences") {
}

/// Access the word(sentenceIdx, wordIdx) API and confirm what we hold as
/// expected words are the same as those obtained.
/// expected words are the same as those obtained from the container.
if (debug) {
std::cout << "From container: " << std::endl;
std::cout << "Words: " << std::endl;
std::cout << "From container: Words" << std::endl;
}

CHECK(sentenceWords.size() == testAnnotation.numSentences());
for (int idx = 0; idx < sentenceWords.size(); idx++) {
CHECK(sentenceWords[idx].size() == testAnnotation.numWords(idx));
CHECK(groundTruthWords.size() == testAnnotation.numSentences());
for (int idx = 0; idx < groundTruthWords.size(); idx++) {
CHECK(groundTruthWords[idx].size() == testAnnotation.numWords(idx));
}

for (int idx = 0; idx < sentenceWords.size(); idx++) {
for (int idw = 0; idw < sentenceWords[idx].size(); idw++) {
ByteRange expected = sentenceWords[idx][idw];
for (int idx = 0; idx < groundTruthWords.size(); idx++) {
for (int idw = 0; idw < groundTruthWords[idx].size(); idw++) {
ByteRange expected = groundTruthWords[idx][idw];
ByteRange obtained = testAnnotation.wordAsByteRange(idx, idw);
if (debug) {
std::cout << std::string(testAnnotation.word(idx, idw)) << " ";
Expand All @@ -142,12 +203,16 @@ TEST_CASE("Test Annotation API with random sentences") {
}
}

// Try inserting an empty Sentence.
// Try inserting an empty Sentence. This is ensuring we check for empty
// Sentence if the random test above does not cover it for some reason.
int emptySentenceIdx = sentences;
std::vector<marian::string_view> emptySentence;
testAnnotation.addSentence(emptySentence);

// There are no words.
CHECK(testAnnotation.numWords(emptySentenceIdx) == 0);

// Empty sentence expected at output.
std::string expectedEmptyString = "";
marian::string_view emptyView = testAnnotation.sentence(emptySentenceIdx);
std::string obtainedString = std::string(emptyView.data(), emptyView.size());
Expand Down
18 changes: 12 additions & 6 deletions src/translator/sentence_ranges.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,18 @@ ByteRange Annotation::sentence(size_t sentenceIdx) const {
: sentenceEndIds_[sentenceIdx - 1]; // Half interval, so;

eosId = sentenceEndIds_[sentenceIdx];
ByteRange bos = flatByteRanges_[bosId];
ByteRange sentenceByteRange{bos.end, bos.end};
if (bosId != eosId) {
auto bos = flatByteRanges_[bosId];
auto eos = flatByteRanges_[eosId - 1];

ByteRange sentenceByteRange;

if (bosId == eosId) {
// We have an empty sentence. However, we want to be able to point where in
// target this happened through the ranges. We are looking for the end of
// the flatByteRange and non-empty sentence before and construct empty
// string-view equivalent ByteRange.
ByteRange eos = flatByteRanges_[eosId - 1];
sentenceByteRange = (ByteRange){eos.end, eos.end};
} else {
ByteRange bos = flatByteRanges_[bosId];
ByteRange eos = flatByteRanges_[eosId - 1];
sentenceByteRange = (ByteRange){bos.begin, eos.end};
}
return sentenceByteRange;
Expand Down

0 comments on commit f10d6d2

Please sign in to comment.