From f10d6d2d3967f7dad4ada6e1c053cc730c668bc6 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Thu, 1 Apr 2021 17:39:01 +0000 Subject: [PATCH] Further strengthen and comment unit-tests, mark exactly where empty sentence is happening --- src/tests/annotation_tests.cpp | 155 ++++++++++++++++++++--------- src/translator/sentence_ranges.cpp | 18 ++-- 2 files changed, 122 insertions(+), 51 deletions(-) diff --git a/src/tests/annotation_tests.cpp b/src/tests/annotation_tests.cpp index 20ddda8a8..3e42baedb 100644 --- a/src/tests/annotation_tests.cpp +++ b/src/tests/annotation_tests.cpp @@ -11,42 +11,89 @@ TEST_CASE("Test Annotation API with random sentences") { /// which sentence went in where and try to use accessor methods on /// AnnotatedText to check if what we have as ground-truth by construction is /// consistent with what is returned. - size_t sentences = 20; + size_t sentences = 500; size_t maxWords = 40; - bool debug{false}; // Set in case needed to see output. + // Set in case needed to see output. The output is in lines of #sentences + + // header, which can be split and compared for easy understanding. The ideal + // way to inspect what is going wrong is to redirect output and use to split + // the different stages by sentences + 1 lines and check the diff. + bool debug{false}; std::mt19937 randomIntGen_; randomIntGen_.seed(42); - AnnotatedText testAnnotation; - std::vector> sentenceWords; - std::vector wordByteRanges; - std::vector sentenceByteRanges; + AnnotatedText testAnnotation; // This the container we add through API and + // check if the access is correct. + + // External book-keeping so we have ground truths. Each element represents a + // sentence. + + // word byte ranges - for testAnnotation.word(sId, wId) + std::vector> groundTruthWords; + // sentence byte ranges - for testAnnotation.sentence(sId, wId) + std::vector groundTruthSentences; // Prepare the text and construct ByteRanges as intended for sentences and // words. The ByteRanges we construct here are expected to be the - // ground-truths for words and sentences. + // ground-truths for words and sentences. The string being constructed is like + // as follows: + // + // 0-0 0-1 0-2 0-3 + // 1-0 1-1 1-2 1-3 1-4 + // 2-0 2-1 + // + // 4-0 4-1 4-2 4-3 + // + // Words are separated by space units. + // + // Below, we accumulate the text with intended structure as above, and + // ground-truth tables populated to be aware of the ByteRanges where they are + // meant to be. + if (debug) { + std::cout << "Preparing text and ground truth-tables" << std::endl; + } for (size_t idx = 0; idx < sentences; idx++) { if (idx != 0) testAnnotation.text += "\n"; - wordByteRanges.clear(); - size_t words = randomIntGen_() % maxWords + 1; - wordByteRanges.reserve(words); - size_t sentenceBegin, sentenceEnd; - for (size_t idw = 0; idw < words; idw++) { - if (idw != 0) + // Words can be zero, we need to support empty word sentences as well. + size_t numWords = randomIntGen_() % maxWords; + + std::vector wordByteRanges; + wordByteRanges.reserve(numWords); + + // For empty sentence, we expect it to be empty and marked in position where + // the existing string is if needed to be pointed out. + size_t before = testAnnotation.text.size() - 1; + size_t sentenceBegin{before}, sentenceEnd{before}; + + for (size_t idw = 0; idw < numWords; idw++) { + if (idw != 0) { testAnnotation.text += " "; - size_t before = testAnnotation.text.size(); + if (debug) { + std::cout << " "; + } + } + + // Get new beginning, accounting for space above. + before = testAnnotation.text.size(); + + // Add the word std::string word = std::to_string(idx) + "-" + std::to_string(idw); testAnnotation.text += word; + + // Do math, before, before + new-word's size. wordByteRanges.push_back((ByteRange){before, before + word.size()}); + if (debug) { + std::cout << word; + } + if (idw == 0) { sentenceBegin = before; } - if (idw == words - 1) { + if (idw == numWords - 1) { sentenceEnd = before + word.size(); } } @@ -54,27 +101,35 @@ TEST_CASE("Test Annotation API with random sentences") { std::cout << std::endl; } - sentenceWords.push_back(wordByteRanges); - sentenceByteRanges.push_back((ByteRange){sentenceBegin, sentenceEnd}); + groundTruthWords.push_back(wordByteRanges); + groundTruthSentences.push_back((ByteRange){sentenceBegin, sentenceEnd}); } - // Use the AnnotatedText's addSentence() API to add sentences to transparently - // convert from string_views to ByteRanges, rebasing/working out the math - // underneath. + // We prepare string_views now with the known ByteRanges and use the + // string_view based AnnotatedText.addSentence(...) API to add sentences to + // transparently convert from string_views to ByteRanges, rebasing/working out + // the math underneath. + if (debug) { - std::cout << "Inserting words onto container:" << std::endl; + std::cout << "Inserting words onto container and save ground-truth-table:" + << std::endl; } - /// std::vector> wordStringViews; - for (auto &sentence : sentenceWords) { + for (auto &sentence : groundTruthWords) { std::vector wordByteRanges; + bool first{true}; for (auto &word : sentence) { marian::string_view wordView(&testAnnotation.text[word.begin], word.size()); wordByteRanges.push_back(wordView); if (debug) { - std::cout << std::string(wordView) << " "; + if (first) { + first = false; + } else { + std::cout << " "; + } + std::cout << std::string(wordView); } } testAnnotation.addSentence(wordByteRanges); @@ -84,25 +139,32 @@ TEST_CASE("Test Annotation API with random sentences") { } } - // Access from the sentence(sentenceIdx) API and confirm that the ground truth - // we expect is same as what comes out of the contianer. + if (debug) { + std::cout + << "Inserting sentences onto container and save ground-truth-table" + << std::endl; + } std::vector sentenceStringViews; - for (auto &sentenceByteRange : sentenceByteRanges) { - marian::string_view sentenceView( - &testAnnotation.text[sentenceByteRange.begin], - sentenceByteRange.size()); + for (auto &sentenceByteRange : groundTruthSentences) { + char *data = &(testAnnotation.text[sentenceByteRange.begin]); + marian::string_view sentenceView(data, sentenceByteRange.size()); sentenceStringViews.push_back(sentenceView); + + if (debug) { + std::cout << sentenceView << std::endl; + } } + // Access from the sentence(sentenceIdx) API and confirm that the ground truth + // we expect is same as what comes out of the container. if (debug) { - std::cout << "From container: " << std::endl; - std::cout << "Sentences: " << std::endl; + std::cout << "From container: Sentences" << std::endl; } - for (int idx = 0; idx < sentenceByteRanges.size(); idx++) { - ByteRange expected = sentenceByteRanges[idx]; + for (int idx = 0; idx < groundTruthSentences.size(); idx++) { + ByteRange expected = groundTruthSentences[idx]; ByteRange obtained = testAnnotation.sentenceAsByteRange(idx); if (debug) { - std::cout << std::string(testAnnotation.sentence(idx)) << " "; + std::cout << std::string(testAnnotation.sentence(idx)) << std::endl; } CHECK(expected.begin == obtained.begin); CHECK(expected.end == obtained.end); @@ -112,20 +174,19 @@ TEST_CASE("Test Annotation API with random sentences") { } /// Access the word(sentenceIdx, wordIdx) API and confirm what we hold as - /// expected words are the same as those obtained. + /// expected words are the same as those obtained from the container. if (debug) { - std::cout << "From container: " << std::endl; - std::cout << "Words: " << std::endl; + std::cout << "From container: Words" << std::endl; } - CHECK(sentenceWords.size() == testAnnotation.numSentences()); - for (int idx = 0; idx < sentenceWords.size(); idx++) { - CHECK(sentenceWords[idx].size() == testAnnotation.numWords(idx)); + CHECK(groundTruthWords.size() == testAnnotation.numSentences()); + for (int idx = 0; idx < groundTruthWords.size(); idx++) { + CHECK(groundTruthWords[idx].size() == testAnnotation.numWords(idx)); } - for (int idx = 0; idx < sentenceWords.size(); idx++) { - for (int idw = 0; idw < sentenceWords[idx].size(); idw++) { - ByteRange expected = sentenceWords[idx][idw]; + for (int idx = 0; idx < groundTruthWords.size(); idx++) { + for (int idw = 0; idw < groundTruthWords[idx].size(); idw++) { + ByteRange expected = groundTruthWords[idx][idw]; ByteRange obtained = testAnnotation.wordAsByteRange(idx, idw); if (debug) { std::cout << std::string(testAnnotation.word(idx, idw)) << " "; @@ -142,12 +203,16 @@ TEST_CASE("Test Annotation API with random sentences") { } } - // Try inserting an empty Sentence. + // Try inserting an empty Sentence. This is ensuring we check for empty + // Sentence if the random test above does not cover it for some reason. int emptySentenceIdx = sentences; std::vector emptySentence; testAnnotation.addSentence(emptySentence); + + // There are no words. CHECK(testAnnotation.numWords(emptySentenceIdx) == 0); + // Empty sentence expected at output. std::string expectedEmptyString = ""; marian::string_view emptyView = testAnnotation.sentence(emptySentenceIdx); std::string obtainedString = std::string(emptyView.data(), emptyView.size()); diff --git a/src/translator/sentence_ranges.cpp b/src/translator/sentence_ranges.cpp index 9c7c37062..cbda56a87 100644 --- a/src/translator/sentence_ranges.cpp +++ b/src/translator/sentence_ranges.cpp @@ -30,12 +30,18 @@ ByteRange Annotation::sentence(size_t sentenceIdx) const { : sentenceEndIds_[sentenceIdx - 1]; // Half interval, so; eosId = sentenceEndIds_[sentenceIdx]; - ByteRange bos = flatByteRanges_[bosId]; - ByteRange sentenceByteRange{bos.end, bos.end}; - if (bosId != eosId) { - auto bos = flatByteRanges_[bosId]; - auto eos = flatByteRanges_[eosId - 1]; - + ByteRange sentenceByteRange; + + if (bosId == eosId) { + // We have an empty sentence. However, we want to be able to point where in + // target this happened through the ranges. We are looking for the end of + // the flatByteRange and non-empty sentence before and construct empty + // string-view equivalent ByteRange. + ByteRange eos = flatByteRanges_[eosId - 1]; + sentenceByteRange = (ByteRange){eos.end, eos.end}; + } else { + ByteRange bos = flatByteRanges_[bosId]; + ByteRange eos = flatByteRanges_[eosId - 1]; sentenceByteRange = (ByteRange){bos.begin, eos.end}; } return sentenceByteRange;