Further strengthen and comment unit-tests, mark exactly where empty s…

…entence is happening
browsermt · Apr 1, 2021 · f10d6d2 · f10d6d2
1 parent d8d89f3
commit f10d6d2
Show file tree

Hide file tree

Showing 2 changed files with 122 additions and 51 deletions.
diff --git a/src/tests/annotation_tests.cpp b/src/tests/annotation_tests.cpp
@@ -11,70 +11,125 @@ TEST_CASE("Test Annotation API with random sentences") {
   /// which sentence went in where and try to use accessor methods on
   /// AnnotatedText to check if what we have as ground-truth by construction is
   /// consistent with what is returned.
-  size_t sentences = 20;
+  size_t sentences = 500;
   size_t maxWords = 40;
 
-  bool debug{false}; // Set in case needed to see output.
+  // Set in case needed to see output. The output is in lines of #sentences +
+  // header, which can be split and compared for easy understanding. The ideal
+  // way to inspect what is going wrong is to redirect output and use to split
+  // the different stages by sentences + 1 lines and check the diff.
+  bool debug{false};
 
   std::mt19937 randomIntGen_;
   randomIntGen_.seed(42);
 
-  AnnotatedText testAnnotation;
-  std::vector<std::vector<ByteRange>> sentenceWords;
-  std::vector<ByteRange> wordByteRanges;
-  std::vector<ByteRange> sentenceByteRanges;
+  AnnotatedText testAnnotation; // This the container we add through API and
+                                // check if the access is correct.
+
+  // External book-keeping so we have ground truths. Each element represents a
+  // sentence.
+
+  // word byte ranges - for testAnnotation.word(sId, wId)
+  std::vector<std::vector<ByteRange>> groundTruthWords;
+  // sentence byte ranges - for testAnnotation.sentence(sId, wId)
+  std::vector<ByteRange> groundTruthSentences;
 
   // Prepare the text and construct ByteRanges as intended for sentences and
   // words. The ByteRanges we construct here are expected to be the
-  // ground-truths for words and sentences.
+  // ground-truths for words and sentences. The string being constructed is like
+  // as follows:
+  //
+  //     0-0 0-1 0-2 0-3
+  //     1-0 1-1 1-2 1-3 1-4
+  //     2-0 2-1
+  //
+  //     4-0 4-1 4-2 4-3
+  //
+  // Words are separated by space units.
+  //
+  // Below, we accumulate the text with intended structure as above, and
+  // ground-truth tables populated to be aware of the ByteRanges where they are
+  // meant to be.
+  if (debug) {
+    std::cout << "Preparing text and ground truth-tables" << std::endl;
+  }
   for (size_t idx = 0; idx < sentences; idx++) {
     if (idx != 0)
       testAnnotation.text += "\n";
 
-    wordByteRanges.clear();
-    size_t words = randomIntGen_() % maxWords + 1;
-    wordByteRanges.reserve(words);
-    size_t sentenceBegin, sentenceEnd;
-    for (size_t idw = 0; idw < words; idw++) {
-      if (idw != 0)
+    // Words can be zero, we need to support empty word sentences as well.
+    size_t numWords = randomIntGen_() % maxWords;
+
+    std::vector<ByteRange> wordByteRanges;
+    wordByteRanges.reserve(numWords);
+
+    // For empty sentence, we expect it to be empty and marked in position where
+    // the existing string is if needed to be pointed out.
+    size_t before = testAnnotation.text.size() - 1;
+    size_t sentenceBegin{before}, sentenceEnd{before};
+
+    for (size_t idw = 0; idw < numWords; idw++) {
+      if (idw != 0) {
         testAnnotation.text += " ";
-      size_t before = testAnnotation.text.size();
+        if (debug) {
+          std::cout << " ";
+        }
+      }
+
+      // Get new beginning, accounting for space above.
+      before = testAnnotation.text.size();
+
+      // Add the word
       std::string word = std::to_string(idx) + "-" + std::to_string(idw);
       testAnnotation.text += word;
+
+      // Do math, before, before + new-word's size.
       wordByteRanges.push_back((ByteRange){before, before + word.size()});
 
+      if (debug) {
+        std::cout << word;
+      }
+
       if (idw == 0) {
         sentenceBegin = before;
       }
-      if (idw == words - 1) {
+      if (idw == numWords - 1) {
         sentenceEnd = before + word.size();
       }
     }
     if (debug) {
       std::cout << std::endl;
     }
 
-    sentenceWords.push_back(wordByteRanges);
-    sentenceByteRanges.push_back((ByteRange){sentenceBegin, sentenceEnd});
+    groundTruthWords.push_back(wordByteRanges);
+    groundTruthSentences.push_back((ByteRange){sentenceBegin, sentenceEnd});
   }
 
-  // Use the AnnotatedText's addSentence() API to add sentences to transparently
-  // convert from string_views to ByteRanges, rebasing/working out the math
-  // underneath.
+  // We prepare string_views now with the known ByteRanges and use the
+  // string_view based AnnotatedText.addSentence(...) API to add sentences to
+  // transparently convert from string_views to ByteRanges, rebasing/working out
+  // the math underneath.
+
   if (debug) {
-    std::cout << "Inserting words onto container:" << std::endl;
+    std::cout << "Inserting words onto container and save ground-truth-table:"
+              << std::endl;
   }
 
-  ///
   std::vector<std::vector<marian::string_view>> wordStringViews;
-  for (auto &sentence : sentenceWords) {
+  for (auto &sentence : groundTruthWords) {
     std::vector<marian::string_view> wordByteRanges;
+    bool first{true};
     for (auto &word : sentence) {
       marian::string_view wordView(&testAnnotation.text[word.begin],
                                    word.size());
       wordByteRanges.push_back(wordView);
       if (debug) {
-        std::cout << std::string(wordView) << " ";
+        if (first) {
+          first = false;
+        } else {
+          std::cout << " ";
+        }
+        std::cout << std::string(wordView);
       }
     }
     testAnnotation.addSentence(wordByteRanges);
@@ -84,25 +139,32 @@ TEST_CASE("Test Annotation API with random sentences") {
     }
   }
 
-  // Access from the sentence(sentenceIdx) API and confirm that the ground truth
-  // we expect is same as what comes out of the contianer.
+  if (debug) {
+    std::cout
+        << "Inserting sentences onto container and save ground-truth-table"
+        << std::endl;
+  }
   std::vector<marian::string_view> sentenceStringViews;
-  for (auto &sentenceByteRange : sentenceByteRanges) {
-    marian::string_view sentenceView(
-        &testAnnotation.text[sentenceByteRange.begin],
-        sentenceByteRange.size());
+  for (auto &sentenceByteRange : groundTruthSentences) {
+    char *data = &(testAnnotation.text[sentenceByteRange.begin]);
+    marian::string_view sentenceView(data, sentenceByteRange.size());
     sentenceStringViews.push_back(sentenceView);
+
+    if (debug) {
+      std::cout << sentenceView << std::endl;
+    }
   }
 
+  // Access from the sentence(sentenceIdx) API and confirm that the ground truth
+  // we expect is same as what comes out of the container.
   if (debug) {
-    std::cout << "From container: " << std::endl;
-    std::cout << "Sentences: " << std::endl;
+    std::cout << "From container: Sentences" << std::endl;
   }
-  for (int idx = 0; idx < sentenceByteRanges.size(); idx++) {
-    ByteRange expected = sentenceByteRanges[idx];
+  for (int idx = 0; idx < groundTruthSentences.size(); idx++) {
+    ByteRange expected = groundTruthSentences[idx];
     ByteRange obtained = testAnnotation.sentenceAsByteRange(idx);
     if (debug) {
-      std::cout << std::string(testAnnotation.sentence(idx)) << " ";
+      std::cout << std::string(testAnnotation.sentence(idx)) << std::endl;
     }
     CHECK(expected.begin == obtained.begin);
     CHECK(expected.end == obtained.end);
@@ -112,20 +174,19 @@ TEST_CASE("Test Annotation API with random sentences") {
   }
 
   /// Access the word(sentenceIdx, wordIdx) API and confirm what we hold as
-  /// expected words are the same as those obtained.
+  /// expected words are the same as those obtained from the container.
   if (debug) {
-    std::cout << "From container: " << std::endl;
-    std::cout << "Words: " << std::endl;
+    std::cout << "From container: Words" << std::endl;
   }
 
-  CHECK(sentenceWords.size() == testAnnotation.numSentences());
-  for (int idx = 0; idx < sentenceWords.size(); idx++) {
-    CHECK(sentenceWords[idx].size() == testAnnotation.numWords(idx));
+  CHECK(groundTruthWords.size() == testAnnotation.numSentences());
+  for (int idx = 0; idx < groundTruthWords.size(); idx++) {
+    CHECK(groundTruthWords[idx].size() == testAnnotation.numWords(idx));
   }
 
-  for (int idx = 0; idx < sentenceWords.size(); idx++) {
-    for (int idw = 0; idw < sentenceWords[idx].size(); idw++) {
-      ByteRange expected = sentenceWords[idx][idw];
+  for (int idx = 0; idx < groundTruthWords.size(); idx++) {
+    for (int idw = 0; idw < groundTruthWords[idx].size(); idw++) {
+      ByteRange expected = groundTruthWords[idx][idw];
       ByteRange obtained = testAnnotation.wordAsByteRange(idx, idw);
       if (debug) {
         std::cout << std::string(testAnnotation.word(idx, idw)) << " ";
@@ -142,12 +203,16 @@ TEST_CASE("Test Annotation API with random sentences") {
     }
   }
 
-  // Try inserting an empty Sentence.
+  // Try inserting an empty Sentence. This is ensuring we check for empty
+  // Sentence if the random test above does not cover it for some reason.
   int emptySentenceIdx = sentences;
   std::vector<marian::string_view> emptySentence;
   testAnnotation.addSentence(emptySentence);
+
+  // There are no words.
   CHECK(testAnnotation.numWords(emptySentenceIdx) == 0);
 
+  // Empty sentence expected at output.
   std::string expectedEmptyString = "";
   marian::string_view emptyView = testAnnotation.sentence(emptySentenceIdx);
   std::string obtainedString = std::string(emptyView.data(), emptyView.size());

diff --git a/src/translator/sentence_ranges.cpp b/src/translator/sentence_ranges.cpp
@@ -30,12 +30,18 @@ ByteRange Annotation::sentence(size_t sentenceIdx) const {
               : sentenceEndIds_[sentenceIdx - 1]; // Half interval, so;
 
   eosId = sentenceEndIds_[sentenceIdx];
-  ByteRange bos = flatByteRanges_[bosId];
-  ByteRange sentenceByteRange{bos.end, bos.end};
-  if (bosId != eosId) {
-    auto bos = flatByteRanges_[bosId];
-    auto eos = flatByteRanges_[eosId - 1];
-
+  ByteRange sentenceByteRange;
+
+  if (bosId == eosId) {
+    // We have an empty sentence. However, we want to be able to point where in
+    // target this happened through the ranges. We are looking for the end of
+    // the flatByteRange and non-empty sentence before and construct empty
+    // string-view equivalent ByteRange.
+    ByteRange eos = flatByteRanges_[eosId - 1];
+    sentenceByteRange = (ByteRange){eos.end, eos.end};
+  } else {
+    ByteRange bos = flatByteRanges_[bosId];
+    ByteRange eos = flatByteRanges_[eosId - 1];
     sentenceByteRange = (ByteRange){bos.begin, eos.end};
   }
   return sentenceByteRange;