Adressed most fixes from Niklas

- Mostly used the utf-8 tolowercase - Currently we do not yet have the Umlauts at a useful position (this takes more effort). - The whole Utf-8 business should be handled by proper libraries with iterators etc.
ad-freiburg · Mar 19, 2019 · fb3ae82 · fb3ae82
1 parent 5cc1602
commit fb3ae82
Show file tree

Hide file tree

Showing 7 changed files with 93 additions and 92 deletions.
diff --git a/src/engine/Filter.cpp b/src/engine/Filter.cpp
@@ -512,8 +512,7 @@ void Filter::computeResultFixedValue(
           switch (_type) {
             case SparqlFilter::GE:
             case SparqlFilter::LT: {
-              std::transform(rhs_string.begin(), rhs_string.end(),
-                             rhs_string.begin(), ::toupper);
+              rhs_string = ad_utility::getUppercaseUtf8(rhs_string);
               auto split = StringSortComparator::extractComparable(rhs_string);
               if (split.isLiteral && !split.langtag.empty()) {
                 // get rid of possible langtags to move to the beginning of the
@@ -525,8 +524,7 @@ void Filter::computeResultFixedValue(
             break;
             case SparqlFilter::GT:
             case SparqlFilter::LE: {
-              std::transform(rhs_string.begin(), rhs_string.end(),
-                             rhs_string.begin(), ::tolower);
+              rhs_string = ad_utility::getLowercaseUtf8(rhs_string);
               auto split2 = StringSortComparator::extractComparable(rhs_string);
               if (split2.isLiteral) {
                 rhs_string =

diff --git a/src/index/Index.cpp b/src/index/Index.cpp
@@ -204,43 +204,7 @@ VocabularyData Index::passFileForVocabulary(const string& filename,
       LOG(INFO) << "Lines (from KB-file) processed: " << i << '\n';
     }
     if (i % linesPerPartial == 0) {
-      std::future<void> fut1, fut2;
-      LOG(INFO) << "Lines (from KB-file) processed: " << i << '\n';
-      LOG(INFO) << "Actual number of Triples in this section (include "
-                   "langfilter triples): "
-                << actualCurrentPartialSize << '\n';
-      string partialFilename =
-          _onDiskBase + PARTIAL_VOCAB_FILE_NAME + std::to_string(numFiles);
-
-      LOG(INFO) << "writing partial vocabulary to " << partialFilename
-                << std::endl;
-      LOG(INFO) << "it contains " << items.size() << " elements\n";
-      fut1 = std::async([this, &items, partialFilename]() {
-        writePartialIdMapToBinaryFileForMerging(items, partialFilename,
-                                                _vocab.getCaseComparator());
-      });
-
-      if (_vocabPrefixCompressed && _vocab.getCaseInsensitiveOrdering()) {
-        // we also have to create the "ordinary" vocabulary order to make the
-        // prefix compression work
-        string partialTmpFilename = _onDiskBase + TMP_BASENAME_COMPRESSION +
-                                    PARTIAL_VOCAB_FILE_NAME +
-                                    std::to_string(numFiles);
-        LOG(INFO) << "writing partial temporary vocabulary to "
-                  << partialTmpFilename << std::endl;
-        LOG(INFO) << "it contains " << items.size() << " elements\n";
-        fut2 = std::async([&items, partialTmpFilename]() {
-          writePartialIdMapToBinaryFileForMerging(items, partialTmpFilename,
-                                                  StringSortComparator(false));
-        });
-      }
-      if (fut1.valid()) {
-        fut1.get();
-      }
-      if (fut2.valid()) {
-        fut2.get();
-      }
-      LOG(INFO) << "Done\n";
+      writeNextPartialVocabulary(i, numFiles, actualCurrentPartialSize, items);
       numFiles++;
       // Save the information how many triples this partial vocabulary actually
       // deals with we will use this later for mapping from partial to global
@@ -255,32 +219,7 @@ VocabularyData Index::passFileForVocabulary(const string& filename,
   }
   // deal with remainder
   if (items.size() > 0) {
-    LOG(INFO) << "Lines processed: " << i << '\n';
-    LOG(INFO) << "Actual number of Triples in this section: "
-              << actualCurrentPartialSize << '\n';
-    string partialFilename =
-        _onDiskBase + PARTIAL_VOCAB_FILE_NAME + std::to_string(numFiles);
-
-    LOG(INFO) << "writing partial vocabular to " << partialFilename
-              << std::endl;
-    LOG(INFO) << "it contains " << items.size() << " elements\n";
-    writePartialIdMapToBinaryFileForMerging(items, partialFilename,
-                                            _vocab.getCaseComparator());
-    LOG(INFO) << "Done\n";
-
-    if (_vocabPrefixCompressed && _vocab.getCaseInsensitiveOrdering()) {
-      // we also have to create the "ordinary" vocabulary order to make the
-      // prefix compression work
-      string partialTmpFilename = _onDiskBase + TMP_BASENAME_COMPRESSION +
-                                  PARTIAL_VOCAB_FILE_NAME +
-                                  std::to_string(numFiles);
-      LOG(INFO) << "writing partial temporary vocabulary to "
-                << partialTmpFilename << std::endl;
-      LOG(INFO) << "it contains " << items.size() << " elements\n";
-      writePartialIdMapToBinaryFileForMerging(items, partialTmpFilename,
-                                              StringSortComparator(false));
-      LOG(INFO) << "Done\n";
-    }
+    writeNextPartialVocabulary(i, numFiles, actualCurrentPartialSize, items);
     numFiles++;
     actualPartialSizes.push_back(actualCurrentPartialSize);
   }
@@ -1915,3 +1854,45 @@ Id Index::assignNextId(Map* mapPtr, const string& key) {
     return map[key];
   }
 }
+
+// ___________________________________________________________________________
+void Index::writeNextPartialVocabulary(
+    size_t numLines, size_t numFiles, size_t actualCurrentPartialSize,
+    const ad_utility::HashMap<string, Id>& items) {
+  LOG(INFO) << "Lines (from KB-file) processed: " << numLines << '\n';
+  LOG(INFO) << "Actual number of Triples in this section (include "
+               "langfilter triples): "
+            << actualCurrentPartialSize << '\n';
+  std::future<void> fut1, fut2;
+  string partialFilename =
+      _onDiskBase + PARTIAL_VOCAB_FILE_NAME + std::to_string(numFiles);
+
+  LOG(INFO) << "writing partial vocabulary to " << partialFilename << std::endl;
+  LOG(INFO) << "it contains " << items.size() << " elements\n";
+  fut1 = std::async([this, &items, partialFilename]() {
+    writePartialIdMapToBinaryFileForMerging(items, partialFilename,
+                                            _vocab.getCaseComparator());
+  });
+
+  if (_vocabPrefixCompressed && _vocab.getCaseInsensitiveOrdering()) {
+    // we also have to create the "ordinary" vocabulary order to make the
+    // prefix compression work
+    string partialTmpFilename = _onDiskBase + TMP_BASENAME_COMPRESSION +
+                                PARTIAL_VOCAB_FILE_NAME +
+                                std::to_string(numFiles);
+    LOG(INFO) << "writing partial temporary vocabulary to "
+              << partialTmpFilename << std::endl;
+    LOG(INFO) << "it contains " << items.size() << " elements\n";
+    fut2 = std::async([&items, partialTmpFilename]() {
+      writePartialIdMapToBinaryFileForMerging(items, partialTmpFilename,
+                                              StringSortComparator(false));
+    });
+  }
+  if (fut1.valid()) {
+    fut1.get();
+  }
+  if (fut2.valid()) {
+    fut2.get();
+  }
+  LOG(INFO) << "Done.";
+}
diff --git a/src/index/Index.h b/src/index/Index.h
@@ -382,6 +382,24 @@ class Index {
   VocabularyData passFileForVocabulary(const string& ntFile,
                                        size_t linesPerPartial = 100000000);
 
+  /**
+   * @brief Everything that has to be done when we have seen all the triples
+   * that belong to one partial vocabulary, including Log output used inside
+   * passFileForVocabulary
+   *
+   * @param numLines How many Lines from the KB have we already parsed (only for
+   * Logging)
+   * @param numFiles How many partial vocabularies have we seen before/which is
+   * the index of the voc we are going to write
+   * @param actualCurrentPartialSize How many triples belong to this partition
+   * (including extra langfilter triples)
+   * @param items Contains our unsorted vocabulary. Maps words to their local
+   * ids within this vocabulary.
+   */
+  void writeNextPartialVocabulary(size_t numLines, size_t numFiles,
+                                  size_t actualCurrentPartialSize,
+                                  const ad_utility::HashMap<string, Id>& items);
+
   void convertPartialToGlobalIds(TripleVec& data,
                                  const vector<size_t>& actualLinesPerPartial,
                                  size_t linesPerPartial);

diff --git a/src/index/Vocabulary.h b/src/index/Vocabulary.h
@@ -105,14 +105,10 @@ class StringSortComparator {
 
   bool getIgnoreCase() const { return _ignoreCase; }
 
-  bool operator()(const std::string& a, const std::string& b) const {
+  bool operator()(std::string_view a, std::string_view b) const {
     if (!_ignoreCase) {
       return a < b;
     } else {
-      // TODO<Johannes>
-      // BUG<Johannes>
-      // We have to make sure that the literals are all in contiguous space to
-      // make this work.
       // TODO<Johannes> Ideally we want to have this also when doing
       // case-insensitive compare, but it currently breaks the prefix
       // compression (there we really need ordering by correct bytes)
@@ -135,7 +131,7 @@ class StringSortComparator {
     std::string_view langtag;
   };
 
-  static SplitVal extractComparable(const std::string& a) {
+  static SplitVal extractComparable(std::string_view a) {
     std::string_view res = a;
     bool isLiteral = false;
     std::string_view langtag;
@@ -154,31 +150,30 @@ class StringSortComparator {
   }
 
   static bool caseInsensitiveCompare(const SplitVal& a, const SplitVal& b) {
-    const auto result =
-        std::mismatch(a.val.cbegin(), a.val.cend(), b.val.cbegin(),
-                      b.val.cend(), [](const auto& lhs, const auto& rhs) {
-                        return tolower(lhs) == tolower(rhs);
-                      });
-    if (result.second == b.val.end()) {
-      if (result.first == a.val.end()) {
+    auto aLower = ad_utility::getLowercaseUtf8(a.val);
+    auto bLower = ad_utility::getLowercaseUtf8(b.val);
+    const auto result = std::mismatch(aLower.cbegin(), aLower.cend(),
+                                      bLower.cbegin(), bLower.cend());
+    if (result.second == bLower.end()) {
+      if (result.first == aLower.end()) {
         // In case a and b are equal wrt case-insensitivity we sort by the
         // language tag. If this also matches we return the actual order of the
-        // innter string value. Thus we have a unique ordering that makes life
+        // inner string value. Thus we have a unique ordering that makes life
         // easier.
         return a.langtag != b.langtag ? a.langtag < b.langtag : a.val < b.val;
       }
       // b is a prefix of a, thus a is strictly "bigger"
       return false;
     }
 
-    if (result.first == a.val.end()) {
+    if (result.first == aLower.end()) {
       // a is a prefix of b
       return true;
     }
 
     // neither string is a prefix of the other, look at the first mismatch
     // character if we have reach here, both iterators are save to dereference.
-    return tolower(*result.first) < tolower(*result.second);
+    return *result.first < *result.second;
   }
   bool _ignoreCase;
 };

diff --git a/src/index/VocabularyImpl.h b/src/index/VocabularyImpl.h
@@ -280,10 +280,16 @@ bool PrefixComparator<S>::operator()(const string& lhs,
 template <class S>
 bool PrefixComparator<S>::operator()(const string& lhs,
                                      const string& rhs) const {
-  // TODO<joka921> use string_view for the substrings
+  // we cannot use string_views as parameters as they will unfortunately lead
+  // to ambiguous overloads (even though the CompressedString overload wouldn't
+  // work.
   return _vocab->getCaseComparator()(
-      lhs.size() > _prefixLength ? lhs.substr(0, _prefixLength) : lhs,
-      rhs.size() > _prefixLength ? rhs.substr(0, _prefixLength) : rhs);
+      lhs.size() > _prefixLength
+          ? std::string_view(lhs).substr(0, _prefixLength)
+          : lhs,
+      rhs.size() > _prefixLength
+          ? std::string_view(rhs).substr(0, _prefixLength)
+          : rhs);
 }
 
 // _____________________________________________________

diff --git a/src/util/StringUtils.h b/src/util/StringUtils.h
@@ -68,9 +68,9 @@ inline string getLowercase(const string& orig);
 
 inline string getUppercase(const string& orig);
 
-inline string getLowercaseUtf8(const string& orig);
+inline string getLowercaseUtf8(std::string_view orig);
 
-inline string getUppercaseUtf8(const string& orig);
+inline string getUppercaseUtf8(std::string_view orig);
 
 inline string firstCharToUpperUtf8(const string& orig);
 
@@ -234,7 +234,7 @@ string getUppercase(const string& orig) {
 }
 
 // ____________________________________________________________________________
-string getLowercaseUtf8(const string& orig) {
+string getLowercaseUtf8(std::string_view orig) {
   string retVal;
   retVal.reserve(orig.size());
   std::mbstate_t state = std::mbstate_t();
@@ -263,7 +263,7 @@ string getLowercaseUtf8(const string& orig) {
 }
 
 // ____________________________________________________________________________
-string getUppercaseUtf8(const string& orig) {
+string getUppercaseUtf8(std::string_view orig) {
   string retVal;
   retVal.reserve(orig.size());
   std::mbstate_t state = std::mbstate_t();

diff --git a/test/VocabularyTest.cpp b/test/VocabularyTest.cpp
@@ -132,8 +132,11 @@ TEST(VocabularyTest, StringSortComparator) {
   ASSERT_FALSE(comp("alpha", "ALPHA"));
   ASSERT_TRUE(comp("ALPHA", "alpha"));
 
-  // TODO: check what to do about these cases
-  // ASSERT_TRUE(comp("\"Hannibal\"@en", "\"Hannibal Hamlin\"@en"));
+  ASSERT_TRUE(comp("\"Hannibal\"@en", "\"Hannibal Hamlin\"@en"));
+  ASSERT_TRUE(comp("\"Hannibal\"@af", "\"Hannibal\"@en"));
+  ASSERT_TRUE(comp("\"HAnnibal\"@en", "\"Hannibal\"@en"));
+
+  // TODO<joka921>: test cases for UTF-8
 
   // something is not smaller thant itself
   ASSERT_FALSE(comp("beta", "beta"));