ad-freiburg · joka921 · Mar 15, 2019 · Apr 4, 2019
diff --git a/src/engine/Filter.cpp b/src/engine/Filter.cpp
@@ -3,6 +3,7 @@
 // Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de)
 
 #include "Filter.h"
+#include <algorithm>
 #include <optional>
 #include <regex>
 #include <sstream>
@@ -504,6 +505,41 @@ void Filter::computeResultFixedValue(
         rhs_string = ad_utility::convertValueLiteralToIndexWord(rhs_string);
       } else if (ad_utility::isNumeric(_rhs)) {
         rhs_string = ad_utility::convertNumericToIndexWord(rhs_string);
+      } else {
+        if (getIndex().getVocab().getCaseInsensitiveOrdering()) {
+          // We have to move to the correct end of the
+          // "same letters but different case" - range
+          // to make the filters work
+          // TODO<kalmbach, schnelle>: thoroughly test this
+          // (End-To-End or unit tests? probably both but the unit tests
+          // would also be in an end-to-end fashion for those nested
+          // mechanisms).
+          switch (_type) {
+            case SparqlFilter::GE:
+            case SparqlFilter::LT: {
+              rhs_string = ad_utility::getUppercaseUtf8(rhs_string);
+              auto split = StringSortComparator::extractComparable(rhs_string);
+              if (split.isLiteral && !split.langtag.empty()) {
+                // get rid of possible langtags to move to the beginning of the
+                // range
+                rhs_string = '\"' + std::string(split.val) + '\"';
+              }
+            }
+
+            break;
+            case SparqlFilter::GT:
+            case SparqlFilter::LE: {
+              rhs_string = ad_utility::getLowercaseUtf8(rhs_string);
+              auto split2 = StringSortComparator::extractComparable(rhs_string);
+              if (split2.isLiteral) {
+                rhs_string =
+                    '\"' + std::string(split2.val) + '\"' + "@" + char(127);
+              }
+            } break;
+            default:
+              break;
+          }
+        }
       }
       if (_type == SparqlFilter::EQ || _type == SparqlFilter::NE) {
         if (!getIndex().getVocab().getId(_rhs, &rhs)) {

diff --git a/src/index/ConstantsIndexCreation.h b/src/index/ConstantsIndexCreation.h
@@ -50,3 +50,6 @@ static const size_t THRESHOLD_RELATION_CREATION = 2 << 20;
 // ________________________________________________________________
 static const std::string PARTIAL_VOCAB_FILE_NAME = ".partial-vocabulary";
 static const std::string PARTIAL_MMAP_IDS = ".partial-ids-mmap";
+
+// ________________________________________________________________
+static const std::string TMP_BASENAME_COMPRESSION = ".tmp.compression_index";
diff --git a/src/index/Index.cpp b/src/index/Index.cpp
@@ -82,7 +82,15 @@ void Index::createFromFile(const string& filename) {
   string vocabFileTmp = _onDiskBase + ".vocabularyTmp";
   std::vector<string> prefixes;
   if (_vocabPrefixCompressed) {
-    prefixes = calculatePrefixes(vocabFile, NUM_COMPRESSION_PREFIXES, 1, true);
+    string vocabFileForPrefixCalculation = vocabFile;
+    if (_vocab.getCaseInsensitiveOrdering()) {
+      // we have to use the "normally" sorted vocabulary for the prefix
+      // compression;
+      vocabFileForPrefixCalculation =
+          _onDiskBase + TMP_BASENAME_COMPRESSION + ".vocabulary";
+    }
+    prefixes = calculatePrefixes(vocabFileForPrefixCalculation,
+                                 NUM_COMPRESSION_PREFIXES, 1, true);
     std::ofstream prefixFile(_onDiskBase + PREFIX_FILE);
     AD_CHECK(prefixFile.is_open());
     for (const auto& prefix : prefixes) {
@@ -165,18 +173,7 @@ VocabularyData Index::passFileForVocabulary(const string& filename,
       LOG(INFO) << "Lines (from KB-file) processed: " << i << '\n';
     }
     if (i % linesPerPartial == 0) {
-      LOG(INFO) << "Lines (from KB-file) processed: " << i << '\n';
-      LOG(INFO) << "Actual number of Triples in this section (include "
-                   "langfilter triples): "
-                << actualCurrentPartialSize << '\n';
-      string partialFilename =
-          _onDiskBase + PARTIAL_VOCAB_FILE_NAME + std::to_string(numFiles);
-
-      LOG(INFO) << "writing partial vocabulary to " << partialFilename
-                << std::endl;
-      LOG(INFO) << "it contains " << items.size() << " elements\n";
-      writePartialIdMapToBinaryFileForMerging(items, partialFilename);
-      LOG(INFO) << "Done\n";
+      writeNextPartialVocabulary(i, numFiles, actualCurrentPartialSize, items);
       numFiles++;
       // Save the information how many triples this partial vocabulary actually
       // deals with we will use this later for mapping from partial to global
@@ -191,34 +188,45 @@ VocabularyData Index::passFileForVocabulary(const string& filename,
   }
   // deal with remainder
   if (items.size() > 0) {
-    LOG(INFO) << "Lines processed: " << i << '\n';
-    LOG(INFO) << "Actual number of Triples in this section: "
-              << actualCurrentPartialSize << '\n';
-    string partialFilename =
-        _onDiskBase + PARTIAL_VOCAB_FILE_NAME + std::to_string(numFiles);
-
-    LOG(INFO) << "writing partial vocabular to " << partialFilename
-              << std::endl;
-    LOG(INFO) << "it contains " << items.size() << " elements\n";
-    writePartialIdMapToBinaryFileForMerging(items, partialFilename);
-    LOG(INFO) << "Done\n";
+    writeNextPartialVocabulary(i, numFiles, actualCurrentPartialSize, items);
     numFiles++;
     actualPartialSizes.push_back(actualCurrentPartialSize);
   }
   writer.finish();
 
+  std::future<void> tmpVocFut;
+  if (_vocabPrefixCompressed && _vocab.getCaseInsensitiveOrdering()) {
+    LOG(INFO) << "Merging temporary vocabulary for prefix compression";
+    Id tmp1, tmp2;
+    auto f = [this, numFiles, &tmp1, &tmp2]() {
+      mergeVocabulary(_onDiskBase + TMP_BASENAME_COMPRESSION, numFiles, &tmp1,
+                      &tmp2, StringSortComparator(false));
+    };
+    tmpVocFut = std::async(f);
+    LOG(INFO) << "Pass done.\n";
+  }
+
   LOG(INFO) << "Merging vocabulary\n";
   VocabularyData res;
-  res.nofWords = mergeVocabulary(_onDiskBase, numFiles, &res.langPredLowerBound,
-                                 &res.langPredUpperBound);
+  res.nofWords =
+      mergeVocabulary(_onDiskBase, numFiles, &res.langPredLowerBound,
+                      &res.langPredUpperBound, _vocab.getCaseComparator());
+  res.idTriples = std::move(idTriples);
+  res.actualPartialSizes = std::move(actualPartialSizes);
+  LOG(INFO) << "Finished Merging Vocabulary.\n";
+
+  // if we had to create the additional vocabulary, wait for its completion
+  if (tmpVocFut.valid()) {
+    tmpVocFut.get();
+    LOG(INFO) << "Finished merging additional Vocabulary.";
+  }
+
   for (size_t i = 0; i < numFiles; ++i) {
     string partialFilename =
         _onDiskBase + PARTIAL_VOCAB_FILE_NAME + std::to_string(i);
     deleteTemporaryFile(partialFilename);
   }
-  res.idTriples = std::move(idTriples);
-  res.actualPartialSizes = std::move(actualPartialSizes);
-  LOG(INFO) << "Pass done.\n";
+
   return res;
 }
 
@@ -1269,6 +1277,10 @@ void Index::readConfiguration() {
         _configurationJson["prefixes-external"]);
   }
 
+  if (_configurationJson.count("ignore-case")) {
+    _vocab.setCaseInsensitiveOrdering(_configurationJson["ignore-case"]);
+  }
+
   if (_configurationJson.find("languages-internal") !=
       _configurationJson.end()) {
     _vocab.initializeInternalizedLangs(
@@ -1311,6 +1323,11 @@ void Index::initializeVocabularySettingsBuild() {
     _configurationJson["prefixes-external"] = j["prefixes-external"];
   }
 
+  if (j.count("ignore-case")) {
+    _vocab.setCaseInsensitiveOrdering(j["ignore-case"]);
+    _configurationJson["ignore-case"] = j["ignore-case"];
+  }
+
   if (j.find("languages-internal") != j.end()) {
     _vocab.initializeInternalizedLangs(j["languages-internal"]);
     _configurationJson["languages-internal"] = j["languages-internal"];
@@ -1329,3 +1346,45 @@ Id Index::assignNextId(Map* mapPtr, const string& key) {
     return map[key];
   }
 }
+
+// ___________________________________________________________________________
+void Index::writeNextPartialVocabulary(
+    size_t numLines, size_t numFiles, size_t actualCurrentPartialSize,
+    const ad_utility::HashMap<string, Id>& items) {
+  LOG(INFO) << "Lines (from KB-file) processed: " << numLines << '\n';
+  LOG(INFO) << "Actual number of Triples in this section (include "
+               "langfilter triples): "
+            << actualCurrentPartialSize << '\n';
+  std::future<void> fut1, fut2;
+  string partialFilename =
+      _onDiskBase + PARTIAL_VOCAB_FILE_NAME + std::to_string(numFiles);
+
+  LOG(INFO) << "writing partial vocabulary to " << partialFilename << std::endl;
+  LOG(INFO) << "it contains " << items.size() << " elements\n";
+  fut1 = std::async([this, &items, partialFilename]() {
+    writePartialIdMapToBinaryFileForMerging(items, partialFilename,
+                                            _vocab.getCaseComparator());
+  });
+
+  if (_vocabPrefixCompressed && _vocab.getCaseInsensitiveOrdering()) {
+    // we also have to create the "ordinary" vocabulary order to make the
+    // prefix compression work
+    string partialTmpFilename = _onDiskBase + TMP_BASENAME_COMPRESSION +
+                                PARTIAL_VOCAB_FILE_NAME +
+                                std::to_string(numFiles);
+    LOG(INFO) << "writing partial temporary vocabulary to "
+              << partialTmpFilename << std::endl;
+    LOG(INFO) << "it contains " << items.size() << " elements\n";
+    fut2 = std::async([&items, partialTmpFilename]() {
+      writePartialIdMapToBinaryFileForMerging(items, partialTmpFilename,
+                                              StringSortComparator(false));
+    });
+  }
+  if (fut1.valid()) {
+    fut1.get();
+  }
+  if (fut2.valid()) {
+    fut2.get();
+  }
+  LOG(INFO) << "Done.";
+}
diff --git a/src/index/Index.h b/src/index/Index.h
@@ -494,6 +494,24 @@ class Index {
   VocabularyData passFileForVocabulary(const string& ntFile,
                                        size_t linesPerPartial = 100000000);
 
+  /**
+   * @brief Everything that has to be done when we have seen all the triples
+   * that belong to one partial vocabulary, including Log output used inside
+   * passFileForVocabulary
+   *
+   * @param numLines How many Lines from the KB have we already parsed (only for
+   * Logging)
+   * @param numFiles How many partial vocabularies have we seen before/which is
+   * the index of the voc we are going to write
+   * @param actualCurrentPartialSize How many triples belong to this partition
+   * (including extra langfilter triples)
+   * @param items Contains our unsorted vocabulary. Maps words to their local
+   * ids within this vocabulary.
+   */
+  void writeNextPartialVocabulary(size_t numLines, size_t numFiles,
+                                  size_t actualCurrentPartialSize,
+                                  const ad_utility::HashMap<string, Id>& items);
+
   void convertPartialToGlobalIds(TripleVec& data,
                                  const vector<size_t>& actualLinesPerPartial,
                                  size_t linesPerPartial);

diff --git a/src/index/IndexBuilderMain.cpp b/src/index/IndexBuilderMain.cpp
@@ -207,6 +207,8 @@ int main(int argc, char** argv) {
         cout << endl
              << "! ERROR in processing options (getopt returned '" << c
              << "' = 0x" << std::setbase(16) << c << ")" << endl
+             << "Corresponding ascii option : -" << std::string(1, c) << endl
+             << "This is either an unsupported option or there was an error"
              << endl;
         exit(1);
     }