First compress the vocabulary, then build the permutations (#541)

Also improved various log messages along the way.
ad-freiburg · Jan 13, 2022 · 634d84a · 634d84a
1 parent 2e43053
commit 634d84a
Show file tree

Hide file tree

Showing 6 changed files with 72 additions and 48 deletions.
diff --git a/src/TurtleParserMain.cpp b/src/TurtleParserMain.cpp
@@ -92,7 +92,7 @@ void writeNTDispatch(std::ostream& out, const string& fileFormat,
   if (regexEngine == "re2") {
     writeNT<Tokenizer>(out, fileFormat, filename);
   } else if (regexEngine == "ctre") {
-    LOG(WARN) << WARNING_ASCII_ONLY_PREFIXES;
+    LOG(INFO) << WARNING_ASCII_ONLY_PREFIXES << std::endl;
     writeNT<TokenizerCtre>(out, fileFormat, filename);
   } else {
     LOG(ERROR)

diff --git a/src/index/Index.cpp b/src/index/Index.cpp
@@ -84,37 +84,31 @@ void Index::createFromFile(const string& filename) {
     vocabData = createIdTriplesAndVocab<Parser>(filename);
   }
 
-  // also perform unique for first permutation
-  createPermutationPair<IndexMetaDataHmapDispatcher>(&vocabData, _PSO, _POS,
-                                                     true);
-  // also create Patterns after the Spo permutation if specified
-  createPermutationPair<IndexMetaDataMmapDispatcher>(&vocabData, _SPO, _SOP,
-                                                     false, _usePatterns);
-  createPermutationPair<IndexMetaDataMmapDispatcher>(&vocabData, _OSP, _OPS);
-
-  // if we have no compression, this will also copy the whole vocabulary.
+  // If we have no compression, this will also copy the whole vocabulary.
   // but since we expect compression to be the default case, this  should not
-  // hurt
+  // hurt.
   string vocabFile = _onDiskBase + ".vocabulary";
   string vocabFileTmp = _onDiskBase + ".vocabularyTmp";
   std::vector<string> prefixes;
-  LOG(INFO) << "Finished writing permutations" << std::endl;
   if (_vocabPrefixCompressed) {
-    // we have to use the "normally" sorted vocabulary for the prefix
-    // compression;
+    // We have to use the "normally" sorted vocabulary for the prefix
+    // compression.
     std::string vocabFileForPrefixCalculation =
         _onDiskBase + TMP_BASENAME_COMPRESSION + ".vocabulary";
     prefixes = calculatePrefixes(vocabFileForPrefixCalculation,
                                  NUM_COMPRESSION_PREFIXES, 1, true);
+    deleteTemporaryFile(vocabFileForPrefixCalculation);
     std::ofstream prefixFile(_onDiskBase + PREFIX_FILE);
     AD_CHECK(prefixFile.is_open());
     for (const auto& prefix : prefixes) {
       prefixFile << prefix << std::endl;
     }
   }
   _configurationJson["prefixes"] = _vocabPrefixCompressed;
+  LOG(INFO) << "Writing compressed vocabulary to disk" << std::endl;
   Vocabulary<CompressedString, TripleComponentComparator>::prefixCompressFile(
       vocabFile, vocabFileTmp, prefixes);
+  LOG(INFO) << "Finished writing compressed vocabulary" << std::endl;
 
   // TODO<joka921> maybe move this to its own function
   if (std::rename(vocabFileTmp.c_str(), vocabFile.c_str())) {
@@ -123,10 +117,27 @@ void Index::createFromFile(const string& filename) {
               << ". Terminating..." << std::endl;
     AD_CHECK(false);
   }
+
+  // Write the configuration already at this point, so we have it available in
+  // case any of the permutations fail.
   writeConfiguration();
+
+  // For the first permutation, perform a unique.
+  createPermutationPair<IndexMetaDataHmapDispatcher>(&vocabData, _PSO, _POS,
+                                                     PerformUnique::True);
+  // After the SPO permutation, create patterns if so desired.
+  createPermutationPair<IndexMetaDataMmapDispatcher>(
+      &vocabData, _SPO, _SOP, PerformUnique::False, _usePatterns);
+  createPermutationPair<IndexMetaDataMmapDispatcher>(&vocabData, _OSP, _OPS);
+  LOG(INFO) << "Finished writing permutations" << std::endl;
+
+  // Dump the configuration again in case the permutations have added some
+  // information.
+  writeConfiguration();
+  LOG(INFO) << "Index build completed" << std::endl;
 }
 
-// explicit instantiations
+// Explicit instantiations.
 template void Index::createFromFile<TsvParser>(const string& filename);
 template void Index::createFromFile<TurtleStreamParser<Tokenizer>>(
     const string& filename);
@@ -248,13 +259,14 @@ VocabularyData Index::passFileForVocabulary(const string& filename,
   LOG(INFO) << "Pass done." << endl;
 
   if (_vocabPrefixCompressed) {
-    LOG(INFO) << "Merging temporary vocabulary for prefix compression";
+    LOG(INFO) << "Merging temporary vocabulary for prefix compression"
+              << std::endl;
     {
       VocabularyMerger m;
-      m._ignoreExternalVocabulary = true;
+      m._noIdMapsAndIgnoreExternalVocab = true;
       m.mergeVocabulary(_onDiskBase + TMP_BASENAME_COMPRESSION, numFiles,
                         std::less<>());
-      LOG(INFO) << "Finished merging additional Vocabulary.";
+      LOG(INFO) << "Finished merging additional vocabulary" << std::endl;
     }
   }
 
@@ -268,7 +280,7 @@ VocabularyData Index::passFileForVocabulary(const string& filename,
 
     return v.mergeVocabulary(_onDiskBase, numFiles, sortPred);
   }();
-  LOG(INFO) << "Finished Merging Vocabulary.\n";
+  LOG(INFO) << "Finished merging vocabulary\n";
   VocabularyData res;
   res.nofWords = mergeRes._numWordsTotal;
   res.langPredLowerBound = mergeRes._langPredLowerBound;
@@ -301,10 +313,6 @@ void Index::convertPartialToGlobalIds(
   // iterate over all partial vocabularies
   for (size_t partialNum = 0; partialNum < actualLinesPerPartial.size();
        partialNum++) {
-    LOG(INFO) << "Lines processed: " << i << '\n';
-    LOG(INFO) << "Corresponding number of statements in original knowledgeBase:"
-              << linesPerPartial * partialNum << '\n';
-
     std::string mmapFilename(_onDiskBase + PARTIAL_MMAP_IDS +
                              std::to_string(partialNum));
     LOG(INFO) << "Reading IdMap from " << mmapFilename << " ...\n";
@@ -337,9 +345,12 @@ void Index::convertPartialToGlobalIds(
         LOG(INFO) << "Lines processed: " << i << '\n';
       }
     }
+    LOG(INFO) << "Lines processed: " << i << '\n';
+    LOG(DEBUG)
+        << "Corresponding number of statements in original knowledge base: "
+        << linesPerPartial * (partialNum + 1) << '\n';
   }
-  LOG(INFO) << "Lines processed: " << i << '\n';
-  LOG(INFO) << "Pass done.\n";
+  LOG(INFO) << "Pass done\n";
 }
 
 // _____________________________________________________________________________
@@ -473,13 +484,13 @@ Index::createPermutations(
         p1,
     const PermutationImpl<Comparator2, typename MetaDataDispatcher::ReadType>&
         p2,
-    bool performUnique) {
-  LOG(INFO) << "Sorting for " << p1._readableName << " permutation..."
+    PerformUnique performUnique) {
+  LOG(INFO) << "Sorting for " << p1._readableName << " permutation"
             << std::endl;
   stxxl::sort(begin(*vec), end(*vec), p1._comp, STXXL_MEMORY_TO_USE);
   LOG(INFO) << "Sort done." << std::endl;
 
-  if (performUnique) {
+  if (performUnique == PerformUnique::True) {
     // this only has to be done for the first permutation (PSO)
     LOG(INFO) << "Removing duplicate triples as these are not supported in RDF"
               << std::endl;
@@ -499,18 +510,18 @@ Index::createPermutations(
 // ________________________________________________________________________
 template <class MetaDataDispatcher, class Comparator1, class Comparator2>
 void Index::createPermutationPair(
-    VocabularyData* vocabData,
+    VocabularyData* vocabularyData,
     const PermutationImpl<Comparator1, typename MetaDataDispatcher::ReadType>&
         p1,
     const PermutationImpl<Comparator2, typename MetaDataDispatcher::ReadType>&
         p2,
-    bool performUnique, bool createPatternsAfterFirst) {
+    PerformUnique performUnique, bool createPatternsAfterFirst) {
   auto metaData = createPermutations<MetaDataDispatcher>(
-      &(*vocabData->idTriples), p1, p2, performUnique);
+      &(*vocabularyData->idTriples), p1, p2, performUnique);
   if (createPatternsAfterFirst) {
     // the second permutation does not alter the original triple vector,
     // so this does still work.
-    createPatterns(true, vocabData);
+    createPatterns(true, vocabularyData);
   }
   if (metaData) {
     LOG(INFO) << "Exchanging Multiplicities for " << p1._readableName << " and "
@@ -1333,7 +1344,7 @@ void Index::initializeVocabularySettingsBuild() {
     if constexpr (std::is_same_v<std::decay_t<Parser>, TurtleParserAuto>) {
       bool v{j["ascii-prefixes-only"]};
       if (v) {
-        LOG(WARN) << WARNING_ASCII_ONLY_PREFIXES;
+        LOG(INFO) << WARNING_ASCII_ONLY_PREFIXES << std::endl;
         _onlyAsciiTurtlePrefixes = true;
       } else {
         _onlyAsciiTurtlePrefixes = false;

diff --git a/src/index/Index.h b/src/index/Index.h
@@ -521,14 +521,17 @@ class Index {
   // createPatternsAfterFirst is only valid when  the pair is SPO-SOP because
   // the SPO permutation is also needed for patterns (see usage in
   // Index::createFromFile function)
+
+  enum class PerformUnique { True, False };
   template <class MetaDataDispatcher, class Comparator1, class Comparator2>
   void createPermutationPair(
-      VocabularyData* vec,
+      VocabularyData* vocabularyData,
       const PermutationImpl<Comparator1, typename MetaDataDispatcher::ReadType>&
           p1,
       const PermutationImpl<Comparator2, typename MetaDataDispatcher::ReadType>&
           p2,
-      bool performUnique = false, bool createPatternsAfterFirst = false);
+      PerformUnique performUnique = PerformUnique::False,
+      bool createPatternsAfterFirst = false);
 
   // The pairs of permutations are PSO-POS, OSP-OPS and SPO-SOP
   // the multiplicity of column 1 in partner 1 of the pair is equal to the
@@ -556,7 +559,7 @@ class Index {
           p1,
       const PermutationImpl<Comparator2, typename MetaDataDispatcher::ReadType>&
           p2,
-      bool performUnique);
+      PerformUnique performUnique);
 
   /**
    * @brief Creates the data required for the "pattern-trick" used for fast

diff --git a/src/index/PrefixHeuristic.cpp b/src/index/PrefixHeuristic.cpp
@@ -182,7 +182,7 @@ std::vector<string> calculatePrefixes(const string& filename,
   size_t totalSavings = 0;
   size_t numWords = 0;
 
-  LOG(INFO) << "start reading words and building prefix tree..." << std::endl;
+  LOG(INFO) << "Start reading words and building prefix tree" << std::endl;
   // insert all prefix candidates into  the tree
   while (std::getline(ifs, nextWord)) {
     nextWord = RdfEscaping::unescapeNewlinesAndBackslashes(nextWord);
@@ -205,7 +205,7 @@ std::vector<string> calculatePrefixes(const string& filename,
     }
   }
 
-  LOG(INFO) << "Finished building prefix tree!" << std::endl;
+  LOG(INFO) << "Finished building prefix tree" << std::endl;
   LOG(INFO) << "Start searching for maximal compressing prefixes" << std::endl;
   std::vector<string> res;
   res.reserve(numPrefixes);
@@ -215,8 +215,8 @@ std::vector<string> calculatePrefixes(const string& filename,
       break;
     }
     totalSavings += p.first;
-    LOG(INFO) << "Found prefix " << p.second
-              << " with number of bytes gained: " << p.first << std::endl;
+    LOG(DEBUG) << "Found prefix " << p.second
+               << " with number of bytes gained: " << p.first << std::endl;
     res.push_back(std::move(p.second));
   }
   // if we always add an encoding we have calculated with a codelength of 0 so
@@ -225,9 +225,9 @@ std::vector<string> calculatePrefixes(const string& filename,
     totalSavings -= codelength * numWords;
   }
   double efficiency = static_cast<double>(totalSavings) / totalChars;
-  std::cout << "total number of bytes : " << totalChars << std::endl;
-  std::cout << "total chars compressed : " << totalSavings << '\n';
-  std::cout << "percentage of chars compressed : " << efficiency << std::endl;
+  LOG(INFO) << "Total number of bytes : " << totalChars << std::endl;
+  LOG(INFO) << "Total chars compressed : " << totalSavings << '\n';
+  LOG(INFO) << "Percentage of chars compressed : " << efficiency << std::endl;
   return res;
 }
 

diff --git a/src/index/VocabularyGenerator.h b/src/index/VocabularyGenerator.h
@@ -28,7 +28,10 @@ using TripleVec = stxxl::vector<array<Id, 3>>;
  */
 class VocabularyMerger {
  public:
-  bool _ignoreExternalVocabulary = false;
+  // If this is set, then we will only output the internal vocabulary.
+  // This is useful for the prefix compression, where we don't need the
+  // external part of the vocabulary and the mapping from local to global IDs.
+  bool _noIdMapsAndIgnoreExternalVocab = false;
   // result of a call to mergeVocabulary
   struct VocMergeRes {
     size_t _numWordsTotal;   // that many distinct words were found (size of the

diff --git a/src/index/VocabularyGeneratorImpl.h b/src/index/VocabularyGeneratorImpl.h
@@ -40,8 +40,10 @@ VocabularyMerger::VocMergeRes VocabularyMerger::mergeVocabulary(
 
   _outfile.open(basename + ".vocabulary");
   AD_CHECK(_outfile.is_open());
-  _outfileExternal.open(basename + EXTERNAL_LITS_TEXT_FILE_NAME);
-  AD_CHECK(_outfileExternal.is_open());
+  if (!_noIdMapsAndIgnoreExternalVocab) {
+    _outfileExternal.open(basename + EXTERNAL_LITS_TEXT_FILE_NAME);
+    AD_CHECK(_outfileExternal.is_open());
+  }
   std::vector<bool> endOfFile(numFiles, false);
 
   // Priority queue for the k-way merge
@@ -52,7 +54,9 @@ VocabularyMerger::VocMergeRes VocabularyMerger::mergeVocabulary(
   for (size_t i = 0; i < numFiles; i++) {
     infiles.emplace_back(basename + PARTIAL_VOCAB_FILE_NAME +
                          std::to_string(i));
-    _idVecs.emplace_back(0, basename + PARTIAL_MMAP_IDS + std::to_string(i));
+    if (!_noIdMapsAndIgnoreExternalVocab) {
+      _idVecs.emplace_back(0, basename + PARTIAL_MMAP_IDS + std::to_string(i));
+    }
     AD_CHECK(infiles.back().is_open());
 
     // read the first entry of the vocabulary and add it to the queue
@@ -78,7 +82,7 @@ VocabularyMerger::VocMergeRes VocabularyMerger::mergeVocabulary(
   while (!queue.empty()) {
     // for the prefix compression vocabulary, we don't need the external
     // vocabulary
-    if (_ignoreExternalVocabulary &&
+    if (_noIdMapsAndIgnoreExternalVocab &&
         queue.top()._value >= EXTERNALIZED_LITERALS_PREFIX) {
       break;
     }
@@ -245,6 +249,9 @@ void VocabularyMerger::writeQueueWordsToIdVec(
 // ____________________________________________________________________________________________________________
 void VocabularyMerger::doActualWrite(
     const std::vector<std::pair<size_t, std::pair<size_t, size_t>>>& buffer) {
+  if (_noIdMapsAndIgnoreExternalVocab) {
+    return;
+  }
   for (const auto& [id, value] : buffer) {
     _idVecs[id].push_back(value);
   }