Merge pull request #302 from joka921/f.pipelinedIndexBuild

Speeding up the first phase of Index Building
ad-freiburg · Jan 30, 2020 · 335deb2 · 335deb2
2 parents 5e70bed + 3783f38
commit 335deb2
Show file tree

Hide file tree

Showing 38 changed files with 2,000 additions and 377 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -7,3 +7,6 @@
 [submodule "third_party/re2"]
 	path = third_party/re2
 	url = https://github.com/google/re2.git
+[submodule "third_party/abseil-cpp"]
+	path = third_party/abseil-cpp
+	url = https://github.com/abseil/abseil-cpp.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -65,6 +65,13 @@ include_directories(third_party/googletest/googletest/include)
 # Header only, nothing to include
 include_directories(third_party/json/)
 
+################################
+# ABSEIL
+################################
+set(BUILD_TESTING OFF CACHE BOOL "Don't build tests for abseil" FORCE)
+add_subdirectory(third_party/abseil-cpp)
+include_directories(third_party/abseil-cpp/)
+
 if (USE_PARALLEL)
     include(FindOpenMP)
     if(OPENMP_FOUND)
@@ -163,7 +170,7 @@ add_executable(PrefixHeuristicEvaluatorMain src/PrefixHeuristicEvaluatorMain.cpp
 target_link_libraries (PrefixHeuristicEvaluatorMain index ${CMAKE_THREAD_LIBS_INIT})
 
 add_executable(TurtleParserMain src/TurtleParserMain.cpp)
-target_link_libraries(TurtleParserMain parser ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(TurtleParserMain parser ${CMAKE_THREAD_LIBS_INIT} absl::flat_hash_map)
 
 add_executable(VocabularyMergerMain src/VocabularyMergerMain.cpp)
 target_link_libraries(VocabularyMergerMain index ${CMAKE_THREAD_LIBS_INIT})

diff --git a/e2e/e2e-build-settings.json b/e2e/e2e-build-settings.json
@@ -0,0 +1,4 @@
+{
+  "num-triples-per-partial-vocab" : 40000,
+  "parser-batch-size" : 1000
+}
diff --git a/e2e/e2e.sh b/e2e/e2e.sh
@@ -61,6 +61,7 @@ if [ "$1" != "no-index" ]; then
 	./IndexBuilderMain -l -i "$INDEX" \
 	    -F ttl \
 		-f "$INPUT.nt" \
+		-s "$PROJECT_DIR/e2e/e2e-build-settings.json" \
 		-w "$INPUT.wordsfile.tsv" \
 		-d "$INPUT.docsfile.tsv" || bail "Building Index failed"
 	popd

diff --git a/misc/check_binary_index_equality.sh b/misc/check_binary_index_equality.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+#compare two indices for binary equality. Can be used for regression testing of the Index Building procedure
+
+if [ "$#" -ne 2 ]
+then
+  echo "Usage: $0 <indexPrefix1> <indexPrefix2>"
+  echo "Passed $# command line arguments"
+  exit 1
+fi
+
+
+
+for i in .index.pso .index.pos \
+         .index.spo .index.spo.meta-mmap .index.sop .index.spo.meta-mmap \
+         .index.osp .index.osp.meta-mmap .index.ops .index.ops.meta-mmap \
+         .index.patterns .prefixes .vocabulary .meta-data.json .literals-index
+do
+  f1=$1$i
+  f2=$2$i
+  echo "Comparing $f1 and $f2"
+  if  cmp $f1 $f2
+  then
+    echo "$f1 and $f2 match, continuing"
+  else
+    echo "Error, $f1 and $f2 are not equal"
+    #exit 1
+  fi
+
+done
+
+echo "Indices with prefixes $1 and $2 are binary equal"
diff --git a/src/engine/QueryExecutionTree.cpp b/src/engine/QueryExecutionTree.cpp
@@ -164,13 +164,11 @@ size_t QueryExecutionTree::getSizeEstimate() {
   if (_sizeEstimate == std::numeric_limits<size_t>::max()) {
     if (_cachedResult && _cachedResult->status() == ResultTable::FINISHED) {
       _sizeEstimate = _cachedResult->size();
-    } else if (_qec) {
-      _sizeEstimate = _rootOperation->getSizeEstimate();
     } else {
-      // For test cases without index only:
-      // Make it deterministic by using the asString.
-      _sizeEstimate =
-          1000 + std::hash<string>{}(_rootOperation->asString()) % 1000;
+      // if we are in a unit test setting and there is no QueryExecutionContest
+      // specified it is the _rootOperation's obligation to handle this case
+      // correctly
+      _sizeEstimate = _rootOperation->getSizeEstimate();
     }
   }
   return _sizeEstimate;

diff --git a/src/engine/QueryPlanner.cpp b/src/engine/QueryPlanner.cpp
@@ -89,17 +89,8 @@ QueryExecutionTree QueryPlanner::createExecutionTree(ParsedQuery& pq) {
   vector<SubtreePlan>& lastRow = plans.back();
 
   AD_CHECK_GT(lastRow.size(), 0);
+  auto minInd = findCheapestExecutionTree(lastRow);
 
-  size_t minCost = lastRow[0].getCostEstimate();
-  size_t minInd = 0;
-
-  for (size_t i = 1; i < lastRow.size(); ++i) {
-    size_t thisCost = lastRow[i].getCostEstimate();
-    if (thisCost < minCost) {
-      minCost = thisCost;
-      minInd = i;
-    }
-  }
   lastRow[minInd]._isOptional = pq._rootGraphPattern->_optional;
 
   SubtreePlan final = lastRow[minInd];
@@ -320,16 +311,7 @@ std::vector<QueryPlanner::SubtreePlan> QueryPlanner::optimize(
     if (pattern == rootPattern) {
       return lastRow;
     } else {
-      AD_CHECK_GT(lastRow.size(), 0);
-      size_t minCost = lastRow[0].getCostEstimate();
-      size_t minInd = 0;
-      for (size_t i = 1; i < lastRow.size(); ++i) {
-        size_t thisCost = lastRow[i].getCostEstimate();
-        if (thisCost < minCost) {
-          minCost = lastRow[i].getCostEstimate();
-          minInd = i;
-        }
-      }
+      auto minInd = findCheapestExecutionTree(lastRow);
       lastRow[minInd]._isOptional = pattern->_optional;
       patternPlans[pattern->_id] = lastRow[minInd];
     }
@@ -2114,21 +2096,12 @@ vector<QueryPlanner::SubtreePlan> QueryPlanner::merge(
   // as key.
   LOG(TRACE) << "Pruning...\n";
   vector<SubtreePlan> prunedPlans;
-  size_t nofCandidates = 0;
-  for (auto it = candidates.begin(); it != candidates.end(); ++it) {
-    size_t minCost = std::numeric_limits<size_t>::max();
-    size_t minIndex = 0;
-    for (size_t i = 0; i < it->second.size(); ++i) {
-      ++nofCandidates;
-      if (it->second[i].getCostEstimate() < minCost) {
-        minCost = it->second[i].getCostEstimate();
-        minIndex = i;
-      }
-    }
-    prunedPlans.push_back(it->second[minIndex]);
+  for (const auto& [key, value] : candidates) {
+    (void)key;  // silence unused warning
+    size_t minIndex = findCheapestExecutionTree(value);
+    prunedPlans.push_back(value[minIndex]);
   }
-  LOG(TRACE) << "Got " << prunedPlans.size() << " pruned plans from "
-             << nofCandidates << " candidates.\n";
+  LOG(TRACE) << "Got " << prunedPlans.size() << " pruned plans from \n";
   return prunedPlans;
 }
 
@@ -2904,3 +2877,35 @@ QueryPlanner::createVariableColumnsMapForTextOperation(
 void QueryPlanner::setEnablePatternTrick(bool enablePatternTrick) {
   _enablePatternTrick = enablePatternTrick;
 }
+
+// _________________________________________________________________________________
+size_t QueryPlanner::findCheapestExecutionTree(
+    const std::vector<SubtreePlan>& lastRow) const {
+  AD_CHECK_GT(lastRow.size(), 0);
+  size_t minCost = std::numeric_limits<size_t>::max();
+  size_t minInd = 0;
+  LOG(TRACE) << "\nFinding the cheapest row in the optimizer\n";
+  for (size_t i = 0; i < lastRow.size(); ++i) {
+    [[maybe_unused]] auto repr = lastRow[i]._qet->asString();
+    std::transform(repr.begin(), repr.end(), repr.begin(),
+                   [](char c) { return c == '\n' ? ' ' : c; });
+
+    size_t thisSize = lastRow[i].getSizeEstimate();
+    size_t thisCost = lastRow[i].getCostEstimate();
+    LOG(TRACE) << "Estimated cost and size  of " << thisCost << " " << thisSize
+               << " for Tree " << repr << '\n';
+    if (thisCost < minCost) {
+      minCost = lastRow[i].getCostEstimate();
+      minInd = i;
+    }
+    // make the tiebreaking deterministic for the UnitTests. The asString
+    // should never be on a hot code path in practice.
+    else if (thisCost == minCost && isInTestMode() &&
+             lastRow[i]._qet->asString() < lastRow[minInd]._qet->asString()) {
+      minCost = lastRow[i].getCostEstimate();
+      minInd = i;
+    }
+  }
+  LOG(TRACE) << "Finished\n";
+  return minInd;
+};
diff --git a/src/engine/QueryPlanner.h b/src/engine/QueryPlanner.h
@@ -323,4 +323,18 @@ class QueryPlanner {
    */
   bool checkUsePatternTrick(ParsedQuery* pq,
                             SparqlTriple* patternTrickTriple) const;
+
+  /**
+   * @brief return the index of the cheapest execution tree in the argument.
+   *
+   * If we are in the unit test mode, this is deterministic by additionally
+   * sorting by the cache key when comparing equally cheap indices, else the
+   * first element that has the minimum index is returned.
+   */
+  size_t findCheapestExecutionTree(
+      const std::vector<SubtreePlan>& lastRow) const;
+
+  /// if this Planner is not associated with a queryExecutionContext we are only
+  /// in the unit test mode
+  [[nodiscard]] bool isInTestMode() const { return _qec == nullptr; }
 };
diff --git a/src/index/CMakeLists.txt b/src/index/CMakeLists.txt
@@ -14,7 +14,7 @@ add_library(index
         FTSAlgorithms.cpp FTSAlgorithms.h
         PrefixHeuristic.cpp PrefixHeuristic.h)
 
-target_link_libraries(index parser ${STXXL_LIBRARIES} ${ICU_LIBRARIES})
+target_link_libraries(index parser ${STXXL_LIBRARIES} ${ICU_LIBRARIES} absl::flat_hash_map)
 
 add_library(metaConverter
             MetaDataConverter.cpp MetaDataConverter.h)

diff --git a/src/index/ConstantsIndexCreation.h b/src/index/ConstantsIndexCreation.h
@@ -53,3 +53,10 @@ static const std::string PARTIAL_MMAP_IDS = ".partial-ids-mmap";
 
 // ________________________________________________________________
 static const std::string TMP_BASENAME_COMPRESSION = ".tmp.compression_index";
+
+// _________________________________________________________________
+// The degree of parallelism that is used for IndexBuilding step where the
+// unique elements of the vocabulary are identified via hash maps. Typically, 4
+// is a good value. On systems with very few CPUs, a lower value might be
+// beneficial.
+constexpr size_t NUM_PARALLEL_ITEM_MAPS = 4;
diff --git a/src/index/FTSAlgorithms.cpp b/src/index/FTSAlgorithms.cpp
@@ -517,9 +517,7 @@ void FTSAlgorithms::multVarsAggScoresAndTakeTopKContexts(
     using ScoreToContext = std::set<pair<Score, Id>>;
     using ScoreAndStC = pair<Score, ScoreToContext>;
     using AggMap = ad_utility::HashMap<vector<Id>, ScoreAndStC, IdVectorHash>;
-    vector<Id> emptyKey = {{std::numeric_limits<Id>::max()}};
-    vector<Id> deletedKey = {{std::numeric_limits<Id>::max() - 1}};
-    AggMap map(emptyKey, deletedKey);
+    AggMap map;
     vector<Id> entitiesInContext;
     Id currentCid = cids[0];
     Score cscore = scores[0];
@@ -645,7 +643,7 @@ void FTSAlgorithms::multVarsAggScoresAndTakeTopContext(
                           IdVectorHash>;
   vector<Id> emptyKey = {{std::numeric_limits<Id>::max()}};
   vector<Id> deletedKey = {{std::numeric_limits<Id>::max() - 1}};
-  AggMap map(emptyKey, deletedKey);
+  AggMap map;
   vector<Id> entitiesInContext;
   Id currentCid = cids[0];
   Score cscore = scores[0];
@@ -1027,9 +1025,7 @@ void FTSAlgorithms::multVarsFilterAggScoresAndTakeTopKContexts(
   using ScoreToContext = std::set<pair<Score, Id>>;
   using ScoreAndStC = pair<Score, ScoreToContext>;
   using AggMap = ad_utility::HashMap<vector<Id>, ScoreAndStC, IdVectorHash>;
-  vector<Id> emptyKey = {{std::numeric_limits<Id>::max()}};
-  vector<Id> deletedKey = {{std::numeric_limits<Id>::max() - 1}};
-  AggMap map(emptyKey, deletedKey);
+  AggMap map;
   vector<Id> entitiesInContext;
   vector<Id> filteredEntitiesInContext;
   Id currentCid = cids[0];
@@ -1201,9 +1197,7 @@ void FTSAlgorithms::multVarsFilterAggScoresAndTakeTopKContexts(
   using ScoreToContext = std::set<pair<Score, Id>>;
   using ScoreAndStC = pair<Score, ScoreToContext>;
   using AggMap = ad_utility::HashMap<vector<Id>, ScoreAndStC, IdVectorHash>;
-  vector<Id> emptyKey = {{std::numeric_limits<Id>::max()}};
-  vector<Id> deletedKey = {{std::numeric_limits<Id>::max() - 1}};
-  AggMap map(emptyKey, deletedKey);
+  AggMap map;
   vector<Id> entitiesInContext;
   vector<Id> filteredEntitiesInContext;
   Id currentCid = cids[0];