Skip to content

Commit

Permalink
Merge pull request #302 from joka921/f.pipelinedIndexBuild
Browse files Browse the repository at this point in the history
Speeding up the first phase of Index Building
  • Loading branch information
niklas88 committed Jan 30, 2020
2 parents 5e70bed + 3783f38 commit 335deb2
Show file tree
Hide file tree
Showing 38 changed files with 2,000 additions and 377 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,6 @@
[submodule "third_party/re2"]
path = third_party/re2
url = https://github.com/google/re2.git
[submodule "third_party/abseil-cpp"]
path = third_party/abseil-cpp
url = https://github.com/abseil/abseil-cpp.git
9 changes: 8 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,13 @@ include_directories(third_party/googletest/googletest/include)
# Header only, nothing to include
include_directories(third_party/json/)

################################
# ABSEIL
################################
set(BUILD_TESTING OFF CACHE BOOL "Don't build tests for abseil" FORCE)
add_subdirectory(third_party/abseil-cpp)
include_directories(third_party/abseil-cpp/)

if (USE_PARALLEL)
include(FindOpenMP)
if(OPENMP_FOUND)
Expand Down Expand Up @@ -163,7 +170,7 @@ add_executable(PrefixHeuristicEvaluatorMain src/PrefixHeuristicEvaluatorMain.cpp
target_link_libraries (PrefixHeuristicEvaluatorMain index ${CMAKE_THREAD_LIBS_INIT})

add_executable(TurtleParserMain src/TurtleParserMain.cpp)
target_link_libraries(TurtleParserMain parser ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(TurtleParserMain parser ${CMAKE_THREAD_LIBS_INIT} absl::flat_hash_map)

add_executable(VocabularyMergerMain src/VocabularyMergerMain.cpp)
target_link_libraries(VocabularyMergerMain index ${CMAKE_THREAD_LIBS_INIT})
Expand Down
4 changes: 4 additions & 0 deletions e2e/e2e-build-settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"num-triples-per-partial-vocab" : 40000,
"parser-batch-size" : 1000
}
1 change: 1 addition & 0 deletions e2e/e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ if [ "$1" != "no-index" ]; then
./IndexBuilderMain -l -i "$INDEX" \
-F ttl \
-f "$INPUT.nt" \
-s "$PROJECT_DIR/e2e/e2e-build-settings.json" \
-w "$INPUT.wordsfile.tsv" \
-d "$INPUT.docsfile.tsv" || bail "Building Index failed"
popd
Expand Down
32 changes: 32 additions & 0 deletions misc/check_binary_index_equality.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash

#compare two indices for binary equality. Can be used for regression testing of the Index Building procedure

if [ "$#" -ne 2 ]
then
echo "Usage: $0 <indexPrefix1> <indexPrefix2>"
echo "Passed $# command line arguments"
exit 1
fi



for i in .index.pso .index.pos \
.index.spo .index.spo.meta-mmap .index.sop .index.spo.meta-mmap \
.index.osp .index.osp.meta-mmap .index.ops .index.ops.meta-mmap \
.index.patterns .prefixes .vocabulary .meta-data.json .literals-index
do
f1=$1$i
f2=$2$i
echo "Comparing $f1 and $f2"
if cmp $f1 $f2
then
echo "$f1 and $f2 match, continuing"
else
echo "Error, $f1 and $f2 are not equal"
#exit 1
fi

done

echo "Indices with prefixes $1 and $2 are binary equal"
10 changes: 4 additions & 6 deletions src/engine/QueryExecutionTree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,13 +164,11 @@ size_t QueryExecutionTree::getSizeEstimate() {
if (_sizeEstimate == std::numeric_limits<size_t>::max()) {
if (_cachedResult && _cachedResult->status() == ResultTable::FINISHED) {
_sizeEstimate = _cachedResult->size();
} else if (_qec) {
_sizeEstimate = _rootOperation->getSizeEstimate();
} else {
// For test cases without index only:
// Make it deterministic by using the asString.
_sizeEstimate =
1000 + std::hash<string>{}(_rootOperation->asString()) % 1000;
// if we are in a unit test setting and there is no QueryExecutionContest
// specified it is the _rootOperation's obligation to handle this case
// correctly
_sizeEstimate = _rootOperation->getSizeEstimate();
}
}
return _sizeEstimate;
Expand Down
73 changes: 39 additions & 34 deletions src/engine/QueryPlanner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,17 +89,8 @@ QueryExecutionTree QueryPlanner::createExecutionTree(ParsedQuery& pq) {
vector<SubtreePlan>& lastRow = plans.back();

AD_CHECK_GT(lastRow.size(), 0);
auto minInd = findCheapestExecutionTree(lastRow);

size_t minCost = lastRow[0].getCostEstimate();
size_t minInd = 0;

for (size_t i = 1; i < lastRow.size(); ++i) {
size_t thisCost = lastRow[i].getCostEstimate();
if (thisCost < minCost) {
minCost = thisCost;
minInd = i;
}
}
lastRow[minInd]._isOptional = pq._rootGraphPattern->_optional;

SubtreePlan final = lastRow[minInd];
Expand Down Expand Up @@ -320,16 +311,7 @@ std::vector<QueryPlanner::SubtreePlan> QueryPlanner::optimize(
if (pattern == rootPattern) {
return lastRow;
} else {
AD_CHECK_GT(lastRow.size(), 0);
size_t minCost = lastRow[0].getCostEstimate();
size_t minInd = 0;
for (size_t i = 1; i < lastRow.size(); ++i) {
size_t thisCost = lastRow[i].getCostEstimate();
if (thisCost < minCost) {
minCost = lastRow[i].getCostEstimate();
minInd = i;
}
}
auto minInd = findCheapestExecutionTree(lastRow);
lastRow[minInd]._isOptional = pattern->_optional;
patternPlans[pattern->_id] = lastRow[minInd];
}
Expand Down Expand Up @@ -2114,21 +2096,12 @@ vector<QueryPlanner::SubtreePlan> QueryPlanner::merge(
// as key.
LOG(TRACE) << "Pruning...\n";
vector<SubtreePlan> prunedPlans;
size_t nofCandidates = 0;
for (auto it = candidates.begin(); it != candidates.end(); ++it) {
size_t minCost = std::numeric_limits<size_t>::max();
size_t minIndex = 0;
for (size_t i = 0; i < it->second.size(); ++i) {
++nofCandidates;
if (it->second[i].getCostEstimate() < minCost) {
minCost = it->second[i].getCostEstimate();
minIndex = i;
}
}
prunedPlans.push_back(it->second[minIndex]);
for (const auto& [key, value] : candidates) {
(void)key; // silence unused warning
size_t minIndex = findCheapestExecutionTree(value);
prunedPlans.push_back(value[minIndex]);
}
LOG(TRACE) << "Got " << prunedPlans.size() << " pruned plans from "
<< nofCandidates << " candidates.\n";
LOG(TRACE) << "Got " << prunedPlans.size() << " pruned plans from \n";
return prunedPlans;
}

Expand Down Expand Up @@ -2904,3 +2877,35 @@ QueryPlanner::createVariableColumnsMapForTextOperation(
void QueryPlanner::setEnablePatternTrick(bool enablePatternTrick) {
_enablePatternTrick = enablePatternTrick;
}

// _________________________________________________________________________________
size_t QueryPlanner::findCheapestExecutionTree(
const std::vector<SubtreePlan>& lastRow) const {
AD_CHECK_GT(lastRow.size(), 0);
size_t minCost = std::numeric_limits<size_t>::max();
size_t minInd = 0;
LOG(TRACE) << "\nFinding the cheapest row in the optimizer\n";
for (size_t i = 0; i < lastRow.size(); ++i) {
[[maybe_unused]] auto repr = lastRow[i]._qet->asString();
std::transform(repr.begin(), repr.end(), repr.begin(),
[](char c) { return c == '\n' ? ' ' : c; });

size_t thisSize = lastRow[i].getSizeEstimate();
size_t thisCost = lastRow[i].getCostEstimate();
LOG(TRACE) << "Estimated cost and size of " << thisCost << " " << thisSize
<< " for Tree " << repr << '\n';
if (thisCost < minCost) {
minCost = lastRow[i].getCostEstimate();
minInd = i;
}
// make the tiebreaking deterministic for the UnitTests. The asString
// should never be on a hot code path in practice.
else if (thisCost == minCost && isInTestMode() &&
lastRow[i]._qet->asString() < lastRow[minInd]._qet->asString()) {
minCost = lastRow[i].getCostEstimate();
minInd = i;
}
}
LOG(TRACE) << "Finished\n";
return minInd;
};
14 changes: 14 additions & 0 deletions src/engine/QueryPlanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -323,4 +323,18 @@ class QueryPlanner {
*/
bool checkUsePatternTrick(ParsedQuery* pq,
SparqlTriple* patternTrickTriple) const;

/**
* @brief return the index of the cheapest execution tree in the argument.
*
* If we are in the unit test mode, this is deterministic by additionally
* sorting by the cache key when comparing equally cheap indices, else the
* first element that has the minimum index is returned.
*/
size_t findCheapestExecutionTree(
const std::vector<SubtreePlan>& lastRow) const;

/// if this Planner is not associated with a queryExecutionContext we are only
/// in the unit test mode
[[nodiscard]] bool isInTestMode() const { return _qec == nullptr; }
};
2 changes: 1 addition & 1 deletion src/index/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ add_library(index
FTSAlgorithms.cpp FTSAlgorithms.h
PrefixHeuristic.cpp PrefixHeuristic.h)

target_link_libraries(index parser ${STXXL_LIBRARIES} ${ICU_LIBRARIES})
target_link_libraries(index parser ${STXXL_LIBRARIES} ${ICU_LIBRARIES} absl::flat_hash_map)

add_library(metaConverter
MetaDataConverter.cpp MetaDataConverter.h)
Expand Down
7 changes: 7 additions & 0 deletions src/index/ConstantsIndexCreation.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,10 @@ static const std::string PARTIAL_MMAP_IDS = ".partial-ids-mmap";

// ________________________________________________________________
static const std::string TMP_BASENAME_COMPRESSION = ".tmp.compression_index";

// _________________________________________________________________
// The degree of parallelism that is used for IndexBuilding step where the
// unique elements of the vocabulary are identified via hash maps. Typically, 4
// is a good value. On systems with very few CPUs, a lower value might be
// beneficial.
constexpr size_t NUM_PARALLEL_ITEM_MAPS = 4;
14 changes: 4 additions & 10 deletions src/index/FTSAlgorithms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -517,9 +517,7 @@ void FTSAlgorithms::multVarsAggScoresAndTakeTopKContexts(
using ScoreToContext = std::set<pair<Score, Id>>;
using ScoreAndStC = pair<Score, ScoreToContext>;
using AggMap = ad_utility::HashMap<vector<Id>, ScoreAndStC, IdVectorHash>;
vector<Id> emptyKey = {{std::numeric_limits<Id>::max()}};
vector<Id> deletedKey = {{std::numeric_limits<Id>::max() - 1}};
AggMap map(emptyKey, deletedKey);
AggMap map;
vector<Id> entitiesInContext;
Id currentCid = cids[0];
Score cscore = scores[0];
Expand Down Expand Up @@ -645,7 +643,7 @@ void FTSAlgorithms::multVarsAggScoresAndTakeTopContext(
IdVectorHash>;
vector<Id> emptyKey = {{std::numeric_limits<Id>::max()}};
vector<Id> deletedKey = {{std::numeric_limits<Id>::max() - 1}};
AggMap map(emptyKey, deletedKey);
AggMap map;
vector<Id> entitiesInContext;
Id currentCid = cids[0];
Score cscore = scores[0];
Expand Down Expand Up @@ -1027,9 +1025,7 @@ void FTSAlgorithms::multVarsFilterAggScoresAndTakeTopKContexts(
using ScoreToContext = std::set<pair<Score, Id>>;
using ScoreAndStC = pair<Score, ScoreToContext>;
using AggMap = ad_utility::HashMap<vector<Id>, ScoreAndStC, IdVectorHash>;
vector<Id> emptyKey = {{std::numeric_limits<Id>::max()}};
vector<Id> deletedKey = {{std::numeric_limits<Id>::max() - 1}};
AggMap map(emptyKey, deletedKey);
AggMap map;
vector<Id> entitiesInContext;
vector<Id> filteredEntitiesInContext;
Id currentCid = cids[0];
Expand Down Expand Up @@ -1201,9 +1197,7 @@ void FTSAlgorithms::multVarsFilterAggScoresAndTakeTopKContexts(
using ScoreToContext = std::set<pair<Score, Id>>;
using ScoreAndStC = pair<Score, ScoreToContext>;
using AggMap = ad_utility::HashMap<vector<Id>, ScoreAndStC, IdVectorHash>;
vector<Id> emptyKey = {{std::numeric_limits<Id>::max()}};
vector<Id> deletedKey = {{std::numeric_limits<Id>::max() - 1}};
AggMap map(emptyKey, deletedKey);
AggMap map;
vector<Id> entitiesInContext;
vector<Id> filteredEntitiesInContext;
Id currentCid = cids[0];
Expand Down

0 comments on commit 335deb2

Please sign in to comment.