Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speeding up the first phase of Index Building #302

Merged
merged 14 commits into from
Jan 30, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,6 @@
[submodule "third_party/re2"]
path = third_party/re2
url = https://github.com/google/re2.git
[submodule "third_party/abseil-cpp"]
path = third_party/abseil-cpp
url = https://github.com/abseil/abseil-cpp.git
9 changes: 8 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,13 @@ include_directories(third_party/googletest/googletest/include)
# Header only, nothing to include
include_directories(third_party/json/)

################################
# ABSEIL
################################
set(BUILD_TESTING OFF CACHE BOOL "Don't build tests for abseil" FORCE)
add_subdirectory(third_party/abseil-cpp)
include_directories(third_party/abseil-cpp/)

if (USE_PARALLEL)
include(FindOpenMP)
if(OPENMP_FOUND)
Expand Down Expand Up @@ -163,7 +170,7 @@ add_executable(PrefixHeuristicEvaluatorMain src/PrefixHeuristicEvaluatorMain.cpp
target_link_libraries (PrefixHeuristicEvaluatorMain index ${CMAKE_THREAD_LIBS_INIT})

add_executable(TurtleParserMain src/TurtleParserMain.cpp)
target_link_libraries(TurtleParserMain parser ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(TurtleParserMain parser ${CMAKE_THREAD_LIBS_INIT} absl::flat_hash_map)

add_executable(VocabularyMergerMain src/VocabularyMergerMain.cpp)
target_link_libraries(VocabularyMergerMain index ${CMAKE_THREAD_LIBS_INIT})
Expand Down
4 changes: 4 additions & 0 deletions e2e/e2e-build-settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"num-triples-per-partial-vocab" : 40000,
"parser-batch-size" : 1000
}
1 change: 1 addition & 0 deletions e2e/e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ if [ "$1" != "no-index" ]; then
./IndexBuilderMain -l -i "$INDEX" \
-F ttl \
-f "$INPUT.nt" \
-s "$PROJECT_DIR/e2e/e2e-build-settings.json" \
-w "$INPUT.wordsfile.tsv" \
-d "$INPUT.docsfile.tsv" || bail "Building Index failed"
popd
Expand Down
32 changes: 32 additions & 0 deletions misc/check_binary_index_equality.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash

#compare two indices for binary equality. Can be used for regression testing of the Index Building procedure

if [ "$#" -ne 2 ]
then
echo "Usage: $0 <indexPrefix1> <indexPrefix2>"
echo "Passed $# command line arguments"
exit 1
fi



for i in .index.pso .index.pos \
.index.spo .index.spo.meta-mmap .index.sop .index.spo.meta-mmap \
.index.osp .index.osp.meta-mmap .index.ops .index.ops.meta-mmap \
.index.patterns .prefixes .vocabulary .meta-data.json .literals-index
do
f1=$1$i
f2=$2$i
echo "Comparing $f1 and $f2"
if cmp $f1 $f2
then
echo "$f1 and $f2 match, continuing"
else
echo "Error, $f1 and $f2 are not equal"
#exit 1
fi

done

echo "Indices with prefixes $1 and $2 are binary equal"
10 changes: 4 additions & 6 deletions src/engine/QueryExecutionTree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,13 +164,11 @@ size_t QueryExecutionTree::getSizeEstimate() {
if (_sizeEstimate == std::numeric_limits<size_t>::max()) {
if (_cachedResult && _cachedResult->status() == ResultTable::FINISHED) {
_sizeEstimate = _cachedResult->size();
} else if (_qec) {
_sizeEstimate = _rootOperation->getSizeEstimate();
} else {
// For test cases without index only:
// Make it deterministic by using the asString.
_sizeEstimate =
1000 + std::hash<string>{}(_rootOperation->asString()) % 1000;
// if we are in a unit test setting and there is no QueryExecutionContest
// specified it is the _rootOperation's obligation to handle this case
// correctly
_sizeEstimate = _rootOperation->getSizeEstimate();
}
}
return _sizeEstimate;
Expand Down
73 changes: 39 additions & 34 deletions src/engine/QueryPlanner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,17 +89,8 @@ QueryExecutionTree QueryPlanner::createExecutionTree(ParsedQuery& pq) {
vector<SubtreePlan>& lastRow = plans.back();

AD_CHECK_GT(lastRow.size(), 0);
auto minInd = findCheapestExecutionTree(lastRow);

size_t minCost = lastRow[0].getCostEstimate();
size_t minInd = 0;

for (size_t i = 1; i < lastRow.size(); ++i) {
size_t thisCost = lastRow[i].getCostEstimate();
if (thisCost < minCost) {
minCost = thisCost;
minInd = i;
}
}
lastRow[minInd]._isOptional = pq._rootGraphPattern->_optional;

SubtreePlan final = lastRow[minInd];
Expand Down Expand Up @@ -320,16 +311,7 @@ std::vector<QueryPlanner::SubtreePlan> QueryPlanner::optimize(
if (pattern == rootPattern) {
return lastRow;
} else {
AD_CHECK_GT(lastRow.size(), 0);
size_t minCost = lastRow[0].getCostEstimate();
size_t minInd = 0;
for (size_t i = 1; i < lastRow.size(); ++i) {
size_t thisCost = lastRow[i].getCostEstimate();
if (thisCost < minCost) {
minCost = lastRow[i].getCostEstimate();
minInd = i;
}
}
auto minInd = findCheapestExecutionTree(lastRow);
lastRow[minInd]._isOptional = pattern->_optional;
patternPlans[pattern->_id] = lastRow[minInd];
}
Expand Down Expand Up @@ -2114,21 +2096,12 @@ vector<QueryPlanner::SubtreePlan> QueryPlanner::merge(
// as key.
LOG(TRACE) << "Pruning...\n";
vector<SubtreePlan> prunedPlans;
size_t nofCandidates = 0;
for (auto it = candidates.begin(); it != candidates.end(); ++it) {
size_t minCost = std::numeric_limits<size_t>::max();
size_t minIndex = 0;
for (size_t i = 0; i < it->second.size(); ++i) {
++nofCandidates;
if (it->second[i].getCostEstimate() < minCost) {
minCost = it->second[i].getCostEstimate();
minIndex = i;
}
}
prunedPlans.push_back(it->second[minIndex]);
for (const auto& [key, value] : candidates) {
(void)key; // silence unused warning
size_t minIndex = findCheapestExecutionTree(value);
prunedPlans.push_back(value[minIndex]);
}
LOG(TRACE) << "Got " << prunedPlans.size() << " pruned plans from "
<< nofCandidates << " candidates.\n";
LOG(TRACE) << "Got " << prunedPlans.size() << " pruned plans from \n";
return prunedPlans;
}

Expand Down Expand Up @@ -2904,3 +2877,35 @@ QueryPlanner::createVariableColumnsMapForTextOperation(
void QueryPlanner::setEnablePatternTrick(bool enablePatternTrick) {
_enablePatternTrick = enablePatternTrick;
}

// _________________________________________________________________________________
size_t QueryPlanner::findCheapestExecutionTree(
const std::vector<SubtreePlan>& lastRow) const {
AD_CHECK_GT(lastRow.size(), 0);
size_t minCost = std::numeric_limits<size_t>::max();
size_t minInd = 0;
LOG(TRACE) << "\nFinding the cheapest row in the optimizer\n";
for (size_t i = 0; i < lastRow.size(); ++i) {
[[maybe_unused]] auto repr = lastRow[i]._qet->asString();
std::transform(repr.begin(), repr.end(), repr.begin(),
[](char c) { return c == '\n' ? ' ' : c; });

size_t thisSize = lastRow[i].getSizeEstimate();
size_t thisCost = lastRow[i].getCostEstimate();
LOG(TRACE) << "Estimated cost and size of " << thisCost << " " << thisSize
<< " for Tree " << repr << '\n';
if (thisCost < minCost) {
minCost = lastRow[i].getCostEstimate();
minInd = i;
}
// make the tiebreaking deterministic for the UnitTests. The asString
// should never be on a hot code path in practice.
else if (thisCost == minCost && isInTestMode() &&
lastRow[i]._qet->asString() < lastRow[minInd]._qet->asString()) {
minCost = lastRow[i].getCostEstimate();
minInd = i;
}
}
LOG(TRACE) << "Finished\n";
return minInd;
};
14 changes: 14 additions & 0 deletions src/engine/QueryPlanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -323,4 +323,18 @@ class QueryPlanner {
*/
bool checkUsePatternTrick(ParsedQuery* pq,
SparqlTriple* patternTrickTriple) const;

/**
* @brief return the index of the cheapest execution tree in the argument.
*
* If we are in the unit test mode, this is deterministic by additionally
* sorting by the cache key when comparing equally cheap indices, else the
* first element that has the minimum index is returned.
*/
size_t findCheapestExecutionTree(
const std::vector<SubtreePlan>& lastRow) const;

/// if this Planner is not associated with a queryExecutionContext we are only
/// in the unit test mode
[[nodiscard]] bool isInTestMode() const { return _qec == nullptr; }
};
2 changes: 1 addition & 1 deletion src/index/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ add_library(index
FTSAlgorithms.cpp FTSAlgorithms.h
PrefixHeuristic.cpp PrefixHeuristic.h)

target_link_libraries(index parser ${STXXL_LIBRARIES} ${ICU_LIBRARIES})
target_link_libraries(index parser ${STXXL_LIBRARIES} ${ICU_LIBRARIES} absl::flat_hash_map)

add_library(metaConverter
MetaDataConverter.cpp MetaDataConverter.h)
Expand Down
7 changes: 7 additions & 0 deletions src/index/ConstantsIndexCreation.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,10 @@ static const std::string PARTIAL_MMAP_IDS = ".partial-ids-mmap";

// ________________________________________________________________
static const std::string TMP_BASENAME_COMPRESSION = ".tmp.compression_index";

// _________________________________________________________________
// The degree of parallelism that is used for IndexBuilding step where the
// unique elements of the vocabulary are identified via hash maps. Typically, 4
// is a good value. On systems with very few CPUs, a lower value might be
// beneficial.
constexpr size_t NUM_PARALLEL_ITEM_MAPS = 4;
14 changes: 4 additions & 10 deletions src/index/FTSAlgorithms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -517,9 +517,7 @@ void FTSAlgorithms::multVarsAggScoresAndTakeTopKContexts(
using ScoreToContext = std::set<pair<Score, Id>>;
using ScoreAndStC = pair<Score, ScoreToContext>;
using AggMap = ad_utility::HashMap<vector<Id>, ScoreAndStC, IdVectorHash>;
vector<Id> emptyKey = {{std::numeric_limits<Id>::max()}};
vector<Id> deletedKey = {{std::numeric_limits<Id>::max() - 1}};
AggMap map(emptyKey, deletedKey);
AggMap map;
vector<Id> entitiesInContext;
Id currentCid = cids[0];
Score cscore = scores[0];
Expand Down Expand Up @@ -645,7 +643,7 @@ void FTSAlgorithms::multVarsAggScoresAndTakeTopContext(
IdVectorHash>;
vector<Id> emptyKey = {{std::numeric_limits<Id>::max()}};
vector<Id> deletedKey = {{std::numeric_limits<Id>::max() - 1}};
AggMap map(emptyKey, deletedKey);
AggMap map;
vector<Id> entitiesInContext;
Id currentCid = cids[0];
Score cscore = scores[0];
Expand Down Expand Up @@ -1027,9 +1025,7 @@ void FTSAlgorithms::multVarsFilterAggScoresAndTakeTopKContexts(
using ScoreToContext = std::set<pair<Score, Id>>;
using ScoreAndStC = pair<Score, ScoreToContext>;
using AggMap = ad_utility::HashMap<vector<Id>, ScoreAndStC, IdVectorHash>;
vector<Id> emptyKey = {{std::numeric_limits<Id>::max()}};
vector<Id> deletedKey = {{std::numeric_limits<Id>::max() - 1}};
AggMap map(emptyKey, deletedKey);
AggMap map;
vector<Id> entitiesInContext;
vector<Id> filteredEntitiesInContext;
Id currentCid = cids[0];
Expand Down Expand Up @@ -1201,9 +1197,7 @@ void FTSAlgorithms::multVarsFilterAggScoresAndTakeTopKContexts(
using ScoreToContext = std::set<pair<Score, Id>>;
using ScoreAndStC = pair<Score, ScoreToContext>;
using AggMap = ad_utility::HashMap<vector<Id>, ScoreAndStC, IdVectorHash>;
vector<Id> emptyKey = {{std::numeric_limits<Id>::max()}};
vector<Id> deletedKey = {{std::numeric_limits<Id>::max() - 1}};
AggMap map(emptyKey, deletedKey);
AggMap map;
vector<Id> entitiesInContext;
vector<Id> filteredEntitiesInContext;
Id currentCid = cids[0];
Expand Down