Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implemented several more changes requested during the pull request review #69

Merged
merged 4 commits into from
Jul 4, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions src/engine/Filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ string Filter::asString(size_t indent) const {
break;
case SparqlFilter::LE:
os << " <= ";
break;
case SparqlFilter::GT:
os << " > ";
break;
Expand Down
55 changes: 16 additions & 39 deletions src/engine/GroupBy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,9 @@
#include "../util/HashSet.h"

GroupBy::GroupBy(QueryExecutionContext* qec,
std::shared_ptr<QueryExecutionTree> subtree,
const vector<string>& groupByVariables,
const std::vector<ParsedQuery::Alias>& aliases)
: Operation(qec), _subtree(subtree), _groupByVariables(groupByVariables) {
: Operation(qec), _subtree(nullptr), _groupByVariables(groupByVariables) {
_aliases.reserve(aliases.size());
for (const ParsedQuery::Alias& a : aliases) {
// Only aggregate aliases need to be processed by GruopBy, other aliases
Expand Down Expand Up @@ -41,6 +40,10 @@ GroupBy::GroupBy(QueryExecutionContext* qec,
}
}

void GroupBy::setSubtree(std::shared_ptr<QueryExecutionTree> subtree) {
_subtree = subtree;
}

string GroupBy::asString(size_t indent) const {
std::ostringstream os;
for (size_t i = 0; i < indent; ++i) {
Expand All @@ -63,47 +66,21 @@ size_t GroupBy::getResultWidth() const { return _varColMap.size(); }
size_t GroupBy::resultSortedOn() const { return -1; }

vector<pair<size_t, bool>> GroupBy::computeSortColumns(
std::shared_ptr<QueryExecutionTree> subtree,
const vector<string>& groupByVariables,
const std::vector<ParsedQuery::Alias>& aliases) {
// Create sorted lists of the aliases and the group by variables to determine
// the output column order, on which the sorting depends. Then populate
// the vector of columns which should be sorted by using the subtrees
// variable column map.
std::shared_ptr<QueryExecutionTree> inputTree) {
vector<pair<size_t, bool>> cols;
if (groupByVariables.empty()) {
if (_groupByVariables.empty()) {
// the entire input is a single group, no sorting needs to be done
return cols;
}

std::vector<ParsedQuery::Alias> sortedAliases;
sortedAliases.reserve(aliases.size());
for (const ParsedQuery::Alias& a : aliases) {
if (a._isAggregate) {
sortedAliases.push_back(a);
}
}
// sort the aliases to ensure the cache key is order invariant
std::sort(sortedAliases.begin(), sortedAliases.end(),
[](const ParsedQuery::Alias& a1, const ParsedQuery::Alias& a2) {
return a1._outVarName < a2._outVarName;
});

std::vector<std::string> sortedGroupByVars;
sortedGroupByVars.insert(sortedGroupByVars.end(), groupByVariables.begin(),
groupByVariables.end());

// sort the groupByVariables to ensure the cache key is order invariant
std::sort(sortedGroupByVars.begin(), sortedGroupByVars.end());

std::unordered_map<string, size_t> inVarColMap =
subtree->getVariableColumnMap();
inputTree->getVariableColumnMap();

// The returned columns are all groupByVariables followed by aggregrates
for (std::string var : sortedGroupByVars) {
for (std::string var : _groupByVariables) {
cols.push_back({inVarColMap[var], false});
}
for (const ParsedQuery::Alias& a : sortedAliases) {
for (const ParsedQuery::Alias& a : _aliases) {
cols.push_back({inVarColMap[a._outVarName], false});
}
return cols;
Expand Down Expand Up @@ -211,7 +188,7 @@ void processGroup(const GroupBy::Aggregate& a, size_t blockStart,
}
}
} else if (inputTypes[a._inCol] == ResultTable::ResultType::TEXT ||
inputTypes[a._inCol] == ResultTable::ResultType::STRING) {
inputTypes[a._inCol] == ResultTable::ResultType::LOCAL_VOCAB) {
res = std::numeric_limits<float>::quiet_NaN();
} else {
if (a._distinct) {
Expand Down Expand Up @@ -337,7 +314,7 @@ void processGroup(const GroupBy::Aggregate& a, size_t blockStart,
}
out << index.getTextExcerpt((*input)[blockEnd][a._inCol]);
}
} else if (inputTypes[a._inCol] == ResultTable::ResultType::STRING) {
} else if (inputTypes[a._inCol] == ResultTable::ResultType::LOCAL_VOCAB) {
if (a._distinct) {
for (size_t i = blockStart; i + 1 <= blockEnd; i++) {
const auto it = distinctHashSet.find((*input)[i][a._inCol]);
Expand Down Expand Up @@ -423,7 +400,7 @@ void processGroup(const GroupBy::Aggregate& a, size_t blockStart,
resultRow[a._outCol] = 0;
std::memcpy(&resultRow[a._outCol], &res, sizeof(float));
} else if (inputTypes[a._inCol] == ResultTable::ResultType::TEXT ||
inputTypes[a._inCol] == ResultTable::ResultType::STRING) {
inputTypes[a._inCol] == ResultTable::ResultType::LOCAL_VOCAB) {
resultRow[a._outCol] = ID_NO_VALUE;
} else {
Id res = std::numeric_limits<Id>::lowest();
Expand Down Expand Up @@ -453,7 +430,7 @@ void processGroup(const GroupBy::Aggregate& a, size_t blockStart,
resultRow[a._outCol] = 0;
std::memcpy(&resultRow[a._outCol], &res, sizeof(float));
} else if (inputTypes[a._inCol] == ResultTable::ResultType::TEXT ||
inputTypes[a._inCol] == ResultTable::ResultType::STRING) {
inputTypes[a._inCol] == ResultTable::ResultType::LOCAL_VOCAB) {
resultRow[a._outCol] = ID_NO_VALUE;
} else {
Id res = std::numeric_limits<Id>::max();
Expand Down Expand Up @@ -504,7 +481,7 @@ void processGroup(const GroupBy::Aggregate& a, size_t blockStart,
}
}
} else if (inputTypes[a._inCol] == ResultTable::ResultType::TEXT ||
inputTypes[a._inCol] == ResultTable::ResultType::STRING) {
inputTypes[a._inCol] == ResultTable::ResultType::LOCAL_VOCAB) {
res = std::numeric_limits<float>::quiet_NaN();
} else {
if (a._distinct) {
Expand Down Expand Up @@ -901,7 +878,7 @@ void GroupBy::computeResult(ResultTable* result) const {
result->_resultTypes[i] = ResultTable::ResultType::VERBATIM;
break;
case AggregateType::GROUP_CONCAT:
result->_resultTypes[i] = ResultTable::ResultType::STRING;
result->_resultTypes[i] = ResultTable::ResultType::LOCAL_VOCAB;
break;
case AggregateType::MAX:
result->_resultTypes[i] =
Expand Down
34 changes: 25 additions & 9 deletions src/engine/GroupBy.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,18 @@ class GroupBy : public Operation {
bool _distinct;
};

GroupBy(QueryExecutionContext* qec,
std::shared_ptr<QueryExecutionTree> subtree,
const vector<string>& groupByVariables,
/**
* @brief This constructor does not take a subtree as an argument to allow
* for creating the GroupBy operation before the OrderBy operation
* that is required by this GroupBy. This prevents having to compute
* the order of the aggregate aliases twice and group by columns
* in two places. The subtree must be set by calling setSubtree
* before calling computeResult
* @param qec
* @param groupByVariables
* @param aliases
*/
GroupBy(QueryExecutionContext* qec, const vector<string>& groupByVariables,
const std::vector<ParsedQuery::Alias>& aliases);

virtual string asString(size_t indent = 0) const;
Expand All @@ -69,16 +78,23 @@ class GroupBy : public Operation {

virtual size_t getCostEstimate();

/**
* @brief To allow for creating a OrderBy Operation after the GroupBy
* the subtree is not an argument to the constructor, as it is with
* other operations. Instead it needs to be set using this function.
* @param subtree The QueryExecutionTree that contains the operations creating
* this operations input.
*/
void setSubtree(std::shared_ptr<QueryExecutionTree> subtree);

/**
* @return The columns on which the input data should be sorted or an empty
* list if no particular order is required for the grouping.
* The columns need to be known before the GroupBy Operation can be
* created, as the groupBy requires its parent operation on creation.
* @param inputTree The QueryExecutionTree that contains the operations
* creating the sorting operation inputs.
*/
static vector<pair<size_t, bool>> computeSortColumns(
std::shared_ptr<QueryExecutionTree> subtree,
const vector<string>& groupByVariables,
const std::vector<ParsedQuery::Alias>& aliases);
vector<pair<size_t, bool>> computeSortColumns(
std::shared_ptr<QueryExecutionTree> inputTree);

private:
std::shared_ptr<QueryExecutionTree> _subtree;
Expand Down
6 changes: 3 additions & 3 deletions src/engine/QueryExecutionTree.h
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ class QueryExecutionTree {
os << f << "\",\"";
break;
}
case ResultTable::ResultType::STRING: {
case ResultTable::ResultType::LOCAL_VOCAB: {
os << ad_utility::escapeForJson(
res->idToString(row[validIndices[j].first]))
<< "\",\"";
Expand Down Expand Up @@ -201,7 +201,7 @@ class QueryExecutionTree {
os << f << "\"]";
break;
}
case ResultTable::ResultType::STRING: {
case ResultTable::ResultType::LOCAL_VOCAB: {
os << ad_utility::escapeForJson(res->idToString(
row[validIndices[validIndices.size() - 1].first]))
<< "\"]";
Expand Down Expand Up @@ -250,7 +250,7 @@ class QueryExecutionTree {
out << f;
break;
}
case ResultTable::ResultType::STRING: {
case ResultTable::ResultType::LOCAL_VOCAB: {
out << res->idToString(row[validIndices[j].first]);
break;
}
Expand Down
44 changes: 25 additions & 19 deletions src/engine/QueryPlanner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,10 @@ QueryExecutionTree QueryPlanner::createExecutionTree(ParsedQuery& pq) const {
const ParsedQuery::Alias* countAlias = nullptr;
for (const ParsedQuery::Alias& a : pq._aliases) {
if (a._inVarName == t._o && a._isAggregate &&
ad_utility::startsWith(a._function, "COUNT")) {
a._function.find("DISTINCT") == std::string::npos &&
a._function.find("distinct") == std::string::npos &&
(ad_utility::startsWith(a._function, "COUNT") ||
ad_utility::startsWith(a._function, "count"))) {
countAlias = &a;
}
}
Expand Down Expand Up @@ -287,31 +290,34 @@ QueryExecutionTree QueryPlanner::createExecutionTree(ParsedQuery& pq) const {
std::cout << "Plan after pattern trick: " << endl
<< final._qet->asString() << endl;
} else if (doGrouping) {
// First order the result to match the ordering needed by the group by
// operation.
std::vector<std::pair<size_t, bool>> sortColumns =
GroupBy::computeSortColumns(final._qet, pq._groupByVariables,
pq._aliases);

SubtreePlan orderByPlan(_qec);
// Create a group by operation to determine on which columns the input
// needs to be sorted
SubtreePlan groupByPlan(_qec);
std::shared_ptr<Operation> groupBy =
std::make_shared<GroupBy>(_qec, pq._groupByVariables, pq._aliases);
QueryExecutionTree& groupByTree = *groupByPlan._qet.get();

std::shared_ptr<Operation> orderBy(
new OrderBy(_qec, final._qet, sortColumns));
if (!sortColumns.empty()) {
// Then compute the sort columns
std::vector<std::pair<size_t, bool>> sortColumns =
static_cast<GroupBy*>(groupBy.get())->computeSortColumns(final._qet);

if (!sortColumns.empty() &&
!(sortColumns.size() == 1 &&
final._qet->resultSortedOn() == sortColumns[0].first)) {
// Create an order by operation as required by the group by
std::shared_ptr<Operation> orderBy =
std::make_shared<OrderBy>(_qec, final._qet, sortColumns);
SubtreePlan orderByPlan(_qec);
QueryExecutionTree& orderByTree = *orderByPlan._qet.get();
orderByTree.setVariableColumns(final._qet->getVariableColumnMap());
orderByTree.setOperation(QueryExecutionTree::ORDER_BY, orderBy);
final = orderByPlan;
}

// Then run the actual group by
SubtreePlan groupByPlan(_qec);
std::shared_ptr<Operation> groupBy(
new GroupBy(_qec, sortColumns.empty() ? final._qet : orderByPlan._qet,
pq._groupByVariables, pq._aliases));
QueryExecutionTree& tree = *groupByPlan._qet.get();
tree.setVariableColumns(
static_cast<GroupBy*>(groupBy.get())->setSubtree(final._qet);
groupByTree.setVariableColumns(
static_cast<GroupBy*>(groupBy.get())->getVariableColumns());
tree.setOperation(QueryExecutionTree::GROUP_BY, groupBy);
groupByTree.setOperation(QueryExecutionTree::GROUP_BY, groupBy);
final = groupByPlan;
}

Expand Down
6 changes: 3 additions & 3 deletions src/engine/ResultTable.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,13 @@ class ResultTable {
KB,
// An unsigned integer (size_t)
VERBATIM,
// An entry in the text index
// A byte offset in the text index
TEXT,
// A 32 bit float, stored in the first 4 bytes of the entry. The last four
// bytes have to be zero.
FLOAT,
// An entry in the ResultTable _localVocab
STRING
// An entry in the ResultTable's _localVocab
LOCAL_VOCAB
};

size_t _nofColumns;
Expand Down
7 changes: 5 additions & 2 deletions src/parser/ParsedQuery.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,8 +189,11 @@ void ParsedQuery::parseAliases() {
a._isAggregate = true;
size_t pos = inner.find("as");
if (pos == std::string::npos) {
throw ParseException("Alias " + var +
" is malformed: keyword as is missing.");
pos = inner.find("AS");
if (pos == std::string::npos) {
throw ParseException("Alias " + var +
" is malformed: keyword as is missing.");
}
}
std::string newVarName = inner.substr(pos + 2, var.size() - pos - 2);
newVarName = ad_utility::strip(newVarName, " \t\n");
Expand Down
2 changes: 1 addition & 1 deletion test/GroupByTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ TEST_F(GroupByTest, doGroupBy) {
std::vector<ResultTable::ResultType> inputTypes = {
ResultTable::ResultType::KB, ResultTable::ResultType::KB,
ResultTable::ResultType::VERBATIM, ResultTable::ResultType::TEXT,
ResultTable::ResultType::FLOAT, ResultTable::ResultType::STRING};
ResultTable::ResultType::FLOAT, ResultTable::ResultType::LOCAL_VOCAB};

/*
COUNT,
Expand Down
2 changes: 1 addition & 1 deletion test/SparqlParserTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,7 @@ TEST(ParserTest, testSolutionModifiers) {
ASSERT_EQ(1u, pq._orderBy.size());
ASSERT_EQ("?r", pq._groupByVariables[0]);
ASSERT_EQ("?avg", pq._orderBy[0]._key);
ASSERT_EQ(false, pq._orderBy[0]._desc);
ASSERT_FALSE(pq._orderBy[0]._desc);

pq = SparqlParser::parse(
"SELECT ?r (COUNT(DISTINCT ?r) as ?count) WHERE {"
Expand Down