Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve SPARQL parser by using proper token lexing #271

Merged
merged 15 commits into from
Aug 6, 2019
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ set(USE_OPENMP OFF CACHE BOOL "Don't use OPENMP as default" FORCE)
add_subdirectory(third_party/stxxl)
# apply STXXL CXXFLAGS
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${STXXL_CXX_FLAGS}")
include_directories(${STXXL_INCLUDE_DIRS})
include_directories(SYSTEM ${STXXL_INCLUDE_DIRS})
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea, on GCC 9.x the warnings were getting a bit annoying.


################################
# RE2
Expand All @@ -91,7 +91,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter")

set(RE2_BUILD_TESTING OFF CACHE BOOL "enable testing for RE2" FORCE)
add_subdirectory(third_party/re2)
include_directories(third_party/re2)
include_directories(SYSTEM third_party/re2)

# reinstate original flags including all warnings
set(CMAKE_CXX_FLAGS "${LOCAL_CXX_BACKUP_FLAGS}")
Expand Down
4 changes: 2 additions & 2 deletions e2e/scientists_queries.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,7 @@ queries:
?x ql:has-predicate ?predicate .
}
GROUP BY ?predicate
HAVING (?predicate < <Z) (?predicate = <Religion>)
HAVING (?predicate < "<Z") (?predicate = <Religion>)
checks:
- num_rows: 1
- num_cols: 2
Expand Down Expand Up @@ -535,7 +535,7 @@ queries:
?subject <Profession> ?object
}
GROUP BY ?object
ORDER BY DESC((COUNT(?object) AS ?count))
ORDER BY DESC(?count)
checks:
- num_rows: 836
- num_cols: 2
Expand Down
4 changes: 2 additions & 2 deletions src/SparqlEngineMain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -192,8 +192,8 @@ int main(int argc, char** argv) {
void processQuery(QueryExecutionContext& qec, const string& query) {
ad_utility::Timer t;
t.start();
SparqlParser sp;
ParsedQuery pq = sp.parse(query);
SparqlParser sp(query);
ParsedQuery pq = sp.parse();
pq.expandPrefixes();
QueryPlanner qp(&qec);
ad_utility::Timer timer;
Expand Down
9 changes: 5 additions & 4 deletions src/WriteIndexListsMain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,11 +90,12 @@ int main(int argc, char** argv) {
QueryExecutionContext qec(index, engine);
ParsedQuery q;
if (!freebase) {
q = SparqlParser::parse("SELECT ?x WHERE {?x <is-a> <Scientist>}");
q = SparqlParser("SELECT ?x WHERE {?x <is-a> <Scientist>}").parse();
} else {
q = SparqlParser::parse(
"PREFIX fb: <http://rdf.freebase.com/ns/> SELECT ?p WHERE {?p "
"fb:people.person.profession fb:m.06q2q}");
q = SparqlParser(
"PREFIX fb: <http://rdf.freebase.com/ns/> SELECT ?p WHERE {?p "
"fb:people.person.profession fb:m.06q2q}")
.parse();
q.expandPrefixes();
}
QueryPlanner queryPlanner(&qec);
Expand Down
23 changes: 21 additions & 2 deletions src/engine/Filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ Filter::Filter(QueryExecutionContext* qec,
_type(type),
_lhs(lhs),
_rhs(rhs),
_regexIgnoreCase(false) {}
_regexIgnoreCase(false),
_lhsAsString(false) {}

// _____________________________________________________________________________
string Filter::asString(size_t indent) const {
Expand Down Expand Up @@ -510,6 +511,11 @@ void Filter::computeResultFixedValue(
IdTableStatic<WIDTH> result = resultTable->_data.moveToStatic<WIDTH>();
const IdTableStatic<WIDTH> input = subRes->_data.asStaticView<WIDTH>();

if (_lhsAsString) {
AD_THROW(ad_semsearch::Exception::NOT_YET_IMPLEMENTED,
"The str function is not yet supported within filters.");
}

// interpret the filters right hand side
size_t lhs = _subtree->getVariableColumn(_lhs);
Id rhs;
Expand All @@ -521,6 +527,19 @@ void Filter::computeResultFixedValue(
} else if (ad_utility::isNumeric(_rhs)) {
rhs_string = ad_utility::convertNumericToIndexWord(rhs_string);
} else {
// TODO: This is not standard conform, but currently required due to
// our vocabulary storing iris with the greater than and
// literals with their quotation marks.
if (rhs_string.size() > 2 && rhs_string[1] == '<' &&
rhs_string[0] == '"' && rhs_string.back() == '"') {
// Remove the quotation marks surrounding the string.
rhs_string = rhs_string.substr(1, rhs_string.size() - 2);
} else if (std::count(rhs_string.begin(), rhs_string.end(), '"') > 2 &&
rhs_string.back() == '"') {
// Remove the quotation marks surrounding the string.
rhs_string = rhs_string.substr(1, rhs_string.size() - 2);
}

if (getIndex().getVocab().isCaseInsensitiveOrdering()) {
// We have to move to the correct end of the
// "same letters but different case" - range
Expand All @@ -544,7 +563,7 @@ void Filter::computeResultFixedValue(
}
}
if (_type == SparqlFilter::EQ || _type == SparqlFilter::NE) {
if (!getIndex().getVocab().getId(_rhs, &rhs)) {
if (!getIndex().getVocab().getId(rhs_string, &rhs)) {
rhs = std::numeric_limits<size_t>::max() - 1;
}
} else if (_type == SparqlFilter::GE) {
Expand Down
2 changes: 2 additions & 0 deletions src/engine/Filter.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ class Filter : public Operation {
}

void setRegexIgnoreCase(bool i) { _regexIgnoreCase = i; }
void setLhsAsString(bool i) { _lhsAsString = i; }

std::shared_ptr<QueryExecutionTree> getSubtree() const { return _subtree; };

Expand All @@ -93,6 +94,7 @@ class Filter : public Operation {
string _lhs;
string _rhs;
bool _regexIgnoreCase;
bool _lhsAsString;

/**
* @brief Uses the result type and the filter type (_type) to apply the filter
Expand Down
129 changes: 24 additions & 105 deletions src/engine/GroupBy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ void processGroup(const GroupBy::Aggregate& a, size_t blockStart,
const Index& index,
ad_utility::HashSet<size_t>& distinctHashSet) {
switch (a._type) {
case GroupBy::AggregateType::AVG: {
case ParsedQuery::AggregateType::AVG: {
float res = 0;
if (inputTypes[a._inCol] == ResultTable::ResultType::VERBATIM) {
if (a._distinct) {
Expand Down Expand Up @@ -253,7 +253,7 @@ void processGroup(const GroupBy::Aggregate& a, size_t blockStart,
std::memcpy(&(*result)(resultRow, a._outCol), &res, sizeof(float));
break;
}
case GroupBy::AggregateType::COUNT:
case ParsedQuery::AggregateType::COUNT:
if (a._distinct) {
size_t count = 0;
for (size_t i = blockStart; i <= blockEnd; i++) {
Expand All @@ -269,7 +269,7 @@ void processGroup(const GroupBy::Aggregate& a, size_t blockStart,
(*result)(resultRow, a._outCol) = blockEnd - blockStart + 1;
}
break;
case GroupBy::AggregateType::GROUP_CONCAT: {
case ParsedQuery::AggregateType::GROUP_CONCAT: {
std::ostringstream out;
std::string* delim = reinterpret_cast<string*>(a._userData);
if (inputTypes[a._inCol] == ResultTable::ResultType::VERBATIM) {
Expand Down Expand Up @@ -421,7 +421,7 @@ void processGroup(const GroupBy::Aggregate& a, size_t blockStart,
outTable->_localVocab->push_back(out.str());
break;
}
case GroupBy::AggregateType::MAX: {
case ParsedQuery::AggregateType::MAX: {
if (inputTypes[a._inCol] == ResultTable::ResultType::VERBATIM) {
Id res = std::numeric_limits<Id>::lowest();
for (size_t i = blockStart; i <= blockEnd; i++) {
Expand Down Expand Up @@ -451,7 +451,7 @@ void processGroup(const GroupBy::Aggregate& a, size_t blockStart,
}
break;
}
case GroupBy::AggregateType::MIN: {
case ParsedQuery::AggregateType::MIN: {
if (inputTypes[a._inCol] == ResultTable::ResultType::VERBATIM) {
Id res = std::numeric_limits<Id>::max();
for (size_t i = blockStart; i <= blockEnd; i++) {
Expand Down Expand Up @@ -481,10 +481,10 @@ void processGroup(const GroupBy::Aggregate& a, size_t blockStart,
}
break;
}
case GroupBy::AggregateType::SAMPLE:
case ParsedQuery::AggregateType::SAMPLE:
(*result)(resultRow, a._outCol) = input(blockEnd, a._inCol);
break;
case GroupBy::AggregateType::SUM: {
case ParsedQuery::AggregateType::SUM: {
float res = 0;
if (inputTypes[a._inCol] == ResultTable::ResultType::VERBATIM) {
if (a._distinct) {
Expand Down Expand Up @@ -561,12 +561,12 @@ void processGroup(const GroupBy::Aggregate& a, size_t blockStart,
std::memcpy(&(*result)(resultRow, a._outCol), &res, sizeof(float));
break;
}
case GroupBy::AggregateType::FIRST:
case ParsedQuery::AggregateType::FIRST:
// This does the same as sample, as the non grouping rows have no
// inherent order.
(*result)(resultRow, a._outCol) = input(blockStart, a._inCol);
break;
case GroupBy::AggregateType::LAST:
case ParsedQuery::AggregateType::LAST:
// This does the same as sample, as the non grouping rows have no
// inherent order.
(*result)(resultRow, a._outCol) = input(blockEnd, a._inCol);
Expand Down Expand Up @@ -674,7 +674,7 @@ void GroupBy::computeResult(ResultTable* result) {
// Add an "identity" aggregate in the form of a sample aggregate to
// facilitate the passthrough of the groupBy columns into the result
aggregates.emplace_back();
aggregates.back()._type = AggregateType::SAMPLE;
aggregates.back()._type = ParsedQuery::AggregateType::SAMPLE;
aggregates.back()._inCol = it->second;
aggregates.back()._outCol = _varColMap.find(var)->second;
aggregates.back()._userData = nullptr;
Expand All @@ -685,95 +685,14 @@ void GroupBy::computeResult(ResultTable* result) {
for (const ParsedQuery::Alias& alias : _aliases) {
if (alias._isAggregate) {
aggregates.emplace_back();
if (ad_utility::startsWith(alias._function, "COUNT")) {
aggregates.back()._type = AggregateType::COUNT;
} else if (ad_utility::startsWith(alias._function, "GROUP_CONCAT")) {
aggregates.back()._type = AggregateType::GROUP_CONCAT;
} else if (ad_utility::startsWith(alias._function, "SAMPLE")) {
aggregates.back()._type = AggregateType::SAMPLE;
} else if (ad_utility::startsWith(alias._function, "MIN")) {
aggregates.back()._type = AggregateType::MIN;
} else if (ad_utility::startsWith(alias._function, "MAX")) {
aggregates.back()._type = AggregateType::MAX;
} else if (ad_utility::startsWith(alias._function, "SUM")) {
aggregates.back()._type = AggregateType::SUM;
} else if (ad_utility::startsWith(alias._function, "AVG")) {
aggregates.back()._type = AggregateType::AVG;
aggregates.back()._type = alias._type;
aggregates.back()._distinct = alias._isDistinct;
if (alias._type == ParsedQuery::AggregateType::GROUP_CONCAT) {
aggregates.back()._userData = new std::string(alias._delimiter);
} else {
LOG(WARN) << "Unknown aggregate " << alias._function << std::endl;
aggregates.pop_back();
continue;
aggregates.back()._userData = nullptr;
}

std::string inVarName;
if (aggregates.back()._type == AggregateType::GROUP_CONCAT) {
size_t varStart = alias._function.find('(');
size_t varStop = alias._function.rfind(')');
size_t delimitorPos = alias._function.find(';');
if (varStop > varStart && varStop != std::string::npos &&
varStart != std::string::npos) {
// found a matching pair of brackets
// look for a distinct keyword

if (alias._function.find("DISTINCT") != std::string::npos ||
alias._function.find("distinct") != std::string::npos) {
aggregates.back()._distinct = true;
} else {
aggregates.back()._distinct = false;
}

if (delimitorPos != std::string::npos) {
// found a delimiter, need to look for a separator assignment
inVarName = alias._function.substr(varStart + 1,
delimitorPos - varStart - 1);
if (aggregates.back()._distinct) {
inVarName = ad_utility::strip(inVarName, " \t").substr(8);
}
std::string concatString = alias._function.substr(
delimitorPos + 1, varStop - delimitorPos - 1);
concatString = ad_utility::strip(concatString, " ");
size_t startConcat = concatString.find('"');
size_t stopConcat = concatString.rfind('"');
if (stopConcat > startConcat && stopConcat != std::string::npos &&
startConcat != std::string::npos) {
aggregates.back()._userData = new std::string(concatString.substr(
startConcat + 1, stopConcat - startConcat - 1));
} else {
LOG(WARN) << "Unable to parse the delimiter in GROUP_CONCAT"
"aggregrate "
<< alias._function;
aggregates.back()._userData = new std::string(" ");
}
} else {
// found no delimiter, using the default separator ' '
inVarName =
alias._function.substr(varStart + 1, varStop - varStart - 1);
if (aggregates.back()._distinct) {
inVarName = ad_utility::strip(inVarName, " \t").substr(8);
}
aggregates.back()._userData = new std::string(" ");
}
}
} else {
size_t varStart = alias._function.find('(');
size_t varStop = alias._function.rfind(')');
if (varStop > varStart && varStop != std::string::npos &&
varStart != std::string::npos) {
if (alias._function.find("DISTINCT") != std::string::npos ||
alias._function.find("distinct") != std::string::npos) {
aggregates.back()._distinct = true;
} else {
aggregates.back()._distinct = false;
}
inVarName =
alias._function.substr(varStart + 1, varStop - varStart - 1);
if (aggregates.back()._distinct) {
inVarName = ad_utility::strip(inVarName, " \t").substr(8);
}
}
}
inVarName = ad_utility::strip(inVarName, " \t");
auto inIt = subtreeVarCols.find(inVarName);
auto inIt = subtreeVarCols.find(alias._inVarName);
if (inIt == subtreeVarCols.end()) {
LOG(WARN) << "The aggregate alias " << alias._function << " refers to "
<< "a column not present in the query." << std::endl;
Expand All @@ -794,28 +713,28 @@ void GroupBy::computeResult(ResultTable* result) {
result->_resultTypes.resize(result->_data.cols());
for (size_t i = 0; i < result->_data.cols(); i++) {
switch (aggregates[i]._type) {
case AggregateType::AVG:
case ParsedQuery::AggregateType::AVG:
result->_resultTypes[i] = ResultTable::ResultType::FLOAT;
break;
case AggregateType::COUNT:
case ParsedQuery::AggregateType::COUNT:
result->_resultTypes[i] = ResultTable::ResultType::VERBATIM;
break;
case AggregateType::GROUP_CONCAT:
case ParsedQuery::AggregateType::GROUP_CONCAT:
result->_resultTypes[i] = ResultTable::ResultType::LOCAL_VOCAB;
break;
case AggregateType::MAX:
case ParsedQuery::AggregateType::MAX:
result->_resultTypes[i] =
subresult->getResultType(aggregates[i]._inCol);
break;
case AggregateType::MIN:
case ParsedQuery::AggregateType::MIN:
result->_resultTypes[i] =
subresult->getResultType(aggregates[i]._inCol);
break;
case AggregateType::SAMPLE:
case ParsedQuery::AggregateType::SAMPLE:
result->_resultTypes[i] =
subresult->getResultType(aggregates[i]._inCol);
break;
case AggregateType::SUM:
case ParsedQuery::AggregateType::SUM:
result->_resultTypes[i] = ResultTable::ResultType::FLOAT;
break;
default:
Expand Down Expand Up @@ -843,7 +762,7 @@ void GroupBy::computeResult(ResultTable* result) {

// Free the user data used by GROUP_CONCAT aggregates.
for (Aggregate& a : aggregates) {
if (a._type == AggregateType::GROUP_CONCAT) {
if (a._type == ParsedQuery::AggregateType::GROUP_CONCAT) {
delete static_cast<std::string*>(a._userData);
}
}
Expand Down
17 changes: 1 addition & 16 deletions src/engine/GroupBy.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,26 +18,11 @@ using std::vector;

class GroupBy : public Operation {
public:
/**
* @brief All supported types of aggregate aliases
*/
enum class AggregateType {
COUNT,
GROUP_CONCAT,
FIRST,
LAST,
SAMPLE,
MIN,
MAX,
SUM,
AVG
};

/**
* @brief Represents an aggregate alias in the select part of the query.
*/
struct Aggregate {
AggregateType _type;
ParsedQuery::AggregateType _type;
size_t _inCol, _outCol;
// Used to store the string necessary for the group concat aggregate.
// A void pointer is used to allow for storing arbitrary data should any
Expand Down