Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Removed a lot of redundant code in the SparqlLexer. #432

Merged
merged 7 commits into from
Aug 4, 2021
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
185 changes: 90 additions & 95 deletions src/parser/SparqlLexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include "SparqlLexer.h"

#include "../util/HashSet.h"
#include "../util/StringUtils.h"
#include "ParseException.h"
#include "Tokenizer.h"
Expand All @@ -12,77 +13,83 @@ const std::string SparqlToken::TYPE_NAMES[] = {
"IRI", "WS", "KEYWORD", "VARIABLE", "SYMBOL",
"AGGREGATE", "RDFLITERAL", "INTEGER", "FLOAT", "LOGICAL_OR"};

const std::string SparqlLexer::LANGTAG = "@[a-zA-Z]+(-[a-zA-Z0-9]+)*";
const std::string SparqlLexer::IRIREF =
"(<[^<>\"{}|^`\\\\\\[\\]\\x00-\\x20]*>)";
const std::string SparqlLexer::PN_CHARS_BASE =
const std::string LANGTAG = "@[a-zA-Z]+(-[a-zA-Z0-9]+)*";
const std::string IRIREF = "(<[^<>\"{}|^`\\\\\\[\\]\\x00-\\x20]*>)";
const std::string PN_CHARS_BASE =
"[A-Z]|[a-z]|[\\x{00C0}-\\x{00D6}]|[\\x{00D8}-\\x{00F6}]|"
"[\\x{00F8}-\\x{02FF}]|[\\x{0370}-\\x{037D}]|[\\x{037F}-\\x{1FFF}]|"
"[\\x{200C}-\\x{200D}]|[\\x{2070}-\\x{218F}]|[\\x{2C00}-\\x{2FEF}]|"
"[\\x{3001}-\\x{D7FF}]|[\\x{F900}-\\x{FDCF}]|[\\x{FDF0}-\\x{FFFD}]|"
"[\\x{10000}-\\x{EFFFF}]";
const std::string SparqlLexer::WS = R"((\x20|\x09|\x0D|\x0A))";
const std::string SparqlLexer::ECHAR = R"(\\[tbnrf\\"'])";
const std::string SparqlLexer::INTEGER = "(-?[0-9]+)";
const std::string SparqlLexer::FLOAT = "(-?[0-9]+\\.[0-9]+)";
const std::string WS = R"((\x20|\x09|\x0D|\x0A))";
const std::string ECHAR = R"(\\[tbnrf\\"'])";
const std::string INTEGER = "(-?[0-9]+)";
const std::string FLOAT = "(-?[0-9]+\\.[0-9]+)";

const std::string SparqlLexer::PN_CHARS_U = PN_CHARS_BASE + "|_";
const std::string SparqlLexer::PN_CHARS =
const std::string PN_CHARS_U = PN_CHARS_BASE + "|_";
const std::string PN_CHARS =
PN_CHARS_U +
"|-|[0-9]|\\x{00B7}|[\\x{0300}-\\x{036F}]|[\\x{203F}-\\x{2040}]";
const std::string SparqlLexer::PN_PREFIX =
const std::string PN_PREFIX =
"(" + PN_CHARS_BASE + ")((" + PN_CHARS + "|\\.)*(" + PN_CHARS + "))?";
const std::string SparqlLexer::PLX =
const std::string PLX =
"(%[0-9a-fA-F][0-9a-fA-F])|(\\\\(_|~|\\.|-|!|$|&|'|\\(|\\)|\\*|\\+|,|;|=|/"
"|\\?|#|@|%))";
const std::string SparqlLexer::PN_LOCAL =
"(" + PN_CHARS_U + "|:|[0-9]|" + PLX + ")((" + PN_CHARS + "|\\.|:|" + PLX +
")*(" + PN_CHARS + "|:|" + PLX + "))?";
const std::string PN_LOCAL = "(" + PN_CHARS_U + "|:|[0-9]|" + PLX + ")((" +
PN_CHARS + "|\\.|:|" + PLX + ")*(" + PN_CHARS +
"|:|" + PLX + "))?";

const std::string SparqlLexer::PNAME_NS = "(" + PN_PREFIX + ")?:";
const std::string SparqlLexer::PNAME_LN =
"(" + PNAME_NS + ")(" + PN_LOCAL + ")";
const std::string PNAME_NS = "(" + PN_PREFIX + ")?:";
const std::string PNAME_LN = "(" + PNAME_NS + ")(" + PN_LOCAL + ")";

const std::string SparqlLexer::IRI = "((" + LANGTAG + "@)?((" + IRIREF + ")|(" +
PNAME_LN + ")|(" + PNAME_NS + ")))";
const std::string SparqlLexer::VARNAME =
const std::string IRI = "((" + LANGTAG + "@)?((" + IRIREF + ")|(" + PNAME_LN +
")|(" + PNAME_NS + ")))";
const std::string VARNAME =
"(" + PN_CHARS_U + "|[0-9])(" + PN_CHARS_U +
"|[0-9]|\\x{00B7}|[\\x{0300}-\\x{036F}]|[\\x{203F}-\\x{2040}])*";
const std::string SparqlLexer::GROUP_BY = "(?i)(GROUP(\\s)*BY)";
const std::string SparqlLexer::ORDER_BY = "(?i)(ORDER(\\s)*BY)";
const std::string SparqlLexer::KEYWORD =
const std::string GROUP_BY = "(?i)(GROUP(\\s)*BY)";
const std::string ORDER_BY = "(?i)(ORDER(\\s)*BY)";
const std::string KEYWORD =
"(?i)(TEXTLIMIT|PREFIX|SELECT|DISTINCT|REDUCED|"
"HAVING|WHERE|ASC|AS|LIMIT|OFFSET|DESC|FILTER|VALUES|"
"OPTIONAL|UNION|LANGMATCHES|LANG|TEXT|SCORE|REGEX|PREFIX|SEPARATOR|STR|"
"BIND|MINUS)";
const std::string SparqlLexer::AGGREGATE =
"(?i)(SAMPLE|COUNT|MIN|MAX|AVG|SUM|GROUP_CONCAT)";
const std::string SparqlLexer::VARIABLE = "(\\?" + VARNAME + ")";
const std::string SparqlLexer::SYMBOL =
"([\\.\\{\\}\\(\\)\\=\\*,;:<>!\\|/\\^\\?\\*\\+-])";

const std::string SparqlLexer::STRING_LITERAL =
"(('([^\\x27\\x5C\\x0A\\x0D]|(" + ECHAR +
"))*')|"
"(\"([^\\x22\\x5C\\x0A\\x0D]|(" +
ECHAR + "))*\"))";
const std::string SparqlLexer::RDFLITERAL =
const std::string AGGREGATE = "(?i)(SAMPLE|COUNT|MIN|MAX|AVG|SUM|GROUP_CONCAT)";
const std::string VARIABLE = "(\\?" + VARNAME + ")";
const std::string SYMBOL = "([\\.\\{\\}\\(\\)\\=\\*,;:<>!\\|/\\^\\?\\*\\+-])";

const std::string STRING_LITERAL = "(('([^\\x27\\x5C\\x0A\\x0D]|(" + ECHAR +
"))*')|"
"(\"([^\\x22\\x5C\\x0A\\x0D]|(" +
ECHAR + "))*\"))";
const std::string RDFLITERAL =
STRING_LITERAL + "((" + LANGTAG + ")|(\\^\\^" + IRI + "))?";

const std::string SparqlLexer::LOGICAL_OR = "(\\|\\|)";

const re2::RE2 SparqlLexer::RE_IRI = re2::RE2(IRI);
const re2::RE2 SparqlLexer::RE_WS = re2::RE2("(" + WS + "+)");
const re2::RE2 SparqlLexer::RE_KEYWORD = re2::RE2(KEYWORD);
const re2::RE2 SparqlLexer::RE_GROUP_BY = re2::RE2(GROUP_BY);
const re2::RE2 SparqlLexer::RE_ORDER_BY = re2::RE2(ORDER_BY);
const re2::RE2 SparqlLexer::RE_VARIABLE = re2::RE2(VARIABLE);
const re2::RE2 SparqlLexer::RE_SYMBOL = re2::RE2(SYMBOL);
const re2::RE2 SparqlLexer::RE_AGGREGATE = re2::RE2(AGGREGATE);
const re2::RE2 SparqlLexer::RE_RDFLITERAL = re2::RE2("(" + RDFLITERAL + ")");
const re2::RE2 SparqlLexer::RE_INTEGER = re2::RE2(INTEGER);
const re2::RE2 SparqlLexer::RE_FLOAT = re2::RE2(FLOAT);
const re2::RE2 SparqlLexer::RE_LOGICAL_OR = re2::RE2(LOGICAL_OR);
const std::string LOGICAL_OR = "(\\|\\|)";

const SparqlLexer::RegexTokenMap& SparqlLexer::getRegexTokenMap() {
using T = SparqlToken::Type;
static const RegexTokenMap regexTokenMap = [=]() {
RegexTokenMap m;
auto emplace = [&m](const std::string& regex, T type) {
m.push_back(std::make_pair(std::make_unique<re2::RE2>(regex), type));
};
emplace(KEYWORD, T::KEYWORD);
emplace(GROUP_BY, T::GROUP_BY);
emplace(ORDER_BY, T::ORDER_BY);
emplace(AGGREGATE, T::AGGREGATE);
emplace(LOGICAL_OR, T::LOGICAL_OR);
emplace(VARIABLE, T::VARIABLE);
emplace(IRI, T::IRI);
emplace("(" + RDFLITERAL + ")", T::RDFLITERAL);
emplace(FLOAT, T::FLOAT);
emplace(INTEGER, T::INTEGER);
emplace(SYMBOL, T::SYMBOL);
emplace("(" + WS + "+)", T::WS);
return m;
}();
return regexTokenMap;
}

SparqlLexer::SparqlLexer(const std::string& sparql)
: _sparql(sparql), _re_string(_sparql) {
Expand All @@ -96,54 +103,42 @@ void SparqlLexer::readNext() {
_next.type = SparqlToken::Type::WS;
std::string raw;
// Return the first token type matched.
static const ad_utility::HashSet<SparqlToken::Type>
tokensThatRequireLowercasing = {
SparqlToken::Type::KEYWORD, SparqlToken::Type::GROUP_BY,
SparqlToken::Type::ORDER_BY, SparqlToken::Type::AGGREGATE};
while (_next.type == SparqlToken::Type::WS && !empty()) {
_next.pos = _sparql.size() - _re_string.size();
if (re2::RE2::Consume(&_re_string, RE_KEYWORD, &raw)) {
_next.type = SparqlToken::Type::KEYWORD;
raw = ad_utility::getLowercaseUtf8(raw);
} else if (re2::RE2::Consume(&_re_string, RE_GROUP_BY, &raw)) {
_next.type = SparqlToken::Type::GROUP_BY;
raw = ad_utility::getLowercaseUtf8(raw);
} else if (re2::RE2::Consume(&_re_string, RE_ORDER_BY, &raw)) {
_next.type = SparqlToken::Type::ORDER_BY;
raw = ad_utility::getLowercaseUtf8(raw);
} else if (re2::RE2::Consume(&_re_string, RE_AGGREGATE, &raw)) {
_next.type = SparqlToken::Type::AGGREGATE;
raw = ad_utility::getLowercaseUtf8(raw);
} else if (re2::RE2::Consume(&_re_string, RE_KEYWORD, &raw)) {
_next.type = SparqlToken::Type::KEYWORD;
raw = ad_utility::getLowercaseUtf8(raw);
} else if (re2::RE2::Consume(&_re_string, RE_LOGICAL_OR, &raw)) {
_next.type = SparqlToken::Type::LOGICAL_OR;
} else if (re2::RE2::Consume(&_re_string, RE_VARIABLE, &raw)) {
_next.type = SparqlToken::Type::VARIABLE;
} else if (re2::RE2::Consume(&_re_string, RE_IRI, &raw)) {
_next.type = SparqlToken::Type::IRI;
} else if (re2::RE2::Consume(&_re_string, RE_RDFLITERAL, &raw)) {
_next.type = SparqlToken::Type::RDFLITERAL;
auto lastQuote = raw.rfind('"');
std::string_view quoted{raw.begin(), raw.begin() + lastQuote + 1};
std::string_view langtagOrDatatype{raw.begin() + lastQuote + 1,
raw.end()};
raw = RdfEscaping::normalizeRDFLiteral(quoted) + langtagOrDatatype;
// TODO<joka921, kramerfl> proper (un-)escaping of the RDFLITERAL type
// requires splitting it up into the different parts (string content + iri
// of the datatype) which require different escaping
} else if (re2::RE2::Consume(&_re_string, RE_FLOAT, &raw)) {
_next.type = SparqlToken::Type::FLOAT;
} else if (re2::RE2::Consume(&_re_string, RE_INTEGER, &raw)) {
_next.type = SparqlToken::Type::INTEGER;
} else if (re2::RE2::Consume(&_re_string, RE_SYMBOL, &raw)) {
_next.type = SparqlToken::Type::SYMBOL;
} else if (re2::RE2::Consume(&_re_string, RE_WS, &raw)) {
_next.type = SparqlToken::Type::WS;
} else if (_re_string[0] == '#') {
// Start of a comment. Consume everything up to the next newline.
while (!_re_string.empty() && _re_string[0] != '\n') {
_re_string.remove_prefix(1);
bool regexMatched = false;
for (const auto& [regexPtr, type] : getRegexTokenMap()) {
if (re2::RE2::Consume(&_re_string, *regexPtr, &raw)) {
regexMatched = true;
_next.type = type;
if (tokensThatRequireLowercasing.contains(type)) {
raw = ad_utility::getLowercaseUtf8(raw);
}
if (type == SparqlToken::Type::RDFLITERAL) {
// unescaping of RDFLiteral, only applied to the actual literal and
// not the datatype/langtag
auto lastQuote = raw.rfind('"');
std::string_view quoted{raw.begin(), raw.begin() + lastQuote + 1};
std::string_view langtagOrDatatype{raw.begin() + lastQuote + 1,
raw.end()};
raw = RdfEscaping::normalizeRDFLiteral(quoted) + langtagOrDatatype;
}
break; // we check the regexes in an order that ensures that stopping
// at the first match is indeed correct.
}
}
if (!regexMatched) {
if (_re_string[0] == '#') {
// Start of a comment. Consume everything up to the next newline.
while (!_re_string.empty() && _re_string[0] != '\n') {
_re_string.remove_prefix(1);
}
} else {
throw ParseException("Unexpected input: " + _re_string.as_string());
}
} else {
throw ParseException("Unexpected input: " + _re_string.as_string());
}
}
_next.raw = raw;
Expand Down
47 changes: 9 additions & 38 deletions src/parser/SparqlLexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <re2/re2.h>

#include <iostream>
#include <memory>
#include <string>

struct SparqlToken {
Expand Down Expand Up @@ -35,46 +36,16 @@ struct SparqlToken {
};

class SparqlLexer {
public:
private:
// The rules for the lexer
static const std::string IRIREF;
static const std::string IRI;
static const std::string PN_CHARS_BASE;
static const std::string PN_CHARS_U;
static const std::string PN_CHARS;
static const std::string PN_PREFIX;
static const std::string PLX;
static const std::string PN_LOCAL;
static const std::string VARNAME;
static const std::string WS;
static const std::string GROUP_BY;
static const std::string ORDER_BY;
static const std::string KEYWORD;
static const std::string VARIABLE;
static const std::string SYMBOL;
static const std::string AGGREGATE;
static const std::string ECHAR;
static const std::string LANGTAG;
static const std::string STRING_LITERAL;
static const std::string RDFLITERAL;
static const std::string PNAME_NS;
static const std::string PNAME_LN;
static const std::string INTEGER;
static const std::string FLOAT;
static const std::string LOGICAL_OR;
using RegexTokenMap =
std::vector<std::pair<std::unique_ptr<re2::RE2>, SparqlToken::Type>>;

static const re2::RE2 RE_IRI;
static const re2::RE2 RE_WS;
static const re2::RE2 RE_GROUP_BY;
static const re2::RE2 RE_ORDER_BY;
static const re2::RE2 RE_KEYWORD;
static const re2::RE2 RE_VARIABLE;
static const re2::RE2 RE_SYMBOL;
static const re2::RE2 RE_AGGREGATE;
static const re2::RE2 RE_RDFLITERAL;
static const re2::RE2 RE_INTEGER;
static const re2::RE2 RE_FLOAT;
static const re2::RE2 RE_LOGICAL_OR;
// contains pairs of <regex, the corresponding token type>
// These regexes have to be checked in the correct order, because
// this lexer currently does not perform longest matches. For this
// reason the result is a vector with the correct order.
static const RegexTokenMap& getRegexTokenMap();

public:
SparqlLexer(const std::string& sparql);
Expand Down