Skip to content

Commit

Permalink
Began changing the parser to use the lexer
Browse files Browse the repository at this point in the history
  • Loading branch information
floriankramer committed Jul 11, 2019
1 parent f256af1 commit e8e654b
Show file tree
Hide file tree
Showing 11 changed files with 923 additions and 1,112 deletions.
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ set(USE_OPENMP OFF CACHE BOOL "Don't use OPENMP as default" FORCE)
add_subdirectory(third_party/stxxl)
# apply STXXL CXXFLAGS
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${STXXL_CXX_FLAGS}")
include_directories(${STXXL_INCLUDE_DIRS})
include_directories(SYSTEM ${STXXL_INCLUDE_DIRS})

################################
# RE2
Expand All @@ -91,7 +91,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter")

set(RE2_BUILD_TESTING OFF CACHE BOOL "enable testing for RE2" FORCE)
add_subdirectory(third_party/re2)
include_directories(third_party/re2)
include_directories(SYSTEM third_party/re2)

# reinstate original flags including all warnings
set(CMAKE_CXX_FLAGS "${LOCAL_CXX_BACKUP_FLAGS}")
Expand Down
4 changes: 2 additions & 2 deletions src/SparqlEngineMain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -192,8 +192,8 @@ int main(int argc, char** argv) {
void processQuery(QueryExecutionContext& qec, const string& query) {
ad_utility::Timer t;
t.start();
SparqlParser sp;
ParsedQuery pq = sp.parse(query);
SparqlParser sp(query);
ParsedQuery pq = sp.parse();
pq.expandPrefixes();
QueryPlanner qp(&qec);
ad_utility::Timer timer;
Expand Down
9 changes: 5 additions & 4 deletions src/WriteIndexListsMain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,11 +90,12 @@ int main(int argc, char** argv) {
QueryExecutionContext qec(index, engine);
ParsedQuery q;
if (!freebase) {
q = SparqlParser::parse("SELECT ?x WHERE {?x <is-a> <Scientist>}");
q = SparqlParser("SELECT ?x WHERE {?x <is-a> <Scientist>}").parse();
} else {
q = SparqlParser::parse(
"PREFIX fb: <http://rdf.freebase.com/ns/> SELECT ?p WHERE {?p "
"fb:people.person.profession fb:m.06q2q}");
q = SparqlParser(
"PREFIX fb: <http://rdf.freebase.com/ns/> SELECT ?p WHERE {?p "
"fb:people.person.profession fb:m.06q2q}")
.parse();
q.expandPrefixes();
}
QueryPlanner queryPlanner(&qec);
Expand Down
2 changes: 1 addition & 1 deletion src/engine/Server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ void Server::process(Socket* client, QueryExecutionContext* qec) const {
#endif
query = createQueryFromHttpParams(params);
LOG(INFO) << "Query:\n" << query << '\n';
ParsedQuery pq = SparqlParser::parse(query);
ParsedQuery pq = SparqlParser(query).parse();
pq.expandPrefixes();

// QueryGraph qg(qec);
Expand Down
22 changes: 15 additions & 7 deletions src/parser/SparqlLexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include "ParseException.h"

const std::string SparqlToken::TYPE_NAMES[] = {
"IRIREF", "WS", "CONTROL", "VARIABLE",
"IRIREF", "WS", "KEYWORD", "VARIABLE",
"SYMBOL", "PROPERTYPATH", "AGGREGATE", "RDFLITERAL"};

const std::string SparqlLexer::IRIREF =
Expand All @@ -21,6 +21,8 @@ const std::string SparqlLexer::PN_CHARS_BASE =
const std::string SparqlLexer::WS = "(\\x20|\\x09|\\x0D|\\x0A)";
const std::string SparqlLexer::ECHAR = "\\\\[tbnrf\"']";
const std::string SparqlLexer::LANGTAG = "@[a-zA-Z]+(-[a-zA-Z0-9]+)*";
const std::string SparqlLexer::INTEGER = "(-?[0-9]+)";
const std::string SparqlLexer::FLOAT = "(-?[0-9]+\\.[0-9]+)";

const std::string SparqlLexer::PN_CHARS_U = PN_CHARS_BASE + "|_";
const std::string SparqlLexer::PN_CHARS =
Expand All @@ -44,14 +46,14 @@ const std::string SparqlLexer::IRI =
const std::string SparqlLexer::VARNAME =
"(" + PN_CHARS_U + "|[0-9])(" + PN_CHARS_U +
"|[0-9]|\\x{00B7}|[\\x{0300}-\\x{036F}]|[\\x{203F}-\\x{2040}])*";
const std::string SparqlLexer::CONTROL =
const std::string SparqlLexer::KEYWORD =
"(?i)(PREFIX|SELECT|DISTINCT|REDUCED|"
"HAVING|WHERE|ASC|AS|GROUP|BY|LIMIT|OFFSET|ORDER|DESC|FILTER|VALUES|"
"OPTIONAL|UNION|LANGMATCHES|LANG)";
"OPTIONAL|UNION|LANGMATCHES|LANG|TEXT|SCORE|REGEX|PREFIX)";
const std::string SparqlLexer::AGGREGATE =
"(?i)(SAMPLE|COUNT|MIN|MAX|AVG|SUM|GROUP_CONCAT)";
const std::string SparqlLexer::VARIABLE = "(\\?" + VARNAME + ")";
const std::string SparqlLexer::SYMBOL = "([\\.\\{\\}\\(\\)\\=\\*,])";
const std::string SparqlLexer::SYMBOL = "([\\.\\{\\}\\(\\)\\=\\*,<>!])";
const std::string SparqlLexer::PPATH = "((" + IRIREF + "|[?*+/|()^0-9])*" +
IRIREF + "(" + IRIREF +
"|[?*+/|()^0-9])*)";
Expand All @@ -65,12 +67,14 @@ const std::string SparqlLexer::RDFLITERAL =

const re2::RE2 SparqlLexer::RE_IRI = re2::RE2(IRI);
const re2::RE2 SparqlLexer::RE_WS = re2::RE2("(" + WS + "+)");
const re2::RE2 SparqlLexer::RE_CONTROL = re2::RE2(CONTROL);
const re2::RE2 SparqlLexer::RE_KEYWORD = re2::RE2(KEYWORD);
const re2::RE2 SparqlLexer::RE_VARIABLE = re2::RE2(VARIABLE);
const re2::RE2 SparqlLexer::RE_SYMBOL = re2::RE2(SYMBOL);
const re2::RE2 SparqlLexer::RE_PPATH = re2::RE2(PPATH);
const re2::RE2 SparqlLexer::RE_AGGREGATE = re2::RE2(AGGREGATE);
const re2::RE2 SparqlLexer::RE_RDFLITERAL = re2::RE2("(" + RDFLITERAL + ")");
const re2::RE2 SparqlLexer::RE_INTEGER = re2::RE2(INTEGER);
const re2::RE2 SparqlLexer::RE_FLOAT = re2::RE2(FLOAT);

SparqlLexer::SparqlLexer(const std::string& sparql)
: _sparql(sparql), _re_string(_sparql) {
Expand All @@ -85,8 +89,8 @@ void SparqlLexer::readNext() {
std::string raw;
while (_next.type == SparqlToken::Type::WS && !empty()) {
_next.pos = _sparql.size() - _re_string.size();
if (re2::RE2::Consume(&_re_string, RE_CONTROL, &raw)) {
_next.type = SparqlToken::Type::CONTROL;
if (re2::RE2::Consume(&_re_string, RE_KEYWORD, &raw)) {
_next.type = SparqlToken::Type::KEYWORD;
raw = ad_utility::getLowercaseUtf8(raw);
} else if (re2::RE2::Consume(&_re_string, RE_AGGREGATE, &raw)) {
_next.type = SparqlToken::Type::AGGREGATE;
Expand All @@ -97,6 +101,10 @@ void SparqlLexer::readNext() {
_next.type = SparqlToken::Type::IRI;
} else if (re2::RE2::Consume(&_re_string, RE_RDFLITERAL, &raw)) {
_next.type = SparqlToken::Type::RDFLITERAL;
} else if (re2::RE2::Consume(&_re_string, RE_INTEGER, &raw)) {
_next.type = SparqlToken::Type::INTEGER;
} else if (re2::RE2::Consume(&_re_string, RE_FLOAT, &raw)) {
_next.type = SparqlToken::Type::FLOAT;
} else if (re2::RE2::Consume(&_re_string, RE_PPATH, &raw)) {
_next.type = SparqlToken::Type::PROPERTYPATH;
} else if (re2::RE2::Consume(&_re_string, RE_SYMBOL, &raw)) {
Expand Down
41 changes: 27 additions & 14 deletions src/parser/SparqlLexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,28 +3,37 @@
// Author: Florian Kramer (florian.kramer@neptun.uni-freiburg.de)

#include <re2/re2.h>
#include <string>
#include <iostream>
#include <string>

struct SparqlToken {
enum class Type {
IRI, WS, CONTROL, VARIABLE, SYMBOL, PROPERTYPATH, AGGREGATE, RDFLITERAL
IRI,
WS,
KEYWORD,
VARIABLE,
SYMBOL,
PROPERTYPATH,
AGGREGATE,
RDFLITERAL,
INTEGER,
FLOAT
};
static const std::string TYPE_NAMES[];

std::string raw;
Type type = Type::IRI;
size_t pos;

friend std::ostream& operator<<(std::ostream &os, const SparqlToken &t) {
os << t.raw << " : " << TYPE_NAMES[(int) t.type];
friend std::ostream& operator<<(std::ostream& os, const SparqlToken& t) {
os << t.raw << " : " << TYPE_NAMES[(int)t.type];
return os;
}
};

class SparqlLexer {
private:
// The rules for the lexer
private:
// The rules for the lexer
static const std::string IRIREF;
static const std::string IRI;
static const std::string PN_CHARS_BASE;
Expand All @@ -35,7 +44,7 @@ class SparqlLexer {
static const std::string PN_LOCAL;
static const std::string VARNAME;
static const std::string WS;
static const std::string CONTROL;
static const std::string KEYWORD;
static const std::string VARIABLE;
static const std::string SYMBOL;
static const std::string PPATH;
Expand All @@ -46,34 +55,38 @@ class SparqlLexer {
static const std::string RDFLITERAL;
static const std::string PNAME_NS;
static const std::string PNAME_LN;
static const std::string INTEGER;
static const std::string FLOAT;

static const re2::RE2 RE_IRI;
static const re2::RE2 RE_WS;
static const re2::RE2 RE_CONTROL;
static const re2::RE2 RE_KEYWORD;
static const re2::RE2 RE_VARIABLE;
static const re2::RE2 RE_SYMBOL;
static const re2::RE2 RE_PPATH;
static const re2::RE2 RE_AGGREGATE;
static const re2::RE2 RE_RDFLITERAL;
static const re2::RE2 RE_INTEGER;
static const re2::RE2 RE_FLOAT;

public:
SparqlLexer(const std::string &sparql);
public:
SparqlLexer(const std::string& sparql);

// True if the entire input stream was consumed
bool empty() const;

bool accept(SparqlToken::Type type);
bool accept(const std::string &raw, bool match_case = true);
bool accept(const std::string& raw, bool match_case = true);
// Accepts any token
void accept();

void expect(SparqlToken::Type type);
void expect(const std::string &raw, bool match_case = true);
void expect(const std::string& raw, bool match_case = true);
void expectEmpty();

const SparqlToken &current();
const SparqlToken& current();

private:
private:
void readNext();

const std::string _sparql;
Expand Down

0 comments on commit e8e654b

Please sign in to comment.