Skip to content

Commit

Permalink
Correct handling of escaped Literals and Irirefs during index build.
Browse files Browse the repository at this point in the history
- also apply the normalization of literals correctly during index build time
- Adapt the Index unit tests to "legal" knowledge bases
- Disable the CTRE parser for now, since it becomes awefully slow with the PnameNS and PnLocal changes for some reasons.
- TODO: Maybe we want to renable the CTRE Parser with the old "wrong"
  behavior as a very fast way to parse Wikidata
  • Loading branch information
joka921 committed Apr 13, 2020
1 parent b460307 commit b8aabfd
Show file tree
Hide file tree
Showing 13 changed files with 280 additions and 180 deletions.
2 changes: 1 addition & 1 deletion e2e/e2e-build-settings.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"num-triples-per-partial-vocab" : 40000,
"parser-batch-size" : 1000,
"ascii-prefixes-only":true
"ascii-prefixes-only":false
}
3 changes: 3 additions & 0 deletions src/TurtleParserMain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ void writeNTDispatch(std::ostream& out, const string& fileFormat,
writeNT<Tokenizer>(out, fileFormat, filename);
} else if (regexEngine == "ctre") {
LOG(WARN) << WARNING_ASCII_ONLY_PREFIXES;
throw std::runtime_error(
"The ctre engine is currently disabled due to serious performance "
"problems");
writeNT<TokenizerCtre>(out, fileFormat, filename);
} else {
LOG(ERROR)
Expand Down
4 changes: 3 additions & 1 deletion src/global/Constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,9 @@ static const std::string WARNING_ASCII_ONLY_PREFIXES =
"regex engine for Tokenization. This means "
"that prefixes in the input Turtle may only use characters from "
"the ascii range. This is stricter than the Sparql standard but "
"makes parsing faster and works e.g. for wikidata dumps\n";
"makes parsing faster and works e.g. for wikidata dumps\n"
"ALSO CURRENTLY THIS SETTING IS BROKEN (serious performance issues) AND "
"THUS FORBIDDEN";

static const std::string LOCALE_DEFAULT_LANG = "en";
static const std::string LOCALE_DEFAULT_COUNTRY = "US";
Expand Down
14 changes: 13 additions & 1 deletion src/index/Index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ VocabularyData Index::passFileForVocabulary(const string& filename,
std::array<ItemMapManager, NUM_PARALLEL_ITEM_MAPS> itemArray;

{
auto p = ad_pipeline::setupParallelPipeline<1, NUM_PARALLEL_ITEM_MAPS>(
auto p = ad_pipeline::setupParallelPipeline<1, 1, NUM_PARALLEL_ITEM_MAPS>(
_parserBatchSize,
// when called, returns an optional to the next triple. If
// <linexPerPartial> triples were parsed, return std::nullopt. when
Expand All @@ -169,6 +169,15 @@ VocabularyData Index::passFileForVocabulary(const string& filename,
// as a first step in the parallel Pipeline.
ParserBatcher(parser, linesPerPartial,
[&]() { parserExhausted = true; }),
// do all the unescaping from Sparql (ToDo<joka921>:: move this into
// its own pipeline within the parser
[this](Triple&& t) {
Triple res;
std::transform(t.begin(), t.end(), res.begin(), [](const auto& s) {
return TurtleToken::normalizeRDFLiteral(s);
});
return res;
},
// convert each triple to the internal representation (e.g. special
// values for Numbers, externalized literals, etc.)
[this](Triple&& t) {
Expand Down Expand Up @@ -1504,6 +1513,9 @@ void Index::initializeVocabularySettingsBuild() {
if (v) {
LOG(WARN) << WARNING_ASCII_ONLY_PREFIXES;
_onlyAsciiTurtlePrefixes = true;
throw std::runtime_error(
"the ascii-prefixes-only setting is forbidden due to performance "
"problems at the moment");
} else {
_onlyAsciiTurtlePrefixes = false;
}
Expand Down
2 changes: 2 additions & 0 deletions src/parser/Tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ const RE2& Tokenizer::idToRegex(const TokId reg) {
return _tokens.PnameNS;
case TokId::PnameLN:
return _tokens.PnameLN;
case TokId::PnLocal:
return _tokens.PnLocal;
case TokId::BlankNodeLabel:
return _tokens.BlankNodeLabel;
case TokId::WsMultiple:
Expand Down
82 changes: 65 additions & 17 deletions src/parser/Tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@
#include <ctre/ctre.h>
#include <gtest/gtest.h>
#include <re2/re2.h>
#include <unicode/ustream.h>
#include <regex>
#include "../util/Exception.h"
#include "../util/Log.h"

using re2::RE2;
using namespace std::string_literals;

Expand Down Expand Up @@ -93,6 +93,7 @@ enum class TokId : int {
Iriref,
PnameNS,
PnameLN,
PnLocal,
BlankNodeLabel,
WsMultiple,
Anon,
Expand Down Expand Up @@ -215,6 +216,8 @@ struct TurtleTokenCtre {
grp(cls(PnCharsUString + u8":0-9") + "|" + PlxString) +
grp(u8"\\.*" + grp(TmpNoDot)) + "*";

static constexpr fixed_string PnLocal = grp(PnLocalString);

static constexpr fixed_string PnameLNString =
grp(PnameNSString) + grp(PnLocalString);

Expand Down Expand Up @@ -246,6 +249,16 @@ struct TurtleTokenCtre {
* at runtime
*/
struct TurtleToken {
/// turn a number of hex-chars like '00e4' into utf-8
static std::string unescapeUchar(std::string_view hex) {
UChar32 x;
std::stringstream sstream;
sstream << std::hex << hex;
sstream >> x;
std::string res;
icu::UnicodeString(x).toUTF8String(res);
return res;
}
/**
* @brief convert a RDF Literal to a unified form that is used inside QLever
*
Expand All @@ -266,23 +279,42 @@ struct TurtleToken {
* @param literal
* @return
*/
static std::string normalizeRDFLiteral(std::string_view literal) {
std::string res = "\"";
auto lastQuot = literal.find_last_of("\"\'");
AD_CHECK(lastQuot != std::string_view::npos);
auto langtagOrDatatype = literal.substr(lastQuot + 1);
literal.remove_suffix(literal.size() - lastQuot - 1);
if (ad_utility::startsWith(literal, "\"\"\"") ||
ad_utility::startsWith(literal, "'''")) {
AD_CHECK(ad_utility::endsWith(literal, literal.substr(0, 3)));
literal.remove_prefix(3);
literal.remove_suffix(3);
} else {
AD_CHECK(ad_utility::startsWith(literal, "\"") ||
ad_utility::startsWith(literal, "'"));
AD_CHECK(ad_utility::endsWith(literal, literal.substr(0, 1)));
static std::string normalizeRDFLiteral(std::string_view origLiteral) {
auto literal = origLiteral;
std::string res;
char endDelimiter = '\0';
std::string_view langtagOrDatatype;
if (ad_utility::startsWith(literal, "<")) {
// this must be an <iriref>
if (!ad_utility::endsWith(literal, ">")) {
throw std::runtime_error("Error: Rdf Triple element "s + origLiteral +
"could not be normalized properly"s);
}
res = "<";
endDelimiter = '>';
literal.remove_prefix(1);
literal.remove_suffix(1);
} else {
res = "\"";
endDelimiter = '\"';
auto lastQuot = literal.find_last_of("\"\'");
if (lastQuot != std::string_view::npos) {
langtagOrDatatype = literal.substr(lastQuot + 1);
literal.remove_suffix(literal.size() - lastQuot - 1);
} else {
}
if (ad_utility::startsWith(literal, "\"\"\"") ||
ad_utility::startsWith(literal, "'''")) {
AD_CHECK(ad_utility::endsWith(literal, literal.substr(0, 3)));
literal.remove_prefix(3);
literal.remove_suffix(3);
} else {
AD_CHECK(ad_utility::startsWith(literal, "\"") ||
ad_utility::startsWith(literal, "'"));
AD_CHECK(ad_utility::endsWith(literal, literal.substr(0, 1)));
literal.remove_prefix(1);
literal.remove_suffix(1);
}
}
auto pos = literal.find('\\');
while (pos != literal.npos) {
Expand Down Expand Up @@ -313,6 +345,20 @@ struct TurtleToken {
case '\\':
res.push_back('\\');
break;
case 'u': {
AD_CHECK(pos + 5 <= literal.size());
auto unesc = unescapeUchar(literal.substr(pos + 2, 4));
res.insert(res.end(), unesc.begin(), unesc.end());
literal.remove_prefix(4);
break;
}
case 'U': {
AD_CHECK(pos + 9 <= literal.size());
auto unesc = unescapeUchar(literal.substr(pos + 2, 8));
res.insert(res.end(), unesc.begin(), unesc.end());
literal.remove_prefix(8);
break;
}

default:
throw std::runtime_error(
Expand All @@ -323,7 +369,7 @@ struct TurtleToken {
pos = literal.find('\\');
}
res.append(literal);
res.push_back('"');
res.push_back(endDelimiter);
res.append(langtagOrDatatype);
return res;
}
Expand Down Expand Up @@ -703,6 +749,8 @@ class TokenizerCtre {
return F::template process<TurtleTokenCtre::PnameNS>(_data);
} else if constexpr (id == TokId::PnameLN) {
return F::template process<TurtleTokenCtre::PnameLN>(_data);
} else if constexpr (id == TokId::PnLocal) {
return F::template process<TurtleTokenCtre::PnLocal>(_data);
} else if constexpr (id == TokId::BlankNodeLabel) {
return F::template process<TurtleTokenCtre::BlankNodeLabel>(_data);
} else {
Expand Down
9 changes: 5 additions & 4 deletions src/parser/TurtleParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ bool TurtleParser<T>::prefixID() {
_lastParseResult.substr(1, _lastParseResult.size() - 2);
return true;
} else {
throw raise("prefixID");
raise("prefixID");
}
} else {
return false;
Expand Down Expand Up @@ -318,16 +318,17 @@ bool TurtleParser<T>::iri() {
// _____________________________________________________________________
template <class T>
bool TurtleParser<T>::prefixedName() {
if (!parseTerminal(tokens().PnameNS)) {
if (!parseTerminal<TokId::PnameNS>()) {
return false;
} else {
// this also includes a ":" which we do not need, hence the "-1"
_activePrefix = _lastParseResult.substr(0, _lastParseResult.size() - 1);
_lastParseResult = "";
}
_lastParseResult.clear();
parseTerminal<false>(tokens().PnLocal);
parseTerminal<TokId::PnLocal, false>();
_lastParseResult = '<' + expandPrefix(_activePrefix) + _lastParseResult + '>';
LOG(INFO) << "Parsed a prefixed name\n";
return true;
}

Expand Down Expand Up @@ -591,7 +592,7 @@ bool TurtleStreamParser<T>::getLine(std::array<string, 3>* triple) {
throw ex;

} else {
raise(
this->raise(
"Too many bytes parsed without finishing a turtle "
"statement");
}
Expand Down
2 changes: 1 addition & 1 deletion src/parser/TurtleParser.h
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ class TurtleStringParser : public TurtleParser<Tokenizer_T> {

// _____________________________________________________________
size_t getParsePosition() const override {
return _tmpToParse.size() - _tok.data().size();
return _tmpToParse.size() - this->_tok.data().size();
}

void initialize(const string&) override {
Expand Down
6 changes: 3 additions & 3 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
add_executable(SparqlParserTest SparqlParserTest.cpp)
add_test(SparqlParserTest SparqlParserTest)
target_link_libraries(SparqlParserTest gtest_main parser ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(SparqlParserTest gtest_main parser ${CMAKE_THREAD_LIBS_INIT} ${ICU_LIBRARIES})

add_executable(StringUtilsTest StringUtilsTest.cpp)
add_test(StringUtilsTest StringUtilsTest)
Expand Down Expand Up @@ -96,7 +96,7 @@ target_link_libraries(UnionTest gtest_main engine ${CMAKE_THREAD_LIBS_INIT})

add_executable(TokenTest TokenTest.cpp)
add_test(TokenTest TokenTest)
target_link_libraries(TokenTest parser re2 gtest_main -pthread)
target_link_libraries(TokenTest parser re2 gtest_main -pthread ${ICU_LIBRARIES})

add_executable(TurtleParserTest TurtleParserTest.cpp)
add_test(TurtleParserTest TurtleParserTest)
Expand All @@ -116,7 +116,7 @@ target_link_libraries(TransitivePathTest engine gtest_main ${CMAKE_THREAD_LIBS_I

add_executable(SparqlLexerTest SparqlLexerTest.cpp)
add_test(SparqlLexerTest SparqlLexerTest)
target_link_libraries(SparqlLexerTest parser gtest_main ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(SparqlLexerTest parser gtest_main ${CMAKE_THREAD_LIBS_INIT} ${ICU_LIBRARIES})

add_executable(Utf8RegexTest Utf8RegexTest.cpp)
add_test(Utf8RegexTest Utf8RegexTest)
Expand Down

0 comments on commit b8aabfd

Please sign in to comment.