Skip to content

Commit

Permalink
Correct Handling of Escape Sequences in the TurtleParser
Browse files Browse the repository at this point in the history
- SPARQL also allows escape sequences in PrefixedNames like rdfs:l\,abel
  These were previously unsupported, which is now fixed.

- We now also transform escape sequences in sparql literals to their
  correct form during index build.

- Several changes to the Index class unit tests had to be made, because
  they used knowledge base elements like a instead of <a> which is no
  longer supported by any of the parsers.

- Disable the CTRE parser for now, since it becomes awefully slow with the
  the fixes for the prefixed names. TODO: Maybe we want to reimplement the
  old and wrong behavior and make CTRE a general "WikidataUnsafe" parser.

- Get rid of misleading warning in case of whitespace at the end of a TTL
  file. Previously there was a "parsing of ttl has failed, but there is still content left" warning, although
  the remainder of the ttl input was only whitespace.

- Made the ad_utility::hash_set also use absl.
  now we have completely removed the dependency from google::sparsehash and
  migrated to absl.
  • Loading branch information
joka921 committed Apr 13, 2020
1 parent 0c0f5de commit 58106bc
Show file tree
Hide file tree
Showing 15 changed files with 409 additions and 260 deletions.
2 changes: 1 addition & 1 deletion e2e/e2e-build-settings.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"num-triples-per-partial-vocab" : 40000,
"parser-batch-size" : 1000,
"ascii-prefixes-only":true
"ascii-prefixes-only":false
}
3 changes: 3 additions & 0 deletions src/TurtleParserMain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ void writeNTDispatch(std::ostream& out, const string& fileFormat,
writeNT<Tokenizer>(out, fileFormat, filename);
} else if (regexEngine == "ctre") {
LOG(WARN) << WARNING_ASCII_ONLY_PREFIXES;
throw std::runtime_error(
"The ctre engine is currently disabled due to serious performance "
"problems");
writeNT<TokenizerCtre>(out, fileFormat, filename);
} else {
LOG(ERROR)
Expand Down
4 changes: 3 additions & 1 deletion src/global/Constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,9 @@ static const std::string WARNING_ASCII_ONLY_PREFIXES =
"regex engine for Tokenization. This means "
"that prefixes in the input Turtle may only use characters from "
"the ascii range. This is stricter than the Sparql standard but "
"makes parsing faster and works e.g. for wikidata dumps\n";
"makes parsing faster and works e.g. for wikidata dumps\n"
"ALSO CURRENTLY THIS SETTING IS BROKEN (serious performance issues) AND "
"THUS FORBIDDEN";

static const std::string LOCALE_DEFAULT_LANG = "en";
static const std::string LOCALE_DEFAULT_COUNTRY = "US";
Expand Down
14 changes: 13 additions & 1 deletion src/index/Index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ VocabularyData Index::passFileForVocabulary(const string& filename,
std::array<ItemMapManager, NUM_PARALLEL_ITEM_MAPS> itemArray;

{
auto p = ad_pipeline::setupParallelPipeline<1, NUM_PARALLEL_ITEM_MAPS>(
auto p = ad_pipeline::setupParallelPipeline<1, 1, NUM_PARALLEL_ITEM_MAPS>(
_parserBatchSize,
// when called, returns an optional to the next triple. If
// <linexPerPartial> triples were parsed, return std::nullopt. when
Expand All @@ -169,6 +169,15 @@ VocabularyData Index::passFileForVocabulary(const string& filename,
// as a first step in the parallel Pipeline.
ParserBatcher(parser, linesPerPartial,
[&]() { parserExhausted = true; }),
// do all the unescaping from Sparql (ToDo<joka921>:: move this into
// its own pipeline within the parser
[this](Triple&& t) {
Triple res;
std::transform(t.begin(), t.end(), res.begin(), [](const auto& s) {
return TurtleToken::normalizeRDFLiteral(s);
});
return res;
},
// convert each triple to the internal representation (e.g. special
// values for Numbers, externalized literals, etc.)
[this](Triple&& t) {
Expand Down Expand Up @@ -1504,6 +1513,9 @@ void Index::initializeVocabularySettingsBuild() {
if (v) {
LOG(WARN) << WARNING_ASCII_ONLY_PREFIXES;
_onlyAsciiTurtlePrefixes = true;
throw std::runtime_error(
"the ascii-prefixes-only setting is forbidden due to performance "
"problems at the moment");
} else {
_onlyAsciiTurtlePrefixes = false;
}
Expand Down
2 changes: 2 additions & 0 deletions src/parser/Tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ const RE2& Tokenizer::idToRegex(const TokId reg) {
return _tokens.PnameNS;
case TokId::PnameLN:
return _tokens.PnameLN;
case TokId::PnLocal:
return _tokens.PnLocal;
case TokId::BlankNodeLabel:
return _tokens.BlankNodeLabel;
case TokId::WsMultiple:
Expand Down
91 changes: 70 additions & 21 deletions src/parser/Tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@
#include <ctre/ctre.h>
#include <gtest/gtest.h>
#include <re2/re2.h>
#include <unicode/ustream.h>
#include <regex>
#include "../util/Exception.h"
#include "../util/Log.h"

using re2::RE2;
using namespace std::string_literals;

Expand Down Expand Up @@ -93,6 +93,7 @@ enum class TokId : int {
Iriref,
PnameNS,
PnameLN,
PnLocal,
BlankNodeLabel,
WsMultiple,
Anon,
Expand Down Expand Up @@ -215,6 +216,8 @@ struct TurtleTokenCtre {
grp(cls(PnCharsUString + u8":0-9") + "|" + PlxString) +
grp(u8"\\.*" + grp(TmpNoDot)) + "*";

static constexpr fixed_string PnLocal = grp(PnLocalString);

static constexpr fixed_string PnameLNString =
grp(PnameNSString) + grp(PnLocalString);

Expand Down Expand Up @@ -246,6 +249,16 @@ struct TurtleTokenCtre {
* at runtime
*/
struct TurtleToken {
/// turn a number of hex-chars like '00e4' into utf-8
static std::string unescapeUchar(std::string_view hex) {
UChar32 x;
std::stringstream sstream;
sstream << std::hex << hex;
sstream >> x;
std::string res;
icu::UnicodeString(x).toUTF8String(res);
return res;
}
/**
* @brief convert a RDF Literal to a unified form that is used inside QLever
*
Expand All @@ -266,23 +279,42 @@ struct TurtleToken {
* @param literal
* @return
*/
static std::string normalizeRDFLiteral(std::string_view literal) {
std::string res = "\"";
auto lastQuot = literal.find_last_of("\"\'");
AD_CHECK(lastQuot != std::string_view::npos);
auto langtagOrDatatype = literal.substr(lastQuot + 1);
literal.remove_suffix(literal.size() - lastQuot - 1);
if (ad_utility::startsWith(literal, "\"\"\"") ||
ad_utility::startsWith(literal, "'''")) {
AD_CHECK(ad_utility::endsWith(literal, literal.substr(0, 3)));
literal.remove_prefix(3);
literal.remove_suffix(3);
} else {
AD_CHECK(ad_utility::startsWith(literal, "\"") ||
ad_utility::startsWith(literal, "'"));
AD_CHECK(ad_utility::endsWith(literal, literal.substr(0, 1)));
static std::string normalizeRDFLiteral(std::string_view origLiteral) {
auto literal = origLiteral;
std::string res;
char endDelimiter = '\0';
std::string_view langtagOrDatatype;
if (ad_utility::startsWith(literal, "<")) {
// this must be an <iriref>
if (!ad_utility::endsWith(literal, ">")) {
throw std::runtime_error("Error: Rdf Triple element "s + origLiteral +
"could not be normalized properly"s);
}
res = "<";
endDelimiter = '>';
literal.remove_prefix(1);
literal.remove_suffix(1);
} else {
res = "\"";
endDelimiter = '\"';
auto lastQuot = literal.find_last_of("\"\'");
if (lastQuot != std::string_view::npos) {
langtagOrDatatype = literal.substr(lastQuot + 1);
literal.remove_suffix(literal.size() - lastQuot - 1);
} else {
}
if (ad_utility::startsWith(literal, "\"\"\"") ||
ad_utility::startsWith(literal, "'''")) {
AD_CHECK(ad_utility::endsWith(literal, literal.substr(0, 3)));
literal.remove_prefix(3);
literal.remove_suffix(3);
} else {
AD_CHECK(ad_utility::startsWith(literal, "\"") ||
ad_utility::startsWith(literal, "'"));
AD_CHECK(ad_utility::endsWith(literal, literal.substr(0, 1)));
literal.remove_prefix(1);
literal.remove_suffix(1);
}
}
auto pos = literal.find('\\');
while (pos != literal.npos) {
Expand Down Expand Up @@ -313,6 +345,20 @@ struct TurtleToken {
case '\\':
res.push_back('\\');
break;
case 'u': {
AD_CHECK(pos + 5 <= literal.size());
auto unesc = unescapeUchar(literal.substr(pos + 2, 4));
res.insert(res.end(), unesc.begin(), unesc.end());
literal.remove_prefix(4);
break;
}
case 'U': {
AD_CHECK(pos + 9 <= literal.size());
auto unesc = unescapeUchar(literal.substr(pos + 2, 8));
res.insert(res.end(), unesc.begin(), unesc.end());
literal.remove_prefix(8);
break;
}

default:
throw std::runtime_error(
Expand All @@ -323,7 +369,7 @@ struct TurtleToken {
pos = literal.find('\\');
}
res.append(literal);
res.push_back('"');
res.push_back(endDelimiter);
res.append(langtagOrDatatype);
return res;
}
Expand Down Expand Up @@ -421,6 +467,7 @@ struct TurtleToken {
Iriref(grp(IrirefString)),
PnameNS(grp(PnameNSString)),
PnameLN(grp(PnameLNString)),
PnLocal(grp(PnLocalString)),
BlankNodeLabel(grp(BlankNodeLabelString)),

WsMultiple(grp(WsMultipleString)),
Expand Down Expand Up @@ -542,6 +589,7 @@ struct TurtleToken {

const string PnameLNString = grp(PnameNSString) + grp(PnLocalString);
const RE2 PnameLN;
const RE2 PnLocal;

const string BlankNodeLabelString = u8"_:" + cls(PnCharsUString + u8"0-9") +
grp("\\.*" + cls(PnCharsString)) + "*";
Expand Down Expand Up @@ -701,6 +749,8 @@ class TokenizerCtre {
return F::template process<TurtleTokenCtre::PnameNS>(_data);
} else if constexpr (id == TokId::PnameLN) {
return F::template process<TurtleTokenCtre::PnameLN>(_data);
} else if constexpr (id == TokId::PnLocal) {
return F::template process<TurtleTokenCtre::PnLocal>(_data);
} else if constexpr (id == TokId::BlankNodeLabel) {
return F::template process<TurtleTokenCtre::BlankNodeLabel>(_data);
} else {
Expand Down Expand Up @@ -907,11 +957,10 @@ class Tokenizer {
void skipWhitespace() {
auto v = view();
auto pos = v.find_first_not_of("\x20\x09\x0D\x0A");
if (pos != string::npos) {
_data.remove_prefix(pos);
if (pos == string::npos) {
pos = _data.size();
}
// auto success = skip(_tokens.WsMultiple);
// assert(success);
_data.remove_prefix(pos);
return;
}

Expand Down

0 comments on commit 58106bc

Please sign in to comment.