Correct Handling of Escape Sequences in the TurtleParser

- SPARQL also allows escape sequences in PrefixedNames like rdfs:l\,abel These were previously unsupported, which is now fixed. - We now also transform escape sequences in sparql literals to their correct form during index build. - Several changes to the Index class unit tests had to be made, because they used knowledge base elements like a instead of <a> which is no longer supported by any of the parsers. - Disable the CTRE parser for now, since it becomes awefully slow with the the fixes for the prefixed names. TODO: Maybe we want to reimplement the old and wrong behavior and make CTRE a general "WikidataUnsafe" parser. - Get rid of misleading warning in case of whitespace at the end of a TTL file. Previously there was a "parsing of ttl has failed, but there is still content left" warning, although the remainder of the ttl input was only whitespace. - Made the ad_utility::hash_set also use absl. now we have completely removed the dependency from google::sparsehash and migrated to absl.
ad-freiburg · Apr 13, 2020 · 58106bc · 58106bc
1 parent 0c0f5de
commit 58106bc
Show file tree

Hide file tree

Showing 15 changed files with 409 additions and 260 deletions.
diff --git a/e2e/e2e-build-settings.json b/e2e/e2e-build-settings.json
@@ -1,5 +1,5 @@
 {
   "num-triples-per-partial-vocab" : 40000,
   "parser-batch-size" : 1000,
-  "ascii-prefixes-only":true
+  "ascii-prefixes-only":false
 }
diff --git a/src/TurtleParserMain.cpp b/src/TurtleParserMain.cpp
@@ -69,6 +69,9 @@ void writeNTDispatch(std::ostream& out, const string& fileFormat,
     writeNT<Tokenizer>(out, fileFormat, filename);
   } else if (regexEngine == "ctre") {
     LOG(WARN) << WARNING_ASCII_ONLY_PREFIXES;
+    throw std::runtime_error(
+        "The ctre engine is currently disabled due to serious performance "
+        "problems");
     writeNT<TokenizerCtre>(out, fileFormat, filename);
   } else {
     LOG(ERROR)

diff --git a/src/global/Constants.h b/src/global/Constants.h
@@ -79,7 +79,9 @@ static const std::string WARNING_ASCII_ONLY_PREFIXES =
     "regex engine for Tokenization. This means "
     "that prefixes in the input Turtle may only use characters from "
     "the ascii range. This is stricter than the Sparql standard but "
-    "makes parsing faster and works e.g. for wikidata dumps\n";
+    "makes parsing faster and works e.g. for wikidata dumps\n"
+    "ALSO CURRENTLY THIS SETTING IS BROKEN (serious performance issues) AND "
+    "THUS FORBIDDEN";
 
 static const std::string LOCALE_DEFAULT_LANG = "en";
 static const std::string LOCALE_DEFAULT_COUNTRY = "US";

diff --git a/src/index/Index.cpp b/src/index/Index.cpp
@@ -160,7 +160,7 @@ VocabularyData Index::passFileForVocabulary(const string& filename,
     std::array<ItemMapManager, NUM_PARALLEL_ITEM_MAPS> itemArray;
 
     {
-      auto p = ad_pipeline::setupParallelPipeline<1, NUM_PARALLEL_ITEM_MAPS>(
+      auto p = ad_pipeline::setupParallelPipeline<1, 1, NUM_PARALLEL_ITEM_MAPS>(
           _parserBatchSize,
           // when called, returns an optional to the next triple. If
           // <linexPerPartial> triples were parsed, return std::nullopt. when
@@ -169,6 +169,15 @@ VocabularyData Index::passFileForVocabulary(const string& filename,
           // as a first step in the parallel Pipeline.
           ParserBatcher(parser, linesPerPartial,
                         [&]() { parserExhausted = true; }),
+          // do all the unescaping from Sparql (ToDo<joka921>:: move this into
+          // its own pipeline within the parser
+          [this](Triple&& t) {
+            Triple res;
+            std::transform(t.begin(), t.end(), res.begin(), [](const auto& s) {
+              return TurtleToken::normalizeRDFLiteral(s);
+            });
+            return res;
+          },
           // convert each triple to the internal representation (e.g. special
           // values for Numbers, externalized literals, etc.)
           [this](Triple&& t) {
@@ -1504,6 +1513,9 @@ void Index::initializeVocabularySettingsBuild() {
       if (v) {
         LOG(WARN) << WARNING_ASCII_ONLY_PREFIXES;
         _onlyAsciiTurtlePrefixes = true;
+        throw std::runtime_error(
+            "the ascii-prefixes-only setting is forbidden due to performance "
+            "problems at the moment");
       } else {
         _onlyAsciiTurtlePrefixes = false;
       }

diff --git a/src/parser/Tokenizer.cpp b/src/parser/Tokenizer.cpp
@@ -98,6 +98,8 @@ const RE2& Tokenizer::idToRegex(const TokId reg) {
       return _tokens.PnameNS;
     case TokId::PnameLN:
       return _tokens.PnameLN;
+    case TokId::PnLocal:
+      return _tokens.PnLocal;
     case TokId::BlankNodeLabel:
       return _tokens.BlankNodeLabel;
     case TokId::WsMultiple:

diff --git a/src/parser/Tokenizer.h b/src/parser/Tokenizer.h
@@ -7,10 +7,10 @@
 #include <ctre/ctre.h>
 #include <gtest/gtest.h>
 #include <re2/re2.h>
+#include <unicode/ustream.h>
 #include <regex>
 #include "../util/Exception.h"
 #include "../util/Log.h"
-
 using re2::RE2;
 using namespace std::string_literals;
 
@@ -93,6 +93,7 @@ enum class TokId : int {
   Iriref,
   PnameNS,
   PnameLN,
+  PnLocal,
   BlankNodeLabel,
   WsMultiple,
   Anon,
@@ -215,6 +216,8 @@ struct TurtleTokenCtre {
       grp(cls(PnCharsUString + u8":0-9") + "|" + PlxString) +
       grp(u8"\\.*" + grp(TmpNoDot)) + "*";
 
+  static constexpr fixed_string PnLocal = grp(PnLocalString);
+
   static constexpr fixed_string PnameLNString =
       grp(PnameNSString) + grp(PnLocalString);
 
@@ -246,6 +249,16 @@ struct TurtleTokenCtre {
  * at runtime
  */
 struct TurtleToken {
+  /// turn a number of hex-chars like '00e4' into utf-8
+  static std::string unescapeUchar(std::string_view hex) {
+    UChar32 x;
+    std::stringstream sstream;
+    sstream << std::hex << hex;
+    sstream >> x;
+    std::string res;
+    icu::UnicodeString(x).toUTF8String(res);
+    return res;
+  }
   /**
    * @brief convert a RDF Literal to a unified form that is used inside QLever
    *
@@ -266,23 +279,42 @@ struct TurtleToken {
    * @param literal
    * @return
    */
-  static std::string normalizeRDFLiteral(std::string_view literal) {
-    std::string res = "\"";
-    auto lastQuot = literal.find_last_of("\"\'");
-    AD_CHECK(lastQuot != std::string_view::npos);
-    auto langtagOrDatatype = literal.substr(lastQuot + 1);
-    literal.remove_suffix(literal.size() - lastQuot - 1);
-    if (ad_utility::startsWith(literal, "\"\"\"") ||
-        ad_utility::startsWith(literal, "'''")) {
-      AD_CHECK(ad_utility::endsWith(literal, literal.substr(0, 3)));
-      literal.remove_prefix(3);
-      literal.remove_suffix(3);
-    } else {
-      AD_CHECK(ad_utility::startsWith(literal, "\"") ||
-               ad_utility::startsWith(literal, "'"));
-      AD_CHECK(ad_utility::endsWith(literal, literal.substr(0, 1)));
+  static std::string normalizeRDFLiteral(std::string_view origLiteral) {
+    auto literal = origLiteral;
+    std::string res;
+    char endDelimiter = '\0';
+    std::string_view langtagOrDatatype;
+    if (ad_utility::startsWith(literal, "<")) {
+      // this must be an <iriref>
+      if (!ad_utility::endsWith(literal, ">")) {
+        throw std::runtime_error("Error: Rdf Triple element "s + origLiteral +
+                                 "could not be normalized properly"s);
+      }
+      res = "<";
+      endDelimiter = '>';
       literal.remove_prefix(1);
       literal.remove_suffix(1);
+    } else {
+      res = "\"";
+      endDelimiter = '\"';
+      auto lastQuot = literal.find_last_of("\"\'");
+      if (lastQuot != std::string_view::npos) {
+        langtagOrDatatype = literal.substr(lastQuot + 1);
+        literal.remove_suffix(literal.size() - lastQuot - 1);
+      } else {
+      }
+      if (ad_utility::startsWith(literal, "\"\"\"") ||
+          ad_utility::startsWith(literal, "'''")) {
+        AD_CHECK(ad_utility::endsWith(literal, literal.substr(0, 3)));
+        literal.remove_prefix(3);
+        literal.remove_suffix(3);
+      } else {
+        AD_CHECK(ad_utility::startsWith(literal, "\"") ||
+                 ad_utility::startsWith(literal, "'"));
+        AD_CHECK(ad_utility::endsWith(literal, literal.substr(0, 1)));
+        literal.remove_prefix(1);
+        literal.remove_suffix(1);
+      }
     }
     auto pos = literal.find('\\');
     while (pos != literal.npos) {
@@ -313,6 +345,20 @@ struct TurtleToken {
         case '\\':
           res.push_back('\\');
           break;
+        case 'u': {
+          AD_CHECK(pos + 5 <= literal.size());
+          auto unesc = unescapeUchar(literal.substr(pos + 2, 4));
+          res.insert(res.end(), unesc.begin(), unesc.end());
+          literal.remove_prefix(4);
+          break;
+        }
+        case 'U': {
+          AD_CHECK(pos + 9 <= literal.size());
+          auto unesc = unescapeUchar(literal.substr(pos + 2, 8));
+          res.insert(res.end(), unesc.begin(), unesc.end());
+          literal.remove_prefix(8);
+          break;
+        }
 
         default:
           throw std::runtime_error(
@@ -323,7 +369,7 @@ struct TurtleToken {
       pos = literal.find('\\');
     }
     res.append(literal);
-    res.push_back('"');
+    res.push_back(endDelimiter);
     res.append(langtagOrDatatype);
     return res;
   }
@@ -421,6 +467,7 @@ struct TurtleToken {
         Iriref(grp(IrirefString)),
         PnameNS(grp(PnameNSString)),
         PnameLN(grp(PnameLNString)),
+        PnLocal(grp(PnLocalString)),
         BlankNodeLabel(grp(BlankNodeLabelString)),
 
         WsMultiple(grp(WsMultipleString)),
@@ -542,6 +589,7 @@ struct TurtleToken {
 
   const string PnameLNString = grp(PnameNSString) + grp(PnLocalString);
   const RE2 PnameLN;
+  const RE2 PnLocal;
 
   const string BlankNodeLabelString = u8"_:" + cls(PnCharsUString + u8"0-9") +
                                       grp("\\.*" + cls(PnCharsString)) + "*";
@@ -701,6 +749,8 @@ class TokenizerCtre {
       return F::template process<TurtleTokenCtre::PnameNS>(_data);
     } else if constexpr (id == TokId::PnameLN) {
       return F::template process<TurtleTokenCtre::PnameLN>(_data);
+    } else if constexpr (id == TokId::PnLocal) {
+      return F::template process<TurtleTokenCtre::PnLocal>(_data);
     } else if constexpr (id == TokId::BlankNodeLabel) {
       return F::template process<TurtleTokenCtre::BlankNodeLabel>(_data);
     } else {
@@ -907,11 +957,10 @@ class Tokenizer {
   void skipWhitespace() {
     auto v = view();
     auto pos = v.find_first_not_of("\x20\x09\x0D\x0A");
-    if (pos != string::npos) {
-      _data.remove_prefix(pos);
+    if (pos == string::npos) {
+      pos = _data.size();
     }
-    // auto success = skip(_tokens.WsMultiple);
-    // assert(success);
+    _data.remove_prefix(pos);
     return;
   }