Correct handling of escaped Literals and Irirefs during index build.

- also apply the normalization of literals correctly during index build time - Adapt the Index unit tests to "legal" knowledge bases - Disable the CTRE parser for now, since it becomes awefully slow with the PnameNS and PnLocal changes for some reasons. - TODO: Maybe we want to renable the CTRE Parser with the old "wrong" behavior as a very fast way to parse Wikidata
ad-freiburg · Apr 13, 2020 · b8aabfd · b8aabfd
1 parent b460307
commit b8aabfd
Show file tree

Hide file tree

Showing 13 changed files with 280 additions and 180 deletions.
diff --git a/e2e/e2e-build-settings.json b/e2e/e2e-build-settings.json
@@ -1,5 +1,5 @@
 {
   "num-triples-per-partial-vocab" : 40000,
   "parser-batch-size" : 1000,
-  "ascii-prefixes-only":true
+  "ascii-prefixes-only":false
 }
diff --git a/src/TurtleParserMain.cpp b/src/TurtleParserMain.cpp
@@ -69,6 +69,9 @@ void writeNTDispatch(std::ostream& out, const string& fileFormat,
     writeNT<Tokenizer>(out, fileFormat, filename);
   } else if (regexEngine == "ctre") {
     LOG(WARN) << WARNING_ASCII_ONLY_PREFIXES;
+    throw std::runtime_error(
+        "The ctre engine is currently disabled due to serious performance "
+        "problems");
     writeNT<TokenizerCtre>(out, fileFormat, filename);
   } else {
     LOG(ERROR)

diff --git a/src/global/Constants.h b/src/global/Constants.h
@@ -79,7 +79,9 @@ static const std::string WARNING_ASCII_ONLY_PREFIXES =
     "regex engine for Tokenization. This means "
     "that prefixes in the input Turtle may only use characters from "
     "the ascii range. This is stricter than the Sparql standard but "
-    "makes parsing faster and works e.g. for wikidata dumps\n";
+    "makes parsing faster and works e.g. for wikidata dumps\n"
+    "ALSO CURRENTLY THIS SETTING IS BROKEN (serious performance issues) AND "
+    "THUS FORBIDDEN";
 
 static const std::string LOCALE_DEFAULT_LANG = "en";
 static const std::string LOCALE_DEFAULT_COUNTRY = "US";

diff --git a/src/index/Index.cpp b/src/index/Index.cpp
@@ -160,7 +160,7 @@ VocabularyData Index::passFileForVocabulary(const string& filename,
     std::array<ItemMapManager, NUM_PARALLEL_ITEM_MAPS> itemArray;
 
     {
-      auto p = ad_pipeline::setupParallelPipeline<1, NUM_PARALLEL_ITEM_MAPS>(
+      auto p = ad_pipeline::setupParallelPipeline<1, 1, NUM_PARALLEL_ITEM_MAPS>(
           _parserBatchSize,
           // when called, returns an optional to the next triple. If
           // <linexPerPartial> triples were parsed, return std::nullopt. when
@@ -169,6 +169,15 @@ VocabularyData Index::passFileForVocabulary(const string& filename,
           // as a first step in the parallel Pipeline.
           ParserBatcher(parser, linesPerPartial,
                         [&]() { parserExhausted = true; }),
+          // do all the unescaping from Sparql (ToDo<joka921>:: move this into
+          // its own pipeline within the parser
+          [this](Triple&& t) {
+            Triple res;
+            std::transform(t.begin(), t.end(), res.begin(), [](const auto& s) {
+              return TurtleToken::normalizeRDFLiteral(s);
+            });
+            return res;
+          },
           // convert each triple to the internal representation (e.g. special
           // values for Numbers, externalized literals, etc.)
           [this](Triple&& t) {
@@ -1504,6 +1513,9 @@ void Index::initializeVocabularySettingsBuild() {
       if (v) {
         LOG(WARN) << WARNING_ASCII_ONLY_PREFIXES;
         _onlyAsciiTurtlePrefixes = true;
+        throw std::runtime_error(
+            "the ascii-prefixes-only setting is forbidden due to performance "
+            "problems at the moment");
       } else {
         _onlyAsciiTurtlePrefixes = false;
       }

diff --git a/src/parser/Tokenizer.cpp b/src/parser/Tokenizer.cpp
@@ -98,6 +98,8 @@ const RE2& Tokenizer::idToRegex(const TokId reg) {
       return _tokens.PnameNS;
     case TokId::PnameLN:
       return _tokens.PnameLN;
+    case TokId::PnLocal:
+      return _tokens.PnLocal;
     case TokId::BlankNodeLabel:
       return _tokens.BlankNodeLabel;
     case TokId::WsMultiple:

diff --git a/src/parser/Tokenizer.h b/src/parser/Tokenizer.h
@@ -7,10 +7,10 @@
 #include <ctre/ctre.h>
 #include <gtest/gtest.h>
 #include <re2/re2.h>
+#include <unicode/ustream.h>
 #include <regex>
 #include "../util/Exception.h"
 #include "../util/Log.h"
-
 using re2::RE2;
 using namespace std::string_literals;
 
@@ -93,6 +93,7 @@ enum class TokId : int {
   Iriref,
   PnameNS,
   PnameLN,
+  PnLocal,
   BlankNodeLabel,
   WsMultiple,
   Anon,
@@ -215,6 +216,8 @@ struct TurtleTokenCtre {
       grp(cls(PnCharsUString + u8":0-9") + "|" + PlxString) +
       grp(u8"\\.*" + grp(TmpNoDot)) + "*";
 
+  static constexpr fixed_string PnLocal = grp(PnLocalString);
+
   static constexpr fixed_string PnameLNString =
       grp(PnameNSString) + grp(PnLocalString);
 
@@ -246,6 +249,16 @@ struct TurtleTokenCtre {
  * at runtime
  */
 struct TurtleToken {
+  /// turn a number of hex-chars like '00e4' into utf-8
+  static std::string unescapeUchar(std::string_view hex) {
+    UChar32 x;
+    std::stringstream sstream;
+    sstream << std::hex << hex;
+    sstream >> x;
+    std::string res;
+    icu::UnicodeString(x).toUTF8String(res);
+    return res;
+  }
   /**
    * @brief convert a RDF Literal to a unified form that is used inside QLever
    *
@@ -266,23 +279,42 @@ struct TurtleToken {
    * @param literal
    * @return
    */
-  static std::string normalizeRDFLiteral(std::string_view literal) {
-    std::string res = "\"";
-    auto lastQuot = literal.find_last_of("\"\'");
-    AD_CHECK(lastQuot != std::string_view::npos);
-    auto langtagOrDatatype = literal.substr(lastQuot + 1);
-    literal.remove_suffix(literal.size() - lastQuot - 1);
-    if (ad_utility::startsWith(literal, "\"\"\"") ||
-        ad_utility::startsWith(literal, "'''")) {
-      AD_CHECK(ad_utility::endsWith(literal, literal.substr(0, 3)));
-      literal.remove_prefix(3);
-      literal.remove_suffix(3);
-    } else {
-      AD_CHECK(ad_utility::startsWith(literal, "\"") ||
-               ad_utility::startsWith(literal, "'"));
-      AD_CHECK(ad_utility::endsWith(literal, literal.substr(0, 1)));
+  static std::string normalizeRDFLiteral(std::string_view origLiteral) {
+    auto literal = origLiteral;
+    std::string res;
+    char endDelimiter = '\0';
+    std::string_view langtagOrDatatype;
+    if (ad_utility::startsWith(literal, "<")) {
+      // this must be an <iriref>
+      if (!ad_utility::endsWith(literal, ">")) {
+        throw std::runtime_error("Error: Rdf Triple element "s + origLiteral +
+                                 "could not be normalized properly"s);
+      }
+      res = "<";
+      endDelimiter = '>';
       literal.remove_prefix(1);
       literal.remove_suffix(1);
+    } else {
+      res = "\"";
+      endDelimiter = '\"';
+      auto lastQuot = literal.find_last_of("\"\'");
+      if (lastQuot != std::string_view::npos) {
+        langtagOrDatatype = literal.substr(lastQuot + 1);
+        literal.remove_suffix(literal.size() - lastQuot - 1);
+      } else {
+      }
+      if (ad_utility::startsWith(literal, "\"\"\"") ||
+          ad_utility::startsWith(literal, "'''")) {
+        AD_CHECK(ad_utility::endsWith(literal, literal.substr(0, 3)));
+        literal.remove_prefix(3);
+        literal.remove_suffix(3);
+      } else {
+        AD_CHECK(ad_utility::startsWith(literal, "\"") ||
+                 ad_utility::startsWith(literal, "'"));
+        AD_CHECK(ad_utility::endsWith(literal, literal.substr(0, 1)));
+        literal.remove_prefix(1);
+        literal.remove_suffix(1);
+      }
     }
     auto pos = literal.find('\\');
     while (pos != literal.npos) {
@@ -313,6 +345,20 @@ struct TurtleToken {
         case '\\':
           res.push_back('\\');
           break;
+        case 'u': {
+          AD_CHECK(pos + 5 <= literal.size());
+          auto unesc = unescapeUchar(literal.substr(pos + 2, 4));
+          res.insert(res.end(), unesc.begin(), unesc.end());
+          literal.remove_prefix(4);
+          break;
+        }
+        case 'U': {
+          AD_CHECK(pos + 9 <= literal.size());
+          auto unesc = unescapeUchar(literal.substr(pos + 2, 8));
+          res.insert(res.end(), unesc.begin(), unesc.end());
+          literal.remove_prefix(8);
+          break;
+        }
 
         default:
           throw std::runtime_error(
@@ -323,7 +369,7 @@ struct TurtleToken {
       pos = literal.find('\\');
     }
     res.append(literal);
-    res.push_back('"');
+    res.push_back(endDelimiter);
     res.append(langtagOrDatatype);
     return res;
   }
@@ -703,6 +749,8 @@ class TokenizerCtre {
       return F::template process<TurtleTokenCtre::PnameNS>(_data);
     } else if constexpr (id == TokId::PnameLN) {
       return F::template process<TurtleTokenCtre::PnameLN>(_data);
+    } else if constexpr (id == TokId::PnLocal) {
+      return F::template process<TurtleTokenCtre::PnLocal>(_data);
     } else if constexpr (id == TokId::BlankNodeLabel) {
       return F::template process<TurtleTokenCtre::BlankNodeLabel>(_data);
     } else {

diff --git a/src/parser/TurtleParser.cpp b/src/parser/TurtleParser.cpp
@@ -29,7 +29,7 @@ bool TurtleParser<T>::prefixID() {
           _lastParseResult.substr(1, _lastParseResult.size() - 2);
       return true;
     } else {
-      throw raise("prefixID");
+      raise("prefixID");
     }
   } else {
     return false;
@@ -318,16 +318,17 @@ bool TurtleParser<T>::iri() {
 // _____________________________________________________________________
 template <class T>
 bool TurtleParser<T>::prefixedName() {
-  if (!parseTerminal(tokens().PnameNS)) {
+  if (!parseTerminal<TokId::PnameNS>()) {
     return false;
   } else {
     // this also includes a ":" which we do not need, hence the "-1"
     _activePrefix = _lastParseResult.substr(0, _lastParseResult.size() - 1);
     _lastParseResult = "";
   }
   _lastParseResult.clear();
-  parseTerminal<false>(tokens().PnLocal);
+  parseTerminal<TokId::PnLocal, false>();
   _lastParseResult = '<' + expandPrefix(_activePrefix) + _lastParseResult + '>';
+  LOG(INFO) << "Parsed a prefixed name\n";
   return true;
 }
 
@@ -591,7 +592,7 @@ bool TurtleStreamParser<T>::getLine(std::array<string, 3>* triple) {
               throw ex;
 
             } else {
-              raise(
+              this->raise(
                   "Too many bytes parsed without finishing a turtle "
                   "statement");
             }

diff --git a/src/parser/TurtleParser.h b/src/parser/TurtleParser.h
@@ -252,7 +252,7 @@ class TurtleStringParser : public TurtleParser<Tokenizer_T> {
 
   // _____________________________________________________________
   size_t getParsePosition() const override {
-    return _tmpToParse.size() - _tok.data().size();
+    return _tmpToParse.size() - this->_tok.data().size();
   }
 
   void initialize(const string&) override {

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
@@ -1,6 +1,6 @@
 add_executable(SparqlParserTest SparqlParserTest.cpp)
 add_test(SparqlParserTest SparqlParserTest)
-target_link_libraries(SparqlParserTest gtest_main parser ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(SparqlParserTest gtest_main parser ${CMAKE_THREAD_LIBS_INIT} ${ICU_LIBRARIES})
 
 add_executable(StringUtilsTest StringUtilsTest.cpp)
 add_test(StringUtilsTest StringUtilsTest)
@@ -96,7 +96,7 @@ target_link_libraries(UnionTest gtest_main engine ${CMAKE_THREAD_LIBS_INIT})
 
 add_executable(TokenTest TokenTest.cpp)
 add_test(TokenTest TokenTest)
-target_link_libraries(TokenTest parser re2 gtest_main -pthread)
+target_link_libraries(TokenTest parser re2 gtest_main -pthread ${ICU_LIBRARIES})
 
 add_executable(TurtleParserTest TurtleParserTest.cpp)
 add_test(TurtleParserTest TurtleParserTest)
@@ -116,7 +116,7 @@ target_link_libraries(TransitivePathTest engine gtest_main ${CMAKE_THREAD_LIBS_I
 
 add_executable(SparqlLexerTest SparqlLexerTest.cpp)
 add_test(SparqlLexerTest SparqlLexerTest)
-target_link_libraries(SparqlLexerTest parser gtest_main ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(SparqlLexerTest parser gtest_main ${CMAKE_THREAD_LIBS_INIT} ${ICU_LIBRARIES})
 
 add_executable(Utf8RegexTest Utf8RegexTest.cpp)
 add_test(Utf8RegexTest Utf8RegexTest)