Correct handling of escaped Literals and Irirefs during index build.

- also apply the normalization of literals correctly during index build time - Adapt the Index unit tests to "legal" knowledge bases - Get rid of misleading warning in case of whitespace at the end of a TTL file. - Previously there was a "parsing of ttl has failed, but there is still content left" warning, although the remainder of the ttl input was only whitespace. - This was due to a bug in the Parser's skipWhitespace() function which failed if the input consisted of ONLY whitespace. This is now fixed. - The case, where a prefix was used with an empty "content" (e.g. <a> wd: <b>) was broken before, luckily there was a unit test and this is now fixed.
ad-freiburg · Mar 23, 2021 · fcc127c · fcc127c
1 parent 211ded4
commit fcc127c
Show file tree

Hide file tree

Showing 19 changed files with 654 additions and 327 deletions.
diff --git a/e2e/e2e-build-settings.json b/e2e/e2e-build-settings.json
@@ -1,5 +1,5 @@
 {
   "num-triples-per-partial-vocab" : 40000,
   "parser-batch-size" : 1000,
-  "ascii-prefixes-only":true
+  "ascii-prefixes-only":false
 }
diff --git a/src/global/Constants.h b/src/global/Constants.h
@@ -79,7 +79,9 @@ static const std::string WARNING_ASCII_ONLY_PREFIXES =
     "You explicitly requested the ascii-prefixes-only settings or the ctre "
     "regex engine for Tokenization. This means "
     "that prefixes in the input Turtle may only use characters from "
-    "the ascii range. This is stricter than the Sparql standard but "
+    "the ascii range and that no escape sequences may be used in prefixed "
+    "names (e.g. rdfs:lab\\,el)."
+    " This is stricter than the Sparql standard but "
     "makes parsing faster and works e.g. for wikidata dumps\n";
 
 static const std::string LOCALE_DEFAULT_LANG = "en";

diff --git a/src/index/ExternalVocabulary.cpp b/src/index/ExternalVocabulary.cpp
@@ -6,6 +6,7 @@
 
 #include <fstream>
 
+#include "../parser/Tokenizer.h"
 #include "../util/Log.h"
 
 // _____________________________________________________________________________
@@ -74,6 +75,10 @@ void ExternalVocabulary<Comp>::buildFromTextFile(const string& textFileName,
   _size = 0;
   std::string word;
   while (std::getline(infile, word)) {
+    // in the text file we had the escaped variants, but here
+    // we can use the correct ones since we store in a binary format with
+    // offsets
+    word = TurtleToken::normalizeRDFLiteral<false>(word);
     offsets.push_back(currentOffset);
     currentOffset += _file.write(word.data(), word.size());
     _size++;

diff --git a/src/index/Index.cpp b/src/index/Index.cpp
@@ -97,6 +97,7 @@ void Index::createFromFile(const string& filename) {
   string vocabFile = _onDiskBase + ".vocabulary";
   string vocabFileTmp = _onDiskBase + ".vocabularyTmp";
   std::vector<string> prefixes;
+  LOG(INFO) << "Finished writing permutations" << std::endl;
   if (_vocabPrefixCompressed) {
     // we have to use the "normally" sorted vocabulary for the prefix
     // compression;
@@ -107,7 +108,7 @@ void Index::createFromFile(const string& filename) {
     std::ofstream prefixFile(_onDiskBase + PREFIX_FILE);
     AD_CHECK(prefixFile.is_open());
     for (const auto& prefix : prefixes) {
-      prefixFile << prefix << '\n';
+      prefixFile << prefix << std::endl;
     }
   }
   _configurationJson["prefixes"] = _vocabPrefixCompressed;
@@ -118,7 +119,7 @@ void Index::createFromFile(const string& filename) {
   if (std::rename(vocabFileTmp.c_str(), vocabFile.c_str())) {
     LOG(INFO) << "Error: Rename the prefixed vocab file " << vocabFileTmp
               << " to " << vocabFile << " set errno to " << errno
-              << ". Terminating...\n";
+              << ". Terminating..." << std::endl;
     AD_CHECK(false);
   }
   writeConfiguration();
@@ -160,7 +161,7 @@ VocabularyData Index::passFileForVocabulary(const string& filename,
     std::array<ItemMapManager, NUM_PARALLEL_ITEM_MAPS> itemArray;
 
     {
-      auto p = ad_pipeline::setupParallelPipeline<1, NUM_PARALLEL_ITEM_MAPS>(
+      auto p = ad_pipeline::setupParallelPipeline<1, 1, NUM_PARALLEL_ITEM_MAPS>(
           _parserBatchSize,
           // when called, returns an optional to the next triple. If
           // <linexPerPartial> triples were parsed, return std::nullopt. when
@@ -169,6 +170,24 @@ VocabularyData Index::passFileForVocabulary(const string& filename,
           // as a first step in the parallel Pipeline.
           ParserBatcher(parser, linesPerPartial,
                         [&]() { parserExhausted = true; }),
+          // do all the unescaping from Sparql (ToDo<joka921>:: move this into
+          // its own pipeline within the parser
+          [this](Triple&& t) {
+            Triple res;
+            std::transform(t.begin(), t.end(), res.begin(), [](const auto& s) {
+              auto res = TurtleToken::normalizeRDFLiteral(s);
+              try {
+                [[maybe_unused]] auto tmp = TurtleToken::normalizeRDFLiteral(
+                    TurtleToken::escapeRDFLiteral(res));
+              } catch (...) {
+                LOG(ERROR) << "Vocabulary entry " + s +
+                                  " could not be (un)escaped properly"
+                           << std::endl;
+              }
+              return res;
+            });
+            return res;
+          },
           // convert each triple to the internal representation (e.g. special
           // values for Numbers, externalized literals, etc.)
           [this](Triple&& t) {
@@ -433,7 +452,7 @@ Index::createPermutationPairImpl(const string& fileName1,
 
   out1.close();
   out2.close();
-  LOG(INFO) << "Permutation done.\n";
+  LOG(INFO) << "Permutation done." << std::endl;
   return std::make_pair(std::move(metaData1), std::move(metaData2));
 }
 
@@ -493,12 +512,12 @@ void Index::createPermutationPair(
                            &(metaData.value().second));
     LOG(INFO) << "Done" << '\n';
     LOG(INFO) << "Writing MetaData for " << p1._readableName << " and "
-              << p2._readableName << '\n';
+              << p2._readableName << std::endl;
     ad_utility::File f1(_onDiskBase + ".index" + p1._fileSuffix, "r+");
     metaData.value().first.appendToFile(&f1);
     ad_utility::File f2(_onDiskBase + ".index" + p2._fileSuffix, "r+");
     metaData.value().second.appendToFile(&f2);
-    LOG(INFO) << "Done" << '\n';
+    LOG(INFO) << "Done" << std::endl;
   }
 }
 

diff --git a/src/index/PrefixHeuristic.cpp b/src/index/PrefixHeuristic.cpp
@@ -5,6 +5,7 @@
 #include "./PrefixHeuristic.h"
 #include <algorithm>
 #include <fstream>
+#include "../parser/Tokenizer.h"
 #include "../util/Exception.h"
 #include "../util/Log.h"
 #include "../util/StringUtils.h"
@@ -41,7 +42,7 @@ TreeNode* TreeNode::insertAfter(string_view value) {
   }
 
   // if we have reached here, we have to add a new child
-  NodePtr newNode(new TreeNode(value));
+  NodePtr newNode = std::make_unique<TreeNode>(value);
   newNode->_parent = this;
 
   // find children of current node which have to become children of the new node
@@ -60,6 +61,13 @@ TreeNode* TreeNode::insertAfter(string_view value) {
 
   // register the newly created node as a child of this node
   _children.push_back(std::move(newNode));
+
+  for (const auto& c : _children) {
+    if (c.get() == nullptr) {
+      LOG(ERROR) << "Illegal nullptr child was found" << std::endl;
+    }
+  }
+
   return _children.back().get();
 }
 
@@ -176,9 +184,10 @@ std::vector<string> calculatePrefixes(const string& filename,
   size_t totalSavings = 0;
   size_t numWords = 0;
 
-  LOG(INFO) << "start reading words and building prefix tree...\n";
+  LOG(INFO) << "start reading words and building prefix tree..." << std::endl;
   // insert all prefix candidates into  the tree
   while (std::getline(ifs, nextWord)) {
+    nextWord = TurtleToken::normalizeRDFLiteral<false>(nextWord);
     totalChars += nextWord.size();
     // the longest common prefixes between two adjacent words are our candidates
     // for compression
@@ -194,12 +203,12 @@ std::vector<string> calculatePrefixes(const string& filename,
 
     numWords++;
     if (numWords % 10000000 == 0) {
-      LOG(INFO) << "words read: " << numWords << '\n';
+      LOG(INFO) << "words read: " << numWords << std::endl;
     }
   }
 
-  LOG(INFO) << "Finished building prefix tree!\n";
-  LOG(INFO) << "Start searching for maximal compressing prefixes\n";
+  LOG(INFO) << "Finished building prefix tree!" << std::endl;
+  LOG(INFO) << "Start searching for maximal compressing prefixes" << std::endl;
   std::vector<string> res;
   res.reserve(numPrefixes);
   for (size_t i = 0; i < numPrefixes; ++i) {
@@ -209,7 +218,7 @@ std::vector<string> calculatePrefixes(const string& filename,
     }
     totalSavings += p.first;
     LOG(INFO) << "Found prefix " << p.second
-              << " with number of bytes gained: " << p.first << '\n';
+              << " with number of bytes gained: " << p.first << std::endl;
     res.push_back(std::move(p.second));
   }
   // if we always add an encoding we have calculated with a codelength of 0 so
@@ -218,9 +227,9 @@ std::vector<string> calculatePrefixes(const string& filename,
     totalSavings -= codelength * numWords;
   }
   double efficiency = static_cast<double>(totalSavings) / totalChars;
-  std::cout << "total number of bytes : " << totalChars << '\n';
+  std::cout << "total number of bytes : " << totalChars << std::endl;
   std::cout << "total chars compressed : " << totalSavings << '\n';
-  std::cout << "percentage of chars compressed : " << efficiency << '\n';
+  std::cout << "percentage of chars compressed : " << efficiency << std::endl;
   return res;
 }
 

diff --git a/src/index/PrefixHeuristic.h b/src/index/PrefixHeuristic.h
@@ -55,11 +55,14 @@ using std::string_view;
 class TreeNode {
  private:
   friend class Tree;
+
   using NodePtr = std::unique_ptr<TreeNode>;
 
   // Constructor
+ public:
   explicit TreeNode(string_view value) : _value(value) {}
 
+ private:
   // Recursive Insertion of value. If the value does not match _value we will
   // automatically call insert on a node that is closer to the actual position
   // of value in the Tree. Returns the node that was actually inserted

diff --git a/src/index/VocabularyGeneratorImpl.h b/src/index/VocabularyGeneratorImpl.h
@@ -153,7 +153,14 @@ void VocabularyMerger::writeQueueWordsToIdVec(const std::vector<QueueWord>& buff
 
       // write the new word to the vocabulary
       if (_lastWritten < EXTERNALIZED_LITERALS_PREFIX) {
-        _outfile << _lastWritten << '\n';
+        auto escaped = TurtleToken::escapeRDFLiteral(_lastWritten);
+        try {
+          auto restored = TurtleToken::normalizeRDFLiteral(escaped);
+        } catch (...) {
+          LOG(ERROR) << "Failure in the (un) escaping of vocabulary entry " + _lastWritten
+                     << std::endl;
+        }
+        _outfile << TurtleToken::escapeRDFLiteral(_lastWritten) << '\n';
       } else {
         // we have to strip the externalization character again
         auto& c = _lastWritten[0];
@@ -169,7 +176,7 @@ void VocabularyMerger::writeQueueWordsToIdVec(const std::vector<QueueWord>& buff
                           "should never happen\n";
             AD_CHECK(false)
         }
-        _outfileExternal << _lastWritten << '\n';
+        _outfileExternal << TurtleToken::escapeRDFLiteral(_lastWritten) << '\n';
       }
 
       // write id to corresponding vec

diff --git a/src/index/VocabularyImpl.h b/src/index/VocabularyImpl.h
@@ -8,6 +8,7 @@
 #include <fstream>
 #include <iostream>
 
+#include "../parser/Tokenizer.h"
 #include "../util/File.h"
 #include "../util/HashMap.h"
 #include "../util/HashSet.h"
@@ -30,8 +31,10 @@ void Vocabulary<S, C>::readFromFile(const string& fileName,
     if constexpr (_isCompressed) {
       // when we read from file it means that all preprocessing has been done
       // and the prefixes are already stripped in the file
-      _words.push_back(CompressedString::fromString(line));
-      auto str = expandPrefix(_words.back());
+      auto str = TurtleToken::normalizeRDFLiteral<false>(
+          expandPrefix(CompressedString::fromString(line)));
+
+      _words.push_back(compressPrefix(str));
       if (!first) {
         if (!(_caseComparator.compare(lastExpandedString, str,
                                       SortLevel::TOTAL))) {

diff --git a/src/parser/ParsedQuery.cpp b/src/parser/ParsedQuery.cpp
@@ -12,6 +12,7 @@
 #include "../util/Conversions.h"
 #include "../util/StringUtils.h"
 #include "ParseException.h"
+#include "Tokenizer.h"
 
 using std::string;
 using std::vector;
@@ -368,14 +369,9 @@ void ParsedQuery::expandPrefix(
     if (i != string::npos && i >= from &&
         prefixMap.count(item.substr(from, i - from)) > 0) {
       string prefixUri = prefixMap.find(item.substr(from, i - from))->second;
-      if (from == 0) {
-        item = prefixUri.substr(0, prefixUri.size() - 1) + item.substr(i + 1) +
-               '>';
-      } else {
-        item = item.substr(0, from) +
-               prefixUri.substr(0, prefixUri.size() - 1) + item.substr(i + 1) +
-               '>';
-      }
+      item = item.substr(0, from) + prefixUri.substr(0, prefixUri.size() - 1) +
+             item.substr(i + 1) + '>';
+      item = TurtleToken::unescapePrefixedIri(item);
     }
     if (langtag) {
       item =

diff --git a/src/parser/PropertyPathParser.cpp b/src/parser/PropertyPathParser.cpp
@@ -48,6 +48,7 @@ std::vector<PropertyPathParser::Token> PropertyPathParser::tokenize(
 
   size_t start = 0;
   size_t pos = 0;
+  bool escaped = false;
   while (pos < str.size()) {
     char c = str[pos];
     if (!VALID_CHARS[(uint8_t)c]) {
@@ -61,31 +62,38 @@ std::vector<PropertyPathParser::Token> PropertyPathParser::tokenize(
       inside_iri = false;
     }
 
-    if (!inside_iri && DELIMITER_CHARS[(uint8_t)str[pos]] &&
-        (pos != 0 || c != '?')) {
-      if (start != pos) {
-        // add the string up to but not including the new token
-        tokens.push_back({str.substr(start, pos - start), start});
+    if (!inside_iri && c == '\\') {
+      escaped = !escaped;
+    } else if (!inside_iri && DELIMITER_CHARS[(uint8_t)str[pos]] && escaped) {
+      escaped = false;
+    } else {
+      escaped = false;
+      if (!inside_iri && DELIMITER_CHARS[(uint8_t)str[pos]] &&
+          (pos != 0 || c != '?')) {
+        if (start != pos) {
+          // add the string up to but not including the new token
+          tokens.push_back({str.substr(start, pos - start), start});
 
-        start = pos;
-      }
-      while (pos < str.size() && DELIMITER_CHARS[(uint8_t)str[pos]]) {
-        pos++;
-        if (c == '*' && pos < str.size() && std::isdigit(str[pos])) {
-          // The * token has a number following it
+          start = pos;
+        }
+        while (pos < str.size() && DELIMITER_CHARS[(uint8_t)str[pos]]) {
           pos++;
-          while (pos < str.size() && std::isdigit(str[pos])) {
+          if (c == '*' && pos < str.size() && std::isdigit(str[pos])) {
+            // The * token has a number following it
             pos++;
+            while (pos < str.size() && std::isdigit(str[pos])) {
+              pos++;
+            }
+            tokens.push_back({str.substr(start, pos - start), start});
+            start = pos;
+          } else {
+            // Add the token
+            tokens.push_back({str.substr(start, pos - start), start});
+            start = pos;
           }
-          tokens.push_back({str.substr(start, pos - start), start});
-          start = pos;
-        } else {
-          // Add the token
-          tokens.push_back({str.substr(start, pos - start), start});
-          start = pos;
         }
+        continue;
       }
-      continue;
     }
     pos++;
   }

diff --git a/src/parser/Tokenizer.cpp b/src/parser/Tokenizer.cpp
@@ -98,6 +98,8 @@ const RE2& Tokenizer::idToRegex(const TokId reg) {
       return _tokens.PnameNS;
     case TokId::PnameLN:
       return _tokens.PnameLN;
+    case TokId::PnLocal:
+      return _tokens.PnLocal;
     case TokId::BlankNodeLabel:
       return _tokens.BlankNodeLabel;
     case TokId::WsMultiple: