ad-freiburg · joka921 · May 8, 2021 · Feb 27, 2020 · Apr 26, 2021 · Apr 26, 2021
diff --git a/e2e/e2e-build-settings.json b/e2e/e2e-build-settings.json
@@ -1,5 +1,5 @@
 {
   "num-triples-per-partial-vocab" : 40000,
   "parser-batch-size" : 1000,
-  "ascii-prefixes-only":true
+  "ascii-prefixes-only":false
 }
diff --git a/e2e/scientists_queries.yaml b/e2e/scientists_queries.yaml
@@ -790,11 +790,11 @@ queries:
     checks:
       - num_rows: 579
       - num_cols: 2
-      - selected: ["?s", "?a"]
-      - contains_row: ["<Albert_Einstein>", "<Nobel_Prize_in_Physics>"]
-      - contains_row: ["<Albert_Fert>", "<Wolf_Prize_in_Physics>"]
-      - contains_row: ["<Albert_Overhauser>", "<National_Medal_of_Science_for_Physical_Science>"]
-      - contains_row: ["<Andre_Geim>", "<Nobel_Prize_in_Physics>"]
+      - selected: [ "?s", "?a" ]
+      - contains_row: [ "<Albert_Einstein>", "<Nobel_Prize_in_Physics>" ]
+      - contains_row: [ "<Albert_Fert>", "<Wolf_Prize_in_Physics>" ]
+      - contains_row: [ "<Albert_Overhauser>", "<National_Medal_of_Science_for_Physical_Science>" ]
+      - contains_row: [ "<Andre_Geim>", "<Nobel_Prize_in_Physics>" ]
 
   - query: bind-rename
     type: no-text

diff --git a/src/engine/Bind.cpp b/src/engine/Bind.cpp
@@ -3,6 +3,7 @@
 //
 
 #include "Bind.h"
+
 #include "../util/Exception.h"
 #include "CallFixedSize.h"
 #include "QueryExecutionTree.h"

diff --git a/src/engine/SortPerformanceEstimator.h b/src/engine/SortPerformanceEstimator.h
@@ -31,8 +31,8 @@ class SortPerformanceEstimator {
 
   // Compute and return an Estimate for how long sorting an IdTable with the
   // specified number of rows and columns takes.
-  double estimatedSortTimeInSeconds(size_t numRows, size_t numCols) const
-      noexcept;
+  double estimatedSortTimeInSeconds(size_t numRows,
+                                    size_t numCols) const noexcept;
 
  private:
   // Set up all the estimates. Might take several minutes. This constructor is

diff --git a/src/global/Constants.h b/src/global/Constants.h
@@ -83,7 +83,9 @@ static const std::string WARNING_ASCII_ONLY_PREFIXES =
     "You explicitly requested the ascii-prefixes-only settings or the ctre "
     "regex engine for Tokenization. This means "
     "that prefixes in the input Turtle may only use characters from "
-    "the ascii range. This is stricter than the Sparql standard but "
+    "the ascii range and that no escape sequences may be used in prefixed "
+    "names (e.g. rdfs:lab\\,el)."
+    " This is stricter than the Sparql standard but "
     "makes parsing faster and works e.g. for wikidata dumps\n";
 
 static const std::string LOCALE_DEFAULT_LANG = "en";

diff --git a/src/index/ExternalVocabulary.cpp b/src/index/ExternalVocabulary.cpp
@@ -6,6 +6,8 @@
 
 #include <fstream>
 
+#include "../parser/RdfEscaping.h"
+#include "../parser/Tokenizer.h"
 #include "../util/Log.h"
 
 // _____________________________________________________________________________
@@ -74,6 +76,10 @@ void ExternalVocabulary<Comp>::buildFromTextFile(const string& textFileName,
   _size = 0;
   std::string word;
   while (std::getline(infile, word)) {
+    // In the text file we stored the strings with escapend \n and
+    // \\ characters. We now store them in a binary format, where we can use the
+    // `actual` values.
+    word = RdfEscaping::unescapeNewlineAndBackslash(word);
     offsets.push_back(currentOffset);
     currentOffset += _file.write(word.data(), word.size());
     _size++;

diff --git a/src/index/Index.cpp b/src/index/Index.cpp
@@ -97,6 +97,7 @@ void Index::createFromFile(const string& filename) {
   string vocabFile = _onDiskBase + ".vocabulary";
   string vocabFileTmp = _onDiskBase + ".vocabularyTmp";
   std::vector<string> prefixes;
+  LOG(INFO) << "Finished writing permutations" << std::endl;
   if (_vocabPrefixCompressed) {
     // we have to use the "normally" sorted vocabulary for the prefix
     // compression;
@@ -107,7 +108,7 @@ void Index::createFromFile(const string& filename) {
     std::ofstream prefixFile(_onDiskBase + PREFIX_FILE);
     AD_CHECK(prefixFile.is_open());
     for (const auto& prefix : prefixes) {
-      prefixFile << prefix << '\n';
+      prefixFile << prefix << std::endl;
     }
   }
   _configurationJson["prefixes"] = _vocabPrefixCompressed;
@@ -118,7 +119,7 @@ void Index::createFromFile(const string& filename) {
   if (std::rename(vocabFileTmp.c_str(), vocabFile.c_str())) {
     LOG(INFO) << "Error: Rename the prefixed vocab file " << vocabFileTmp
               << " to " << vocabFile << " set errno to " << errno
-              << ". Terminating...\n";
+              << ". Terminating..." << std::endl;
     AD_CHECK(false);
   }
   writeConfiguration();
@@ -163,7 +164,7 @@ VocabularyData Index::passFileForVocabulary(const string& filename,
       auto p = ad_pipeline::setupParallelPipeline<1, NUM_PARALLEL_ITEM_MAPS>(
           _parserBatchSize,
           // when called, returns an optional to the next triple. If
-          // <linexPerPartial> triples were parsed, return std::nullopt. when
+          // `linesPerPartial` triples were parsed, return std::nullopt. when
           // the parser is unable to deliver triples, set parserExhausted to
           // true and return std::nullopt. this is exactly the behavior we need,
           // as a first step in the parallel Pipeline.
@@ -433,7 +434,7 @@ Index::createPermutationPairImpl(const string& fileName1,
 
   out1.close();
   out2.close();
-  LOG(INFO) << "Permutation done.\n";
+  LOG(INFO) << "Permutation done." << std::endl;
   return std::make_pair(std::move(metaData1), std::move(metaData2));
 }
 
@@ -493,12 +494,12 @@ void Index::createPermutationPair(
                            &(metaData.value().second));
     LOG(INFO) << "Done" << '\n';
     LOG(INFO) << "Writing MetaData for " << p1._readableName << " and "
-              << p2._readableName << '\n';
+              << p2._readableName << std::endl;
     ad_utility::File f1(_onDiskBase + ".index" + p1._fileSuffix, "r+");
     metaData.value().first.appendToFile(&f1);
     ad_utility::File f2(_onDiskBase + ".index" + p2._fileSuffix, "r+");
     metaData.value().second.appendToFile(&f2);
-    LOG(INFO) << "Done" << '\n';
+    LOG(INFO) << "Done" << std::endl;
   }
 }
 

diff --git a/src/index/Index.h b/src/index/Index.h
@@ -12,6 +12,7 @@
 #include <string>
 #include <stxxl/vector>
 #include <vector>
+
 #include "../engine/ResultTable.h"
 #include "../global/Pattern.h"
 #include "../parser/NTriplesParser.h"

diff --git a/src/index/PrefixHeuristic.cpp b/src/index/PrefixHeuristic.cpp
@@ -3,8 +3,12 @@
 // Author: Johannes Kalmbach<joka921> (johannes.kalmbach@gmail.com)
 
 #include "./PrefixHeuristic.h"
+
 #include <algorithm>
 #include <fstream>
+
+#include "../parser/RdfEscaping.h"
+#include "../parser/Tokenizer.h"
 #include "../util/Exception.h"
 #include "../util/Log.h"
 #include "../util/StringUtils.h"
@@ -41,7 +45,7 @@ TreeNode* TreeNode::insertAfter(string_view value) {
   }
 
   // if we have reached here, we have to add a new child
-  NodePtr newNode(new TreeNode(value));
+  NodePtr newNode = std::make_unique<TreeNode>(value);
   newNode->_parent = this;
 
   // find children of current node which have to become children of the new node
@@ -60,6 +64,13 @@ TreeNode* TreeNode::insertAfter(string_view value) {
 
   // register the newly created node as a child of this node
   _children.push_back(std::move(newNode));
+
+  for (const auto& c : _children) {
+    if (c.get() == nullptr) {
+      LOG(ERROR) << "Illegal nullptr child was found" << std::endl;
+    }
+  }
+
   return _children.back().get();
 }
 
@@ -176,9 +187,10 @@ std::vector<string> calculatePrefixes(const string& filename,
   size_t totalSavings = 0;
   size_t numWords = 0;
 
-  LOG(INFO) << "start reading words and building prefix tree...\n";
+  LOG(INFO) << "start reading words and building prefix tree..." << std::endl;
   // insert all prefix candidates into  the tree
   while (std::getline(ifs, nextWord)) {
+    nextWord = RdfEscaping::unescapeNewlineAndBackslash(nextWord);
     totalChars += nextWord.size();
     // the longest common prefixes between two adjacent words are our candidates
     // for compression
@@ -194,12 +206,12 @@ std::vector<string> calculatePrefixes(const string& filename,
 
     numWords++;
     if (numWords % 10000000 == 0) {
-      LOG(INFO) << "words read: " << numWords << '\n';
+      LOG(INFO) << "words read: " << numWords << std::endl;
     }
   }
 
-  LOG(INFO) << "Finished building prefix tree!\n";
-  LOG(INFO) << "Start searching for maximal compressing prefixes\n";
+  LOG(INFO) << "Finished building prefix tree!" << std::endl;
+  LOG(INFO) << "Start searching for maximal compressing prefixes" << std::endl;
   std::vector<string> res;
   res.reserve(numPrefixes);
   for (size_t i = 0; i < numPrefixes; ++i) {
@@ -209,7 +221,7 @@ std::vector<string> calculatePrefixes(const string& filename,
     }
     totalSavings += p.first;
     LOG(INFO) << "Found prefix " << p.second
-              << " with number of bytes gained: " << p.first << '\n';
+              << " with number of bytes gained: " << p.first << std::endl;
     res.push_back(std::move(p.second));
   }
   // if we always add an encoding we have calculated with a codelength of 0 so
@@ -218,9 +230,9 @@ std::vector<string> calculatePrefixes(const string& filename,
     totalSavings -= codelength * numWords;
   }
   double efficiency = static_cast<double>(totalSavings) / totalChars;
-  std::cout << "total number of bytes : " << totalChars << '\n';
+  std::cout << "total number of bytes : " << totalChars << std::endl;
   std::cout << "total chars compressed : " << totalSavings << '\n';
-  std::cout << "percentage of chars compressed : " << efficiency << '\n';
+  std::cout << "percentage of chars compressed : " << efficiency << std::endl;
   return res;
 }
 

diff --git a/src/index/PrefixHeuristic.h b/src/index/PrefixHeuristic.h
@@ -55,11 +55,14 @@ using std::string_view;
 class TreeNode {
  private:
   friend class Tree;
+
   using NodePtr = std::unique_ptr<TreeNode>;
 
   // Constructor
+ public:
   explicit TreeNode(string_view value) : _value(value) {}
 
+ private:
   // Recursive Insertion of value. If the value does not match _value we will
   // automatically call insert on a node that is closer to the actual position
   // of value in the Tree. Returns the node that was actually inserted

diff --git a/src/index/VocabularyGeneratorImpl.h b/src/index/VocabularyGeneratorImpl.h
@@ -153,7 +153,8 @@ void VocabularyMerger::writeQueueWordsToIdVec(const std::vector<QueueWord>& buff
 
       // write the new word to the vocabulary
       if (_lastWritten < EXTERNALIZED_LITERALS_PREFIX) {
-        _outfile << _lastWritten << '\n';
+        _outfile << RdfEscaping::escapeNewlineAndBackslash(_lastWritten)
+                 << '\n';
       } else {
         // we have to strip the externalization character again
         auto& c = _lastWritten[0];
@@ -169,7 +170,8 @@ void VocabularyMerger::writeQueueWordsToIdVec(const std::vector<QueueWord>& buff
                           "should never happen\n";
             AD_CHECK(false)
         }
-        _outfileExternal << _lastWritten << '\n';
+        _outfileExternal << RdfEscaping::escapeNewlineAndBackslash(_lastWritten)
+                         << '\n';
       }
 
       // write id to corresponding vec

diff --git a/src/index/VocabularyImpl.h b/src/index/VocabularyImpl.h
@@ -8,6 +8,8 @@
 #include <fstream>
 #include <iostream>
 
+#include "../parser/RdfEscaping.h"
+#include "../parser/Tokenizer.h"
 #include "../util/File.h"
 #include "../util/HashMap.h"
 #include "../util/HashSet.h"
@@ -30,8 +32,10 @@ void Vocabulary<S, C>::readFromFile(const string& fileName,
     if constexpr (_isCompressed) {
       // when we read from file it means that all preprocessing has been done
       // and the prefixes are already stripped in the file
-      _words.push_back(CompressedString::fromString(line));
-      auto str = expandPrefix(_words.back());
+      auto str = RdfEscaping::unescapeNewlineAndBackslash(
+          expandPrefix(CompressedString::fromString(line)));
+
+      _words.push_back(compressPrefix(str));
       if (!first) {
         if (!(_caseComparator.compare(lastExpandedString, str,
                                       SortLevel::TOTAL))) {

diff --git a/src/parser/CMakeLists.txt b/src/parser/CMakeLists.txt
@@ -1,13 +1,13 @@
 add_library(parser
-              SparqlParser.h SparqlParser.cpp
-              ParsedQuery.h ParsedQuery.cpp
-              ParseException.h
-              TsvParser.h TsvParser.cpp
-              NTriplesParser.h NTriplesParser.cpp
-              TurtleParser.h TurtleParser.cpp
-              Tokenizer.h Tokenizer.cpp
-              ContextFileParser.cpp ContextFileParser.h
-              ParallelParseBuffer.h
-              PropertyPathParser.h PropertyPathParser.cpp
-              SparqlLexer.h SparqlLexer.cpp)
-target_link_libraries(parser re2 absl::flat_hash_map)
+        SparqlParser.h SparqlParser.cpp
+        ParsedQuery.h ParsedQuery.cpp
+        ParseException.h
+        TsvParser.h TsvParser.cpp
+        NTriplesParser.h NTriplesParser.cpp
+        TurtleParser.h TurtleParser.cpp
+        Tokenizer.h Tokenizer.cpp
+        ContextFileParser.cpp ContextFileParser.h
+        ParallelParseBuffer.h
+        PropertyPathParser.h PropertyPathParser.cpp
+        SparqlLexer.h SparqlLexer.cpp RdfEscaping.h)
+target_link_libraries(parser re2 ${ICU_LIBRARIES} absl::flat_hash_map)
diff --git a/src/parser/ParsedQuery.cpp b/src/parser/ParsedQuery.cpp
@@ -11,7 +11,9 @@
 
 #include "../util/Conversions.h"
 #include "../util/StringUtils.h"
+#include "./RdfEscaping.h"
 #include "ParseException.h"
+#include "Tokenizer.h"
 
 using std::string;
 using std::vector;
@@ -373,14 +375,9 @@ void ParsedQuery::expandPrefix(
     if (i != string::npos && i >= from &&
         prefixMap.count(item.substr(from, i - from)) > 0) {
       string prefixUri = prefixMap.find(item.substr(from, i - from))->second;
-      if (from == 0) {
-        item = prefixUri.substr(0, prefixUri.size() - 1) + item.substr(i + 1) +
-               '>';
-      } else {
-        item = item.substr(0, from) +
-               prefixUri.substr(0, prefixUri.size() - 1) + item.substr(i + 1) +
-               '>';
-      }
+      item = item.substr(0, from) + prefixUri.substr(0, prefixUri.size() - 1) +
+             item.substr(i + 1) + '>';
+      item = RdfEscaping::unescapePrefixedIri(item);
     }
     if (langtag) {
       item =