Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Correctly handle Escapes in the Turtle input. #317

Merged
merged 19 commits into from
May 8, 2021
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion e2e/e2e-build-settings.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"num-triples-per-partial-vocab" : 40000,
"parser-batch-size" : 1000,
"ascii-prefixes-only":true
"ascii-prefixes-only":false
}
10 changes: 5 additions & 5 deletions e2e/scientists_queries.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -790,11 +790,11 @@ queries:
checks:
- num_rows: 579
- num_cols: 2
- selected: ["?s", "?a"]
- contains_row: ["<Albert_Einstein>", "<Nobel_Prize_in_Physics>"]
- contains_row: ["<Albert_Fert>", "<Wolf_Prize_in_Physics>"]
- contains_row: ["<Albert_Overhauser>", "<National_Medal_of_Science_for_Physical_Science>"]
- contains_row: ["<Andre_Geim>", "<Nobel_Prize_in_Physics>"]
- selected: [ "?s", "?a" ]
- contains_row: [ "<Albert_Einstein>", "<Nobel_Prize_in_Physics>" ]
- contains_row: [ "<Albert_Fert>", "<Wolf_Prize_in_Physics>" ]
- contains_row: [ "<Albert_Overhauser>", "<National_Medal_of_Science_for_Physical_Science>" ]
- contains_row: [ "<Andre_Geim>", "<Nobel_Prize_in_Physics>" ]

- query: bind-rename
type: no-text
Expand Down
1 change: 1 addition & 0 deletions src/engine/Bind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
//

#include "Bind.h"

#include "../util/Exception.h"
#include "CallFixedSize.h"
#include "QueryExecutionTree.h"
Expand Down
4 changes: 2 additions & 2 deletions src/engine/SortPerformanceEstimator.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ class SortPerformanceEstimator {

// Compute and return an Estimate for how long sorting an IdTable with the
// specified number of rows and columns takes.
double estimatedSortTimeInSeconds(size_t numRows, size_t numCols) const
noexcept;
double estimatedSortTimeInSeconds(size_t numRows,
size_t numCols) const noexcept;

private:
// Set up all the estimates. Might take several minutes. This constructor is
Expand Down
4 changes: 3 additions & 1 deletion src/global/Constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,9 @@ static const std::string WARNING_ASCII_ONLY_PREFIXES =
"You explicitly requested the ascii-prefixes-only settings or the ctre "
"regex engine for Tokenization. This means "
joka921 marked this conversation as resolved.
Show resolved Hide resolved
"that prefixes in the input Turtle may only use characters from "
"the ascii range. This is stricter than the Sparql standard but "
"the ascii range and that no escape sequences may be used in prefixed "
"names (e.g. rdfs:lab\\,el)."
" This is stricter than the Sparql standard but "
"makes parsing faster and works e.g. for wikidata dumps\n";

static const std::string LOCALE_DEFAULT_LANG = "en";
Expand Down
6 changes: 6 additions & 0 deletions src/index/ExternalVocabulary.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

#include <fstream>

#include "../parser/RdfEscaping.h"
#include "../parser/Tokenizer.h"
#include "../util/Log.h"

// _____________________________________________________________________________
Expand Down Expand Up @@ -74,6 +76,10 @@ void ExternalVocabulary<Comp>::buildFromTextFile(const string& textFileName,
_size = 0;
std::string word;
while (std::getline(infile, word)) {
// In the text file we stored the strings with escapend \n and
// \\ characters. We now store them in a binary format, where we can use the
// `actual` values.
joka921 marked this conversation as resolved.
Show resolved Hide resolved
word = RdfEscaping::unescapeNewlineAndBackslash(word);
offsets.push_back(currentOffset);
currentOffset += _file.write(word.data(), word.size());
_size++;
Expand Down
13 changes: 7 additions & 6 deletions src/index/Index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ void Index::createFromFile(const string& filename) {
string vocabFile = _onDiskBase + ".vocabulary";
string vocabFileTmp = _onDiskBase + ".vocabularyTmp";
std::vector<string> prefixes;
LOG(INFO) << "Finished writing permutations" << std::endl;
if (_vocabPrefixCompressed) {
// we have to use the "normally" sorted vocabulary for the prefix
// compression;
Expand All @@ -107,7 +108,7 @@ void Index::createFromFile(const string& filename) {
std::ofstream prefixFile(_onDiskBase + PREFIX_FILE);
AD_CHECK(prefixFile.is_open());
for (const auto& prefix : prefixes) {
prefixFile << prefix << '\n';
prefixFile << prefix << std::endl;
}
}
_configurationJson["prefixes"] = _vocabPrefixCompressed;
Expand All @@ -118,7 +119,7 @@ void Index::createFromFile(const string& filename) {
if (std::rename(vocabFileTmp.c_str(), vocabFile.c_str())) {
LOG(INFO) << "Error: Rename the prefixed vocab file " << vocabFileTmp
<< " to " << vocabFile << " set errno to " << errno
<< ". Terminating...\n";
<< ". Terminating..." << std::endl;
AD_CHECK(false);
}
writeConfiguration();
Expand Down Expand Up @@ -163,7 +164,7 @@ VocabularyData Index::passFileForVocabulary(const string& filename,
auto p = ad_pipeline::setupParallelPipeline<1, NUM_PARALLEL_ITEM_MAPS>(
_parserBatchSize,
// when called, returns an optional to the next triple. If
// <linexPerPartial> triples were parsed, return std::nullopt. when
// `linesPerPartial` triples were parsed, return std::nullopt. when
// the parser is unable to deliver triples, set parserExhausted to
// true and return std::nullopt. this is exactly the behavior we need,
// as a first step in the parallel Pipeline.
Expand Down Expand Up @@ -433,7 +434,7 @@ Index::createPermutationPairImpl(const string& fileName1,

out1.close();
out2.close();
LOG(INFO) << "Permutation done.\n";
LOG(INFO) << "Permutation done." << std::endl;
return std::make_pair(std::move(metaData1), std::move(metaData2));
}

Expand Down Expand Up @@ -493,12 +494,12 @@ void Index::createPermutationPair(
&(metaData.value().second));
LOG(INFO) << "Done" << '\n';
LOG(INFO) << "Writing MetaData for " << p1._readableName << " and "
<< p2._readableName << '\n';
<< p2._readableName << std::endl;
ad_utility::File f1(_onDiskBase + ".index" + p1._fileSuffix, "r+");
metaData.value().first.appendToFile(&f1);
ad_utility::File f2(_onDiskBase + ".index" + p2._fileSuffix, "r+");
metaData.value().second.appendToFile(&f2);
LOG(INFO) << "Done" << '\n';
LOG(INFO) << "Done" << std::endl;
}
}

Expand Down
1 change: 1 addition & 0 deletions src/index/Index.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <string>
#include <stxxl/vector>
#include <vector>

#include "../engine/ResultTable.h"
#include "../global/Pattern.h"
#include "../parser/NTriplesParser.h"
Expand Down
28 changes: 20 additions & 8 deletions src/index/PrefixHeuristic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,12 @@
// Author: Johannes Kalmbach<joka921> (johannes.kalmbach@gmail.com)

#include "./PrefixHeuristic.h"

#include <algorithm>
#include <fstream>

#include "../parser/RdfEscaping.h"
#include "../parser/Tokenizer.h"
#include "../util/Exception.h"
#include "../util/Log.h"
#include "../util/StringUtils.h"
Expand Down Expand Up @@ -41,7 +45,7 @@ TreeNode* TreeNode::insertAfter(string_view value) {
}

// if we have reached here, we have to add a new child
NodePtr newNode(new TreeNode(value));
NodePtr newNode = std::make_unique<TreeNode>(value);
newNode->_parent = this;

// find children of current node which have to become children of the new node
Expand All @@ -60,6 +64,13 @@ TreeNode* TreeNode::insertAfter(string_view value) {

// register the newly created node as a child of this node
_children.push_back(std::move(newNode));

for (const auto& c : _children) {
if (c.get() == nullptr) {
LOG(ERROR) << "Illegal nullptr child was found" << std::endl;
}
}

joka921 marked this conversation as resolved.
Show resolved Hide resolved
return _children.back().get();
}

Expand Down Expand Up @@ -176,9 +187,10 @@ std::vector<string> calculatePrefixes(const string& filename,
size_t totalSavings = 0;
size_t numWords = 0;

LOG(INFO) << "start reading words and building prefix tree...\n";
LOG(INFO) << "start reading words and building prefix tree..." << std::endl;
// insert all prefix candidates into the tree
while (std::getline(ifs, nextWord)) {
nextWord = RdfEscaping::unescapeNewlineAndBackslash(nextWord);
totalChars += nextWord.size();
// the longest common prefixes between two adjacent words are our candidates
// for compression
Expand All @@ -194,12 +206,12 @@ std::vector<string> calculatePrefixes(const string& filename,

numWords++;
if (numWords % 10000000 == 0) {
LOG(INFO) << "words read: " << numWords << '\n';
LOG(INFO) << "words read: " << numWords << std::endl;
}
}

LOG(INFO) << "Finished building prefix tree!\n";
LOG(INFO) << "Start searching for maximal compressing prefixes\n";
LOG(INFO) << "Finished building prefix tree!" << std::endl;
LOG(INFO) << "Start searching for maximal compressing prefixes" << std::endl;
std::vector<string> res;
res.reserve(numPrefixes);
for (size_t i = 0; i < numPrefixes; ++i) {
Expand All @@ -209,7 +221,7 @@ std::vector<string> calculatePrefixes(const string& filename,
}
totalSavings += p.first;
LOG(INFO) << "Found prefix " << p.second
<< " with number of bytes gained: " << p.first << '\n';
<< " with number of bytes gained: " << p.first << std::endl;
res.push_back(std::move(p.second));
}
// if we always add an encoding we have calculated with a codelength of 0 so
Expand All @@ -218,9 +230,9 @@ std::vector<string> calculatePrefixes(const string& filename,
totalSavings -= codelength * numWords;
}
double efficiency = static_cast<double>(totalSavings) / totalChars;
std::cout << "total number of bytes : " << totalChars << '\n';
std::cout << "total number of bytes : " << totalChars << std::endl;
std::cout << "total chars compressed : " << totalSavings << '\n';
std::cout << "percentage of chars compressed : " << efficiency << '\n';
std::cout << "percentage of chars compressed : " << efficiency << std::endl;
return res;
}

Expand Down
3 changes: 3 additions & 0 deletions src/index/PrefixHeuristic.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,14 @@ using std::string_view;
class TreeNode {
private:
friend class Tree;

using NodePtr = std::unique_ptr<TreeNode>;

// Constructor
public:
explicit TreeNode(string_view value) : _value(value) {}

private:
joka921 marked this conversation as resolved.
Show resolved Hide resolved
// Recursive Insertion of value. If the value does not match _value we will
// automatically call insert on a node that is closer to the actual position
// of value in the Tree. Returns the node that was actually inserted
Expand Down
6 changes: 4 additions & 2 deletions src/index/VocabularyGeneratorImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,8 @@ void VocabularyMerger::writeQueueWordsToIdVec(const std::vector<QueueWord>& buff

// write the new word to the vocabulary
if (_lastWritten < EXTERNALIZED_LITERALS_PREFIX) {
_outfile << _lastWritten << '\n';
_outfile << RdfEscaping::escapeNewlineAndBackslash(_lastWritten)
<< '\n';
} else {
// we have to strip the externalization character again
auto& c = _lastWritten[0];
Expand All @@ -169,7 +170,8 @@ void VocabularyMerger::writeQueueWordsToIdVec(const std::vector<QueueWord>& buff
"should never happen\n";
AD_CHECK(false)
}
_outfileExternal << _lastWritten << '\n';
_outfileExternal << RdfEscaping::escapeNewlineAndBackslash(_lastWritten)
<< '\n';
}

// write id to corresponding vec
Expand Down
8 changes: 6 additions & 2 deletions src/index/VocabularyImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
#include <fstream>
#include <iostream>

#include "../parser/RdfEscaping.h"
#include "../parser/Tokenizer.h"
#include "../util/File.h"
#include "../util/HashMap.h"
#include "../util/HashSet.h"
Expand All @@ -30,8 +32,10 @@ void Vocabulary<S, C>::readFromFile(const string& fileName,
if constexpr (_isCompressed) {
// when we read from file it means that all preprocessing has been done
// and the prefixes are already stripped in the file
_words.push_back(CompressedString::fromString(line));
auto str = expandPrefix(_words.back());
auto str = RdfEscaping::unescapeNewlineAndBackslash(
expandPrefix(CompressedString::fromString(line)));

_words.push_back(compressPrefix(str));
if (!first) {
if (!(_caseComparator.compare(lastExpandedString, str,
SortLevel::TOTAL))) {
Expand Down
24 changes: 12 additions & 12 deletions src/parser/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
add_library(parser
SparqlParser.h SparqlParser.cpp
ParsedQuery.h ParsedQuery.cpp
ParseException.h
TsvParser.h TsvParser.cpp
NTriplesParser.h NTriplesParser.cpp
TurtleParser.h TurtleParser.cpp
Tokenizer.h Tokenizer.cpp
ContextFileParser.cpp ContextFileParser.h
ParallelParseBuffer.h
PropertyPathParser.h PropertyPathParser.cpp
SparqlLexer.h SparqlLexer.cpp)
target_link_libraries(parser re2 absl::flat_hash_map)
SparqlParser.h SparqlParser.cpp
ParsedQuery.h ParsedQuery.cpp
ParseException.h
TsvParser.h TsvParser.cpp
NTriplesParser.h NTriplesParser.cpp
TurtleParser.h TurtleParser.cpp
Tokenizer.h Tokenizer.cpp
ContextFileParser.cpp ContextFileParser.h
ParallelParseBuffer.h
PropertyPathParser.h PropertyPathParser.cpp
SparqlLexer.h SparqlLexer.cpp RdfEscaping.h)
target_link_libraries(parser re2 ${ICU_LIBRARIES} absl::flat_hash_map)
13 changes: 5 additions & 8 deletions src/parser/ParsedQuery.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@

#include "../util/Conversions.h"
#include "../util/StringUtils.h"
#include "./RdfEscaping.h"
#include "ParseException.h"
#include "Tokenizer.h"

using std::string;
using std::vector;
Expand Down Expand Up @@ -373,14 +375,9 @@ void ParsedQuery::expandPrefix(
if (i != string::npos && i >= from &&
prefixMap.count(item.substr(from, i - from)) > 0) {
joka921 marked this conversation as resolved.
Show resolved Hide resolved
string prefixUri = prefixMap.find(item.substr(from, i - from))->second;
joka921 marked this conversation as resolved.
Show resolved Hide resolved
if (from == 0) {
item = prefixUri.substr(0, prefixUri.size() - 1) + item.substr(i + 1) +
'>';
} else {
item = item.substr(0, from) +
prefixUri.substr(0, prefixUri.size() - 1) + item.substr(i + 1) +
'>';
}
item = item.substr(0, from) + prefixUri.substr(0, prefixUri.size() - 1) +
joka921 marked this conversation as resolved.
Show resolved Hide resolved
item.substr(i + 1) + '>';
item = RdfEscaping::unescapePrefixedIri(item);
}
if (langtag) {
item =
Expand Down