Skip to content

Commit

Permalink
Correct handling of escaped Literals and Irirefs during index build.
Browse files Browse the repository at this point in the history
- also apply the normalization of literals correctly during index build time
- Adapt the Index unit tests to "legal" knowledge bases

- Get rid of misleading warning in case of whitespace at the
  end of a TTL file.
  - Previously there was a "parsing of ttl has failed, but there is still content left"
    warning, although the remainder of the ttl input was only whitespace.
  - This was due to a bug in the Parser's skipWhitespace() function which failed if the
    input consisted of ONLY whitespace. This is now fixed.

- The case, where a prefix was used with an empty "content" (e.g. <a> wd: <b>) was
  broken before, luckily there was a unit test and this is now fixed.
  • Loading branch information
joka921 committed Mar 23, 2021
1 parent 211ded4 commit fcc127c
Show file tree
Hide file tree
Showing 19 changed files with 654 additions and 327 deletions.
2 changes: 1 addition & 1 deletion e2e/e2e-build-settings.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"num-triples-per-partial-vocab" : 40000,
"parser-batch-size" : 1000,
"ascii-prefixes-only":true
"ascii-prefixes-only":false
}
4 changes: 3 additions & 1 deletion src/global/Constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,9 @@ static const std::string WARNING_ASCII_ONLY_PREFIXES =
"You explicitly requested the ascii-prefixes-only settings or the ctre "
"regex engine for Tokenization. This means "
"that prefixes in the input Turtle may only use characters from "
"the ascii range. This is stricter than the Sparql standard but "
"the ascii range and that no escape sequences may be used in prefixed "
"names (e.g. rdfs:lab\\,el)."
" This is stricter than the Sparql standard but "
"makes parsing faster and works e.g. for wikidata dumps\n";

static const std::string LOCALE_DEFAULT_LANG = "en";
Expand Down
5 changes: 5 additions & 0 deletions src/index/ExternalVocabulary.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include <fstream>

#include "../parser/Tokenizer.h"
#include "../util/Log.h"

// _____________________________________________________________________________
Expand Down Expand Up @@ -74,6 +75,10 @@ void ExternalVocabulary<Comp>::buildFromTextFile(const string& textFileName,
_size = 0;
std::string word;
while (std::getline(infile, word)) {
// in the text file we had the escaped variants, but here
// we can use the correct ones since we store in a binary format with
// offsets
word = TurtleToken::normalizeRDFLiteral<false>(word);
offsets.push_back(currentOffset);
currentOffset += _file.write(word.data(), word.size());
_size++;
Expand Down
31 changes: 25 additions & 6 deletions src/index/Index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ void Index::createFromFile(const string& filename) {
string vocabFile = _onDiskBase + ".vocabulary";
string vocabFileTmp = _onDiskBase + ".vocabularyTmp";
std::vector<string> prefixes;
LOG(INFO) << "Finished writing permutations" << std::endl;
if (_vocabPrefixCompressed) {
// we have to use the "normally" sorted vocabulary for the prefix
// compression;
Expand All @@ -107,7 +108,7 @@ void Index::createFromFile(const string& filename) {
std::ofstream prefixFile(_onDiskBase + PREFIX_FILE);
AD_CHECK(prefixFile.is_open());
for (const auto& prefix : prefixes) {
prefixFile << prefix << '\n';
prefixFile << prefix << std::endl;
}
}
_configurationJson["prefixes"] = _vocabPrefixCompressed;
Expand All @@ -118,7 +119,7 @@ void Index::createFromFile(const string& filename) {
if (std::rename(vocabFileTmp.c_str(), vocabFile.c_str())) {
LOG(INFO) << "Error: Rename the prefixed vocab file " << vocabFileTmp
<< " to " << vocabFile << " set errno to " << errno
<< ". Terminating...\n";
<< ". Terminating..." << std::endl;
AD_CHECK(false);
}
writeConfiguration();
Expand Down Expand Up @@ -160,7 +161,7 @@ VocabularyData Index::passFileForVocabulary(const string& filename,
std::array<ItemMapManager, NUM_PARALLEL_ITEM_MAPS> itemArray;

{
auto p = ad_pipeline::setupParallelPipeline<1, NUM_PARALLEL_ITEM_MAPS>(
auto p = ad_pipeline::setupParallelPipeline<1, 1, NUM_PARALLEL_ITEM_MAPS>(
_parserBatchSize,
// when called, returns an optional to the next triple. If
// <linexPerPartial> triples were parsed, return std::nullopt. when
Expand All @@ -169,6 +170,24 @@ VocabularyData Index::passFileForVocabulary(const string& filename,
// as a first step in the parallel Pipeline.
ParserBatcher(parser, linesPerPartial,
[&]() { parserExhausted = true; }),
// do all the unescaping from Sparql (ToDo<joka921>:: move this into
// its own pipeline within the parser
[this](Triple&& t) {
Triple res;
std::transform(t.begin(), t.end(), res.begin(), [](const auto& s) {
auto res = TurtleToken::normalizeRDFLiteral(s);
try {
[[maybe_unused]] auto tmp = TurtleToken::normalizeRDFLiteral(
TurtleToken::escapeRDFLiteral(res));
} catch (...) {
LOG(ERROR) << "Vocabulary entry " + s +
" could not be (un)escaped properly"
<< std::endl;
}
return res;
});
return res;
},
// convert each triple to the internal representation (e.g. special
// values for Numbers, externalized literals, etc.)
[this](Triple&& t) {
Expand Down Expand Up @@ -433,7 +452,7 @@ Index::createPermutationPairImpl(const string& fileName1,

out1.close();
out2.close();
LOG(INFO) << "Permutation done.\n";
LOG(INFO) << "Permutation done." << std::endl;
return std::make_pair(std::move(metaData1), std::move(metaData2));
}

Expand Down Expand Up @@ -493,12 +512,12 @@ void Index::createPermutationPair(
&(metaData.value().second));
LOG(INFO) << "Done" << '\n';
LOG(INFO) << "Writing MetaData for " << p1._readableName << " and "
<< p2._readableName << '\n';
<< p2._readableName << std::endl;
ad_utility::File f1(_onDiskBase + ".index" + p1._fileSuffix, "r+");
metaData.value().first.appendToFile(&f1);
ad_utility::File f2(_onDiskBase + ".index" + p2._fileSuffix, "r+");
metaData.value().second.appendToFile(&f2);
LOG(INFO) << "Done" << '\n';
LOG(INFO) << "Done" << std::endl;
}
}

Expand Down
25 changes: 17 additions & 8 deletions src/index/PrefixHeuristic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "./PrefixHeuristic.h"
#include <algorithm>
#include <fstream>
#include "../parser/Tokenizer.h"
#include "../util/Exception.h"
#include "../util/Log.h"
#include "../util/StringUtils.h"
Expand Down Expand Up @@ -41,7 +42,7 @@ TreeNode* TreeNode::insertAfter(string_view value) {
}

// if we have reached here, we have to add a new child
NodePtr newNode(new TreeNode(value));
NodePtr newNode = std::make_unique<TreeNode>(value);
newNode->_parent = this;

// find children of current node which have to become children of the new node
Expand All @@ -60,6 +61,13 @@ TreeNode* TreeNode::insertAfter(string_view value) {

// register the newly created node as a child of this node
_children.push_back(std::move(newNode));

for (const auto& c : _children) {
if (c.get() == nullptr) {
LOG(ERROR) << "Illegal nullptr child was found" << std::endl;
}
}

return _children.back().get();
}

Expand Down Expand Up @@ -176,9 +184,10 @@ std::vector<string> calculatePrefixes(const string& filename,
size_t totalSavings = 0;
size_t numWords = 0;

LOG(INFO) << "start reading words and building prefix tree...\n";
LOG(INFO) << "start reading words and building prefix tree..." << std::endl;
// insert all prefix candidates into the tree
while (std::getline(ifs, nextWord)) {
nextWord = TurtleToken::normalizeRDFLiteral<false>(nextWord);
totalChars += nextWord.size();
// the longest common prefixes between two adjacent words are our candidates
// for compression
Expand All @@ -194,12 +203,12 @@ std::vector<string> calculatePrefixes(const string& filename,

numWords++;
if (numWords % 10000000 == 0) {
LOG(INFO) << "words read: " << numWords << '\n';
LOG(INFO) << "words read: " << numWords << std::endl;
}
}

LOG(INFO) << "Finished building prefix tree!\n";
LOG(INFO) << "Start searching for maximal compressing prefixes\n";
LOG(INFO) << "Finished building prefix tree!" << std::endl;
LOG(INFO) << "Start searching for maximal compressing prefixes" << std::endl;
std::vector<string> res;
res.reserve(numPrefixes);
for (size_t i = 0; i < numPrefixes; ++i) {
Expand All @@ -209,7 +218,7 @@ std::vector<string> calculatePrefixes(const string& filename,
}
totalSavings += p.first;
LOG(INFO) << "Found prefix " << p.second
<< " with number of bytes gained: " << p.first << '\n';
<< " with number of bytes gained: " << p.first << std::endl;
res.push_back(std::move(p.second));
}
// if we always add an encoding we have calculated with a codelength of 0 so
Expand All @@ -218,9 +227,9 @@ std::vector<string> calculatePrefixes(const string& filename,
totalSavings -= codelength * numWords;
}
double efficiency = static_cast<double>(totalSavings) / totalChars;
std::cout << "total number of bytes : " << totalChars << '\n';
std::cout << "total number of bytes : " << totalChars << std::endl;
std::cout << "total chars compressed : " << totalSavings << '\n';
std::cout << "percentage of chars compressed : " << efficiency << '\n';
std::cout << "percentage of chars compressed : " << efficiency << std::endl;
return res;
}

Expand Down
3 changes: 3 additions & 0 deletions src/index/PrefixHeuristic.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,14 @@ using std::string_view;
class TreeNode {
private:
friend class Tree;

using NodePtr = std::unique_ptr<TreeNode>;

// Constructor
public:
explicit TreeNode(string_view value) : _value(value) {}

private:
// Recursive Insertion of value. If the value does not match _value we will
// automatically call insert on a node that is closer to the actual position
// of value in the Tree. Returns the node that was actually inserted
Expand Down
11 changes: 9 additions & 2 deletions src/index/VocabularyGeneratorImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,14 @@ void VocabularyMerger::writeQueueWordsToIdVec(const std::vector<QueueWord>& buff

// write the new word to the vocabulary
if (_lastWritten < EXTERNALIZED_LITERALS_PREFIX) {
_outfile << _lastWritten << '\n';
auto escaped = TurtleToken::escapeRDFLiteral(_lastWritten);
try {
auto restored = TurtleToken::normalizeRDFLiteral(escaped);
} catch (...) {
LOG(ERROR) << "Failure in the (un) escaping of vocabulary entry " + _lastWritten
<< std::endl;
}
_outfile << TurtleToken::escapeRDFLiteral(_lastWritten) << '\n';
} else {
// we have to strip the externalization character again
auto& c = _lastWritten[0];
Expand All @@ -169,7 +176,7 @@ void VocabularyMerger::writeQueueWordsToIdVec(const std::vector<QueueWord>& buff
"should never happen\n";
AD_CHECK(false)
}
_outfileExternal << _lastWritten << '\n';
_outfileExternal << TurtleToken::escapeRDFLiteral(_lastWritten) << '\n';
}

// write id to corresponding vec
Expand Down
7 changes: 5 additions & 2 deletions src/index/VocabularyImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <fstream>
#include <iostream>

#include "../parser/Tokenizer.h"
#include "../util/File.h"
#include "../util/HashMap.h"
#include "../util/HashSet.h"
Expand All @@ -30,8 +31,10 @@ void Vocabulary<S, C>::readFromFile(const string& fileName,
if constexpr (_isCompressed) {
// when we read from file it means that all preprocessing has been done
// and the prefixes are already stripped in the file
_words.push_back(CompressedString::fromString(line));
auto str = expandPrefix(_words.back());
auto str = TurtleToken::normalizeRDFLiteral<false>(
expandPrefix(CompressedString::fromString(line)));

_words.push_back(compressPrefix(str));
if (!first) {
if (!(_caseComparator.compare(lastExpandedString, str,
SortLevel::TOTAL))) {
Expand Down
12 changes: 4 additions & 8 deletions src/parser/ParsedQuery.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "../util/Conversions.h"
#include "../util/StringUtils.h"
#include "ParseException.h"
#include "Tokenizer.h"

using std::string;
using std::vector;
Expand Down Expand Up @@ -368,14 +369,9 @@ void ParsedQuery::expandPrefix(
if (i != string::npos && i >= from &&
prefixMap.count(item.substr(from, i - from)) > 0) {
string prefixUri = prefixMap.find(item.substr(from, i - from))->second;
if (from == 0) {
item = prefixUri.substr(0, prefixUri.size() - 1) + item.substr(i + 1) +
'>';
} else {
item = item.substr(0, from) +
prefixUri.substr(0, prefixUri.size() - 1) + item.substr(i + 1) +
'>';
}
item = item.substr(0, from) + prefixUri.substr(0, prefixUri.size() - 1) +
item.substr(i + 1) + '>';
item = TurtleToken::unescapePrefixedIri(item);
}
if (langtag) {
item =
Expand Down
46 changes: 27 additions & 19 deletions src/parser/PropertyPathParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ std::vector<PropertyPathParser::Token> PropertyPathParser::tokenize(

size_t start = 0;
size_t pos = 0;
bool escaped = false;
while (pos < str.size()) {
char c = str[pos];
if (!VALID_CHARS[(uint8_t)c]) {
Expand All @@ -61,31 +62,38 @@ std::vector<PropertyPathParser::Token> PropertyPathParser::tokenize(
inside_iri = false;
}

if (!inside_iri && DELIMITER_CHARS[(uint8_t)str[pos]] &&
(pos != 0 || c != '?')) {
if (start != pos) {
// add the string up to but not including the new token
tokens.push_back({str.substr(start, pos - start), start});
if (!inside_iri && c == '\\') {
escaped = !escaped;
} else if (!inside_iri && DELIMITER_CHARS[(uint8_t)str[pos]] && escaped) {
escaped = false;
} else {
escaped = false;
if (!inside_iri && DELIMITER_CHARS[(uint8_t)str[pos]] &&
(pos != 0 || c != '?')) {
if (start != pos) {
// add the string up to but not including the new token
tokens.push_back({str.substr(start, pos - start), start});

start = pos;
}
while (pos < str.size() && DELIMITER_CHARS[(uint8_t)str[pos]]) {
pos++;
if (c == '*' && pos < str.size() && std::isdigit(str[pos])) {
// The * token has a number following it
start = pos;
}
while (pos < str.size() && DELIMITER_CHARS[(uint8_t)str[pos]]) {
pos++;
while (pos < str.size() && std::isdigit(str[pos])) {
if (c == '*' && pos < str.size() && std::isdigit(str[pos])) {
// The * token has a number following it
pos++;
while (pos < str.size() && std::isdigit(str[pos])) {
pos++;
}
tokens.push_back({str.substr(start, pos - start), start});
start = pos;
} else {
// Add the token
tokens.push_back({str.substr(start, pos - start), start});
start = pos;
}
tokens.push_back({str.substr(start, pos - start), start});
start = pos;
} else {
// Add the token
tokens.push_back({str.substr(start, pos - start), start});
start = pos;
}
continue;
}
continue;
}
pos++;
}
Expand Down
2 changes: 2 additions & 0 deletions src/parser/Tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ const RE2& Tokenizer::idToRegex(const TokId reg) {
return _tokens.PnameNS;
case TokId::PnameLN:
return _tokens.PnameLN;
case TokId::PnLocal:
return _tokens.PnLocal;
case TokId::BlankNodeLabel:
return _tokens.BlankNodeLabel;
case TokId::WsMultiple:
Expand Down

0 comments on commit fcc127c

Please sign in to comment.