Skip to content

Commit

Permalink
Trying to track the (unescaping) failures already in the index Builder
Browse files Browse the repository at this point in the history
  • Loading branch information
joka921 committed Dec 15, 2020
1 parent f2bd0bd commit 312fa5c
Show file tree
Hide file tree
Showing 6 changed files with 50 additions and 15 deletions.
22 changes: 16 additions & 6 deletions src/index/Index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ void Index::createFromFile(const string& filename) {
string vocabFile = _onDiskBase + ".vocabulary";
string vocabFileTmp = _onDiskBase + ".vocabularyTmp";
std::vector<string> prefixes;
LOG(INFO) << "Finished writing permutations" << std::endl;
if (_vocabPrefixCompressed) {
// we have to use the "normally" sorted vocabulary for the prefix
// compression;
Expand All @@ -107,7 +108,7 @@ void Index::createFromFile(const string& filename) {
std::ofstream prefixFile(_onDiskBase + PREFIX_FILE);
AD_CHECK(prefixFile.is_open());
for (const auto& prefix : prefixes) {
prefixFile << prefix << '\n';
prefixFile << prefix << std::endl;
}
}
_configurationJson["prefixes"] = _vocabPrefixCompressed;
Expand All @@ -118,7 +119,7 @@ void Index::createFromFile(const string& filename) {
if (std::rename(vocabFileTmp.c_str(), vocabFile.c_str())) {
LOG(INFO) << "Error: Rename the prefixed vocab file " << vocabFileTmp
<< " to " << vocabFile << " set errno to " << errno
<< ". Terminating...\n";
<< ". Terminating..." << std::endl;
AD_CHECK(false);
}
writeConfiguration();
Expand Down Expand Up @@ -174,7 +175,16 @@ VocabularyData Index::passFileForVocabulary(const string& filename,
[this](Triple&& t) {
Triple res;
std::transform(t.begin(), t.end(), res.begin(), [](const auto& s) {
return TurtleToken::normalizeRDFLiteral(s);
auto res = TurtleToken::normalizeRDFLiteral(s);
try {
[[maybe_unused]] auto tmp = TurtleToken::normalizeRDFLiteral(
TurtleToken::escapeRDFLiteral(res));
} catch (...) {
LOG(ERROR) << "Vocabulary entry " + s +
" could not be (un)escaped properly"
<< std::endl;
}
return res;
});
return res;
},
Expand Down Expand Up @@ -442,7 +452,7 @@ Index::createPermutationPairImpl(const string& fileName1,

out1.close();
out2.close();
LOG(INFO) << "Permutation done.\n";
LOG(INFO) << "Permutation done." << std::endl;
return std::make_pair(std::move(metaData1), std::move(metaData2));
}

Expand Down Expand Up @@ -502,12 +512,12 @@ void Index::createPermutationPair(
&(metaData.value().second));
LOG(INFO) << "Done" << '\n';
LOG(INFO) << "Writing MetaData for " << p1._readableName << " and "
<< p2._readableName << '\n';
<< p2._readableName << std::endl;
ad_utility::File f1(_onDiskBase + ".index" + p1._fileSuffix, "r+");
metaData.value().first.appendToFile(&f1);
ad_utility::File f2(_onDiskBase + ".index" + p2._fileSuffix, "r+");
metaData.value().second.appendToFile(&f2);
LOG(INFO) << "Done" << '\n';
LOG(INFO) << "Done" << std::endl;
}
}

Expand Down
25 changes: 17 additions & 8 deletions src/index/PrefixHeuristic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "./PrefixHeuristic.h"
#include <algorithm>
#include <fstream>
#include "../parser/Tokenizer.h"
#include "../util/Exception.h"
#include "../util/Log.h"
#include "../util/StringUtils.h"
Expand Down Expand Up @@ -41,7 +42,7 @@ TreeNode* TreeNode::insertAfter(string_view value) {
}

// if we have reached here, we have to add a new child
NodePtr newNode(new TreeNode(value));
NodePtr newNode = std::make_unique<TreeNode>(value);
newNode->_parent = this;

// find children of current node which have to become children of the new node
Expand All @@ -60,6 +61,13 @@ TreeNode* TreeNode::insertAfter(string_view value) {

// register the newly created node as a child of this node
_children.push_back(std::move(newNode));

for (const auto& c : _children) {
if (c.get() == nullptr) {
LOG(ERROR) << "Illegal nullptr child was found" << std::endl;
}
}

return _children.back().get();
}

Expand Down Expand Up @@ -176,9 +184,10 @@ std::vector<string> calculatePrefixes(const string& filename,
size_t totalSavings = 0;
size_t numWords = 0;

LOG(INFO) << "start reading words and building prefix tree...\n";
LOG(INFO) << "start reading words and building prefix tree..." << std::endl;
// insert all prefix candidates into the tree
while (std::getline(ifs, nextWord)) {
nextWord = TurtleToken::normalizeRDFLiteral<false>(nextWord);
totalChars += nextWord.size();
// the longest common prefixes between two adjacent words are our candidates
// for compression
Expand All @@ -194,12 +203,12 @@ std::vector<string> calculatePrefixes(const string& filename,

numWords++;
if (numWords % 10000000 == 0) {
LOG(INFO) << "words read: " << numWords << '\n';
LOG(INFO) << "words read: " << numWords << std::endl;
}
}

LOG(INFO) << "Finished building prefix tree!\n";
LOG(INFO) << "Start searching for maximal compressing prefixes\n";
LOG(INFO) << "Finished building prefix tree!" << std::endl;
LOG(INFO) << "Start searching for maximal compressing prefixes" << std::endl;
std::vector<string> res;
res.reserve(numPrefixes);
for (size_t i = 0; i < numPrefixes; ++i) {
Expand All @@ -209,7 +218,7 @@ std::vector<string> calculatePrefixes(const string& filename,
}
totalSavings += p.first;
LOG(INFO) << "Found prefix " << p.second
<< " with number of bytes gained: " << p.first << '\n';
<< " with number of bytes gained: " << p.first << std::endl;
res.push_back(std::move(p.second));
}
// if we always add an encoding we have calculated with a codelength of 0 so
Expand All @@ -218,9 +227,9 @@ std::vector<string> calculatePrefixes(const string& filename,
totalSavings -= codelength * numWords;
}
double efficiency = static_cast<double>(totalSavings) / totalChars;
std::cout << "total number of bytes : " << totalChars << '\n';
std::cout << "total number of bytes : " << totalChars << std::endl;
std::cout << "total chars compressed : " << totalSavings << '\n';
std::cout << "percentage of chars compressed : " << efficiency << '\n';
std::cout << "percentage of chars compressed : " << efficiency << std::endl;
return res;
}

Expand Down
3 changes: 3 additions & 0 deletions src/index/PrefixHeuristic.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,14 @@ using std::string_view;
class TreeNode {
private:
friend class Tree;

using NodePtr = std::unique_ptr<TreeNode>;

// Constructor
public:
explicit TreeNode(string_view value) : _value(value) {}

private:
// Recursive Insertion of value. If the value does not match _value we will
// automatically call insert on a node that is closer to the actual position
// of value in the Tree. Returns the node that was actually inserted
Expand Down
7 changes: 7 additions & 0 deletions src/index/VocabularyGeneratorImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,13 @@ void VocabularyMerger::writeQueueWordsToIdVec(const std::vector<QueueWord>& buff

// write the new word to the vocabulary
if (_lastWritten < EXTERNALIZED_LITERALS_PREFIX) {
auto escaped = TurtleToken::escapeRDFLiteral(_lastWritten);
try {
auto restored = TurtleToken::normalizeRDFLiteral(escaped);
} catch (...) {
LOG(ERROR) << "Failure in the (un) escaping of vocabulary entry " + _lastWritten
<< std::endl;
}
_outfile << TurtleToken::escapeRDFLiteral(_lastWritten) << '\n';
} else {
// we have to strip the externalization character again
Expand Down
4 changes: 3 additions & 1 deletion src/parser/Tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,9 @@ struct TurtleToken {
res.append(langtagOrDatatype);
return res;
} catch (...) {
LOG(ERROR) << "Failed to unescape " + origLiteral + " an exception was thrown" << std::endl;
LOG(ERROR) << "Failed to unescape " + origLiteral +
" an exception was thrown"
<< std::endl;
throw;
}
}
Expand Down
4 changes: 4 additions & 0 deletions test/TokenTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -536,4 +536,8 @@ TEST(TokenizerTest, normalizeRDFLiteral) {
ASSERT_THROW(TurtleToken::normalizeRDFLiteral(unterminated),
std::runtime_error);
}

std::string lit = R"(",\")";
lit = R"(",")";
ASSERT_EQ(std::string("\",\\\\\""), TurtleToken::escapeRDFLiteral(lit));
}

0 comments on commit 312fa5c

Please sign in to comment.