Skip to content

Commit

Permalink
First compress the vocabulary, then build the permutations (#541)
Browse files Browse the repository at this point in the history
Also improved various log messages along the way.
  • Loading branch information
joka921 committed Jan 13, 2022
1 parent 2e43053 commit 634d84a
Show file tree
Hide file tree
Showing 6 changed files with 72 additions and 48 deletions.
2 changes: 1 addition & 1 deletion src/TurtleParserMain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ void writeNTDispatch(std::ostream& out, const string& fileFormat,
if (regexEngine == "re2") {
writeNT<Tokenizer>(out, fileFormat, filename);
} else if (regexEngine == "ctre") {
LOG(WARN) << WARNING_ASCII_ONLY_PREFIXES;
LOG(INFO) << WARNING_ASCII_ONLY_PREFIXES << std::endl;
writeNT<TokenizerCtre>(out, fileFormat, filename);
} else {
LOG(ERROR)
Expand Down
75 changes: 43 additions & 32 deletions src/index/Index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,37 +84,31 @@ void Index::createFromFile(const string& filename) {
vocabData = createIdTriplesAndVocab<Parser>(filename);
}

// also perform unique for first permutation
createPermutationPair<IndexMetaDataHmapDispatcher>(&vocabData, _PSO, _POS,
true);
// also create Patterns after the Spo permutation if specified
createPermutationPair<IndexMetaDataMmapDispatcher>(&vocabData, _SPO, _SOP,
false, _usePatterns);
createPermutationPair<IndexMetaDataMmapDispatcher>(&vocabData, _OSP, _OPS);

// if we have no compression, this will also copy the whole vocabulary.
// If we have no compression, this will also copy the whole vocabulary.
// but since we expect compression to be the default case, this should not
// hurt
// hurt.
string vocabFile = _onDiskBase + ".vocabulary";
string vocabFileTmp = _onDiskBase + ".vocabularyTmp";
std::vector<string> prefixes;
LOG(INFO) << "Finished writing permutations" << std::endl;
if (_vocabPrefixCompressed) {
// we have to use the "normally" sorted vocabulary for the prefix
// compression;
// We have to use the "normally" sorted vocabulary for the prefix
// compression.
std::string vocabFileForPrefixCalculation =
_onDiskBase + TMP_BASENAME_COMPRESSION + ".vocabulary";
prefixes = calculatePrefixes(vocabFileForPrefixCalculation,
NUM_COMPRESSION_PREFIXES, 1, true);
deleteTemporaryFile(vocabFileForPrefixCalculation);
std::ofstream prefixFile(_onDiskBase + PREFIX_FILE);
AD_CHECK(prefixFile.is_open());
for (const auto& prefix : prefixes) {
prefixFile << prefix << std::endl;
}
}
_configurationJson["prefixes"] = _vocabPrefixCompressed;
LOG(INFO) << "Writing compressed vocabulary to disk" << std::endl;
Vocabulary<CompressedString, TripleComponentComparator>::prefixCompressFile(
vocabFile, vocabFileTmp, prefixes);
LOG(INFO) << "Finished writing compressed vocabulary" << std::endl;

// TODO<joka921> maybe move this to its own function
if (std::rename(vocabFileTmp.c_str(), vocabFile.c_str())) {
Expand All @@ -123,10 +117,27 @@ void Index::createFromFile(const string& filename) {
<< ". Terminating..." << std::endl;
AD_CHECK(false);
}

// Write the configuration already at this point, so we have it available in
// case any of the permutations fail.
writeConfiguration();

// For the first permutation, perform a unique.
createPermutationPair<IndexMetaDataHmapDispatcher>(&vocabData, _PSO, _POS,
PerformUnique::True);
// After the SPO permutation, create patterns if so desired.
createPermutationPair<IndexMetaDataMmapDispatcher>(
&vocabData, _SPO, _SOP, PerformUnique::False, _usePatterns);
createPermutationPair<IndexMetaDataMmapDispatcher>(&vocabData, _OSP, _OPS);
LOG(INFO) << "Finished writing permutations" << std::endl;

// Dump the configuration again in case the permutations have added some
// information.
writeConfiguration();
LOG(INFO) << "Index build completed" << std::endl;
}

// explicit instantiations
// Explicit instantiations.
template void Index::createFromFile<TsvParser>(const string& filename);
template void Index::createFromFile<TurtleStreamParser<Tokenizer>>(
const string& filename);
Expand Down Expand Up @@ -248,13 +259,14 @@ VocabularyData Index::passFileForVocabulary(const string& filename,
LOG(INFO) << "Pass done." << endl;

if (_vocabPrefixCompressed) {
LOG(INFO) << "Merging temporary vocabulary for prefix compression";
LOG(INFO) << "Merging temporary vocabulary for prefix compression"
<< std::endl;
{
VocabularyMerger m;
m._ignoreExternalVocabulary = true;
m._noIdMapsAndIgnoreExternalVocab = true;
m.mergeVocabulary(_onDiskBase + TMP_BASENAME_COMPRESSION, numFiles,
std::less<>());
LOG(INFO) << "Finished merging additional Vocabulary.";
LOG(INFO) << "Finished merging additional vocabulary" << std::endl;
}
}

Expand All @@ -268,7 +280,7 @@ VocabularyData Index::passFileForVocabulary(const string& filename,

return v.mergeVocabulary(_onDiskBase, numFiles, sortPred);
}();
LOG(INFO) << "Finished Merging Vocabulary.\n";
LOG(INFO) << "Finished merging vocabulary\n";
VocabularyData res;
res.nofWords = mergeRes._numWordsTotal;
res.langPredLowerBound = mergeRes._langPredLowerBound;
Expand Down Expand Up @@ -301,10 +313,6 @@ void Index::convertPartialToGlobalIds(
// iterate over all partial vocabularies
for (size_t partialNum = 0; partialNum < actualLinesPerPartial.size();
partialNum++) {
LOG(INFO) << "Lines processed: " << i << '\n';
LOG(INFO) << "Corresponding number of statements in original knowledgeBase:"
<< linesPerPartial * partialNum << '\n';

std::string mmapFilename(_onDiskBase + PARTIAL_MMAP_IDS +
std::to_string(partialNum));
LOG(INFO) << "Reading IdMap from " << mmapFilename << " ...\n";
Expand Down Expand Up @@ -337,9 +345,12 @@ void Index::convertPartialToGlobalIds(
LOG(INFO) << "Lines processed: " << i << '\n';
}
}
LOG(INFO) << "Lines processed: " << i << '\n';
LOG(DEBUG)
<< "Corresponding number of statements in original knowledge base: "
<< linesPerPartial * (partialNum + 1) << '\n';
}
LOG(INFO) << "Lines processed: " << i << '\n';
LOG(INFO) << "Pass done.\n";
LOG(INFO) << "Pass done\n";
}

// _____________________________________________________________________________
Expand Down Expand Up @@ -473,13 +484,13 @@ Index::createPermutations(
p1,
const PermutationImpl<Comparator2, typename MetaDataDispatcher::ReadType>&
p2,
bool performUnique) {
LOG(INFO) << "Sorting for " << p1._readableName << " permutation..."
PerformUnique performUnique) {
LOG(INFO) << "Sorting for " << p1._readableName << " permutation"
<< std::endl;
stxxl::sort(begin(*vec), end(*vec), p1._comp, STXXL_MEMORY_TO_USE);
LOG(INFO) << "Sort done." << std::endl;

if (performUnique) {
if (performUnique == PerformUnique::True) {
// this only has to be done for the first permutation (PSO)
LOG(INFO) << "Removing duplicate triples as these are not supported in RDF"
<< std::endl;
Expand All @@ -499,18 +510,18 @@ Index::createPermutations(
// ________________________________________________________________________
template <class MetaDataDispatcher, class Comparator1, class Comparator2>
void Index::createPermutationPair(
VocabularyData* vocabData,
VocabularyData* vocabularyData,
const PermutationImpl<Comparator1, typename MetaDataDispatcher::ReadType>&
p1,
const PermutationImpl<Comparator2, typename MetaDataDispatcher::ReadType>&
p2,
bool performUnique, bool createPatternsAfterFirst) {
PerformUnique performUnique, bool createPatternsAfterFirst) {
auto metaData = createPermutations<MetaDataDispatcher>(
&(*vocabData->idTriples), p1, p2, performUnique);
&(*vocabularyData->idTriples), p1, p2, performUnique);
if (createPatternsAfterFirst) {
// the second permutation does not alter the original triple vector,
// so this does still work.
createPatterns(true, vocabData);
createPatterns(true, vocabularyData);
}
if (metaData) {
LOG(INFO) << "Exchanging Multiplicities for " << p1._readableName << " and "
Expand Down Expand Up @@ -1333,7 +1344,7 @@ void Index::initializeVocabularySettingsBuild() {
if constexpr (std::is_same_v<std::decay_t<Parser>, TurtleParserAuto>) {
bool v{j["ascii-prefixes-only"]};
if (v) {
LOG(WARN) << WARNING_ASCII_ONLY_PREFIXES;
LOG(INFO) << WARNING_ASCII_ONLY_PREFIXES << std::endl;
_onlyAsciiTurtlePrefixes = true;
} else {
_onlyAsciiTurtlePrefixes = false;
Expand Down
9 changes: 6 additions & 3 deletions src/index/Index.h
Original file line number Diff line number Diff line change
Expand Up @@ -521,14 +521,17 @@ class Index {
// createPatternsAfterFirst is only valid when the pair is SPO-SOP because
// the SPO permutation is also needed for patterns (see usage in
// Index::createFromFile function)

enum class PerformUnique { True, False };
template <class MetaDataDispatcher, class Comparator1, class Comparator2>
void createPermutationPair(
VocabularyData* vec,
VocabularyData* vocabularyData,
const PermutationImpl<Comparator1, typename MetaDataDispatcher::ReadType>&
p1,
const PermutationImpl<Comparator2, typename MetaDataDispatcher::ReadType>&
p2,
bool performUnique = false, bool createPatternsAfterFirst = false);
PerformUnique performUnique = PerformUnique::False,
bool createPatternsAfterFirst = false);

// The pairs of permutations are PSO-POS, OSP-OPS and SPO-SOP
// the multiplicity of column 1 in partner 1 of the pair is equal to the
Expand Down Expand Up @@ -556,7 +559,7 @@ class Index {
p1,
const PermutationImpl<Comparator2, typename MetaDataDispatcher::ReadType>&
p2,
bool performUnique);
PerformUnique performUnique);

/**
* @brief Creates the data required for the "pattern-trick" used for fast
Expand Down
14 changes: 7 additions & 7 deletions src/index/PrefixHeuristic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ std::vector<string> calculatePrefixes(const string& filename,
size_t totalSavings = 0;
size_t numWords = 0;

LOG(INFO) << "start reading words and building prefix tree..." << std::endl;
LOG(INFO) << "Start reading words and building prefix tree" << std::endl;
// insert all prefix candidates into the tree
while (std::getline(ifs, nextWord)) {
nextWord = RdfEscaping::unescapeNewlinesAndBackslashes(nextWord);
Expand All @@ -205,7 +205,7 @@ std::vector<string> calculatePrefixes(const string& filename,
}
}

LOG(INFO) << "Finished building prefix tree!" << std::endl;
LOG(INFO) << "Finished building prefix tree" << std::endl;
LOG(INFO) << "Start searching for maximal compressing prefixes" << std::endl;
std::vector<string> res;
res.reserve(numPrefixes);
Expand All @@ -215,8 +215,8 @@ std::vector<string> calculatePrefixes(const string& filename,
break;
}
totalSavings += p.first;
LOG(INFO) << "Found prefix " << p.second
<< " with number of bytes gained: " << p.first << std::endl;
LOG(DEBUG) << "Found prefix " << p.second
<< " with number of bytes gained: " << p.first << std::endl;
res.push_back(std::move(p.second));
}
// if we always add an encoding we have calculated with a codelength of 0 so
Expand All @@ -225,9 +225,9 @@ std::vector<string> calculatePrefixes(const string& filename,
totalSavings -= codelength * numWords;
}
double efficiency = static_cast<double>(totalSavings) / totalChars;
std::cout << "total number of bytes : " << totalChars << std::endl;
std::cout << "total chars compressed : " << totalSavings << '\n';
std::cout << "percentage of chars compressed : " << efficiency << std::endl;
LOG(INFO) << "Total number of bytes : " << totalChars << std::endl;
LOG(INFO) << "Total chars compressed : " << totalSavings << '\n';
LOG(INFO) << "Percentage of chars compressed : " << efficiency << std::endl;
return res;
}

Expand Down
5 changes: 4 additions & 1 deletion src/index/VocabularyGenerator.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,10 @@ using TripleVec = stxxl::vector<array<Id, 3>>;
*/
class VocabularyMerger {
public:
bool _ignoreExternalVocabulary = false;
// If this is set, then we will only output the internal vocabulary.
// This is useful for the prefix compression, where we don't need the
// external part of the vocabulary and the mapping from local to global IDs.
bool _noIdMapsAndIgnoreExternalVocab = false;
// result of a call to mergeVocabulary
struct VocMergeRes {
size_t _numWordsTotal; // that many distinct words were found (size of the
Expand Down
15 changes: 11 additions & 4 deletions src/index/VocabularyGeneratorImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,10 @@ VocabularyMerger::VocMergeRes VocabularyMerger::mergeVocabulary(

_outfile.open(basename + ".vocabulary");
AD_CHECK(_outfile.is_open());
_outfileExternal.open(basename + EXTERNAL_LITS_TEXT_FILE_NAME);
AD_CHECK(_outfileExternal.is_open());
if (!_noIdMapsAndIgnoreExternalVocab) {
_outfileExternal.open(basename + EXTERNAL_LITS_TEXT_FILE_NAME);
AD_CHECK(_outfileExternal.is_open());
}
std::vector<bool> endOfFile(numFiles, false);

// Priority queue for the k-way merge
Expand All @@ -52,7 +54,9 @@ VocabularyMerger::VocMergeRes VocabularyMerger::mergeVocabulary(
for (size_t i = 0; i < numFiles; i++) {
infiles.emplace_back(basename + PARTIAL_VOCAB_FILE_NAME +
std::to_string(i));
_idVecs.emplace_back(0, basename + PARTIAL_MMAP_IDS + std::to_string(i));
if (!_noIdMapsAndIgnoreExternalVocab) {
_idVecs.emplace_back(0, basename + PARTIAL_MMAP_IDS + std::to_string(i));
}
AD_CHECK(infiles.back().is_open());

// read the first entry of the vocabulary and add it to the queue
Expand All @@ -78,7 +82,7 @@ VocabularyMerger::VocMergeRes VocabularyMerger::mergeVocabulary(
while (!queue.empty()) {
// for the prefix compression vocabulary, we don't need the external
// vocabulary
if (_ignoreExternalVocabulary &&
if (_noIdMapsAndIgnoreExternalVocab &&
queue.top()._value >= EXTERNALIZED_LITERALS_PREFIX) {
break;
}
Expand Down Expand Up @@ -245,6 +249,9 @@ void VocabularyMerger::writeQueueWordsToIdVec(
// ____________________________________________________________________________________________________________
void VocabularyMerger::doActualWrite(
const std::vector<std::pair<size_t, std::pair<size_t, size_t>>>& buffer) {
if (_noIdMapsAndIgnoreExternalVocab) {
return;
}
for (const auto& [id, value] : buffer) {
_idVecs[id].push_back(value);
}
Expand Down

0 comments on commit 634d84a

Please sign in to comment.