Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Patterns exclude lang predicates #186

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
139 changes: 63 additions & 76 deletions src/index/Index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,12 @@ std::unique_ptr<Index::StxxlVec> Index::createIdTriplesAndVocab(
// first save the total number of words, this is needed to initialize the
// dense IndexMetaData variants
_totalVocabularySize = linesAndWords.nofWords;
// Save the lower and upper bound of language tagged predicates
// TODO(schnelle): These should either also be available when reading the
// Index from disk or reokaced with local variables only available when
// building the index.
_langPredLowerBound = linesAndWords.langPredLowerBound;
_langPredUpperBound = linesAndWords.langPredUpperBound;
LOG(INFO) << "total size of vocabulary (internal and external) is "
<< _totalVocabularySize << std::endl;

Expand All @@ -51,17 +57,19 @@ std::unique_ptr<Index::StxxlVec> Index::createIdTriplesAndVocab(
// clear vocabulary to save ram (only information from partial binary files
// used from now on). This will preserve information about externalized
// Prefixes etc.
// TODO(schnelle): Since we don't use the Vocabulary anywhere until now
// this seems pointless
_vocab.clear();
convertPartialToGlobalIds<Parser>(*(linesAndWords.idTriples),
linesAndWords.actualPartialSizes,
NUM_TRIPLES_PER_PARTIAL_VOCAB);
convertPartialToGlobalIds(*linesAndWords.idTriples,
linesAndWords.actualPartialSizes,
NUM_TRIPLES_PER_PARTIAL_VOCAB);

if (!_keepTempFiles) {
// remove temporary files only used during index creation
LOG(INFO) << "Removing temporary files (partial vocabulary and external "
"text file...\n";

// TODO: using system and rm is not really elegant nor portable.
// TODO(all): using system and rm is not really elegant nor portable.
// use std::filesystem as soon as QLever is ported to C++17
string removeCommand1 =
"rm -- " + _onDiskBase + EXTERNAL_LITS_TEXT_FILE_NAME;
Expand Down Expand Up @@ -92,7 +100,6 @@ void Index::createFromFile(const string& filename, bool allPermutations) {
StxxlVec& idTriples = *idTriplesPtr;

// also perform unique for first permutation

createPermutationPair<IndexMetaDataHmap>(&idTriples, Permutation::Pso,
Permutation::Pos, true);
if (allPermutations) {
Expand All @@ -102,7 +109,7 @@ void Index::createFromFile(const string& filename, bool allPermutations) {
createPermutationPair<IndexMetaDataMmap>(&idTriples, Permutation::Osp,
Permutation::Ops);
} else if (_usePatterns) {
// vector is not yet sorted
// Not constructed with Spo, Sop, needs extra sort
createPatterns(false, &idTriples);
}
// move compression to end
Expand Down Expand Up @@ -132,7 +139,6 @@ void Index::createFromFile(const string& filename, bool allPermutations) {
<< ". Terminating...\n";
AD_CHECK(false);
}
// also perform unique for first permutation
writeConfiguration();
}

Expand Down Expand Up @@ -244,21 +250,19 @@ VocabularyData Index::passFileForVocabulary(const string& filename,

LOG(INFO) << "Merging vocabulary\n";
VocabularyData res;
res.nofWords = mergeVocabulary(_onDiskBase, numFiles);
res.nofWords = mergeVocabulary(_onDiskBase, numFiles, &res.langPredLowerBound,
&res.langPredUpperBound);
res.idTriples = std::move(idTriples);
res.actualPartialSizes = std::move(actualPartialSizes);
LOG(INFO) << "Pass done.\n";
res.idTriples->size();
return res;
}

// _____________________________________________________________________________
template <class Parser>
void Index::convertPartialToGlobalIds(
StxxlVec& data, const vector<size_t>& actualLinesPerPartial,
size_t linesPerPartial) {
LOG(INFO) << "Updating Ids in stxxl vector to global Ids.\n";
array<string, 3> spo;

size_t i = 0;
// iterate over all partial vocabularies
Expand Down Expand Up @@ -318,7 +322,7 @@ std::optional<MetaData> Index::createPermutationImpl(const string& fileName,
LOG(WARN) << "Attempt to write an empty index!" << std::endl;
return std::nullopt;
}
ad_utility::File out(fileName.c_str(), "w");
ad_utility::File out(fileName, "w");
LOG(INFO) << "Creating an on-disk index permutation of " << vec.size()
<< " elements / facts." << std::endl;
// Iterate over the vector and identify relation boundaries
Expand Down Expand Up @@ -478,52 +482,46 @@ void Index::createPatternsImpl(const string& fileName, const StxxlVec& vec,
Pattern pattern;

size_t patternIndex = 0;
Id currentRel;
currentRel = vec[0][0];
bool isValidPattern = true;
size_t numInvalidPatterns = 0;
Id currentSubj;
currentSubj = vec[0][0];
size_t numValidPatterns = 0;

for (StxxlVec::bufreader_type reader(vec); !reader.empty(); ++reader) {
if ((*reader)[0] != currentRel) {
currentRel = (*reader)[0];
if (isValidPattern) {
numValidPatterns++;
auto it = patternCounts.find(pattern);
if (it == patternCounts.end()) {
patternCounts.insert(std::pair<Pattern, size_t>(pattern, size_t(1)));
} else {
(*it).second++;
}
if ((*reader)[0] != currentSubj) {
currentSubj = (*reader)[0];
numValidPatterns++;
auto it = patternCounts.find(pattern);
if (it == patternCounts.end()) {
patternCounts.insert(std::pair<Pattern, size_t>(pattern, size_t(1)));
} else {
numInvalidPatterns++;
(*it).second++;
}
isValidPattern = true;
pattern.clear();
patternIndex = 0;
}
Id currentPred = (*reader)[1];
// Ignore @lang@<predicate> language tagged predicates
if (currentPred >= _langPredLowerBound &&
currentPred < _langPredUpperBound) {
continue;
}

// don't list predicates twice
if (patternIndex == 0 || pattern[patternIndex - 1] != ((*reader)[1])) {
pattern.push_back((*reader)[1]);
if (patternIndex == 0 || pattern[patternIndex - 1] != currentPred) {
pattern.push_back(currentPred);
patternIndex++;
}
}
// process the last entry
if (isValidPattern) {
auto it = patternCounts.find(pattern);
if (it == patternCounts.end()) {
patternCounts.insert(std::pair<Pattern, size_t>(pattern, size_t(1)));
} else {
(*it).second++;
}
auto it = patternCounts.find(pattern);
if (it == patternCounts.end()) {
patternCounts.insert(std::pair<Pattern, size_t>(pattern, size_t(1)));
} else {
(*it).second++;
}
LOG(INFO) << "Counted patterns and found " << patternCounts.size()
<< " distinct patterns." << std::endl;
LOG(INFO) << "Patterns where found for " << numValidPatterns << " entities."
<< std::endl;
LOG(INFO) << "Discarded the patterns of " << numInvalidPatterns
<< " entities"
" because they were too large."
LOG(INFO) << "Patterns were found for " << numValidPatterns << " entities."
<< std::endl;

// stores patterns sorted by their number of occurences
Expand Down Expand Up @@ -614,20 +612,14 @@ void Index::createPatternsImpl(const string& fileName, const StxxlVec& vec,
ad_utility::HashSet<Id> predicateHashSet;

pattern.clear();
currentRel = vec[0][0];
currentSubj = vec[0][0];
patternIndex = 0;
// Create the has-relation and has-pattern predicates
// Create the has-predicate and has-pattern predicates
for (StxxlVec::bufreader_type reader2(vec); !reader2.empty(); ++reader2) {
if ((*reader2)[0] != currentRel) {
if ((*reader2)[0] != currentSubj) {
// we have arrived at a new entity;
fullHasPredicateEntitiesDistinctSize++;
std::unordered_map<Pattern, Id>::iterator it;
if (isValidPattern) {
it = patternSet.find(pattern);
} else {
it = patternSet.end();
numInvalidEntities++;
}
auto it = patternSet.find(pattern);
// increase the haspredicate size here as every predicate is only
// listed once per entity (otherwise it woul always be the same as
// vec.size()
Expand All @@ -641,12 +633,12 @@ void Index::createPatternsImpl(const string& fileName, const StxxlVec& vec,
fullHasPredicatePredicatesDistinctSize++;
}
entityHasPredicate.push_back(
std::array<Id, 2>{currentRel, pattern[i]});
std::array<Id, 2>{currentSubj, pattern[i]});
}
} else {
numEntitiesWithPatterns++;
// The pattern does exist, add an entry to the has-pattern predicate
entityHasPattern.push_back(std::array<Id, 2>{currentRel, it->second});
entityHasPattern.push_back(std::array<Id, 2>{currentSubj, it->second});
if (!haveCountedPattern[it->second]) {
haveCountedPattern[it->second] = true;
// iterate over the pattern once to
Expand All @@ -659,30 +651,25 @@ void Index::createPatternsImpl(const string& fileName, const StxxlVec& vec,
}
}
pattern.clear();
currentRel = (*reader2)[0];
currentSubj = (*reader2)[0];
patternIndex = 0;
isValidPattern = true;
}
// don't list predicates twice
if (patternIndex == 0 || pattern[patternIndex - 1] != ((*reader2)[1])) {
pattern.push_back((*reader2)[1]);
Id currentPred = (*reader2)[1];
if (patternIndex == 0 || pattern[patternIndex - 1] != currentPred) {
pattern.push_back(currentPred);
patternIndex++;
}
}
// process the last element
fullHasPredicateSize += pattern.size();
fullHasPredicateEntitiesDistinctSize++;
std::unordered_map<Pattern, Id>::iterator it;
if (isValidPattern) {
it = patternSet.find(pattern);
} else {
it = patternSet.end();
}
if (it == patternSet.end()) {
auto last = patternSet.find(pattern);
if (last == patternSet.end()) {
numEntitiesWithoutPatterns++;
// The pattern does not exist, use the has-relation predicate instead
for (size_t i = 0; i < patternIndex; i++) {
entityHasPredicate.push_back(std::array<Id, 2>{currentRel, pattern[i]});
entityHasPredicate.push_back(std::array<Id, 2>{currentSubj, pattern[i]});
if (predicateHashSet.find(pattern[i]) == predicateHashSet.end()) {
predicateHashSet.insert(pattern[i]);
fullHasPredicatePredicatesDistinctSize++;
Expand All @@ -691,7 +678,7 @@ void Index::createPatternsImpl(const string& fileName, const StxxlVec& vec,
} else {
numEntitiesWithPatterns++;
// The pattern does exist, add an entry to the has-pattern predicate
entityHasPattern.push_back(std::array<Id, 2>{currentRel, it->second});
entityHasPattern.push_back(std::array<Id, 2>{currentSubj, last->second});
for (size_t i = 0; i < patternIndex; i++) {
if (predicateHashSet.find(pattern[i]) == predicateHashSet.end()) {
predicateHashSet.insert(pattern[i]);
Expand Down Expand Up @@ -734,7 +721,7 @@ void Index::createPatternsImpl(const string& fileName, const StxxlVec& vec,
<< fullHasPredicateMultiplicityPredicates << std::endl;

// Store all data in the file
ad_utility::File file(fileName.c_str(), "w");
ad_utility::File file(fileName, "w");

// Write a byte of ones to make it less likely that an unversioned file is
// read as a versioned one (unversioned files begin with the id of the lowest
Expand Down Expand Up @@ -981,7 +968,7 @@ void Index::createFromOnDiskIndex(const string& onDiskBase,
// Read the pattern info from the patterns file
std::string patternsFilePath = _onDiskBase + ".index.patterns";
ad_utility::File patternsFile;
patternsFile.open(patternsFilePath.c_str(), "r");
patternsFile.open(patternsFilePath, "r");
AD_CHECK(patternsFile.isOpen());
off_t off = 0;
unsigned char firstByte;
Expand Down Expand Up @@ -1073,19 +1060,19 @@ bool Index::ready() const { return _psoFile.isOpen() && _posFile.isOpen(); }
// _____________________________________________________________________________
void Index::openFileHandles() {
AD_CHECK(_onDiskBase.size() > 0);
_psoFile.open((_onDiskBase + ".index.pso").c_str(), "r");
_posFile.open((_onDiskBase + ".index.pos").c_str(), "r");
_psoFile.open((_onDiskBase + ".index.pso"), "r");
_posFile.open((_onDiskBase + ".index.pos"), "r");
if (ad_utility::File::exists(_onDiskBase + ".index.spo")) {
_spoFile.open((_onDiskBase + ".index.spo").c_str(), "r");
_spoFile.open((_onDiskBase + ".index.spo"), "r");
}
if (ad_utility::File::exists(_onDiskBase + ".index.sop")) {
_sopFile.open((_onDiskBase + ".index.sop").c_str(), "r");
_sopFile.open((_onDiskBase + ".index.sop"), "r");
}
if (ad_utility::File::exists(_onDiskBase + ".index.osp")) {
_ospFile.open((_onDiskBase + ".index.osp").c_str(), "r");
_ospFile.open((_onDiskBase + ".index.osp"), "r");
}
if (ad_utility::File::exists(_onDiskBase + ".index.ops")) {
_opsFile.open((_onDiskBase + ".index.ops").c_str(), "r");
_opsFile.open((_onDiskBase + ".index.ops"), "r");
}
AD_CHECK(_psoFile.isOpen());
AD_CHECK(_posFile.isOpen());
Expand Down Expand Up @@ -1548,7 +1535,7 @@ size_t Index::sizeEstimate(const string& sub, const string& pred,
// _____________________________________________________________________________
template <class T>
void Index::writeAsciiListFile(const string& filename, const T& ids) const {
std::ofstream f(filename.c_str());
std::ofstream f(filename);

for (size_t i = 0; i < ids.size(); ++i) {
f << ids[i] << ' ';
Expand Down
21 changes: 12 additions & 9 deletions src/index/Index.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ struct VocabularyData {
using StxxlVec = stxxl::vector<array<Id, 3>>;
// The total number of distinct words in the complete Vocabulary
size_t nofWords;
// Id lower and upper bound of @lang@<predicate> predicates
Id langPredLowerBound;
Id langPredUpperBound;
// The number of triples in the idTriples vec that each partial vocabulary is
// responsible for (depends on the number of additional language filter
// triples)
Expand Down Expand Up @@ -319,6 +322,8 @@ class Index {
bool hasAllPermutations() const { return _spoFile.isOpen(); }

private:
Id _langPredLowerBound;
Id _langPredUpperBound;
string _onDiskBase;
string _settingsFileName;
bool _onDiskLiterals = false;
Expand Down Expand Up @@ -380,7 +385,6 @@ class Index {
VocabularyData passFileForVocabulary(const string& ntFile,
size_t linesPerPartial = 100000000);

template <class Parser>
void convertPartialToGlobalIds(StxxlVec& data,
const vector<size_t>& actualLinesPerPartial,
size_t linesPerPartial);
Expand Down Expand Up @@ -444,14 +448,13 @@ class Index {
* @param fileName The name of the file in which the data should be stored
* @param vec The vectors of triples in spo order.
*/
static void createPatternsImpl(const string& fileName, const StxxlVec& vec,
CompactStringVector<Id, Id>& hasPredicate,
std::vector<PatternID>& hasPattern,
CompactStringVector<size_t, Id>& patterns,
double& fullHasPredicateMultiplicityEntities,
double& fullHasPredicateMultiplicityPredicates,
size_t& fullHasPredicateSize,
size_t maxNumPatterns);
void createPatternsImpl(const string& fileName, const StxxlVec& vec,
CompactStringVector<Id, Id>& hasPredicate,
std::vector<PatternID>& hasPattern,
CompactStringVector<size_t, Id>& patterns,
double& fullHasPredicateMultiplicityEntities,
double& fullHasPredicateMultiplicityPredicates,
size_t& fullHasPredicateSize, size_t maxNumPatterns);

// wrap the static function using the internal member variables
// the bool indicates wether the StxxlVec has to be sorted before the pattern
Expand Down
16 changes: 15 additions & 1 deletion src/index/VocabularyGenerator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ class QueueCompare {
};

// ___________________________________________________________________
size_t mergeVocabulary(const std::string& basename, size_t numFiles) {
size_t mergeVocabulary(const std::string& basename, size_t numFiles,
Id* langPredLowerBound, Id* langPredUpperBound) {
std::vector<std::fstream> infiles;

// we will store pairs of <partialId, globalId>
Expand Down Expand Up @@ -79,6 +80,9 @@ size_t mergeVocabulary(const std::string& basename, size_t numFiles) {
// the number of words we have written. This also is the global Id of the next
// word we see, unless it is is equal to the previous word
size_t totalWritten = 0;
bool firstLangPredSeen = false;
*langPredLowerBound = 0;
*langPredUpperBound = 0;

// start k-way merge
while (!queue.empty()) {
Expand All @@ -100,6 +104,16 @@ size_t mergeVocabulary(const std::string& basename, size_t numFiles) {
// write id to corresponding vec
idVecs[top._partialFileId].push_back(
std::make_pair(top._partialWordId, totalWritten));

if (top._value.size() > 0 && top._value[0] == '@') {
// exclusive
*langPredUpperBound = totalWritten + 1;
if (!firstLangPredSeen) {
// inclusive
*langPredLowerBound = totalWritten;
firstLangPredSeen = true;
}
}
totalWritten++;
} else {
// this is a duplicate which already occured in another partial vocabulary
Expand Down