Skip to content

Commit

Permalink
Readded support for adding patterns to an existing index
Browse files Browse the repository at this point in the history
  • Loading branch information
floriankramer committed Dec 4, 2020
1 parent 58ad9b3 commit 945b7b8
Show file tree
Hide file tree
Showing 4 changed files with 155 additions and 56 deletions.
18 changes: 7 additions & 11 deletions src/index/Index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -522,17 +522,13 @@ void Index::exchangeMultiplicities(MetaData* m1, MetaData* m2) {

// _____________________________________________________________________________
void Index::addPatternsToExistingIndex() {
// auto [langPredLowerBound, langPredUpperBound] = _vocab.prefix_range("@");

// createPatternsImpl<MetaDataIterator<IndexMetaDataMmapView>,
// IndexMetaDataMmapView, ad_utility::File>(
// _onDiskBase + ".index.patterns", _hasPredicate, _hasPattern,
// _patterns, _fullHasPredicateMultiplicityEntities,
// _fullHasPredicateMultiplicityPredicates, _fullHasPredicateSize,
// _maxNumPatterns, langPredLowerBound, langPredUpperBound,
// _SPO.metaData(), _SPO._file);
AD_THROW(ad_semsearch::Exception::NOT_YET_IMPLEMENTED,
"Adding patterns to an existing index is not yet supported");
auto [langPredLowerBound, langPredUpperBound] = _vocab.prefix_range("@");

_patternIndex.generatePredicateLocalNamespaceFromExistingIndex(
langPredLowerBound, langPredUpperBound, _PSO._meta);
_patternIndex.createPatternsFromExistingIndex(langPredLowerBound,
langPredUpperBound, _SPO._meta,
_SPO._file, _onDiskBase);
}

// _____________________________________________________________________________
Expand Down
6 changes: 3 additions & 3 deletions src/index/MetaDataIterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
template <typename MetaDataType>
class MetaDataIterator {
public:
MetaDataIterator(const MetaDataType& meta, ad_utility::File file)
MetaDataIterator(MetaDataType& meta, ad_utility::File file)
: meta_(meta),
_iterator(meta.data().begin()),
_buffer_offset(0),
Expand Down Expand Up @@ -37,11 +37,11 @@ class MetaDataIterator {
_buffer[_buffer_offset][1]};
}

bool empty() { return _iterator == meta_.data().end(); }
bool empty() { return _iterator == meta_.data().cend(); }

private:
void scanCurrentPos() {
const FullRelationMetaData& rmd = _iterator->second.get();
const FullRelationMetaData& rmd = _iterator->second;
_buffer.resize(rmd.getNofElements());
_file.read(_buffer.data(), rmd.getNofElements() * 2 * sizeof(Id),
rmd._startFullIndex);
Expand Down
168 changes: 133 additions & 35 deletions src/index/PatternIndex.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#include "PatternIndex.h"

#include "IndexMetaData.h"
#include "MetaDataIterator.h"

const uint32_t PatternIndex::PATTERNS_FILE_VERSION = 1;

Expand All @@ -18,9 +17,72 @@ void PatternIndex::generatePredicateLocalNamespace(VocabularyData* vocabData) {
// This will be significantly smaller than the global namespace which
// also contains subjects and objects, and allows for shrinking
// the pattern trick data.
createPredicateIdsImpl<TripleVec::bufreader_type>(
&_predicate_local_to_global_ids, vocabData->langPredLowerBound,
vocabData->langPredUpperBound, *vocabData->idTriples);

// This is not inside a templated method as the PSO metadata is based upon
// HashMaps which need to be treated differently
TripleVec::bufreader_type reader(*vocabData->idTriples);
if (reader.empty()) {
LOG(WARN) << "Triple vector was empty, no patterns created" << std::endl;
return;
}

Id currentPred = ID_NO_VALUE;

Id langPredLowerBound = vocabData->langPredLowerBound;
Id langPredUpperBound = vocabData->langPredUpperBound;

// Iterate all triples in POS (or PSO) sorting order. Add every distinct
// non language predicate to the predicateIds vector, therefore assigning
// a predicate namespace id to it via its position in the vector.
for (; !reader.empty(); ++reader) {
Id predicate = (*reader)[1];
if (predicate != currentPred) {
currentPred = predicate;
if (predicate < langPredLowerBound || predicate >= langPredUpperBound) {
// The predicate is not a language predicate, add it to the ids
_predicate_local_to_global_ids.push_back(predicate);
}
}
}

// Compute the global to local mapping from the local to global mapping
_predicate_global_to_local_ids.reserve(_predicate_local_to_global_ids.size());
for (size_t i = 0; i < _predicate_local_to_global_ids.size(); ++i) {
_predicate_global_to_local_ids.try_emplace(
_predicate_local_to_global_ids[i], i);
}
}

// _____________________________________________________________________________
void PatternIndex::generatePredicateLocalNamespaceFromExistingIndex(
Id langPredLowerBound, Id langPredUpperBound,
IndexMetaDataHmap& meta_data) {
// This is not inside a templated method as the PSO metadata is based upon
// HashMaps which need to be treated differently

// Iterate the hash map mapping predicates to metadata
for (const auto& triple_it : meta_data.data()) {
Id predicate = triple_it.first;
if (predicate < langPredLowerBound || predicate >= langPredUpperBound) {
_predicate_local_to_global_ids.push_back(predicate);
}
}

// The sorting ensures that the namespace generated during and after index
// creation are identical. It is currently not strictly speaking required,
// but is also not that expensive (as the number of predicates tends to be
// small), and prevents nasty bugs appearing only if the namespace was
// generated from an existing index.
std::sort(_predicate_local_to_global_ids.begin(),
_predicate_local_to_global_ids.end());

// Compute the global to local mapping from the local to global mapping
_predicate_global_to_local_ids.reserve(_predicate_local_to_global_ids.size());
for (size_t i = 0; i < _predicate_local_to_global_ids.size(); ++i) {
_predicate_global_to_local_ids.try_emplace(
_predicate_local_to_global_ids[i], i);
}

// Compute the global to local mapping from the local to global mapping
_predicate_global_to_local_ids.reserve(_predicate_local_to_global_ids.size());
for (size_t i = 0; i < _predicate_local_to_global_ids.size(); ++i) {
Expand Down Expand Up @@ -122,6 +184,71 @@ void PatternIndex::createPatterns(VocabularyData* vocabData,
_initialized = true;
}

// _____________________________________________________________________________
void PatternIndex::createPatternsFromExistingIndex(
Id langPredLowerBound, Id langPredUpperBound,
IndexMetaDataMmapView& meta_data, ad_utility::File& file,
const std::string& filename_base) {
size_t num_bytes_predicate_id = 0;
size_t num_predicate_ids = _predicate_local_to_global_ids.size();
while (num_predicate_ids > 0) {
num_predicate_ids = num_predicate_ids >> 8;
num_bytes_predicate_id++;
}

std::string patterns_file_name = filename_base + ".index.patterns";

if (num_bytes_predicate_id <= 1) {
std::shared_ptr<PatternContainerImpl<uint8_t>> pattern_data =
std::make_shared<PatternContainerImpl<uint8_t>>();
createPatternsImpl<uint8_t, MetaDataIterator<IndexMetaDataMmapView>,
IndexMetaDataMmapView, ad_utility::File>(
patterns_file_name, pattern_data, _predicate_local_to_global_ids,
_predicate_global_to_local_ids, _fullHasPredicateMultiplicityEntities,
_fullHasPredicateMultiplicityPredicates, _fullHasPredicateSize,
_maxNumPatterns, langPredLowerBound, langPredUpperBound, meta_data,
file);
_pattern_container = pattern_data;
} else if (num_bytes_predicate_id <= 2) {
std::shared_ptr<PatternContainerImpl<uint16_t>> pattern_data =
std::make_shared<PatternContainerImpl<uint16_t>>();
createPatternsImpl<uint16_t, MetaDataIterator<IndexMetaDataMmapView>,
IndexMetaDataMmapView, ad_utility::File>(
patterns_file_name, pattern_data, _predicate_local_to_global_ids,
_predicate_global_to_local_ids, _fullHasPredicateMultiplicityEntities,
_fullHasPredicateMultiplicityPredicates, _fullHasPredicateSize,
_maxNumPatterns, langPredLowerBound, langPredUpperBound, meta_data,
file);
_pattern_container = pattern_data;
} else if (num_bytes_predicate_id <= 4) {
std::shared_ptr<PatternContainerImpl<uint32_t>> pattern_data =
std::make_shared<PatternContainerImpl<uint32_t>>();
createPatternsImpl<uint32_t, MetaDataIterator<IndexMetaDataMmapView>,
IndexMetaDataMmapView, ad_utility::File>(
patterns_file_name, pattern_data, _predicate_local_to_global_ids,
_predicate_global_to_local_ids, _fullHasPredicateMultiplicityEntities,
_fullHasPredicateMultiplicityPredicates, _fullHasPredicateSize,
_maxNumPatterns, langPredLowerBound, langPredUpperBound, meta_data,
file);
_pattern_container = pattern_data;
} else if (num_bytes_predicate_id <= 8) {
std::shared_ptr<PatternContainerImpl<uint64_t>> pattern_data =
std::make_shared<PatternContainerImpl<uint64_t>>();
createPatternsImpl<uint64_t, MetaDataIterator<IndexMetaDataMmapView>,
IndexMetaDataMmapView, ad_utility::File>(
patterns_file_name, pattern_data, _predicate_local_to_global_ids,
_predicate_global_to_local_ids, _fullHasPredicateMultiplicityEntities,
_fullHasPredicateMultiplicityPredicates, _fullHasPredicateSize,
_maxNumPatterns, langPredLowerBound, langPredUpperBound, meta_data,
file);
_pattern_container = pattern_data;
} else {
AD_THROW(ad_semsearch::Exception::BAD_INPUT,
"The index contains more than 2**64 predicates.");
}
_initialized = true;
}

// _____________________________________________________________________________
void PatternIndex::loadPatternIndex(const std::string& filename_base) {
std::string patternsFilePath = filename_base + ".index.patterns";
Expand Down Expand Up @@ -191,7 +318,7 @@ void PatternIndex::createPatternsImpl(
double& fullHasPredicateMultiplicityPredicates,
size_t& fullHasPredicateSize, const size_t maxNumPatterns,
const Id langPredLowerBound, const Id langPredUpperBound,
const Args&... vecReaderArgs) {
Args&... vecReaderArgs) {
IndexMetaDataHmap meta;
typedef ad_utility::HashMap<Pattern<PredicateId>, size_t,
PatternHash<PredicateId>>
Expand Down Expand Up @@ -561,35 +688,6 @@ void PatternIndex::createPatternsImpl(
pattern_data->hasPredicate().build(hasPredicateTmp);
}

// _____________________________________________________________________________
template <typename VecReaderType, typename... Args>
void PatternIndex::createPredicateIdsImpl(std::vector<Id>* predicateIds,
const Id langPredLowerBound,
const Id langPredUpperBound,
const Args&... vecReaderArgs) {
VecReaderType reader(vecReaderArgs...);
if (reader.empty()) {
LOG(WARN) << "Triple vector was empty, no patterns created" << std::endl;
return;
}

Id currentPred = ID_NO_VALUE;

// Iterate all triples in POS (or PSO) sorting order. Add every distinct
// non language predicate to the predicateIds vector, therefore assigning
// a predicate namespace id to it via its position in the vector.
for (; !reader.empty(); ++reader) {
Id predicate = (*reader)[1];
if (predicate != currentPred) {
currentPred = predicate;
if (predicate < langPredLowerBound || predicate >= langPredUpperBound) {
// The predicate is not a language predicate, add it to the ids
predicateIds->push_back(predicate);
}
}
}
}

// _____________________________________________________________________________
template <typename PredicateId>
std::shared_ptr<PatternContainerImpl<PredicateId>>
Expand Down
19 changes: 12 additions & 7 deletions src/index/PatternIndex.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <string>
#include "../global/Id.h"
#include "../util/HashMap.h"
#include "IndexMetaData.h"
#include "PatternContainer.h"
#include "Vocabulary.h"
#include "VocabularyData.h"
Expand Down Expand Up @@ -44,6 +45,12 @@ class PatternIndex {
void createPatterns(VocabularyData* vocabData,
const std::string& filename_base);

void createPatternsFromExistingIndex(Id langPredLowerBound,
Id langPredUpperBound,
IndexMetaDataMmapView& meta_data,
ad_utility::File& file,
const std::string& filename_base);

/**
* @brief Takes the triples sorted by PSO or POS and generates a new namespace
* that only contains predicates. This namespace is then used for storing
Expand All @@ -53,6 +60,10 @@ class PatternIndex {
*/
void generatePredicateLocalNamespace(VocabularyData* vocabData);

void generatePredicateLocalNamespaceFromExistingIndex(
Id langPredLowerBound, Id langPredUpperBound,
IndexMetaDataHmap& meta_data);

void loadPatternIndex(const std::string& filename_base);

friend class CreatePatternsFixture_createPatterns_Test;
Expand All @@ -77,13 +88,7 @@ class PatternIndex {
double& fullHasPredicateMultiplicityPredicates,
size_t& fullHasPredicateSize, const size_t maxNumPatterns,
const Id langPredLowerBound, const Id langPredUpperBound,
const Args&... vecReaderArgs);

template <typename VecReaderType, typename... Args>
void createPredicateIdsImpl(std::vector<Id>* predicateIds,
const Id langPredLowerBound,
const Id langPredUpperBound,
const Args&... vecReaderArgs);
Args&... vecReaderArgs);

void throwExceptionIfNotInitialized() const;

Expand Down

0 comments on commit 945b7b8

Please sign in to comment.