Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Export index permutations in a deterministic way for comparison #468

Merged
merged 3 commits into from
Sep 13, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,9 @@ target_link_libraries(TurtleParserMain parser ${CMAKE_THREAD_LIBS_INIT} absl::fl
add_executable(VocabularyMergerMain src/VocabularyMergerMain.cpp)
target_link_libraries(VocabularyMergerMain index ${CMAKE_THREAD_LIBS_INIT})

add_executable(PermutationExporterMain src/index/PermutationExporterMain.cpp)
target_link_libraries(PermutationExporterMain index ${CMAKE_THREAD_LIBS_INIT})

#add_executable(TextFilterComparison src/experiments/TextFilterComparison.cpp)
#target_link_libraries (TextFilterComparison experiments)

6 changes: 6 additions & 0 deletions src/index/Index.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,11 +115,17 @@ class Index {
SortByOSP(), "OSP", ".osp", {2, 0, 1});

const auto& POS() const { return _POS; }
auto& POS() { return _POS; }
const auto& PSO() const { return _PSO; }
auto& PSO() { return _PSO; }
const auto& SPO() const { return _SPO; }
auto& SPO() { return _SPO; }
const auto& SOP() const { return _SOP; }
auto& SOP() { return _SOP; }
const auto& OPS() const { return _OPS; }
auto& OPS() { return _OPS; }
const auto& OSP() const { return _OSP; }
auto& OSP() { return _OSP; }

// Creates an index from a file. Parameter Parser must be able to split the
// file's format into triples.
Expand Down
80 changes: 80 additions & 0 deletions src/index/MetaDataHandler.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,11 @@ template <class M>
class MetaDataWrapperDense {
public:
using Iterator = VecWrapperImpl::Iterator<M>;
// The VecWrapperImpl::Iterator is actually const
using ConstIterator = VecWrapperImpl::Iterator<M>;
// The VecWrapperImpl::Iterator iterates in order;
using ConstOrderedIterator = VecWrapperImpl::Iterator<M>;

using value_type = typename M::value_type;

// _________________________________________________________
Expand Down Expand Up @@ -147,12 +152,18 @@ class MetaDataWrapperDense {
return it;
}

// __________________________________________________________________________
ConstOrderedIterator ordered_begin() const { return begin(); }

// __________________________________________________________________________
Iterator cend() const { return Iterator(_vec.size(), _vec.cend(), &_vec); }

// __________________________________________________________________________
Iterator end() const { return Iterator(_vec.size(), _vec.end(), &_vec); }

// __________________________________________________________________________
ConstOrderedIterator ordered_end() const { return end(); }

// ____________________________________________________________
void set(Id id, const value_type& value) {
if (id >= _vec.size()) {
Expand Down Expand Up @@ -206,6 +217,61 @@ class MetaDataWrapperHashMap {
using Iterator = typename hashMap::iterator;
using value_type = typename hashMap::mapped_type;

// An iterator on the underlying hashMap that iterates over the elements
// in order. This is used for deterministically exporting the underlying
// permutation.
class ConstOrderedIterator {
using key_type = typename hashMap::key_type;

const MetaDataWrapperHashMap& wrapper_;
std::vector<key_type> sortedKeys_;
size_t position_;

public:
// ________________________________________________________________________
ConstOrderedIterator(const MetaDataWrapperHashMap& wrapper, size_t position)
: wrapper_{wrapper}, position_{position} {
// Sort all the keys from the underlying hashMap and store them.
sortedKeys_.reserve(wrapper.size());
for (const auto& [key, value] : wrapper_) {
(void)value; // Silence the warning about `value` being unused.
sortedKeys_.push_back(key);
}
std::sort(sortedKeys_.begin(), sortedKeys_.end());
}

// ________________________________________________________________________
const auto& operator*() const {
const auto& m = wrapper_.getUnderlyingHashMap();
return *m.find(sortedKeys_[position_]);
}

// _________________________________________________
const auto* operator->() const {
// Call operator* and return a pointer to the result.
// This is safe, because the underlying hashMap ensures the lifetime of
// the returned reference;
return &(**this);
}

// ________________________________________________________________________
ConstOrderedIterator& operator++() {
++position_;
return *this;
}

// _________________________________________________________________________
ConstOrderedIterator operator++(int) {
auto cpy = *this;
++position_;
return cpy;
}

bool operator==(const ConstOrderedIterator& rhs) const {
return position_ == rhs.position_;
}
};

// nothing to do here, since the default constructor of the hashMap does
// everything we want
explicit MetaDataWrapperHashMap() = default;
Expand All @@ -217,19 +283,33 @@ class MetaDataWrapperHashMap {

// __________________________________________________________________
ConstIterator cbegin() const { return _map.begin(); }
ConstIterator begin() const { return _map.begin(); }

// __________________________________________________________________
Iterator begin() { return _map.begin(); }

// _________________________________________________________________________
ConstOrderedIterator ordered_begin() const {
return ConstOrderedIterator{*this, 0};
}

// ____________________________________________________________
ConstIterator cend() const { return _map.end(); }
ConstIterator end() const { return _map.end(); }

// ____________________________________________________________
Iterator end() { return _map.end(); }

// _________________________________________________________________________
ConstOrderedIterator ordered_end() const {
return ConstOrderedIterator{*this, size()};
}

// ____________________________________________________________
void set(Id id, value_type value) { _map[id] = std::move(value); }

const auto& getUnderlyingHashMap() const { return _map; }

// __________________________________________________________
const value_type& getAsserted(Id id) const {
auto it = _map.find(id);
Expand Down
19 changes: 15 additions & 4 deletions src/index/MetaDataIterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ class MetaDataIterator {
public:
MetaDataIterator(const MetaDataType& meta, ad_utility::File file)
: meta_(meta),
_iterator(meta.data().begin()),
_iterator(meta.data().ordered_begin()),
_endIterator(meta.data().ordered_end()),
_buffer_offset(0),
_file(file) {
scanCurrentPos();
Expand All @@ -26,6 +27,10 @@ class MetaDataIterator {
++_buffer_offset;
if (_buffer_offset >= _buffer.size()) {
++_iterator;
if (empty()) {
// don't do anything if we have already reached the end
return *this;
}
scanCurrentPos();
_buffer_offset = 0;
}
Expand All @@ -37,18 +42,24 @@ class MetaDataIterator {
_buffer[_buffer_offset][1]};
}

bool empty() { return _iterator == meta_.data().end(); }
bool empty() const { return _iterator == _endIterator; }

private:
void scanCurrentPos() {
const FullRelationMetaData& rmd = _iterator->second.get();
FullRelationMetaData rmd;
if constexpr (requires { _iterator->second.get(); }) {
rmd = _iterator->second.get();
} else {
rmd = _iterator->second;
}
_buffer.resize(rmd.getNofElements());
_file.read(_buffer.data(), rmd.getNofElements() * 2 * sizeof(Id),
rmd._startFullIndex);
}

const MetaDataType& meta_;
typename MetaDataType::MapType::Iterator _iterator;
typename MetaDataType::MapType::ConstOrderedIterator _iterator;
const typename MetaDataType::MapType::ConstOrderedIterator _endIterator;

// This buffers the results of the scans we need to use to read the relations
std::vector<std::array<Id, 2>> _buffer;
Expand Down
76 changes: 76 additions & 0 deletions src/index/PermutationExporterMain.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// Copyright 2021, University of Freiburg, Chair of Algorithms and Data
// Structures. Author: Johannes Kalmbach <kalmbacj@cs.uni-freiburg.de>

#include "./Index.h"
#include "./MetaDataIterator.h"

/// Dump a certain permutation to stdout in a human-readable way as IDs, and
/// in deterministic order
template <typename Permutation>
void dumpToStdout(const Permutation& permutation) {
MetaDataIterator it{permutation._meta, permutation._file};
while (!it.empty()) {
auto triple = *it;
std::cout << triple[0] << " " << triple[1] << " " << triple[2] << std::endl;
++it;
}
}

/// Load a certain permutation from a certain index, and dump it to stdout
/// in a human-readable and deterministic way. This can be used for large
/// regression tests, when the index format or the index building procedure
/// changes.
/// Args: ./PermutationExporterMain <indexBasename> <permutation>
/// (<permutation> must be one of pso, pos, spo, sop, osp, ops
int main(int argc, char** argv) {
// Actual output goes to std::cout,output of LOG(...) to std::cerr
ad_utility::setGlobalLogginStream(&std::cerr);

if (argc != 3) {
LOG(ERROR) << "Usage: PermutationExporterMain <indexBasename> <permutation "
"to dump>"
<< std::endl;
return EXIT_FAILURE;
}

Index i;
std::string indexName{argv[1]};
std::string p{argv[2]};
if (p == "sop") {
i.SOP().loadFromDisk(indexName);
dumpToStdout(i.SOP());
return EXIT_SUCCESS;
}
if (p == "spo") {
i.SPO().loadFromDisk(indexName);
dumpToStdout(i.SPO());
return EXIT_SUCCESS;
}
if (p == "osp") {
i.OSP().loadFromDisk(indexName);
dumpToStdout(i.OSP());
return EXIT_SUCCESS;
}
if (p == "ops") {
i.OPS().loadFromDisk(indexName);
dumpToStdout(i.OPS());
return EXIT_SUCCESS;
}

if (p == "pos") {
i.POS().loadFromDisk(indexName);
dumpToStdout(i.POS());
return EXIT_SUCCESS;
}

if (p == "pso") {
i.PSO().loadFromDisk(indexName);
dumpToStdout(i.PSO());
return EXIT_SUCCESS;
}

LOG(ERROR)
<< "<permutation> must be one of pso, pos, spo, sop, osp, ops, but was \""
<< p << "\"" << std::endl;
return EXIT_FAILURE;
}