Skip to content

Commit

Permalink
A basic serialization framework
Browse files Browse the repository at this point in the history
- It has a symmetric interface, similar to boost::serialization etc. That
  way, typically only one function has to be written for a type to handle
  reading AND writing.
- It avoids the error-prone handling of manual calls to writing and reading
  of single bytes

- We currently use it for the writing of the IndexMetaData, but it can be
  used in several more places (e.g. the Vocabulary) in the future.
  • Loading branch information
joka921 committed Jun 29, 2021
1 parent 4308ec2 commit 8931b22
Show file tree
Hide file tree
Showing 18 changed files with 543 additions and 302 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 2.8.4)
project(QLever C CXX)

# C/C++ Versions
set(CMAKE_C_STANDARD 11)
set (CMAKE_C_STANDARD 11)
set(CMAKE_C_STANDARD_REQUIRED ON)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
Expand Down
2 changes: 1 addition & 1 deletion src/index/Index.Text.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ void Index::passContextFileIntoVector(const string& contextFile,
// this has to be repeated completely here because we have the possibility to
// only add a text index. In that case the Vocabulary has never been
// initialized before
_vocab = Vocabulary<CompressedString, TripleComponentComparator>();
_vocab = std::move(Vocabulary<CompressedString, TripleComponentComparator>());
readConfiguration();
_vocab.readFromFile(_onDiskBase + ".vocabulary",
_onDiskLiterals ? _onDiskBase + ".literals-index" : "");
Expand Down
27 changes: 13 additions & 14 deletions src/index/IndexMetaData.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,10 @@ const uint64_t MAGIC_NUMBER_SPARSE_META_DATA_VERSION =
// constants for meta data versions in case the format is changed again
constexpr uint64_t V_NO_VERSION = 0; // this is a dummy
constexpr uint64_t V_BLOCK_LIST_AND_STATISTICS = 1;
constexpr uint64_t V_SERIALIZATION_LIBRARY = 2;

// this always tags the current version
constexpr uint64_t V_CURRENT = V_BLOCK_LIST_AND_STATISTICS;
constexpr uint64_t V_CURRENT = V_SERIALIZATION_LIBRARY;

// Check index_layout.md for explanations (expected comments).
// Removed comments here so that not two places had to be kept up-to-date.
Expand Down Expand Up @@ -116,16 +117,15 @@ class IndexMetaData {
static const bool value = std::is_same<MetaWrapperMmap, T>::value ||
std::is_same<MetaWrapperMmapView, T>::value;
};
// compile time information whether this instatiation if MMapBased or not
// Compile time information whether this instatiation if MMapBased or not
static constexpr bool _isMmapBased = IsMmapBased<MapType>::value;

// parse and get the version tag of this MetaData.
// Also verifies that it matches the MapType parameter
// No version tag will lead to version = V_NO_VERSION
// Also returns the number of bytes that the version info was stored in so
// that createFromByteBuffer can continue at the correct position.
VersionInfo parseMagicNumberAndVersioning(unsigned char* buf);
void createFromByteBuffer(unsigned char* buf);
// This magic number is written when serializing the IndexMetaData to a file.
// It is used to check, whether this is a really old index that requires
// rebuilding.
static constexpr uint64_t MAGIC_NUMBER_FOR_SERIALIZATION =
_isMmapBased ? MAGIC_NUMBER_MMAP_META_DATA_VERSION
: MAGIC_NUMBER_SPARSE_META_DATA_VERSION;

// Write to a file that will be overwritten/created
void writeToFile(const std::string& filename) const;
Expand Down Expand Up @@ -188,11 +188,10 @@ class IndexMetaData {
friend IndexMetaDataHmap convertMmapMetaDataToHmap(
const IndexMetaDataMmap& mmap, bool verify);

// this way all instantations will be friends with each other,
// but this should not be an issue.
template <class U>
friend ad_utility::File& operator<<(ad_utility::File& f,
const IndexMetaData<U>& rmd);
// Symmetric serialization function for the ad_utility::serialization module.
template <class Serializer, typename MapType>
friend void serialize(Serializer& serializer,
IndexMetaData<MapType>& metaData);

size_t getNofBlocksForRelation(const Id relId) const;

Expand Down
250 changes: 63 additions & 187 deletions src/index/IndexMetaDataImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
#pragma once

#include "../util/File.h"
#include "../util/Serializer/FileSerializer.h"
#include "../util/Serializer/SerializeHashMap.h"
#include "../util/Serializer/SerializeString.h"
#include "./IndexMetaData.h"
#include "./MetaDataHandler.h"

Expand All @@ -21,7 +24,8 @@ void IndexMetaData<MapType>::add(const FullRelationMetaData& rmd,

off_t afterExpected =
rmd.hasBlocks() ? bRmd._offsetAfter
: static_cast<off_t>(rmd._startFullIndex + rmd.getNofBytesForFulltextIndex());
: static_cast<off_t>(rmd._startFullIndex +
rmd.getNofBytesForFulltextIndex());
if (rmd.hasBlocks()) {
_blockData[rmd._relId] = bRmd;
}
Expand All @@ -36,82 +40,6 @@ off_t IndexMetaData<MapType>::getOffsetAfter() const {
return _offsetAfter;
}

// specialization for MMapBased Arrays which only read
// block-based data from Memory
template <class MapType>
void IndexMetaData<MapType>::createFromByteBuffer(unsigned char* buf) {
// read magic number
auto v = parseMagicNumberAndVersioning(buf);
auto version = v._version;
_version = version;
buf += v._nOfBytes;

size_t nameLength = readFromBuf<size_t>(&buf);
_name.assign(reinterpret_cast<char*>(buf), nameLength);
buf += nameLength;

size_t nofRelations = readFromBuf<size_t>(&buf);
if constexpr (_isMmapBased) {
_data.setSize(nofRelations);
}
_offsetAfter = readFromBuf<off_t>(&buf);

if constexpr (!_isMmapBased) {
// HashMap-based means that FullRMD and Blocks are all stored withing the
// permutation file
for (size_t i = 0; i < nofRelations; ++i) {
FullRelationMetaData rmd;
rmd.createFromByteBuffer(buf);
buf += rmd.bytesRequired();
if (rmd.hasBlocks()) {
BlockBasedRelationMetaData bRmd;
bRmd.createFromByteBuffer(buf);
buf += bRmd.bytesRequired();
add(rmd, bRmd);
} else {
add(rmd, BlockBasedRelationMetaData());
}
}
} else {
// MmapBased
if (version < V_BLOCK_LIST_AND_STATISTICS) {
for (auto it = _data.cbegin(); it != _data.cend(); ++it) {
const FullRelationMetaData& rmd = (*it).second;
if (rmd.hasBlocks()) {
BlockBasedRelationMetaData bRmd;
bRmd.createFromByteBuffer(buf);
buf += bRmd.bytesRequired();
// we do not need to add the meta data since it is already in _data
// because of the persisten MMap file
add<true>(rmd, bRmd);
} else {
add<true>(rmd, BlockBasedRelationMetaData());
}
}
calculateExpensiveStatistics();
} else {
// version >= V_BLOCK_LIST_AND_STATISTICS, no need to touch Relations that
// don't have blocks
size_t numBlockData = readFromBuf<size_t>(&buf);
for (size_t i = 0; i < numBlockData; ++i) {
Id id = readFromBuf<Id>(&buf);
BlockBasedRelationMetaData bRmd;
bRmd.createFromByteBuffer(buf);
buf += bRmd.bytesRequired();
// we do not need to add the meta data since it is already in _data
// because of the persisten MMap file
add<true>(_data.getAsserted(id), bRmd);
}
}
}
if (version >= V_BLOCK_LIST_AND_STATISTICS) {
_totalElements = readFromBuf<size_t>(&buf);
_totalBytes = readFromBuf<size_t>(&buf);
_totalBlocks = readFromBuf<size_t>(&buf);
} else {
calculateExpensiveStatistics();
}
}
// _____________________________________________________________________________
template <class MapType>
const RelationMetaData IndexMetaData<MapType>::getRmd(Id relId) const {
Expand All @@ -129,49 +57,6 @@ bool IndexMetaData<MapType>::relationExists(Id relId) const {
return _data.count(relId) > 0;
}

// _____________________________________________________________________________
template <class MapType>
ad_utility::File& operator<<(ad_utility::File& f, const IndexMetaData<MapType>& imd) {
// first write magic number
if constexpr (IndexMetaData<MapType>::_isMmapBased) {
f.write(&MAGIC_NUMBER_MMAP_META_DATA_VERSION, sizeof(MAGIC_NUMBER_MMAP_META_DATA_VERSION));
} else {
f.write(&MAGIC_NUMBER_SPARSE_META_DATA_VERSION, sizeof(MAGIC_NUMBER_SPARSE_META_DATA_VERSION));
}
// write version
f.write(&V_CURRENT, sizeof(V_CURRENT));
size_t nameLength = imd._name.size();
f.write(&nameLength, sizeof(nameLength));
f.write(imd._name.data(), nameLength);
size_t nofElements = imd._data.size();
f.write(&nofElements, sizeof(nofElements));
f.write(&imd._offsetAfter, sizeof(imd._offsetAfter));
if constexpr (!IndexMetaData<MapType>::_isMmapBased) {
for (auto it = imd._data.cbegin(); it != imd._data.cend(); ++it) {
const auto el = *it;
f << el.second;

if (el.second.hasBlocks()) {
auto itt = imd._blockData.find(el.second._relId);
AD_CHECK(itt != imd._blockData.end());
f << itt->second;
}
}
} else {
size_t numBlockData = imd._blockData.size();
f.write(&numBlockData, sizeof(numBlockData));
for (const auto& [id, blockData] : imd._blockData) {
f.write(&id, sizeof(id));
f << blockData;
}
}
f.write(&imd._totalElements, sizeof(imd._totalElements));
f.write(&imd._totalBytes, sizeof(&imd._totalBytes));
f.write(&imd._totalBlocks, sizeof(&imd._totalBlocks));

return f;
}

// ____________________________________________________________________________
template <class MapType>
void IndexMetaData<MapType>::writeToFile(const std::string& filename) const {
Expand All @@ -187,7 +72,9 @@ void IndexMetaData<MapType>::appendToFile(ad_utility::File* file) const {
AD_CHECK(file->isOpen());
file->seek(0, SEEK_END);
off_t startOfMeta = file->tell();
(*file) << *this;
ad_utility::serialization::FileWriteSerializer serializer{std::move(*file)};
serializer&(*this);
*file = std::move(serializer).moveFileOut();
file->write(&startOfMeta, sizeof(startOfMeta));
}

Expand All @@ -205,10 +92,13 @@ template <class MapType>
void IndexMetaData<MapType>::readFromFile(ad_utility::File* file) {
off_t metaFrom;
off_t metaTo = file->getLastOffset(&metaFrom);
unsigned char* buf = new unsigned char[metaTo - metaFrom];
file->read(buf, static_cast<size_t>(metaTo - metaFrom), metaFrom);
createFromByteBuffer(buf);
delete[] buf;
std::vector<char> buf(metaTo - metaFrom);
file->read(buf.data(), static_cast<size_t>(metaTo - metaFrom), metaFrom);

ad_utility::serialization::ByteBufferReadSerializer serializer{
std::move(buf)};

serializer&(*this);
}

// _____________________________________________________________________________
Expand All @@ -231,8 +121,10 @@ string IndexMetaData<MapType>::statistics() const {

os << "# Elements: " << _totalElements << '\n';
os << "# Blocks: " << _totalBlocks << "\n\n";
os << "Theoretical size of Id triples: " << _totalElements * 3 * sizeof(Id) << " bytes \n";
os << "Size of pair index: " << totalPairIndexBytes << " bytes \n";
os << "Theoretical size of Id triples: " << _totalElements * 3 * sizeof(Id)
<< " bytes \n";
os << "Size of pair index: " << totalPairIndexBytes
<< " bytes \n";
os << "Total Size: " << _totalBytes << " bytes \n";
os << "-------------------------------------------------------------------\n";
return os.str();
Expand All @@ -251,7 +143,8 @@ size_t IndexMetaData<MapType>::getNofBlocksForRelation(const Id id) const {

// _____________________________________________________________________________
template <class MapType>
size_t IndexMetaData<MapType>::getTotalBytesForRelation(const FullRelationMetaData& frmd) const {
size_t IndexMetaData<MapType>::getTotalBytesForRelation(
const FullRelationMetaData& frmd) const {
auto it = _blockData.find(frmd._relId);
if (it != _blockData.end()) {
return static_cast<size_t>(it->second._offsetAfter - frmd._startFullIndex);
Expand Down Expand Up @@ -280,67 +173,50 @@ void IndexMetaData<MapType>::calculateExpensiveStatistics() {
}
}

// ___________________________________________________________________
template <class MapType>
VersionInfo IndexMetaData<MapType>::parseMagicNumberAndVersioning(unsigned char* buf) {
uint64_t magicNumber = *reinterpret_cast<uint64_t*>(buf);
size_t nOfBytes = 0;
bool hasVersion = false;
if constexpr (!_isMmapBased) {
if (magicNumber == MAGIC_NUMBER_MMAP_META_DATA ||
magicNumber == MAGIC_NUMBER_MMAP_META_DATA_VERSION) {
throw WrongFormatException(
"ERROR: magic number of MetaData indicates that we are trying "
"to construct a hashMap based IndexMetaData from mmap-based meta "
"data. This is not valid."
"Please use ./MetaDataConverterMain"
"to convert old indices without rebuilding them (See README.md).\n");
AD_CHECK(false);
} else if (magicNumber == MAGIC_NUMBER_SPARSE_META_DATA) {
hasVersion = false;
nOfBytes = sizeof(uint64_t);
} else if (magicNumber == MAGIC_NUMBER_SPARSE_META_DATA_VERSION) {
hasVersion = true;
nOfBytes = sizeof(uint64_t);
} else {
// no magic number found
hasVersion = false;
nOfBytes = 0;
}
} else { // this _isMmapBased
if (magicNumber == MAGIC_NUMBER_MMAP_META_DATA) {
hasVersion = false;
nOfBytes = sizeof(uint64_t);
} else if (magicNumber == MAGIC_NUMBER_MMAP_META_DATA_VERSION) {
hasVersion = true;
nOfBytes = sizeof(uint64_t);
} else {
throw WrongFormatException("ERROR: No or wrong magic number found in persistent "
"mmap-based meta data. "
"Please use ./MetaDataConverterMain "
"to convert old indices without rebuilding them (See "
"README.md).Terminating...\n");
}
}
// ___________________________________________________________________________
template <typename Serializer, typename MapType>
void serialize(Serializer& serializer, IndexMetaData<MapType>& metaData) {
// The binary format of an IndexMetaData start with an 8-byte magicNumber.
// After this magic number, an 8-byte version number follows. Both have to
// match.
using T = IndexMetaData<MapType>;
uint64_t magicNumber = T::MAGIC_NUMBER_FOR_SERIALIZATION;

VersionInfo res;
res._nOfBytes = nOfBytes;
if (!hasVersion) {
res._version = V_NO_VERSION;
} else {
res._version = *reinterpret_cast<uint64_t*>(buf + res._nOfBytes);
res._nOfBytes += sizeof(uint64_t);
serializer& magicNumber;

// This check might only become false, if we are reading from the serializer
if (magicNumber != T::MAGIC_NUMBER_FOR_SERIALIZATION) {
throw WrongFormatException(
"The binary format of this index is no longer supported by QLever. "
"Please rebuild the index.");
}
if (res._version < V_CURRENT) {
LOG(INFO) << "WARNING: your IndexMetaData seems to have an old format (version "
"tag < V_CURRENT). Please consider using ./MetaDataConverterMain to "
"benefit from improvements in the index structure.\n";

} else if (res._version > V_CURRENT) {
LOG(INFO) << "ERROR: version tag does not match any actual version (> "
"V_CURRENT). Your IndexMetaData is probably corrupted. "
"Terminating\n";
AD_CHECK(false);
serializer& metaData._version;
// This check might only become false, if we are reading from the serializer
if (metaData.getVersion() != V_CURRENT) {
throw WrongFormatException(
"The binary format of this index is no longer supported by QLever. "
"Please rebuild the index.");
}
return res;

// Serialize the rest of the data members
serializer& metaData._name;
serializer& metaData._data;
serializer& metaData._blockData;

serializer& metaData._offsetAfter;
serializer& metaData._totalElements;
serializer& metaData._totalBytes;
serializer& metaData._totalBlocks;
}

// This overload allows us to serialize from a const IndexMetaData& to a writing
// Serializer. This is ok, because then the serialize-function does not perform
// any non-const actions.
// TODO<C++20> using a requires clause we can actually enforce the const access
// in these functions.
template <typename Serializer, typename MapType>
void serialize(Serializer& serializer, const IndexMetaData<MapType>& metaData) {
static_assert(Serializer::IsWriteSerializer);
serialize(serializer, const_cast<IndexMetaData<MapType>&>(metaData));
}

0 comments on commit 8931b22

Please sign in to comment.