Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Separate prefix file, fixing #141 #143

Merged
merged 3 commits into from
Nov 15, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions src/global/Constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ static const int DEFAULT_NOF_DATE_YEAR_DIGITS = 19;

static const std::string MMAP_FILE_SUFFIX = ".meta-mmap";
static const std::string CONFIGURATION_FILE = ".meta-data.json";
static const std::string PREFIX_FILE = ".prefixes";

// Constants for the range of valid compression prefixes
// all ASCII- printable characters are left out.
Expand Down
2 changes: 1 addition & 1 deletion src/index/Index.Text.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ void Index::passContextFileIntoVector(const string& contextFile,
// only add a text index. In that case the Vocabulary has never been
// initialized before
_vocab = Vocabulary<CompressedString>();
readConfigurationFile();
readConfiguration();
_vocab.readFromFile(_onDiskBase + ".vocabulary",
_onDiskLiterals ? _onDiskBase + ".literals-index" : "");

Expand Down
30 changes: 22 additions & 8 deletions src/index/Index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,16 @@ void Index::createFromFile(const string& filename, bool allPermutations) {
std::vector<string> prefixes;
if (_vocabPrefixCompressed) {
prefixes = calculatePrefixes(vocabFile, NUM_COMPRESSION_PREFIXES, 1, true);
std::ofstream prefixFile(_onDiskBase + PREFIX_FILE);
AD_CHECK(prefixFile.is_open());
for (const auto& prefix : prefixes) {
prefixFile << prefix << '\n';
}
}
_configurationJson["prefixes"] =
Vocabulary<CompressedString>::prefixCompressFile(vocabFile, vocabFileTmp,
prefixes);
_configurationJson["prefixes"] = _vocabPrefixCompressed;
Vocabulary<CompressedString>::prefixCompressFile(vocabFile, vocabFileTmp,
prefixes);

// TODO<joka921> maybe move this to its own function
if (std::rename(vocabFileTmp.c_str(), vocabFile.c_str())) {
LOG(INFO) << "Error: Rename the prefixed vocab file " << vocabFileTmp
Expand All @@ -119,7 +125,7 @@ void Index::createFromFile(const string& filename, bool allPermutations) {
// vector is not yet sorted
createPatterns(false, &idTriples);
}
writeConfigurationFile();
writeConfiguration();
}

// explicit instantiations
Expand Down Expand Up @@ -896,7 +902,7 @@ void Index::writeNonFunctionalRelation(
void Index::createFromOnDiskIndex(const string& onDiskBase,
bool allPermutations) {
setOnDiskBase(onDiskBase);
readConfigurationFile();
readConfiguration();
_vocab.readFromFile(_onDiskBase + ".vocabulary",
_onDiskLiterals ? _onDiskBase + ".literals-index" : "");
auto psoName = string(_onDiskBase + ".index.pso");
Expand Down Expand Up @@ -1736,14 +1742,14 @@ void Index::setPrefixCompression(bool compressed) {
}

// ____________________________________________________________________________
void Index::writeConfigurationFile() const {
void Index::writeConfiguration() const {
std::ofstream f(_onDiskBase + CONFIGURATION_FILE);
AD_CHECK(f.is_open());
f << _configurationJson;
}

// ___________________________________________________________________________
void Index::readConfigurationFile() {
void Index::readConfiguration() {
std::ifstream f(_onDiskBase + CONFIGURATION_FILE);
AD_CHECK(f.is_open());
f >> _configurationJson;
Expand All @@ -1753,7 +1759,15 @@ void Index::readConfigurationFile() {
}

if (_configurationJson.find("prefixes") != _configurationJson.end()) {
_vocab.initializeRestartPrefixes(_configurationJson["prefixes"]);
if (_configurationJson["prefixes"]) {
vector<string> prefixes;
std::ifstream prefixFile(_onDiskBase + PREFIX_FILE);
AD_CHECK(prefixFile.is_open());
for (string prefix; std::getline(prefixFile, prefix);) {
prefixes.emplace_back(std::move(prefix));
}
_vocab.initializePrefixes(prefixes);
}
}

if (_configurationJson.find("prefixes-external") !=
Expand Down
5 changes: 2 additions & 3 deletions src/index/Index.h
Original file line number Diff line number Diff line change
Expand Up @@ -558,9 +558,8 @@ class Index {
*/
void throwExceptionIfNoPatterns() const;

// TODO<joka921> better names
void writeConfigurationFile() const;
void readConfigurationFile();
void writeConfiguration() const;
void readConfiguration();

// initialize the index-build-time settings for the vocabulary
void initializeVocabularySettingsBuild();
Expand Down
41 changes: 38 additions & 3 deletions src/index/MetaDataConverter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,8 +176,42 @@ void CompressVocabAndCreateConfigurationFile(const string& indexPrefix) {
string confFilename = indexPrefix + CONFIGURATION_FILE;
string vocabFilename = indexPrefix + ".vocabulary";
if (ad_utility::File::exists(confFilename)) {
std::cout
<< "This index already has a configuration file, nothing to do here\n";
std::cout << "This index already has a configuration file, check if it\n"
"contains prefixes as internal list instead of in a separate\n"
".prefixes file\n";

std::ifstream confFile(indexPrefix + CONFIGURATION_FILE);
AD_CHECK(confFile.is_open());
json config;
confFile >> config;
if (config.find("prefixes") == config.end()) {
std::cout << "The configuration file " << confFilename
<< " is missing the \"prefixes\" field" << std::endl;
AD_CHECK(false);
}
auto prefixes = config["prefixes"];
if (prefixes.type() == json::value_t::boolean &&
ad_utility::File::exists(indexPrefix + PREFIX_FILE)) {
std::cout << "The index already uses a separate " << PREFIX_FILE
<< " file\n";
} else if (prefixes.type() == json::value_t::array) {
std::cout << "Converting to separate " << PREFIX_FILE << " file\n";
std::ofstream prefixFile(indexPrefix + PREFIX_FILE);
AD_CHECK(prefixFile.is_open());
for (const string& prefix : prefixes) {
prefixFile << prefix << '\n';
}
niklas88 marked this conversation as resolved.
Show resolved Hide resolved
std::ofstream f(confFilename + ".converted");
AD_CHECK(f.is_open());
f << config;
notifyCreated(confFilename, true);
} else {
std::cout << "The configuration file " << confFilename
<< " has an unrecoverably broken \"prefixes\" field"
<< std::endl;
AD_CHECK(false);
}

} else {
std::cout << "This index does not have a configuration file. We have to "
"create it and also compress the vocabulary\n";
Expand All @@ -187,7 +221,8 @@ void CompressVocabAndCreateConfigurationFile(const string& indexPrefix) {
ad_utility::File::exists(indexPrefix + ".literals-index");
auto prefixes =
calculatePrefixes(vocabFilename, NUM_COMPRESSION_PREFIXES, 1);
j["prefixes"] = Vocabulary<CompressedString>::prefixCompressFile(
j["prefixes"] = prefixes;
Vocabulary<CompressedString>::prefixCompressFile(
vocabFilename, vocabFilename + ".converted", prefixes);
notifyCreated(vocabFilename, true);
std::ofstream f(confFilename);
Expand Down
30 changes: 4 additions & 26 deletions src/index/Vocabulary.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@
#include <google/sparse_hash_map>
#include <optional>
#include <string>
#include <string_view>
#include <unordered_map>
#include <vector>

#include <nlohmann/json.hpp>
#include "../global/Constants.h"
#include "../global/Id.h"
#include "../util/Exception.h"
Expand All @@ -28,7 +28,6 @@

using std::string;
using std::vector;
using json = nlohmann::json;

template <class StringType>
struct AccessReturnTypeGetter {};
Expand Down Expand Up @@ -96,10 +95,6 @@ class PrefixComparator {
template <class StringType>
class Vocabulary {
public:
// TODO<joka921, niklas> It would be cleaner to put the enable_if into the
// class declaration but this would need a lot of code restructuring.
// Can I leave it like this, the compiler errors are as decent as they become
// with templates
template <
typename = std::enable_if_t<std::is_same_v<StringType, string> ||
std::is_same_v<StringType, CompressedString>>>
Expand Down Expand Up @@ -296,14 +291,13 @@ class Vocabulary {
CompressedString compressPrefix(const string& word) const;

// initialize compression with a list of prefixes
// can only be called on an empty array.
// The prefixes do not have to be in any specific order
//
// StringRange prefixes can be of any type where
// for (const string& el : prefixes {}
// works
template <class StringRange, typename = std::enable_if_t<_isCompressed>>
void initializeNewPrefixes(const StringRange& prefixes);
void initializePrefixes(const StringRange& prefixes);

// set the list of prefixes for words which will become part of the
// externalized vocabulary. Good for entity names that normally don't appear
Expand All @@ -315,32 +309,16 @@ class Vocabulary {
template <class StringRange>
void initializeExternalizePrefixes(const StringRange& prefixes);

// ______________________________________________________
// set the prefixes used for compression
// These have to have the exact same format returned by
// getJsonForPrefixes (serialization of the compression information)
template <typename = std::enable_if_t<_isCompressed>>
void initializeRestartPrefixes(const json& j);

// needed by function prefixCompressFile
template <typename = std::enable_if_t<_isCompressed>>
json getJsonForPrefixes() const;

// Compress the file at path infile, write to file at outfile using the
// specified prefixes.
// Arguments:
// infile - path to original vocabulary, one word per line
// outfile- output path. Will be overwritten by also one word per line
// in the same order as the infile
// prefixes - a list of prefixes which we will compress
//
// Returns: A json array with information about the prefixes,
// j[2]="ablab" means, that the prefix "ablab" was encoded by the
// byte \x02
template <typename = std::enable_if_t<_isCompressed>>
static std::array<std::string, NUM_COMPRESSION_PREFIXES> prefixCompressFile(
const string& infile, const string& outfile,
const vector<string>& prefixes);
static void prefixCompressFile(const string& infile, const string& outfile,
const vector<string>& prefixes);

private:
// Wraps std::lower_bound and returns an index instead of an iterator
Expand Down
67 changes: 11 additions & 56 deletions src/index/VocabularyImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -222,12 +222,13 @@ CompressedString Vocabulary<S>::compressPrefix(const string& word) const {
// _____________________________________________________________________________
template <class S>
template <class StringRange, typename>
void Vocabulary<S>::initializeNewPrefixes(const StringRange& j) {
void Vocabulary<S>::initializePrefixes(const StringRange& prefixes) {
for (auto& el : _prefixMap) {
el = "";
}
_prefixVec.clear();
unsigned char prefixIdx = 0;
for (const auto& fulltext : j) {
for (const auto& fulltext : prefixes) {
if (prefixIdx >= NUM_COMPRESSION_PREFIXES) {
LOG(INFO) << "More than " << NUM_COMPRESSION_PREFIXES
<< " prefixes have been specified. Skipping the rest\n";
Expand All @@ -237,6 +238,10 @@ void Vocabulary<S>::initializeNewPrefixes(const StringRange& j) {
_prefixVec.emplace_back(prefixIdx + MIN_COMPRESSION_PREFIX, fulltext);
prefixIdx++;
}
if (prefixIdx != NUM_COMPRESSION_PREFIXES) {
LOG(WARN) << "less than " << NUM_COMPRESSION_PREFIXES
<< " prefixes specified.";
}
// if longest strings come first we correctly handle overlapping prefixes
auto pred = [](const Prefix& a, const Prefix& b) {
return a._fulltext.size() > b._fulltext.size();
Expand All @@ -254,49 +259,6 @@ void Vocabulary<S>::initializeExternalizePrefixes(const StringRange& s) {
}
}

// _____________________________________________________________________________
template <class S>
template <typename>
void Vocabulary<S>::initializeRestartPrefixes(const json& j) {
for (auto& el : _prefixMap) {
el = "";
}
_prefixVec.clear();
uint8_t idx = 0;
for (const auto& p : j) {
if (idx >= NUM_COMPRESSION_PREFIXES) {
LOG(INFO) << "ERROR: configuration file contained more than "
<< NUM_COMPRESSION_PREFIXES << " prefixes. Terminating.\n";
AD_CHECK(false);
}
_prefixMap[idx] = p;
_prefixVec.emplace_back(idx + MIN_COMPRESSION_PREFIX, p);
idx++;
}
if (idx != NUM_COMPRESSION_PREFIXES) {
LOG(INFO) << "ERROR: configuration file contained less than "
<< NUM_COMPRESSION_PREFIXES
<< " prefixes. Setup of prefix compression is not possible. "
"Terminating.\n";
AD_CHECK(false);
}
auto pred = [](const Prefix& a, const Prefix& b) {
return a._fulltext.size() > b._fulltext.size();
};
std::sort(_prefixVec.begin(), _prefixVec.end(), pred);
}

// ____________________________________________________________________________
template <class S>
template <typename>
json Vocabulary<S>::getJsonForPrefixes() const {
json j = json::array();
for (const auto& p : _prefixMap) {
j.push_back(p);
}
return j;
}

// __________________________________________________________________________
template <class S>
bool PrefixComparator<S>::operator()(const CompressedString& lhsComp,
Expand Down Expand Up @@ -325,23 +287,16 @@ bool PrefixComparator<S>::operator()(const string& lhs,
// _____________________________________________________
template <class S>
template <typename>
std::array<std::string, NUM_COMPRESSION_PREFIXES>
Vocabulary<S>::prefixCompressFile(const string& infile, const string& outfile,
const vector<string>& prefixes) {
void Vocabulary<S>::prefixCompressFile(const string& infile,
const string& outfile,
const vector<string>& prefixes) {
std::ifstream in(infile);
std::ofstream out(outfile);
AD_CHECK(in.is_open() && out.is_open());
Vocabulary v;
v.initializeNewPrefixes(prefixes);
v.initializePrefixes(prefixes);
std::string word;
while (std::getline(in, word)) {
out << v.compressPrefix(word).toStringView() << '\n';
}
return v._prefixMap;
}

// explicit instantiations
/*
template class Vocabulary<string>;
template class Vocabulary<CompressedString>;
*/