Skip to content

Commit

Permalink
Use separate .prefixes file. Fixes #141
Browse files Browse the repository at this point in the history
The MetaDataConverter seems to break on trying to convert (i.e. detect
as fine) a current index. I don't think that has anything todo with the
new changes but these are untested until I figure that out.
  • Loading branch information
niklas88 committed Oct 31, 2018
1 parent 0995c93 commit 038d77f
Show file tree
Hide file tree
Showing 6 changed files with 57 additions and 13 deletions.
1 change: 1 addition & 0 deletions src/global/Constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ static const int DEFAULT_NOF_DATE_YEAR_DIGITS = 19;

static const std::string MMAP_FILE_SUFFIX = ".meta-mmap";
static const std::string CONFIGURATION_FILE = ".meta-data.json";
static const std::string PREFIX_FILE = ".prefixes";

// Constants for the range of valid compression prefixes
// all ASCII- printable characters are left out.
Expand Down
2 changes: 1 addition & 1 deletion src/index/Index.Text.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ void Index::passContextFileIntoVector(const string& contextFile,
// only add a text index. In that case the Vocabulary has never been
// initialized before
_vocab = Vocabulary<CompressedString>();
readConfigurationFile();
readConfiguration();
_vocab.readFromFile(_onDiskBase + ".vocabulary",
_onDiskLiterals ? _onDiskBase + ".literals-index" : "");

Expand Down
26 changes: 20 additions & 6 deletions src/index/Index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,16 @@ void Index::createFromFile(const string& filename, bool allPermutations) {
std::vector<string> prefixes;
if (_vocabPrefixCompressed) {
prefixes = calculatePrefixes(vocabFile, NUM_COMPRESSION_PREFIXES, 1, true);
std::ofstream prefixFile(_onDiskBase + PREFIX_FILE);
AD_CHECK(prefixFile.is_open());
for (const auto& prefix : prefixes) {
prefixFile << prefix << '\n';
}
}
_configurationJson["prefixes"] = prefixes;
_configurationJson["prefixes"] = _vocabPrefixCompressed;
Vocabulary<CompressedString>::prefixCompressFile(vocabFile, vocabFileTmp,
prefixes);

// TODO<joka921> maybe move this to its own function
if (std::rename(vocabFileTmp.c_str(), vocabFile.c_str())) {
LOG(INFO) << "Error: Rename the prefixed vocab file " << vocabFileTmp
Expand All @@ -119,7 +125,7 @@ void Index::createFromFile(const string& filename, bool allPermutations) {
// vector is not yet sorted
createPatterns(false, &idTriples);
}
writeConfigurationFile();
writeConfiguration();
}

// explicit instantiations
Expand Down Expand Up @@ -896,7 +902,7 @@ void Index::writeNonFunctionalRelation(
void Index::createFromOnDiskIndex(const string& onDiskBase,
bool allPermutations) {
setOnDiskBase(onDiskBase);
readConfigurationFile();
readConfiguration();
_vocab.readFromFile(_onDiskBase + ".vocabulary",
_onDiskLiterals ? _onDiskBase + ".literals-index" : "");
auto psoName = string(_onDiskBase + ".index.pso");
Expand Down Expand Up @@ -1736,14 +1742,14 @@ void Index::setPrefixCompression(bool compressed) {
}

// ____________________________________________________________________________
void Index::writeConfigurationFile() const {
void Index::writeConfiguration() const {
std::ofstream f(_onDiskBase + CONFIGURATION_FILE);
AD_CHECK(f.is_open());
f << _configurationJson;
}

// ___________________________________________________________________________
void Index::readConfigurationFile() {
void Index::readConfiguration() {
std::ifstream f(_onDiskBase + CONFIGURATION_FILE);
AD_CHECK(f.is_open());
f >> _configurationJson;
Expand All @@ -1753,7 +1759,15 @@ void Index::readConfigurationFile() {
}

if (_configurationJson.find("prefixes") != _configurationJson.end()) {
_vocab.initializePrefixes(_configurationJson["prefixes"]);
if (_configurationJson["prefixes"]) {
vector<string> prefixes;
std::ifstream prefixFile(_onDiskBase + PREFIX_FILE);
AD_CHECK(prefixFile.is_open());
for (string prefix; std::getline(prefixFile, prefix);) {
prefixes.emplace_back(std::move(prefix));
}
_vocab.initializePrefixes(prefixes);
}
}

if (_configurationJson.find("prefixes-external") !=
Expand Down
5 changes: 2 additions & 3 deletions src/index/Index.h
Original file line number Diff line number Diff line change
Expand Up @@ -558,9 +558,8 @@ class Index {
*/
void throwExceptionIfNoPatterns() const;

// TODO<joka921> better names
void writeConfigurationFile() const;
void readConfigurationFile();
void writeConfiguration() const;
void readConfiguration();

// initialize the index-build-time settings for the vocabulary
void initializeVocabularySettingsBuild();
Expand Down
34 changes: 32 additions & 2 deletions src/index/MetaDataConverter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,8 +176,38 @@ void CompressVocabAndCreateConfigurationFile(const string& indexPrefix) {
string confFilename = indexPrefix + CONFIGURATION_FILE;
string vocabFilename = indexPrefix + ".vocabulary";
if (ad_utility::File::exists(confFilename)) {
std::cout
<< "This index already has a configuration file, nothing to do here\n";
std::cout << "This index already has a configuration file, check if it\n"
"contains prefixes as internal list instead of in a separate\n"
".prefixes file\n";

std::ifstream confFile(indexPrefix + CONFIGURATION_FILE);
AD_CHECK(confFile.is_open());
json config;
confFile >> config;
if (config.find("prefixes") == config.end()) {
std::cout << "The configuration file " << confFilename
<< " is missing the \"prefixes\" field" << std::endl;
AD_CHECK(false);
}
auto prefixes = config["prefixes"];
if (prefixes.type() == json::value_t::boolean &&
ad_utility::File::exists(indexPrefix + PREFIX_FILE)) {
std::cout << "The index already uses a separate " << PREFIX_FILE
<< " file\n";
} else if (prefixes.type() == json::value_t::array) {
std::cout << "Converting to separate " << PREFIX_FILE << " file\n";
std::ofstream prefixFile(indexPrefix + PREFIX_FILE);
AD_CHECK(prefixFile.is_open());
for (const string& prefix : prefixes) {
prefixFile << prefix << '\n';
}
} else {
std::cout << "The configuration file " << confFilename
<< " has an unrecoverably broken \"prefixes\" field"
<< std::endl;
AD_CHECK(false);
}

} else {
std::cout << "This index does not have a configuration file. We have to "
"create it and also compress the vocabulary\n";
Expand Down
2 changes: 1 addition & 1 deletion src/index/Vocabulary.h
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ class Vocabulary {
// prefixes - a list of prefixes which we will compress
template <typename = std::enable_if_t<_isCompressed>>
static void prefixCompressFile(const string& infile, const string& outfile,
const vector<string>& prefixes);
const vector<string>& prefixes);

private:
// Wraps std::lower_bound and returns an index instead of an iterator
Expand Down

0 comments on commit 038d77f

Please sign in to comment.