Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

F.case insensitive label sorting #209

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
36 changes: 36 additions & 0 deletions src/engine/Filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
// Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de)

#include "Filter.h"
#include <algorithm>
#include <optional>
#include <regex>
#include <sstream>
Expand Down Expand Up @@ -504,6 +505,41 @@ void Filter::computeResultFixedValue(
rhs_string = ad_utility::convertValueLiteralToIndexWord(rhs_string);
} else if (ad_utility::isNumeric(_rhs)) {
rhs_string = ad_utility::convertNumericToIndexWord(rhs_string);
} else {
if (getIndex().getVocab().getCaseInsensitiveOrdering()) {
// We have to move to the correct end of the
// "same letters but different case" - range
// to make the filters work
// TODO<kalmbach, schnelle>: thoroughly test this
// (End-To-End or unit tests? probably both but the unit tests
// would also be in an end-to-end fashion for those nested
// mechanisms).
switch (_type) {
case SparqlFilter::GE:
case SparqlFilter::LT: {
rhs_string = ad_utility::getUppercaseUtf8(rhs_string);
auto split = StringSortComparator::extractComparable(rhs_string);
if (split.isLiteral && !split.langtag.empty()) {
// get rid of possible langtags to move to the beginning of the
// range
rhs_string = '\"' + std::string(split.val) + '\"';
}
}

break;
case SparqlFilter::GT:
case SparqlFilter::LE: {
rhs_string = ad_utility::getLowercaseUtf8(rhs_string);
auto split2 = StringSortComparator::extractComparable(rhs_string);
if (split2.isLiteral) {
rhs_string =
'\"' + std::string(split2.val) + '\"' + "@" + char(127);
}
} break;
default:
break;
}
}
}
if (_type == SparqlFilter::EQ || _type == SparqlFilter::NE) {
if (!getIndex().getVocab().getId(_rhs, &rhs)) {
Expand Down
3 changes: 3 additions & 0 deletions src/index/ConstantsIndexCreation.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,6 @@ static const size_t THRESHOLD_RELATION_CREATION = 2 << 20;
// ________________________________________________________________
static const std::string PARTIAL_VOCAB_FILE_NAME = ".partial-vocabulary";
static const std::string PARTIAL_MMAP_IDS = ".partial-ids-mmap";

// ________________________________________________________________
static const std::string TMP_BASENAME_COMPRESSION = ".tmp.compression_index";
117 changes: 88 additions & 29 deletions src/index/Index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,15 @@ void Index::createFromFile(const string& filename) {
string vocabFileTmp = _onDiskBase + ".vocabularyTmp";
std::vector<string> prefixes;
if (_vocabPrefixCompressed) {
prefixes = calculatePrefixes(vocabFile, NUM_COMPRESSION_PREFIXES, 1, true);
string vocabFileForPrefixCalculation = vocabFile;
if (_vocab.getCaseInsensitiveOrdering()) {
// we have to use the "normally" sorted vocabulary for the prefix
// compression;
vocabFileForPrefixCalculation =
_onDiskBase + TMP_BASENAME_COMPRESSION + ".vocabulary";
}
prefixes = calculatePrefixes(vocabFileForPrefixCalculation,
NUM_COMPRESSION_PREFIXES, 1, true);
std::ofstream prefixFile(_onDiskBase + PREFIX_FILE);
AD_CHECK(prefixFile.is_open());
for (const auto& prefix : prefixes) {
Expand Down Expand Up @@ -165,18 +173,7 @@ VocabularyData Index::passFileForVocabulary(const string& filename,
LOG(INFO) << "Lines (from KB-file) processed: " << i << '\n';
}
if (i % linesPerPartial == 0) {
LOG(INFO) << "Lines (from KB-file) processed: " << i << '\n';
LOG(INFO) << "Actual number of Triples in this section (include "
"langfilter triples): "
<< actualCurrentPartialSize << '\n';
string partialFilename =
_onDiskBase + PARTIAL_VOCAB_FILE_NAME + std::to_string(numFiles);

LOG(INFO) << "writing partial vocabulary to " << partialFilename
<< std::endl;
LOG(INFO) << "it contains " << items.size() << " elements\n";
writePartialIdMapToBinaryFileForMerging(items, partialFilename);
LOG(INFO) << "Done\n";
writeNextPartialVocabulary(i, numFiles, actualCurrentPartialSize, items);
numFiles++;
// Save the information how many triples this partial vocabulary actually
// deals with we will use this later for mapping from partial to global
Expand All @@ -191,34 +188,45 @@ VocabularyData Index::passFileForVocabulary(const string& filename,
}
// deal with remainder
if (items.size() > 0) {
LOG(INFO) << "Lines processed: " << i << '\n';
LOG(INFO) << "Actual number of Triples in this section: "
<< actualCurrentPartialSize << '\n';
string partialFilename =
_onDiskBase + PARTIAL_VOCAB_FILE_NAME + std::to_string(numFiles);

LOG(INFO) << "writing partial vocabular to " << partialFilename
<< std::endl;
LOG(INFO) << "it contains " << items.size() << " elements\n";
writePartialIdMapToBinaryFileForMerging(items, partialFilename);
LOG(INFO) << "Done\n";
writeNextPartialVocabulary(i, numFiles, actualCurrentPartialSize, items);
numFiles++;
actualPartialSizes.push_back(actualCurrentPartialSize);
}
writer.finish();

std::future<void> tmpVocFut;
if (_vocabPrefixCompressed && _vocab.getCaseInsensitiveOrdering()) {
LOG(INFO) << "Merging temporary vocabulary for prefix compression";
Id tmp1, tmp2;
auto f = [this, numFiles, &tmp1, &tmp2]() {
mergeVocabulary(_onDiskBase + TMP_BASENAME_COMPRESSION, numFiles, &tmp1,
&tmp2, StringSortComparator(false));
};
tmpVocFut = std::async(f);
LOG(INFO) << "Pass done.\n";
}

LOG(INFO) << "Merging vocabulary\n";
VocabularyData res;
res.nofWords = mergeVocabulary(_onDiskBase, numFiles, &res.langPredLowerBound,
&res.langPredUpperBound);
res.nofWords =
mergeVocabulary(_onDiskBase, numFiles, &res.langPredLowerBound,
&res.langPredUpperBound, _vocab.getCaseComparator());
res.idTriples = std::move(idTriples);
res.actualPartialSizes = std::move(actualPartialSizes);
LOG(INFO) << "Finished Merging Vocabulary.\n";

// if we had to create the additional vocabulary, wait for its completion
if (tmpVocFut.valid()) {
tmpVocFut.get();
LOG(INFO) << "Finished merging additional Vocabulary.";
}

for (size_t i = 0; i < numFiles; ++i) {
string partialFilename =
_onDiskBase + PARTIAL_VOCAB_FILE_NAME + std::to_string(i);
deleteTemporaryFile(partialFilename);
}
res.idTriples = std::move(idTriples);
res.actualPartialSizes = std::move(actualPartialSizes);
LOG(INFO) << "Pass done.\n";

return res;
}

Expand Down Expand Up @@ -1269,6 +1277,10 @@ void Index::readConfiguration() {
_configurationJson["prefixes-external"]);
}

if (_configurationJson.count("ignore-case")) {
_vocab.setCaseInsensitiveOrdering(_configurationJson["ignore-case"]);
}

if (_configurationJson.find("languages-internal") !=
_configurationJson.end()) {
_vocab.initializeInternalizedLangs(
Expand Down Expand Up @@ -1311,6 +1323,11 @@ void Index::initializeVocabularySettingsBuild() {
_configurationJson["prefixes-external"] = j["prefixes-external"];
}

if (j.count("ignore-case")) {
_vocab.setCaseInsensitiveOrdering(j["ignore-case"]);
_configurationJson["ignore-case"] = j["ignore-case"];
}

if (j.find("languages-internal") != j.end()) {
_vocab.initializeInternalizedLangs(j["languages-internal"]);
_configurationJson["languages-internal"] = j["languages-internal"];
Expand All @@ -1329,3 +1346,45 @@ Id Index::assignNextId(Map* mapPtr, const string& key) {
return map[key];
}
}

// ___________________________________________________________________________
void Index::writeNextPartialVocabulary(
size_t numLines, size_t numFiles, size_t actualCurrentPartialSize,
const ad_utility::HashMap<string, Id>& items) {
LOG(INFO) << "Lines (from KB-file) processed: " << numLines << '\n';
LOG(INFO) << "Actual number of Triples in this section (include "
"langfilter triples): "
<< actualCurrentPartialSize << '\n';
std::future<void> fut1, fut2;
string partialFilename =
_onDiskBase + PARTIAL_VOCAB_FILE_NAME + std::to_string(numFiles);

LOG(INFO) << "writing partial vocabulary to " << partialFilename << std::endl;
LOG(INFO) << "it contains " << items.size() << " elements\n";
fut1 = std::async([this, &items, partialFilename]() {
writePartialIdMapToBinaryFileForMerging(items, partialFilename,
_vocab.getCaseComparator());
});

if (_vocabPrefixCompressed && _vocab.getCaseInsensitiveOrdering()) {
// we also have to create the "ordinary" vocabulary order to make the
// prefix compression work
string partialTmpFilename = _onDiskBase + TMP_BASENAME_COMPRESSION +
PARTIAL_VOCAB_FILE_NAME +
std::to_string(numFiles);
LOG(INFO) << "writing partial temporary vocabulary to "
<< partialTmpFilename << std::endl;
LOG(INFO) << "it contains " << items.size() << " elements\n";
fut2 = std::async([&items, partialTmpFilename]() {
writePartialIdMapToBinaryFileForMerging(items, partialTmpFilename,
StringSortComparator(false));
});
}
if (fut1.valid()) {
fut1.get();
}
if (fut2.valid()) {
fut2.get();
}
LOG(INFO) << "Done.";
}
18 changes: 18 additions & 0 deletions src/index/Index.h
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,24 @@ class Index {
VocabularyData passFileForVocabulary(const string& ntFile,
size_t linesPerPartial = 100000000);

/**
* @brief Everything that has to be done when we have seen all the triples
* that belong to one partial vocabulary, including Log output used inside
* passFileForVocabulary
*
* @param numLines How many Lines from the KB have we already parsed (only for
* Logging)
* @param numFiles How many partial vocabularies have we seen before/which is
* the index of the voc we are going to write
* @param actualCurrentPartialSize How many triples belong to this partition
* (including extra langfilter triples)
* @param items Contains our unsorted vocabulary. Maps words to their local
* ids within this vocabulary.
*/
void writeNextPartialVocabulary(size_t numLines, size_t numFiles,
size_t actualCurrentPartialSize,
const ad_utility::HashMap<string, Id>& items);

void convertPartialToGlobalIds(TripleVec& data,
const vector<size_t>& actualLinesPerPartial,
size_t linesPerPartial);
Expand Down
2 changes: 2 additions & 0 deletions src/index/IndexBuilderMain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,8 @@ int main(int argc, char** argv) {
cout << endl
<< "! ERROR in processing options (getopt returned '" << c
<< "' = 0x" << std::setbase(16) << c << ")" << endl
<< "Corresponding ascii option : -" << std::string(1, c) << endl
<< "This is either an unsupported option or there was an error"
<< endl;
exit(1);
}
Expand Down