Skip to content

Commit

Permalink
Adressed most fixes from Niklas
Browse files Browse the repository at this point in the history
- Mostly used the utf-8 tolowercase
- Currently we do not yet have the Umlauts at a useful position (this takes more effort).
- The whole Utf-8 business should be handled by proper libraries with iterators etc.
  • Loading branch information
joka921 committed Mar 19, 2019
1 parent 5cc1602 commit fb3ae82
Show file tree
Hide file tree
Showing 7 changed files with 93 additions and 92 deletions.
6 changes: 2 additions & 4 deletions src/engine/Filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -512,8 +512,7 @@ void Filter::computeResultFixedValue(
switch (_type) {
case SparqlFilter::GE:
case SparqlFilter::LT: {
std::transform(rhs_string.begin(), rhs_string.end(),
rhs_string.begin(), ::toupper);
rhs_string = ad_utility::getUppercaseUtf8(rhs_string);
auto split = StringSortComparator::extractComparable(rhs_string);
if (split.isLiteral && !split.langtag.empty()) {
// get rid of possible langtags to move to the beginning of the
Expand All @@ -525,8 +524,7 @@ void Filter::computeResultFixedValue(
break;
case SparqlFilter::GT:
case SparqlFilter::LE: {
std::transform(rhs_string.begin(), rhs_string.end(),
rhs_string.begin(), ::tolower);
rhs_string = ad_utility::getLowercaseUtf8(rhs_string);
auto split2 = StringSortComparator::extractComparable(rhs_string);
if (split2.isLiteral) {
rhs_string =
Expand Down
107 changes: 44 additions & 63 deletions src/index/Index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -204,43 +204,7 @@ VocabularyData Index::passFileForVocabulary(const string& filename,
LOG(INFO) << "Lines (from KB-file) processed: " << i << '\n';
}
if (i % linesPerPartial == 0) {
std::future<void> fut1, fut2;
LOG(INFO) << "Lines (from KB-file) processed: " << i << '\n';
LOG(INFO) << "Actual number of Triples in this section (include "
"langfilter triples): "
<< actualCurrentPartialSize << '\n';
string partialFilename =
_onDiskBase + PARTIAL_VOCAB_FILE_NAME + std::to_string(numFiles);

LOG(INFO) << "writing partial vocabulary to " << partialFilename
<< std::endl;
LOG(INFO) << "it contains " << items.size() << " elements\n";
fut1 = std::async([this, &items, partialFilename]() {
writePartialIdMapToBinaryFileForMerging(items, partialFilename,
_vocab.getCaseComparator());
});

if (_vocabPrefixCompressed && _vocab.getCaseInsensitiveOrdering()) {
// we also have to create the "ordinary" vocabulary order to make the
// prefix compression work
string partialTmpFilename = _onDiskBase + TMP_BASENAME_COMPRESSION +
PARTIAL_VOCAB_FILE_NAME +
std::to_string(numFiles);
LOG(INFO) << "writing partial temporary vocabulary to "
<< partialTmpFilename << std::endl;
LOG(INFO) << "it contains " << items.size() << " elements\n";
fut2 = std::async([&items, partialTmpFilename]() {
writePartialIdMapToBinaryFileForMerging(items, partialTmpFilename,
StringSortComparator(false));
});
}
if (fut1.valid()) {
fut1.get();
}
if (fut2.valid()) {
fut2.get();
}
LOG(INFO) << "Done\n";
writeNextPartialVocabulary(i, numFiles, actualCurrentPartialSize, items);
numFiles++;
// Save the information how many triples this partial vocabulary actually
// deals with we will use this later for mapping from partial to global
Expand All @@ -255,32 +219,7 @@ VocabularyData Index::passFileForVocabulary(const string& filename,
}
// deal with remainder
if (items.size() > 0) {
LOG(INFO) << "Lines processed: " << i << '\n';
LOG(INFO) << "Actual number of Triples in this section: "
<< actualCurrentPartialSize << '\n';
string partialFilename =
_onDiskBase + PARTIAL_VOCAB_FILE_NAME + std::to_string(numFiles);

LOG(INFO) << "writing partial vocabular to " << partialFilename
<< std::endl;
LOG(INFO) << "it contains " << items.size() << " elements\n";
writePartialIdMapToBinaryFileForMerging(items, partialFilename,
_vocab.getCaseComparator());
LOG(INFO) << "Done\n";

if (_vocabPrefixCompressed && _vocab.getCaseInsensitiveOrdering()) {
// we also have to create the "ordinary" vocabulary order to make the
// prefix compression work
string partialTmpFilename = _onDiskBase + TMP_BASENAME_COMPRESSION +
PARTIAL_VOCAB_FILE_NAME +
std::to_string(numFiles);
LOG(INFO) << "writing partial temporary vocabulary to "
<< partialTmpFilename << std::endl;
LOG(INFO) << "it contains " << items.size() << " elements\n";
writePartialIdMapToBinaryFileForMerging(items, partialTmpFilename,
StringSortComparator(false));
LOG(INFO) << "Done\n";
}
writeNextPartialVocabulary(i, numFiles, actualCurrentPartialSize, items);
numFiles++;
actualPartialSizes.push_back(actualCurrentPartialSize);
}
Expand Down Expand Up @@ -1915,3 +1854,45 @@ Id Index::assignNextId(Map* mapPtr, const string& key) {
return map[key];
}
}

// ___________________________________________________________________________
void Index::writeNextPartialVocabulary(
size_t numLines, size_t numFiles, size_t actualCurrentPartialSize,
const ad_utility::HashMap<string, Id>& items) {
LOG(INFO) << "Lines (from KB-file) processed: " << numLines << '\n';
LOG(INFO) << "Actual number of Triples in this section (include "
"langfilter triples): "
<< actualCurrentPartialSize << '\n';
std::future<void> fut1, fut2;
string partialFilename =
_onDiskBase + PARTIAL_VOCAB_FILE_NAME + std::to_string(numFiles);

LOG(INFO) << "writing partial vocabulary to " << partialFilename << std::endl;
LOG(INFO) << "it contains " << items.size() << " elements\n";
fut1 = std::async([this, &items, partialFilename]() {
writePartialIdMapToBinaryFileForMerging(items, partialFilename,
_vocab.getCaseComparator());
});

if (_vocabPrefixCompressed && _vocab.getCaseInsensitiveOrdering()) {
// we also have to create the "ordinary" vocabulary order to make the
// prefix compression work
string partialTmpFilename = _onDiskBase + TMP_BASENAME_COMPRESSION +
PARTIAL_VOCAB_FILE_NAME +
std::to_string(numFiles);
LOG(INFO) << "writing partial temporary vocabulary to "
<< partialTmpFilename << std::endl;
LOG(INFO) << "it contains " << items.size() << " elements\n";
fut2 = std::async([&items, partialTmpFilename]() {
writePartialIdMapToBinaryFileForMerging(items, partialTmpFilename,
StringSortComparator(false));
});
}
if (fut1.valid()) {
fut1.get();
}
if (fut2.valid()) {
fut2.get();
}
LOG(INFO) << "Done.";
}
18 changes: 18 additions & 0 deletions src/index/Index.h
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,24 @@ class Index {
VocabularyData passFileForVocabulary(const string& ntFile,
size_t linesPerPartial = 100000000);

/**
* @brief Everything that has to be done when we have seen all the triples
* that belong to one partial vocabulary, including Log output used inside
* passFileForVocabulary
*
* @param numLines How many Lines from the KB have we already parsed (only for
* Logging)
* @param numFiles How many partial vocabularies have we seen before/which is
* the index of the voc we are going to write
* @param actualCurrentPartialSize How many triples belong to this partition
* (including extra langfilter triples)
* @param items Contains our unsorted vocabulary. Maps words to their local
* ids within this vocabulary.
*/
void writeNextPartialVocabulary(size_t numLines, size_t numFiles,
size_t actualCurrentPartialSize,
const ad_utility::HashMap<string, Id>& items);

void convertPartialToGlobalIds(TripleVec& data,
const vector<size_t>& actualLinesPerPartial,
size_t linesPerPartial);
Expand Down
27 changes: 11 additions & 16 deletions src/index/Vocabulary.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,14 +105,10 @@ class StringSortComparator {

bool getIgnoreCase() const { return _ignoreCase; }

bool operator()(const std::string& a, const std::string& b) const {
bool operator()(std::string_view a, std::string_view b) const {
if (!_ignoreCase) {
return a < b;
} else {
// TODO<Johannes>
// BUG<Johannes>
// We have to make sure that the literals are all in contiguous space to
// make this work.
// TODO<Johannes> Ideally we want to have this also when doing
// case-insensitive compare, but it currently breaks the prefix
// compression (there we really need ordering by correct bytes)
Expand All @@ -135,7 +131,7 @@ class StringSortComparator {
std::string_view langtag;
};

static SplitVal extractComparable(const std::string& a) {
static SplitVal extractComparable(std::string_view a) {
std::string_view res = a;
bool isLiteral = false;
std::string_view langtag;
Expand All @@ -154,31 +150,30 @@ class StringSortComparator {
}

static bool caseInsensitiveCompare(const SplitVal& a, const SplitVal& b) {
const auto result =
std::mismatch(a.val.cbegin(), a.val.cend(), b.val.cbegin(),
b.val.cend(), [](const auto& lhs, const auto& rhs) {
return tolower(lhs) == tolower(rhs);
});
if (result.second == b.val.end()) {
if (result.first == a.val.end()) {
auto aLower = ad_utility::getLowercaseUtf8(a.val);
auto bLower = ad_utility::getLowercaseUtf8(b.val);
const auto result = std::mismatch(aLower.cbegin(), aLower.cend(),
bLower.cbegin(), bLower.cend());
if (result.second == bLower.end()) {
if (result.first == aLower.end()) {
// In case a and b are equal wrt case-insensitivity we sort by the
// language tag. If this also matches we return the actual order of the
// innter string value. Thus we have a unique ordering that makes life
// inner string value. Thus we have a unique ordering that makes life
// easier.
return a.langtag != b.langtag ? a.langtag < b.langtag : a.val < b.val;
}
// b is a prefix of a, thus a is strictly "bigger"
return false;
}

if (result.first == a.val.end()) {
if (result.first == aLower.end()) {
// a is a prefix of b
return true;
}

// neither string is a prefix of the other, look at the first mismatch
// character if we have reach here, both iterators are save to dereference.
return tolower(*result.first) < tolower(*result.second);
return *result.first < *result.second;
}
bool _ignoreCase;
};
Expand Down
12 changes: 9 additions & 3 deletions src/index/VocabularyImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -280,10 +280,16 @@ bool PrefixComparator<S>::operator()(const string& lhs,
template <class S>
bool PrefixComparator<S>::operator()(const string& lhs,
const string& rhs) const {
// TODO<joka921> use string_view for the substrings
// we cannot use string_views as parameters as they will unfortunately lead
// to ambiguous overloads (even though the CompressedString overload wouldn't
// work.
return _vocab->getCaseComparator()(
lhs.size() > _prefixLength ? lhs.substr(0, _prefixLength) : lhs,
rhs.size() > _prefixLength ? rhs.substr(0, _prefixLength) : rhs);
lhs.size() > _prefixLength
? std::string_view(lhs).substr(0, _prefixLength)
: lhs,
rhs.size() > _prefixLength
? std::string_view(rhs).substr(0, _prefixLength)
: rhs);
}

// _____________________________________________________
Expand Down
8 changes: 4 additions & 4 deletions src/util/StringUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,9 @@ inline string getLowercase(const string& orig);

inline string getUppercase(const string& orig);

inline string getLowercaseUtf8(const string& orig);
inline string getLowercaseUtf8(std::string_view orig);

inline string getUppercaseUtf8(const string& orig);
inline string getUppercaseUtf8(std::string_view orig);

inline string firstCharToUpperUtf8(const string& orig);

Expand Down Expand Up @@ -234,7 +234,7 @@ string getUppercase(const string& orig) {
}

// ____________________________________________________________________________
string getLowercaseUtf8(const string& orig) {
string getLowercaseUtf8(std::string_view orig) {
string retVal;
retVal.reserve(orig.size());
std::mbstate_t state = std::mbstate_t();
Expand Down Expand Up @@ -263,7 +263,7 @@ string getLowercaseUtf8(const string& orig) {
}

// ____________________________________________________________________________
string getUppercaseUtf8(const string& orig) {
string getUppercaseUtf8(std::string_view orig) {
string retVal;
retVal.reserve(orig.size());
std::mbstate_t state = std::mbstate_t();
Expand Down
7 changes: 5 additions & 2 deletions test/VocabularyTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,8 +132,11 @@ TEST(VocabularyTest, StringSortComparator) {
ASSERT_FALSE(comp("alpha", "ALPHA"));
ASSERT_TRUE(comp("ALPHA", "alpha"));

// TODO: check what to do about these cases
// ASSERT_TRUE(comp("\"Hannibal\"@en", "\"Hannibal Hamlin\"@en"));
ASSERT_TRUE(comp("\"Hannibal\"@en", "\"Hannibal Hamlin\"@en"));
ASSERT_TRUE(comp("\"Hannibal\"@af", "\"Hannibal\"@en"));
ASSERT_TRUE(comp("\"HAnnibal\"@en", "\"Hannibal\"@en"));

// TODO<joka921>: test cases for UTF-8

// something is not smaller thant itself
ASSERT_FALSE(comp("beta", "beta"));
Expand Down

0 comments on commit fb3ae82

Please sign in to comment.