Skip to content

Commit

Permalink
Fix externalization bug in text index (#534)
Browse files Browse the repository at this point in the history
Words in the text index should never be externalized. The bug was that they were considered as externalizable at query time, when they looked like a literal, for example "...@...
  • Loading branch information
joka921 committed Jan 11, 2022
1 parent f3c8713 commit e1bac82
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 11 deletions.
14 changes: 7 additions & 7 deletions src/index/Index.Text.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,10 @@ void Index::addTextFromOnDiskIndex() {

// _____________________________________________________________________________
size_t Index::passContextFileForVocabulary(string const& contextFile) {
// We have to have a configuration by now. Rereading it is necessary,
// because we might only add an text index to the already existing KB index.
// In this case we also need the correct locale settings etc.
readConfiguration();
LOG(INFO) << "Making pass over ContextFile " << contextFile
<< " for vocabulary." << std::endl;
ContextFileParser::Line line;
Expand Down Expand Up @@ -174,16 +178,12 @@ void Index::passContextFileIntoVector(const string& contextFile,
} else {
++nofWordPostings;
Id wid;
#ifndef NDEBUG
bool ret = _textVocab.getId(line._word, &wid);
if (!ret) {
LOG(INFO) << "ERROR: word " << line._word
<< "not found in textVocab. Terminating\n";
LOG(ERROR) << "ERROR: word \"" << line._word << "\" "
<< "not found in textVocab. Terminating\n";
AD_CHECK(false);
}
assert(ret);
#else
_textVocab.getId(line._word, &wid);
#endif
wordsInContext[wid] += line._score;
}
++i;
Expand Down
15 changes: 12 additions & 3 deletions src/index/Vocabulary.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -206,10 +206,19 @@ bool Vocabulary<S, C>::isExternalizedLiteral(const string& word) {
// _____________________________________________________________________________
template <class S, class C>
bool Vocabulary<S, C>::shouldBeExternalized(const string& word) const {
if (!isLiteral(word)) {
return shouldEntityBeExternalized(word);
// TODO<joka921> Completely refactor the Vocabulary on the different
// Types, it is a mess.

// If the string is not compressed, this means that this is a text vocabulary
// and thus doesn't support externalization.
if constexpr (std::is_same_v<S, CompressedString>) {
if (!isLiteral(word)) {
return shouldEntityBeExternalized(word);
} else {
return shouldLiteralBeExternalized(word);
}
} else {
return shouldLiteralBeExternalized(word);
return false;
}
}

Expand Down
5 changes: 4 additions & 1 deletion test/StringSortComparatorTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ TEST(StringSortComparatorTest, TripleComponentComparatorTotal) {

// ______________________________________________________________________________________________
TEST(StringSortComparatorTest, SimpleStringComparator) {
SimpleStringComparator comp("en", "US", false);
SimpleStringComparator comp("en", "US", true);

// strange casings must not affect order
ASSERT_TRUE(comp("ALPHA", "beta"));
Expand All @@ -177,4 +177,7 @@ TEST(StringSortComparatorTest, SimpleStringComparator) {

// something is not smaller thant itself
ASSERT_FALSE(comp("beta", "beta"));

ASSERT_TRUE(comp("\"@u2", "@u2"));
ASSERT_FALSE(comp("@u2", "\"@u2"));
}

0 comments on commit e1bac82

Please sign in to comment.