Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LUCENE-9635: BM25FQuery - Mask encoded norm long value in array lookup #2138

Merged
merged 1 commit into from
Dec 18, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,9 @@ Bug fixes
* LUCENE-9365: FuzzyQuery was missing matches when prefix length was equal to the term length
(Mark Harwood, Mike Drob)

* LUCENE-9635: BM25FQuery - Mask encoded norm long value in array lookup.
(Yilun Cui)

Other

* LUCENE-9631: Properly override slice() on subclasses of OffsetRange. (Dawid Weiss)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ public boolean advanceExact(int target) throws IOException {
for (int i = 0; i < normsArr.length; i++) {
boolean found = normsArr[i].advanceExact(target);
assert found;
normValue += weightArr[i] * LENGTH_TABLE[(byte) normsArr[i].longValue()];
normValue += weightArr[i] * LENGTH_TABLE[Byte.toUnsignedInt((byte) normsArr[i].longValue())];
}
current = SmallFloat.intToByte4(Math.round(normValue));
return true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,32 +16,36 @@
*/
package org.apache.lucene.sandbox.search;

import java.io.IOException;

import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.sandbox.search.BM25FQuery;
import org.apache.lucene.search.CheckHits;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SynonymQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.TotalHits;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;

import java.io.IOException;

public class TestBM25FQuery extends LuceneTestCase {
public void testInvalid() {
BM25FQuery.Builder builder = new BM25FQuery.Builder();
Expand Down Expand Up @@ -176,4 +180,80 @@ public void testAgainstCopyField() throws IOException {
w.close();
dir.close();
}

public void testDocWithNegativeNorms() throws IOException {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig();
iwc.setSimilarity(new NegativeNormSimilarity());
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);

String queryString = "foo";

Document doc = new Document();
//both fields must contain tokens that match the query string "foo"
doc.add(new TextField("f", "foo", Store.NO));
doc.add(new TextField("g", "foo baz", Store.NO));
w.addDocument(doc);

IndexReader reader = w.getReader();
IndexSearcher searcher = newSearcher(reader);
BM25FQuery query = new BM25FQuery.Builder()
.addField("f")
.addField("g")
.addTerm(new BytesRef(queryString))
.build();
TopDocs topDocs = searcher.search(query, 10);
CheckHits.checkDocIds("queried docs do not match", new int[]{0}, topDocs.scoreDocs);

reader.close();
w.close();
dir.close();
}

public void testMultipleDocsNegativeNorms() throws IOException {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig();
iwc.setSimilarity(new NegativeNormSimilarity());
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);

String queryString = "foo";

Document doc0 = new Document();
doc0.add(new TextField("f", "foo", Store.NO));
doc0.add(new TextField("g", "foo baz", Store.NO));
w.addDocument(doc0);

Document doc1 = new Document();
// add another match on the query string to the second doc
doc1.add(new TextField("f", "foo is foo", Store.NO));
doc1.add(new TextField("g", "foo baz", Store.NO));
w.addDocument(doc1);

IndexReader reader = w.getReader();
IndexSearcher searcher = newSearcher(reader);
BM25FQuery query = new BM25FQuery.Builder()
.addField("f")
.addField("g")
.addTerm(new BytesRef(queryString))
.build();
TopDocs topDocs = searcher.search(query, 10);
//Return doc1 ahead of doc0 since its tf is higher
CheckHits.checkDocIds("queried docs do not match", new int[]{1,0}, topDocs.scoreDocs);

reader.close();
w.close();
dir.close();
}

private static final class NegativeNormSimilarity extends Similarity {
@Override
public long computeNorm(FieldInvertState state) {
return -128;
}

@Override
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
return new BM25Similarity().scorer(boost, collectionStats, termStats);
}
}
}