From e568046ea39908d840bb1499fab50d326fe59216 Mon Sep 17 00:00:00 2001 From: zacharymorn Date: Thu, 18 Nov 2021 21:36:38 -0800 Subject: [PATCH 1/2] LUCENE-10236: Update field-weight used in CombinedFieldQuery scoring calculation (#444) (cherry picked from commit 07ee3ba83a4c9f3abc24bf9d3fbb3c3102c4a102) --- .../sandbox/search/CombinedFieldQuery.java | 2 +- .../search/MultiNormsLeafSimScorer.java | 8 ++ .../search/TestCombinedFieldQuery.java | 78 +++++++++++++++++++ 3 files changed, 87 insertions(+), 1 deletion(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java index d3187a0896e..fccd6ce3eca 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java @@ -418,7 +418,7 @@ public Scorer scorer(LeafReaderContext context) throws IOException { } MultiNormsLeafSimScorer scoringSimScorer = - new MultiNormsLeafSimScorer(simWeight, context.reader(), fields, true); + new MultiNormsLeafSimScorer(simWeight, context.reader(), fieldAndWeights.values(), true); LeafSimScorer nonScoringSimScorer = new LeafSimScorer(simWeight, context.reader(), "pseudo_field", false); // we use termscorers + disjunction as an impl detail diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/MultiNormsLeafSimScorer.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/MultiNormsLeafSimScorer.java index ba1d69a8b16..ebc98df31af 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/MultiNormsLeafSimScorer.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/MultiNormsLeafSimScorer.java @@ -21,8 +21,10 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Collection; +import java.util.HashSet; import java.util.List; import java.util.Objects; +import java.util.Set; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.search.Explanation; @@ -61,7 +63,13 @@ final class MultiNormsLeafSimScorer { if (needsScores) { final List normsList = new ArrayList<>(); final List weightList = new ArrayList<>(); + final Set duplicateCheckingSet = new HashSet<>(); for (FieldAndWeight field : normFields) { + assert duplicateCheckingSet.add(field.field) + : "There is a duplicated field [" + + field.field + + "] used to construct MultiNormsLeafSimScorer"; + NumericDocValues norms = reader.getNormValues(field.field); if (norms != null) { normsList.add(norms); diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java index bfadc6a681a..3a510d19a7f 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java @@ -16,6 +16,10 @@ */ package org.apache.lucene.sandbox.search; +import static com.carrotsearch.randomizedtesting.RandomizedTest.atMost; +import static com.carrotsearch.randomizedtesting.RandomizedTest.randomBoolean; +import static com.carrotsearch.randomizedtesting.RandomizedTest.randomIntBetween; + import com.carrotsearch.randomizedtesting.generators.RandomPicks; import java.io.IOException; import java.util.Arrays; @@ -165,6 +169,80 @@ public void testSameScore() throws IOException { dir.close(); } + public void testScoringWithMultipleFieldTermsMatch() throws IOException { + int numMatchDoc = randomIntBetween(100, 500); + int numHits = atMost(100); + int boost1 = Math.max(1, random().nextInt(5)); + int boost2 = Math.max(1, random().nextInt(5)); + + Directory dir = newDirectory(); + Similarity similarity = randomCompatibleSimilarity(); + + IndexWriterConfig iwc = new IndexWriterConfig(); + iwc.setSimilarity(similarity); + RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc); + + // adding potentially matching doc + for (int i = 0; i < numMatchDoc; i++) { + Document doc = new Document(); + + int freqA = random().nextInt(20) + 1; + for (int j = 0; j < freqA; j++) { + doc.add(new TextField("a", "foo", Store.NO)); + } + + freqA = random().nextInt(20) + 1; + if (randomBoolean()) { + for (int j = 0; j < freqA; j++) { + doc.add(new TextField("a", "foo" + j, Store.NO)); + } + } + + freqA = random().nextInt(20) + 1; + for (int j = 0; j < freqA; j++) { + doc.add(new TextField("a", "zoo", Store.NO)); + } + + int freqB = random().nextInt(20) + 1; + for (int j = 0; j < freqB; j++) { + doc.add(new TextField("b", "zoo", Store.NO)); + } + + freqB = random().nextInt(20) + 1; + if (randomBoolean()) { + for (int j = 0; j < freqB; j++) { + doc.add(new TextField("b", "zoo" + j, Store.NO)); + } + } + + int freqC = random().nextInt(20) + 1; + for (int j = 0; j < freqC; j++) { + doc.add(new TextField("c", "bla" + j, Store.NO)); + } + w.addDocument(doc); + } + + IndexReader reader = w.getReader(); + IndexSearcher searcher = newSearcher(reader); + searcher.setSimilarity(similarity); + + CombinedFieldQuery query = + new CombinedFieldQuery.Builder() + .addField("a", (float) boost1) + .addField("b", (float) boost2) + .addTerm(new BytesRef("foo")) + .addTerm(new BytesRef("zoo")) + .build(); + + TopScoreDocCollector completeCollector = + TopScoreDocCollector.create(numHits, null, Integer.MAX_VALUE); + searcher.search(query, completeCollector); + + reader.close(); + w.close(); + dir.close(); + } + public void testNormsDisabled() throws IOException { Directory dir = newDirectory(); Similarity similarity = randomCompatibleSimilarity(); From 00d59820e38c0f0379c2d358d0dcaad858836e24 Mon Sep 17 00:00:00 2001 From: Zach Chen Date: Wed, 5 Jan 2022 19:23:18 -0800 Subject: [PATCH 2/2] Add change entry --- lucene/CHANGES.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index cbc79c8c42f..35b8b60813a 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -130,6 +130,9 @@ Bug Fixes * LUCENE-10352: Fixed ctor argument checks: JapaneseKatakanaStemFilter, DoubleMetaphoneFilter (Uwe Schindler, Robert Muir) +* LUCENE-10236: Stop duplicating norms when scoring in CombinedFieldQuery. + (Zach Chen, Jim Ferenczi, Julie Tibshirani) + Other ---------------------