LUCENE-8116: SimScorer now only takes a frequency and a norm as per-d…

…ocument scoring factors.
apache · Jan 4, 2018 · 8fd7ead · 8fd7ead
1 parent 8836fda
commit 8fd7ead
Show file tree

Hide file tree

Showing 68 changed files with 606 additions and 1,096 deletions.
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -32,6 +32,9 @@ API Changes
 * LUCENE-8012: Explanation now takes Number rather than float (Alan Woodward, 
   Robert Muir)
 
+* LUCENE-8116: SimScorer now only takes a frequency and a norm as per-document
+  scoring factors. (Adrien Grand)
+
 Changes in Runtime Behavior
 
 * LUCENE-7837: Indices that were created before the previous major version
@@ -46,6 +49,9 @@ Changes in Runtime Behavior
 * LUCENE-7996: FunctionQuery and FunctionScoreQuery now return a score of 0
   when the function produces a negative value. (Adrien Grand)
 
+* LUCENE-8116: Similarities now score fields that omit norms as if the norm was
+  1. This might change score values on fields that omit norms. (Adrien Grand)
+
 Improvements
 
 * LUCENE-7997: Add BaseSimilarityTestCase to sanity check similarities.

diff --git a/lucene/classification/src/java/org/apache/lucene/classification/KNearestFuzzyClassifier.java b/lucene/classification/src/java/org/apache/lucene/classification/KNearestFuzzyClassifier.java
@@ -213,7 +213,7 @@ public String toString() {
         ", classFieldName='" + classFieldName + '\'' +
         ", k=" + k +
         ", query=" + query +
-        ", similarity=" + indexSearcher.getSimilarity(true) +
+        ", similarity=" + indexSearcher.getSimilarity() +
         '}';
   }
 }
diff --git a/.../classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java b/.../classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java
@@ -251,7 +251,7 @@ public String toString() {
         ", classFieldName='" + classFieldName + '\'' +
         ", k=" + k +
         ", query=" + query +
-        ", similarity=" + indexSearcher.getSimilarity(true) +
+        ", similarity=" + indexSearcher.getSimilarity() +
         '}';
   }
 }
diff --git a/lucene/core/src/java/org/apache/lucene/search/BooleanWeight.java b/lucene/core/src/java/org/apache/lucene/search/BooleanWeight.java
@@ -48,7 +48,7 @@ final class BooleanWeight extends Weight {
     super(query);
     this.query = query;
     this.scoreMode = scoreMode;
-    this.similarity = searcher.getSimilarity(scoreMode.needsScores());
+    this.similarity = searcher.getSimilarity();
     weights = new ArrayList<>();
     for (BooleanClause c : query) {
       Weight w = searcher.createWeight(c.getQuery(), c.isScoring() ? scoreMode : ScoreMode.COMPLETE_NO_SCORES, boost);

diff --git a/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java b/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java
@@ -22,7 +22,6 @@
 import java.util.List;
 
 import org.apache.lucene.index.PostingsEnum;
-import org.apache.lucene.search.similarities.Similarity;
 
 final class ExactPhraseScorer extends Scorer {
 
@@ -42,13 +41,13 @@ public PostingsAndPosition(PostingsEnum postings, int offset) {
 
   private int freq;
 
-  private final Similarity.SimScorer docScorer;
+  private final LeafSimScorer docScorer;
   private final boolean needsScores, needsTotalHitCount;
   private float matchCost;
   private float minCompetitiveScore;
 
   ExactPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
-                    Similarity.SimScorer docScorer, ScoreMode scoreMode,
+                    LeafSimScorer docScorer, ScoreMode scoreMode,
                     float matchCost) throws IOException {
     super(weight);
     this.docScorer = docScorer;
@@ -123,7 +122,7 @@ public float score() throws IOException {
 
   @Override
   public float maxScore() {
-    return docScorer.maxScore(Integer.MAX_VALUE);
+    return docScorer.maxScore();
   }
 
   /** Advance the given pos enum to the first doc on or after {@code target}.

diff --git a/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java b/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java
@@ -32,7 +32,6 @@
 
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.DirectoryReader;
-import org.apache.lucene.index.FieldInvertState;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexReaderContext;
 import org.apache.lucene.index.IndexWriter;
@@ -75,36 +74,6 @@
  */
 public class IndexSearcher {
 
-  /** A search-time {@link Similarity} that does not make use of scoring factors
-   *  and may be used when scores are not needed. */
-  private static final Similarity NON_SCORING_SIMILARITY = new Similarity() {
-
-    @Override
-    public long computeNorm(FieldInvertState state) {
-      throw new UnsupportedOperationException("This Similarity may only be used for searching, not indexing");
-    }
-
-    @Override
-    public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
-      return new SimWeight() {};
-    }
-
-    @Override
-    public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException {
-      return new SimScorer() {
-        @Override
-        public float score(int doc, float freq) {
-          return 0f;
-        }
-        @Override
-        public float maxScore(float maxFreq) {
-          return 0f;
-        }
-      };
-    }
-
-  };
-
   private static QueryCache DEFAULT_QUERY_CACHE;
   private static QueryCachingPolicy DEFAULT_CACHING_POLICY = new UsageTrackingQueryCachingPolicy();
   static {
@@ -136,7 +105,7 @@ public float maxScore(float maxFreq) {
    * Expert: returns a default Similarity instance.
    * In general, this method is only called to initialize searchers and writers.
    * User code and query implementations should respect
-   * {@link IndexSearcher#getSimilarity(boolean)}.
+   * {@link IndexSearcher#getSimilarity()}.
    * @lucene.internal
    */
   public static Similarity getDefaultSimilarity() {
@@ -329,15 +298,11 @@ public void setSimilarity(Similarity similarity) {
     this.similarity = similarity;
   }
 
-  /** Expert: Get the {@link Similarity} to use to compute scores. When
-   *  {@code needsScores} is {@code false}, this method will return a simple
-   *  {@link Similarity} that does not leverage scoring factors such as norms.
-   *  When {@code needsScores} is {@code true}, this returns the
+  /** Expert: Get the {@link Similarity} to use to compute scores. This returns the
    *  {@link Similarity} that has been set through {@link #setSimilarity(Similarity)}
-   *  or the {@link #getDefaultSimilarity()} default {@link Similarity} if none
-   *  has been set explicitly. */
-  public Similarity getSimilarity(boolean needsScores) {
-    return needsScores ? similarity : NON_SCORING_SIMILARITY;
+   *  or the default {@link Similarity} if none has been set explicitly. */
+  public Similarity getSimilarity() {
+    return similarity;
   }
 
   /**

diff --git a/lucene/core/src/java/org/apache/lucene/search/LeafSimScorer.java b/lucene/core/src/java/org/apache/lucene/search/LeafSimScorer.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.search.similarities.Similarity.SimScorer;
+
+/**
+ * {@link SimScorer} on a specific {@link LeafReader}.
+ */
+public final class LeafSimScorer {
+
+  private final SimScorer scorer;
+  private final NumericDocValues norms;
+  private final float maxScore;
+
+  /**
+   * Sole constructor: Score documents of {@code reader} with {@code scorer}.
+   */
+  public LeafSimScorer(SimScorer scorer, LeafReader reader, boolean needsScores, float maxFreq) throws IOException {
+    this.scorer = scorer;
+    norms = needsScores ? reader.getNormValues(scorer.getField()) : null;
+    maxScore = scorer.maxScore(maxFreq);
+  }
+
+  private long getNormValue(int doc) throws IOException {
+    if (norms != null) {
+      boolean found = norms.advanceExact(doc);
+      assert found;
+      return norms.longValue();
+    } else {
+      return 1L; // default norm
+    }
+  }
+
+  /** Score the provided document assuming the given term document frequency.
+   *  This method must be called on non-decreasing sequences of doc ids.
+   *  @see SimScorer#score(float, long) */
+  public float score(int doc, float freq) throws IOException {
+    return scorer.score(freq, getNormValue(doc));
+  }
+
+  /** Explain the score for the provided document assuming the given term document frequency.
+   *  This method must be called on non-decreasing sequences of doc ids.
+   *  @see SimScorer#explain(Explanation, long) */
+  public Explanation explain(int doc, Explanation freqExpl) throws IOException {
+    return scorer.explain(freqExpl, getNormValue(doc));
+  }
+
+  /**
+   * Return an upper bound of the score.
+   * @see SimScorer#maxScore(float)
+   */
+  public float maxScore() {
+    return maxScore;
+  }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java
@@ -18,19 +18,26 @@
 
 
 import java.io.IOException;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Set;
 
-import org.apache.lucene.index.LeafReaderContext;
-import org.apache.lucene.index.LeafReader;
-import org.apache.lucene.index.PostingsEnum;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexReaderContext;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.PostingsEnum;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermContext;
 import org.apache.lucene.index.TermState;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.search.similarities.Similarity.SimScorer;
 import org.apache.lucene.search.similarities.Similarity;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
@@ -183,15 +190,15 @@ public int[] getPositions() {
 
   private class MultiPhraseWeight extends Weight {
     private final Similarity similarity;
-    private final Similarity.SimWeight stats;
+    private final Similarity.SimScorer stats;
     private final Map<Term,TermContext> termContexts = new HashMap<>();
     private final ScoreMode scoreMode;
 
     public MultiPhraseWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
       throws IOException {
       super(MultiPhraseQuery.this);
       this.scoreMode = scoreMode;
-      this.similarity = searcher.getSimilarity(scoreMode.needsScores());
+      this.similarity = searcher.getSimilarity();
       final IndexReaderContext context = searcher.getTopReaderContext();
 
       // compute idf
@@ -212,7 +219,7 @@ public MultiPhraseWeight(IndexSearcher searcher, ScoreMode scoreMode, float boos
       if (allTermStats.isEmpty()) {
         stats = null; // none of the terms were found, we won't use sim at all
       } else {
-        stats = similarity.computeWeight(
+        stats = similarity.scorer(
           boost,
           searcher.collectionStatistics(field),
           allTermStats.toArray(new TermStatistics[allTermStats.size()]));
@@ -282,11 +289,11 @@ public Scorer scorer(LeafReaderContext context) throws IOException {
 
       if (slop == 0) {
         return new ExactPhraseScorer(this, postingsFreqs,
-                                      similarity.simScorer(stats, context),
+                                      new LeafSimScorer(stats, context.reader(), scoreMode.needsScores(), Integer.MAX_VALUE),
                                       scoreMode, totalMatchCost);
       } else {
         return new SloppyPhraseScorer(this, postingsFreqs, slop,
-                                        similarity.simScorer(stats, context),
+                                        new LeafSimScorer(stats, context.reader(), scoreMode.needsScores(), Float.POSITIVE_INFINITY),
                                         scoreMode.needsScores(), totalMatchCost);
       }
     }
@@ -303,7 +310,7 @@ public Explanation explain(LeafReaderContext context, int doc) throws IOExceptio
         int newDoc = scorer.iterator().advance(doc);
         if (newDoc == doc) {
           float freq = slop == 0 ? ((ExactPhraseScorer)scorer).freq() : ((SloppyPhraseScorer)scorer).sloppyFreq();
-          SimScorer docScorer = similarity.simScorer(stats, context);
+          LeafSimScorer docScorer = new LeafSimScorer(stats, context.reader(), scoreMode.needsScores(), Float.POSITIVE_INFINITY);
           Explanation freqExplanation = Explanation.match(freq, "phraseFreq=" + freq);
           Explanation scoreExplanation = docScorer.explain(doc, freqExplanation);
           return Explanation.match(

diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
@@ -37,7 +37,6 @@
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.similarities.Similarity;
-import org.apache.lucene.search.similarities.Similarity.SimScorer;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 
@@ -352,7 +351,7 @@ public boolean equals(Object obj) {
 
   private class PhraseWeight extends Weight {
     private final Similarity similarity;
-    private final Similarity.SimWeight stats;
+    private final Similarity.SimScorer stats;
     private final ScoreMode scoreMode;
     private transient TermContext states[];
 
@@ -366,7 +365,7 @@ public PhraseWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
         throw new IllegalStateException("PhraseWeight requires that the first position is 0, call rewrite first");
       }
       this.scoreMode = scoreMode;
-      this.similarity = searcher.getSimilarity(scoreMode.needsScores());
+      this.similarity = searcher.getSimilarity();
       final IndexReaderContext context = searcher.getTopReaderContext();
       states = new TermContext[terms.length];
       TermStatistics termStats[] = new TermStatistics[terms.length];
@@ -380,7 +379,7 @@ public PhraseWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
         }
       }
       if (termUpTo > 0) {
-        stats = similarity.computeWeight(boost, searcher.collectionStatistics(field), Arrays.copyOf(termStats, termUpTo));
+        stats = similarity.scorer(boost, searcher.collectionStatistics(field), Arrays.copyOf(termStats, termUpTo));
       } else {
         stats = null; // no terms at all, we won't use similarity
       }
@@ -433,11 +432,11 @@ public Scorer scorer(LeafReaderContext context) throws IOException {
 
       if (slop == 0) {  // optimize exact case
         return new ExactPhraseScorer(this, postingsFreqs,
-                                      similarity.simScorer(stats, context),
+                                      new LeafSimScorer(stats, context.reader(), scoreMode.needsScores(), Integer.MAX_VALUE),
                                       scoreMode, totalMatchCost);
       } else {
         return new SloppyPhraseScorer(this, postingsFreqs, slop,
-                                        similarity.simScorer(stats, context),
+                                        new LeafSimScorer(stats, context.reader(), scoreMode.needsScores(), Float.POSITIVE_INFINITY),
                                         scoreMode.needsScores(), totalMatchCost);
       }
     }
@@ -459,7 +458,7 @@ public Explanation explain(LeafReaderContext context, int doc) throws IOExceptio
         int newDoc = scorer.iterator().advance(doc);
         if (newDoc == doc) {
           float freq = slop == 0 ? ((ExactPhraseScorer)scorer).freq() : ((SloppyPhraseScorer)scorer).sloppyFreq();
-          SimScorer docScorer = similarity.simScorer(stats, context);
+          LeafSimScorer docScorer = new LeafSimScorer(stats, context.reader(), scoreMode.needsScores(), Float.POSITIVE_INFINITY);
           Explanation freqExplanation = Explanation.match(freq, "phraseFreq=" + freq);
           Explanation scoreExplanation = docScorer.explain(doc, freqExplanation);
           return Explanation.match(