Skip to content

Commit

Permalink
LUCENE-8116: SimScorer now only takes a frequency and a norm as per-d…
Browse files Browse the repository at this point in the history
…ocument scoring factors.
  • Loading branch information
jpountz committed Jan 4, 2018
1 parent 8836fda commit 8fd7ead
Show file tree
Hide file tree
Showing 68 changed files with 606 additions and 1,096 deletions.
6 changes: 6 additions & 0 deletions lucene/CHANGES.txt
Expand Up @@ -32,6 +32,9 @@ API Changes
* LUCENE-8012: Explanation now takes Number rather than float (Alan Woodward,
Robert Muir)

* LUCENE-8116: SimScorer now only takes a frequency and a norm as per-document
scoring factors. (Adrien Grand)

Changes in Runtime Behavior

* LUCENE-7837: Indices that were created before the previous major version
Expand All @@ -46,6 +49,9 @@ Changes in Runtime Behavior
* LUCENE-7996: FunctionQuery and FunctionScoreQuery now return a score of 0
when the function produces a negative value. (Adrien Grand)

* LUCENE-8116: Similarities now score fields that omit norms as if the norm was
1. This might change score values on fields that omit norms. (Adrien Grand)

Improvements

* LUCENE-7997: Add BaseSimilarityTestCase to sanity check similarities.
Expand Down
Expand Up @@ -213,7 +213,7 @@ public String toString() {
", classFieldName='" + classFieldName + '\'' +
", k=" + k +
", query=" + query +
", similarity=" + indexSearcher.getSimilarity(true) +
", similarity=" + indexSearcher.getSimilarity() +
'}';
}
}
Expand Up @@ -251,7 +251,7 @@ public String toString() {
", classFieldName='" + classFieldName + '\'' +
", k=" + k +
", query=" + query +
", similarity=" + indexSearcher.getSimilarity(true) +
", similarity=" + indexSearcher.getSimilarity() +
'}';
}
}
Expand Up @@ -48,7 +48,7 @@ final class BooleanWeight extends Weight {
super(query);
this.query = query;
this.scoreMode = scoreMode;
this.similarity = searcher.getSimilarity(scoreMode.needsScores());
this.similarity = searcher.getSimilarity();
weights = new ArrayList<>();
for (BooleanClause c : query) {
Weight w = searcher.createWeight(c.getQuery(), c.isScoring() ? scoreMode : ScoreMode.COMPLETE_NO_SCORES, boost);
Expand Down
Expand Up @@ -22,7 +22,6 @@
import java.util.List;

import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.search.similarities.Similarity;

final class ExactPhraseScorer extends Scorer {

Expand All @@ -42,13 +41,13 @@ public PostingsAndPosition(PostingsEnum postings, int offset) {

private int freq;

private final Similarity.SimScorer docScorer;
private final LeafSimScorer docScorer;
private final boolean needsScores, needsTotalHitCount;
private float matchCost;
private float minCompetitiveScore;

ExactPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
Similarity.SimScorer docScorer, ScoreMode scoreMode,
LeafSimScorer docScorer, ScoreMode scoreMode,
float matchCost) throws IOException {
super(weight);
this.docScorer = docScorer;
Expand Down Expand Up @@ -123,7 +122,7 @@ public float score() throws IOException {

@Override
public float maxScore() {
return docScorer.maxScore(Integer.MAX_VALUE);
return docScorer.maxScore();
}

/** Advance the given pos enum to the first doc on or after {@code target}.
Expand Down
45 changes: 5 additions & 40 deletions lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java
Expand Up @@ -32,7 +32,6 @@

import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.IndexWriter;
Expand Down Expand Up @@ -75,36 +74,6 @@
*/
public class IndexSearcher {

/** A search-time {@link Similarity} that does not make use of scoring factors
* and may be used when scores are not needed. */
private static final Similarity NON_SCORING_SIMILARITY = new Similarity() {

@Override
public long computeNorm(FieldInvertState state) {
throw new UnsupportedOperationException("This Similarity may only be used for searching, not indexing");
}

@Override
public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
return new SimWeight() {};
}

@Override
public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException {
return new SimScorer() {
@Override
public float score(int doc, float freq) {
return 0f;
}
@Override
public float maxScore(float maxFreq) {
return 0f;
}
};
}

};

private static QueryCache DEFAULT_QUERY_CACHE;
private static QueryCachingPolicy DEFAULT_CACHING_POLICY = new UsageTrackingQueryCachingPolicy();
static {
Expand Down Expand Up @@ -136,7 +105,7 @@ public float maxScore(float maxFreq) {
* Expert: returns a default Similarity instance.
* In general, this method is only called to initialize searchers and writers.
* User code and query implementations should respect
* {@link IndexSearcher#getSimilarity(boolean)}.
* {@link IndexSearcher#getSimilarity()}.
* @lucene.internal
*/
public static Similarity getDefaultSimilarity() {
Expand Down Expand Up @@ -329,15 +298,11 @@ public void setSimilarity(Similarity similarity) {
this.similarity = similarity;
}

/** Expert: Get the {@link Similarity} to use to compute scores. When
* {@code needsScores} is {@code false}, this method will return a simple
* {@link Similarity} that does not leverage scoring factors such as norms.
* When {@code needsScores} is {@code true}, this returns the
/** Expert: Get the {@link Similarity} to use to compute scores. This returns the
* {@link Similarity} that has been set through {@link #setSimilarity(Similarity)}
* or the {@link #getDefaultSimilarity()} default {@link Similarity} if none
* has been set explicitly. */
public Similarity getSimilarity(boolean needsScores) {
return needsScores ? similarity : NON_SCORING_SIMILARITY;
* or the default {@link Similarity} if none has been set explicitly. */
public Similarity getSimilarity() {
return similarity;
}

/**
Expand Down
74 changes: 74 additions & 0 deletions lucene/core/src/java/org/apache/lucene/search/LeafSimScorer.java
@@ -0,0 +1,74 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search;

import java.io.IOException;

import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.search.similarities.Similarity.SimScorer;

/**
* {@link SimScorer} on a specific {@link LeafReader}.
*/
public final class LeafSimScorer {

private final SimScorer scorer;
private final NumericDocValues norms;
private final float maxScore;

/**
* Sole constructor: Score documents of {@code reader} with {@code scorer}.
*/
public LeafSimScorer(SimScorer scorer, LeafReader reader, boolean needsScores, float maxFreq) throws IOException {
this.scorer = scorer;
norms = needsScores ? reader.getNormValues(scorer.getField()) : null;
maxScore = scorer.maxScore(maxFreq);
}

private long getNormValue(int doc) throws IOException {
if (norms != null) {
boolean found = norms.advanceExact(doc);
assert found;
return norms.longValue();
} else {
return 1L; // default norm
}
}

/** Score the provided document assuming the given term document frequency.
* This method must be called on non-decreasing sequences of doc ids.
* @see SimScorer#score(float, long) */
public float score(int doc, float freq) throws IOException {
return scorer.score(freq, getNormValue(doc));
}

/** Explain the score for the provided document assuming the given term document frequency.
* This method must be called on non-decreasing sequences of doc ids.
* @see SimScorer#explain(Explanation, long) */
public Explanation explain(int doc, Explanation freqExpl) throws IOException {
return scorer.explain(freqExpl, getNormValue(doc));
}

/**
* Return an upper bound of the score.
* @see SimScorer#maxScore(float)
*/
public float maxScore() {
return maxScore;
}
}
29 changes: 18 additions & 11 deletions lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java
Expand Up @@ -18,19 +18,26 @@


import java.io.IOException;
import java.util.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;

import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
Expand Down Expand Up @@ -183,15 +190,15 @@ public int[] getPositions() {

private class MultiPhraseWeight extends Weight {
private final Similarity similarity;
private final Similarity.SimWeight stats;
private final Similarity.SimScorer stats;
private final Map<Term,TermContext> termContexts = new HashMap<>();
private final ScoreMode scoreMode;

public MultiPhraseWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
throws IOException {
super(MultiPhraseQuery.this);
this.scoreMode = scoreMode;
this.similarity = searcher.getSimilarity(scoreMode.needsScores());
this.similarity = searcher.getSimilarity();
final IndexReaderContext context = searcher.getTopReaderContext();

// compute idf
Expand All @@ -212,7 +219,7 @@ public MultiPhraseWeight(IndexSearcher searcher, ScoreMode scoreMode, float boos
if (allTermStats.isEmpty()) {
stats = null; // none of the terms were found, we won't use sim at all
} else {
stats = similarity.computeWeight(
stats = similarity.scorer(
boost,
searcher.collectionStatistics(field),
allTermStats.toArray(new TermStatistics[allTermStats.size()]));
Expand Down Expand Up @@ -282,11 +289,11 @@ public Scorer scorer(LeafReaderContext context) throws IOException {

if (slop == 0) {
return new ExactPhraseScorer(this, postingsFreqs,
similarity.simScorer(stats, context),
new LeafSimScorer(stats, context.reader(), scoreMode.needsScores(), Integer.MAX_VALUE),
scoreMode, totalMatchCost);
} else {
return new SloppyPhraseScorer(this, postingsFreqs, slop,
similarity.simScorer(stats, context),
new LeafSimScorer(stats, context.reader(), scoreMode.needsScores(), Float.POSITIVE_INFINITY),
scoreMode.needsScores(), totalMatchCost);
}
}
Expand All @@ -303,7 +310,7 @@ public Explanation explain(LeafReaderContext context, int doc) throws IOExceptio
int newDoc = scorer.iterator().advance(doc);
if (newDoc == doc) {
float freq = slop == 0 ? ((ExactPhraseScorer)scorer).freq() : ((SloppyPhraseScorer)scorer).sloppyFreq();
SimScorer docScorer = similarity.simScorer(stats, context);
LeafSimScorer docScorer = new LeafSimScorer(stats, context.reader(), scoreMode.needsScores(), Float.POSITIVE_INFINITY);
Explanation freqExplanation = Explanation.match(freq, "phraseFreq=" + freq);
Explanation scoreExplanation = docScorer.explain(doc, freqExplanation);
return Explanation.match(
Expand Down
13 changes: 6 additions & 7 deletions lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
Expand Up @@ -37,7 +37,6 @@
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;

Expand Down Expand Up @@ -352,7 +351,7 @@ public boolean equals(Object obj) {

private class PhraseWeight extends Weight {
private final Similarity similarity;
private final Similarity.SimWeight stats;
private final Similarity.SimScorer stats;
private final ScoreMode scoreMode;
private transient TermContext states[];

Expand All @@ -366,7 +365,7 @@ public PhraseWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
throw new IllegalStateException("PhraseWeight requires that the first position is 0, call rewrite first");
}
this.scoreMode = scoreMode;
this.similarity = searcher.getSimilarity(scoreMode.needsScores());
this.similarity = searcher.getSimilarity();
final IndexReaderContext context = searcher.getTopReaderContext();
states = new TermContext[terms.length];
TermStatistics termStats[] = new TermStatistics[terms.length];
Expand All @@ -380,7 +379,7 @@ public PhraseWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
}
}
if (termUpTo > 0) {
stats = similarity.computeWeight(boost, searcher.collectionStatistics(field), Arrays.copyOf(termStats, termUpTo));
stats = similarity.scorer(boost, searcher.collectionStatistics(field), Arrays.copyOf(termStats, termUpTo));
} else {
stats = null; // no terms at all, we won't use similarity
}
Expand Down Expand Up @@ -433,11 +432,11 @@ public Scorer scorer(LeafReaderContext context) throws IOException {

if (slop == 0) { // optimize exact case
return new ExactPhraseScorer(this, postingsFreqs,
similarity.simScorer(stats, context),
new LeafSimScorer(stats, context.reader(), scoreMode.needsScores(), Integer.MAX_VALUE),
scoreMode, totalMatchCost);
} else {
return new SloppyPhraseScorer(this, postingsFreqs, slop,
similarity.simScorer(stats, context),
new LeafSimScorer(stats, context.reader(), scoreMode.needsScores(), Float.POSITIVE_INFINITY),
scoreMode.needsScores(), totalMatchCost);
}
}
Expand All @@ -459,7 +458,7 @@ public Explanation explain(LeafReaderContext context, int doc) throws IOExceptio
int newDoc = scorer.iterator().advance(doc);
if (newDoc == doc) {
float freq = slop == 0 ? ((ExactPhraseScorer)scorer).freq() : ((SloppyPhraseScorer)scorer).sloppyFreq();
SimScorer docScorer = similarity.simScorer(stats, context);
LeafSimScorer docScorer = new LeafSimScorer(stats, context.reader(), scoreMode.needsScores(), Float.POSITIVE_INFINITY);
Explanation freqExplanation = Explanation.match(freq, "phraseFreq=" + freq);
Explanation scoreExplanation = docScorer.explain(doc, freqExplanation);
return Explanation.match(
Expand Down

0 comments on commit 8fd7ead

Please sign in to comment.