From 857281967c5f36b904e180e918b4af0f2a18b48f Mon Sep 17 00:00:00 2001 From: Paul Elschot Date: Wed, 8 Mar 2017 23:24:11 +0100 Subject: [PATCH] LUCENE-7580 of 8 Mar 2017. Resolves a conflict with recent simplification of NearSpanUnordered. Includes recent SpanSynonymQuery. --- .../lucene/search/DisiPriorityQueue.java | 9 + .../apache/lucene/search/SynonymQuery.java | 65 +- .../search/similarities/BM25Similarity.java | 1 + .../similarities/ClassicSimilarity.java | 1 + .../search/similarities/Similarity.java | 60 +- .../spans/AsSingleTermSpansDocScorer.java | 171 +++++ .../search/spans/ConjunctionNearSpans.java | 38 + .../spans/ConjunctionNearSpansDocScorer.java | 105 +++ .../search/spans/DisjunctionNearSpans.java | 122 ++++ .../spans/DisjunctionNearSpansDocScorer.java | 51 ++ .../lucene/search/spans/DisjunctionSpans.java | 259 +++++++ .../spans/DisjunctionSpansDocScorer.java | 101 +++ .../lucene/search/spans/NearSpansOrdered.java | 17 +- .../search/spans/NearSpansUnordered.java | 23 +- .../lucene/search/spans/SpanNearQuery.java | 76 +- .../lucene/search/spans/SpanNotQuery.java | 2 +- .../lucene/search/spans/SpanOrQuery.java | 292 ++------ .../search/spans/SpanPositionQueue.java | 9 + .../lucene/search/spans/SpanSynonymQuery.java | 191 +++++ .../lucene/search/spans/SpanTermQuery.java | 2 +- .../org/apache/lucene/search/spans/Spans.java | 2 + .../lucene/search/spans/SpansDocScorer.java | 58 ++ .../lucene/search/spans/SpansTreeQuery.java | 328 +++++++++ .../lucene/search/spans/SpansTreeScorer.java | 202 ++++++ .../lucene/search/spans/SynonymSpans.java | 54 ++ .../search/spans/SynonymSpansDocScorer.java | 62 ++ .../apache/lucene/search/spans/TermSpans.java | 2 + .../search/spans/TermSpansDocScorer.java | 46 ++ .../org/apache/lucene/util/PriorityQueue.java | 8 + .../spans/TestSpanSearchEquivalence.java | 222 +++--- .../search/spans/TestSpanSynonymQuery.java | 238 ++++++ .../search/spans/TestSpansTreeQuery.java | 679 ++++++++++++++++++ .../lucene/queryparser/xml/CoreParser.java | 4 + .../xml/builders/SpanSynonymBuilder.java | 70 ++ .../lucene/queryparser/xml/SpanQuery.xml | 1 + 35 files changed, 3178 insertions(+), 393 deletions(-) create mode 100644 lucene/core/src/java/org/apache/lucene/search/spans/AsSingleTermSpansDocScorer.java create mode 100644 lucene/core/src/java/org/apache/lucene/search/spans/ConjunctionNearSpans.java create mode 100644 lucene/core/src/java/org/apache/lucene/search/spans/ConjunctionNearSpansDocScorer.java create mode 100644 lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionNearSpans.java create mode 100644 lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionNearSpansDocScorer.java create mode 100644 lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionSpans.java create mode 100644 lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionSpansDocScorer.java create mode 100644 lucene/core/src/java/org/apache/lucene/search/spans/SpanSynonymQuery.java create mode 100644 lucene/core/src/java/org/apache/lucene/search/spans/SpansDocScorer.java create mode 100644 lucene/core/src/java/org/apache/lucene/search/spans/SpansTreeQuery.java create mode 100644 lucene/core/src/java/org/apache/lucene/search/spans/SpansTreeScorer.java create mode 100644 lucene/core/src/java/org/apache/lucene/search/spans/SynonymSpans.java create mode 100644 lucene/core/src/java/org/apache/lucene/search/spans/SynonymSpansDocScorer.java create mode 100644 lucene/core/src/java/org/apache/lucene/search/spans/TermSpansDocScorer.java create mode 100644 lucene/core/src/test/org/apache/lucene/search/spans/TestSpanSynonymQuery.java create mode 100644 lucene/core/src/test/org/apache/lucene/search/spans/TestSpansTreeQuery.java create mode 100644 lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/builders/SpanSynonymBuilder.java diff --git a/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueue.java b/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueue.java index 0692a7b914e8..e1dcbbb74d76 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueue.java +++ b/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueue.java @@ -105,6 +105,15 @@ public DisiWrapper add(DisiWrapper entry) { return heap[0]; } + /** The total {@link DocIdSetIterator#cost()} of the iterators in the queue */ + public long totalCost() { + long res = 0; + for (int i = 0; i < size; i++) { + res += heap[i].cost; + } + return res; + } + public DisiWrapper pop() { final DisiWrapper[] heap = this.heap; final DisiWrapper result = heap[0]; diff --git a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java index c718dc9ed761..ecca69a0a2ed 100644 --- a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java @@ -29,11 +29,14 @@ import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermContext; import org.apache.lucene.index.TermState; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.search.similarities.Similarity.SimScorer; +import org.apache.lucene.search.MatchNoDocsQuery; + /** * A query that treats multiple terms as synonyms. @@ -45,7 +48,8 @@ */ public final class SynonymQuery extends Query { private final Term terms[]; - + private final String field; + /** * Creates a new SynonymQuery, matching any of the supplied terms. *

@@ -62,16 +66,23 @@ public SynonymQuery(Term... terms) { throw new IllegalArgumentException("Synonyms must be across the same field"); } } + this.field = field; if (terms.length > BooleanQuery.getMaxClauseCount()) { throw new BooleanQuery.TooManyClauses(); } Arrays.sort(this.terms); } + /** The terms to be treated as synonyms. */ public List getTerms() { return Collections.unmodifiableList(Arrays.asList(terms)); } - + + /** The field of the terms. */ + public String getField() { + return field; + } + @Override public String toString(String field) { StringBuilder builder = new StringBuilder("Synonym("); @@ -101,7 +112,7 @@ public boolean equals(Object other) { public Query rewrite(IndexReader reader) throws IOException { // optimize zero and single term cases if (terms.length == 0) { - return new BooleanQuery.Builder().build(); + return new MatchNoDocsQuery(); } if (terms.length == 1) { return new TermQuery(terms[0]); @@ -122,8 +133,8 @@ public Weight createWeight(IndexSearcher searcher, boolean needsScores, float bo return searcher.rewrite(bq.build()).createWeight(searcher, needsScores, boost); } } - - class SynonymWeight extends Weight { + + public class SynonymWeight extends Weight { private final TermContext termContexts[]; private final Similarity similarity; private final Similarity.SimWeight simWeight; @@ -183,18 +194,40 @@ public Explanation explain(LeafReaderContext context, int doc) throws IOExceptio return Explanation.noMatch("no matching term"); } + /** + * Expert: Return a SimScorer for this context. + * Public only for use in the spans package. + * @param context the LeafReaderContext + * @return a SimWeight + * @throws IOException on error + */ + public Similarity.SimScorer getSimScorer(LeafReaderContext context) throws IOException { + return similarity.simScorer(simWeight, context); + } + + /** + * Expert: Return a TermContext array in the same order as the terms. + * Public only for use in the spans package, do not modify. + */ + public TermContext[] getTermContexts() { + return termContexts; + } + @Override public Scorer scorer(LeafReaderContext context) throws IOException { - Similarity.SimScorer simScorer = similarity.simScorer(simWeight, context); + Similarity.SimScorer simScorer = getSimScorer(context); // we use termscorers + disjunction as an impl detail List subScorers = new ArrayList<>(); - for (int i = 0; i < terms.length; i++) { - TermState state = termContexts[i].get(context.ord); - if (state != null) { - TermsEnum termsEnum = context.reader().terms(terms[i].field()).iterator(); - termsEnum.seekExact(terms[i].bytes(), state); - PostingsEnum postings = termsEnum.postings(null, PostingsEnum.FREQS); - subScorers.add(new TermScorer(this, postings, simScorer)); + Terms fieldTerms = context.reader().terms(field); + if (fieldTerms != null) { + TermsEnum termsEnum = fieldTerms.iterator(); + for (int i = 0; i < terms.length; i++) { + TermState state = termContexts[i].get(context.ord); + if (state != null) { + termsEnum.seekExact(terms[i].bytes(), state); + PostingsEnum postings = termsEnum.postings(null, PostingsEnum.FREQS); + subScorers.add(new TermScorer(this, postings, simScorer)); + } } } if (subScorers.isEmpty()) { @@ -207,10 +240,10 @@ public Scorer scorer(LeafReaderContext context) throws IOException { } } } - + static class SynonymScorer extends DisjunctionScorer { private final Similarity.SimScorer similarity; - + SynonymScorer(Similarity.SimScorer similarity, Weight weight, List subScorers) { super(weight, subScorers, true); this.similarity = similarity; @@ -220,7 +253,7 @@ static class SynonymScorer extends DisjunctionScorer { protected float score(DisiWrapper topList) throws IOException { return similarity.score(topList.doc, tf(topList)); } - + /** combines TF of all subs. */ final int tf(DisiWrapper topList) throws IOException { int tf = 0; diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java index 74978fd4dd92..3a823fc8f675 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java @@ -75,6 +75,7 @@ protected float idf(long docFreq, long docCount) { /** Implemented as 1 / (distance + 1). */ protected float sloppyFreq(int distance) { + assert distance <= Integer.MAX_VALUE - 1; return 1.0f / (distance + 1); } diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java index f56575f0d4c2..f02fe94465f1 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java @@ -114,6 +114,7 @@ public float tf(float freq) { /** Implemented as 1 / (distance + 1). */ @Override public float sloppyFreq(int distance) { + assert distance <= Integer.MAX_VALUE - 1; return 1.0f / (distance + 1); } diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java index 7f0f27c5c5df..fdf87994386c 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java @@ -32,36 +32,36 @@ import java.io.IOException; import java.util.Collections; -/** +/** * Similarity defines the components of Lucene scoring. *

* Expert: Scoring API. *

- * This is a low-level API, you should only extend this API if you want to implement - * an information retrieval model. If you are instead looking for a convenient way + * This is a low-level API, you should only extend this API if you want to implement + * an information retrieval model. If you are instead looking for a convenient way * to alter Lucene's scoring, consider extending a higher-level implementation - * such as {@link TFIDFSimilarity}, which implements the vector space model with this API, or + * such as {@link TFIDFSimilarity}, which implements the vector space model with this API, or * just tweaking the default implementation: {@link BM25Similarity}. *

* Similarity determines how Lucene weights terms, and Lucene interacts with - * this class at both index-time and + * this class at both index-time and * query-time. *

* Indexing Time * At indexing time, the indexer calls {@link #computeNorm(FieldInvertState)}, allowing - * the Similarity implementation to set a per-document value for the field that will + * the Similarity implementation to set a per-document value for the field that will * be later accessible via {@link org.apache.lucene.index.LeafReader#getNormValues(String)}. Lucene makes no assumption - * about what is in this norm, but it is most useful for encoding length normalization + * about what is in this norm, but it is most useful for encoding length normalization * information. *

* Implementations should carefully consider how the normalization is encoded: while * Lucene's {@link BM25Similarity} encodes a combination of index-time boost - * and length normalization information with {@link SmallFloat} into a single byte, this + * and length normalization information with {@link SmallFloat} into a single byte, this * might not be suitable for all purposes. *

- * Many formulas require the use of average document length, which can be computed via a - * combination of {@link CollectionStatistics#sumTotalTermFreq()} and - * {@link CollectionStatistics#maxDoc()} or {@link CollectionStatistics#docCount()}, + * Many formulas require the use of average document length, which can be computed via a + * combination of {@link CollectionStatistics#sumTotalTermFreq()} and + * {@link CollectionStatistics#maxDoc()} or {@link CollectionStatistics#docCount()}, * depending upon whether the average should reflect field sparsity. *

* Additional scoring factors can be stored in named @@ -71,7 +71,7 @@ * Finally, using index-time boosts (either via folding into the normalization byte or * via DocValues), is an inefficient way to boost the scores of different fields if the * boost will be the same for every document, instead the Similarity can simply take a constant - * boost parameter C, and {@link PerFieldSimilarityWrapper} can return different + * boost parameter C, and {@link PerFieldSimilarityWrapper} can return different * instances with different boosts depending upon field name. *

* Query time @@ -79,16 +79,16 @@ *

    *
  1. The {@link #computeWeight(float, CollectionStatistics, TermStatistics...)} method is called a single time, * allowing the implementation to compute any statistics (such as IDF, average document length, etc) - * across the entire collection. The {@link TermStatistics} and {@link CollectionStatistics} passed in + * across the entire collection. The {@link TermStatistics} and {@link CollectionStatistics} passed in * already contain all of the raw statistics involved, so a Similarity can freely use any combination - * of statistics without causing any additional I/O. Lucene makes no assumption about what is + * of statistics without causing any additional I/O. Lucene makes no assumption about what is * stored in the returned {@link Similarity.SimWeight} object. *
  2. For each segment in the index, the Query creates a {@link #simScorer(SimWeight, org.apache.lucene.index.LeafReaderContext)} * The score() method is called for each matching document. *
*

* Explanations - * When {@link IndexSearcher#explain(org.apache.lucene.search.Query, int)} is called, queries consult the Similarity's DocScorer for an + * When {@link IndexSearcher#explain(org.apache.lucene.search.Query, int)} is called, queries consult the Similarity's DocScorer for an * explanation of how it computed its score. The query passes in a the document id and an explanation of how the frequency * was computed. * @@ -97,13 +97,13 @@ * @lucene.experimental */ public abstract class Similarity { - + /** - * Sole constructor. (For invocation by subclass + * Sole constructor. (For invocation by subclass * constructors, typically implicit.) */ public Similarity() {} - + /** * Computes the normalization value for a field, given the accumulated * state of term processing for this field (see {@link FieldInvertState}). @@ -111,9 +111,9 @@ public Similarity() {} *

Matches in longer fields are less precise, so implementations of this * method usually set smaller values when state.getLength() is large, * and larger values when state.getLength() is small. - * + * * @lucene.experimental - * + * * @param state current processing state for this field * @return computed norm value */ @@ -138,19 +138,19 @@ public abstract SimWeight computeWeight(float boost, * @throws IOException if there is a low-level I/O error */ public abstract SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException; - + /** * API for scoring "sloppy" queries such as {@link TermQuery}, * {@link SpanQuery}, and {@link PhraseQuery}. *

- * Frequencies are floating-point values: an approximate - * within-document frequency adjusted for "sloppiness" by + * Frequencies may be floating-point values to allow an approximate + * within-document frequency adjusted for "sloppiness" by * {@link SimScorer#computeSlopFactor(int)}. */ public static abstract class SimScorer { - + /** - * Sole constructor. (For invocation by subclass + * Sole constructor. (For invocation by subclass * constructors, typically implicit.) */ public SimScorer() {} @@ -165,10 +165,10 @@ public SimScorer() {} /** Computes the amount of a sloppy phrase match, based on an edit distance. */ public abstract float computeSlopFactor(int distance); - + /** Calculate a scoring factor based on the data in the payload. */ public abstract float computePayloadFactor(int doc, int start, int end, BytesRef payload); - + /** * Explain the score for a single document * @param doc document id within the inverted index segment @@ -182,16 +182,16 @@ public Explanation explain(int doc, Explanation freq) throws IOException { Collections.singleton(freq)); } } - + /** Stores the weight for a query across the indexed collection. This abstract * implementation is empty; descendants of {@code Similarity} should * subclass {@code SimWeight} and define the statistics they require in the * subclass. Examples include idf, average field length, etc. */ public static abstract class SimWeight { - + /** - * Sole constructor. (For invocation by subclass + * Sole constructor. (For invocation by subclass * constructors, typically implicit.) */ public SimWeight() {} diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/AsSingleTermSpansDocScorer.java b/lucene/core/src/java/org/apache/lucene/search/spans/AsSingleTermSpansDocScorer.java new file mode 100644 index 000000000000..7e7c73165293 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/spans/AsSingleTermSpansDocScorer.java @@ -0,0 +1,171 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.spans; + +import java.io.IOException; +import java.util.Set; +import static java.util.Arrays.sort; + +import org.apache.lucene.search.similarities.Similarity.SimScorer; +import static org.apache.lucene.util.ArrayUtil.oversize; + + +/** + * For {@link SpansTreeQuery}. Public for extension. + * + * @lucene.experimental + */ +public abstract class AsSingleTermSpansDocScorer +extends SpansDocScorer { + + protected final SimScorer simScorer; + protected final double nonMatchWeight; + + protected int tf; + protected int matchTF; + protected int lastRecordedPosition; + protected double[] occSlops; + + protected final int INIT_SLOPS_SIZE = 2; // CHECKME: use average term frequency? + + /** + * @param simScorer Scores the term occurrences. + * @param nonMatchWeight The non negative weight to be used for the non matching term occurrences. + */ + public AsSingleTermSpansDocScorer(SimScorer simScorer, double nonMatchWeight) { + this.simScorer = simScorer; + this.nonMatchWeight = nonMatchWeight; + assert nonMatchWeight >= 0 : ("nonMatchWeight="+ nonMatchWeight); + this.occSlops = new double[INIT_SLOPS_SIZE]; + } + + /** The total number of occurrences of the term in the current document. + */ + public abstract int termFreqInDoc() throws IOException; + + @Override + public void beginDoc(int doc) throws IOException { + super.beginDoc(doc); + + matchTF = 0; + lastRecordedPosition = -1; + // currentDoc = docID(); // only for asserts + + tf = termFreqInDoc(); + assert tf >= 1; + if (occSlops.length < tf) { + occSlops = new double[oversize(tf, Double.BYTES)]; + } + } + + @Override + public void extractSpansDocScorersAtDoc(Set> spansDocScorersAtDoc) { + spansDocScorersAtDoc.add(this); + } + + + /** Record a matching term occurrence and record its slopFactor and position. + * When this is called more than once for a document, the position should not decrease. + * Keep the largest slop factor when the position has not changed. + */ + @Override + public void recordMatch(double slopFactor, int position) { + assert slopFactor >= 0; + assert position != Spans.NO_MORE_POSITIONS; + if (position < lastRecordedPosition) { + throw new AssertionError("position=" + position + " is before lastRecordedPosition=" + lastRecordedPosition); + // in case this becomes normal, record all positions and slopFactors and take maximum slopFactor later. + } + if (lastRecordedPosition < position) { + occSlops[matchTF] = slopFactor; + matchTF += 1; + assert matchTF <= tf; + lastRecordedPosition = position; + } else { + assert lastRecordedPosition == position; + assert matchTF >= 1; + if (slopFactor > occSlops[matchTF-1]) { + occSlops[matchTF-1] = slopFactor; + } + } + } + + @Override + public int docMatchFreq() { + return matchTF; + } + + /** Compute the document score for the term. + *
+ * For each matching occurrence determine the score contribution + * and use the given slop factors in decreasing order as weights + * on this contribution. + *
+ * Use the nonMatchSlop as the weight for the score contribution + * of the non matching occurrences. + *
+ * For this it is assumed that {@link SimScorer#score(int, float)} provides + * a diminishing (at least non increasing) + * score contribution for each extra term occurrence. + *
+ * Return the sum of these weighted contributions over all term occurrences. + *

+ * The implementation is not optimal, especially when there are many + * matching occurrences with the same slop factors. + *

+ * Aside: The purpose of using the given slop factors in decreasing order + * is to provide scoring consistency + * between span near queries that only differ in the maximum allowed slop. + * This consistency requires that any extra match increases the score of the document, + * even when an extra match has a bigger slop and corresponding lower slop factor. + * It is not known whether such scoring consistency is always achieved. + *
+ * Sorting the slop factors could be avoided if an actual score + * of each single term occurrence was available. + * In that case the given slop factor could be used as a weight on that score. + * Perhaps it is possible to estimate an actual score for a single term + * occurrence from the distances to other occurrences of the same term. + */ + @Override + public double docScore() throws IOException { + double docScore = 0; + + double cumulMatchTFScore = 0; + + if (matchTF > 0) { + sort(occSlops, 0, matchTF); + assert occSlops[0] >= nonMatchWeight; // non match distance large enough + + for (int matchOcc = 1; matchOcc <= matchTF; matchOcc++) { + double prev = cumulMatchTFScore; + cumulMatchTFScore = simScorer.score(currentDoc, (float) (matchOcc)); + double matchTFScore = cumulMatchTFScore - prev; // matchTFScore should not increase + // use occurence slop factors in decreasing order: + docScore += matchTFScore * occSlops[matchTF - matchOcc]; + } + } + + if (matchTF < tf) { // non matching occurrences + double tfScore = simScorer.score(currentDoc, (float) tf); + double nonMatchingFreqScore = tfScore - cumulMatchTFScore; + double nonMatchScore = nonMatchingFreqScore * nonMatchWeight; + docScore += nonMatchScore; + } + + return docScore; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/ConjunctionNearSpans.java b/lucene/core/src/java/org/apache/lucene/search/spans/ConjunctionNearSpans.java new file mode 100644 index 000000000000..f31b86b4a4b2 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/spans/ConjunctionNearSpans.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.spans; + +import java.util.List; + +import org.apache.lucene.search.similarities.Similarity; + +/** + * Spans that are all present within a given slop. + * + * @lucene.experimental + */ +public abstract class ConjunctionNearSpans extends ConjunctionSpans { + protected final Similarity.SimScorer simScorer; + + public ConjunctionNearSpans(List subSpans, Similarity.SimScorer simScorer) { + super(subSpans); + this.simScorer = simScorer; + } + + /** Compute the slop of the current match. */ + public abstract int currentSlop(); +} diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/ConjunctionNearSpansDocScorer.java b/lucene/core/src/java/org/apache/lucene/search/spans/ConjunctionNearSpansDocScorer.java new file mode 100644 index 000000000000..e47c5bf7a360 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/spans/ConjunctionNearSpansDocScorer.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.spans; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Set; + +import org.apache.lucene.search.similarities.Similarity.SimScorer; + +/** + * For {@link SpansTreeQuery}. Public for extension. + * + * @lucene.experimental + */ +public class ConjunctionNearSpansDocScorer extends SpansDocScorer { + protected final SimScorer simScorer; + protected final Spans[] subSpansArray; + protected final ArrayList> subSpansDocScorers; + protected final ConjunctionNearSpans nearSpans; + + /** Create a ConjunctionNearSpansDocScorer for a ConjunctionNearSpans and its subspans. + * For the subspans use {@link SpansTreeScorer#createSpansDocScorer}. + */ + public ConjunctionNearSpansDocScorer( + SpansTreeScorer spansTreeScorer, + ConjunctionNearSpans nearSpans) + { + this.nearSpans = nearSpans; + this.simScorer = nearSpans.simScorer; + this.subSpansArray = nearSpans.getSubSpans(); + this.subSpansDocScorers = new ArrayList<>(subSpansArray.length); + for (Spans subSpans : subSpansArray) { + SpansDocScorer spansDocScorer = spansTreeScorer.createSpansDocScorer(subSpans); + subSpansDocScorers.add(spansDocScorer); + } + } + + @Override + public void beginDoc(int doc) throws IOException { + super.beginDoc(doc); + for (SpansDocScorer spansDocScorer : subSpansDocScorers) { + spansDocScorer.beginDoc(doc); + } + } + + @Override + public void extractSpansDocScorersAtDoc(Set> spansDocScorersAtDoc) { + for (SpansDocScorer spansDocScorer : subSpansDocScorers) { + spansDocScorer.extractSpansDocScorersAtDoc(spansDocScorersAtDoc); + } + } + + + /** Record a matching occurrence for all subspans. + * Use a slop factor that is the product of the given slopFactor + * and the slop factor of {@link ConjunctionNearSpans#currentSlop}. + */ + @Override + public void recordMatch(double slopFactor, int position) { + int slop = Integer.max(nearSpans.currentSlop(), 0); // avoid infinite localSlopFactor for negative slop + double localSlopFactor = simScorer.computeSlopFactor(slop); + double nestedSlopFactor = slopFactor * localSlopFactor; + for (int i = 0; i < subSpansArray.length; i++) { + Spans subSpans = subSpansArray[i]; + assert subSpans.startPosition() >= position; + SpansDocScorer spansDocScorer = subSpansDocScorers.get(i); + spansDocScorer.recordMatch(nestedSlopFactor, subSpans.startPosition()); + } + } + + /** Return the sum of the matching frequencies of the subspans. */ + @Override + public int docMatchFreq() { + int freq = 0; + for (SpansDocScorer spansDocScorer : subSpansDocScorers) { + freq += spansDocScorer.docMatchFreq(); + } + return freq; + } + + /** Return the sum of document scores of the subspans. */ + @Override + public double docScore() throws IOException { + double score = 0; + for (SpansDocScorer spansDocScorer : subSpansDocScorers) { + score += spansDocScorer.docScore(); + } + return score; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionNearSpans.java b/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionNearSpans.java new file mode 100644 index 000000000000..681cfc6e75f2 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionNearSpans.java @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.spans; + +import java.util.List; + +import org.apache.lucene.search.similarities.Similarity.SimScorer; + +/** + * A DisjunctionSpans that also provides a slop for each match. + * + * See also {@link SpanOrQuery#SpanOrQuery(int, SpanQuery...)}. + * + * @lucene.experimental + */ +public class DisjunctionNearSpans extends DisjunctionSpans { + protected final int maxDistance; + protected final SimScorer simScorer; + + /** Construct a DisjunctionNearSpans. + * @param spanOrQuery The query that provides the subSpans. + * @param subSpans Over which the disjunction is to be taken. + * @param maxDistance The maximum distance to be returned as the current match slop. + * @param simScorer For computing the slop factor from the slop. + */ + public DisjunctionNearSpans( + SpanOrQuery spanOrQuery, + List subSpans, + int maxDistance, + SimScorer simScorer) + { + super(spanOrQuery, subSpans); + this.maxDistance = maxDistance; + this.simScorer = simScorer; + } + + int currentSlop; + int lastDoc = -1; + + Spans prevFirstSpans; + int prevFirstSpansEndPosition; + int lastDifferentSpansEndPosition; + + + /** + * Compute the minimum slop between the currently matching + * sub spans and the previous and next matching other sub spans. + * When this slop is bigger than maxDistance + * or no other matching spans is available, return maxDistance. + *
+ * The slop is computed from the end of a spans to the beginning + * of the following different one. When this is negative, zero is used. + *
+ * When this method is used in a document, it must be called once at each match + * in the document. + *
+ * See also {@link DisjunctionNearSpansDocScorer}. + */ + public int currentSlop() { + Spans firstSpans = byPositionQueue.top(); + assert firstSpans.startPosition() != NO_MORE_POSITIONS; // at a disjunction match + + int currentDoc = docID(); + if (lastDoc != currentDoc) { // at first match in currentDoc + lastDoc = currentDoc; + prevFirstSpans = null; + lastDifferentSpansEndPosition = -1; + } + + int firstSpansEndPosition = firstSpans.endPosition(); // avoid calling more than once below, no spans is moved here. + + int slopBefore; + if (prevFirstSpans == null) { // at first match in currentDoc + slopBefore = maxDistance; + } else if (prevFirstSpans == firstSpans) { // sequence of same subspans. + if (lastDifferentSpansEndPosition == -1) { // initial sequence of same subspans + slopBefore = maxDistance; + } else { // later sequence of same subspans + slopBefore = Math.max(0, firstSpans.startPosition() - lastDifferentSpansEndPosition); + slopBefore = Math.min(slopBefore, maxDistance); + } + } else { // first spans is different from previous spans + slopBefore = Math.max(0, firstSpans.startPosition() - prevFirstSpansEndPosition); + slopBefore = Math.min(slopBefore, maxDistance); + lastDifferentSpansEndPosition = prevFirstSpansEndPosition; + } + prevFirstSpans = firstSpans; + prevFirstSpansEndPosition = firstSpansEndPosition; + + int slopAfter; + if (byPositionQueue.size() == 1) { // no other spans at this document + slopAfter = maxDistance; + } else { + Spans secondSpans = byPositionQueue.subTop(); + assert secondSpans != null; // byPositionQueue.size() >= 2 + assert secondSpans != firstSpans; + if (secondSpans.startPosition() == NO_MORE_POSITIONS) { // second exhausted in current doc + slopAfter = maxDistance; + } else { + slopAfter = Math.max(0, secondSpans.startPosition() - firstSpansEndPosition); + slopAfter = Math.min(slopAfter, maxDistance); + } + } + + currentSlop = Math.min(slopBefore, slopAfter); + return currentSlop; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionNearSpansDocScorer.java b/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionNearSpansDocScorer.java new file mode 100644 index 000000000000..6b37c7dc43f9 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionNearSpansDocScorer.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.spans; + +import org.apache.lucene.search.similarities.Similarity.SimScorer; + + +/** + * For {@link SpansTreeQuery}. Public for extension. + * + * @lucene.experimental + */ +public class DisjunctionNearSpansDocScorer + extends DisjunctionSpansDocScorer { + protected final SimScorer simScorer; + + public DisjunctionNearSpansDocScorer( + SpansTreeScorer spansTreeScorer, + DisjunctionNearSpans orNearSpans) + { + super(spansTreeScorer, orNearSpans); + this.simScorer = orNearSpans.simScorer; + } + + /** Record a match for the subspans at the first position. + * Use a slop factor that is the product of the given slopFactor + * and the slop factor of {@link DisjunctionNearSpans#currentSlop}. + */ + @Override + public void recordMatch(double slopFactor, int position) { + int slop = orSpans.currentSlop(); + double localSlopFactor = simScorer.computeSlopFactor(slop); + double nestedSlopFactor = slopFactor * localSlopFactor; + super.recordMatch(nestedSlopFactor, position); + } + +} diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionSpans.java b/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionSpans.java new file mode 100644 index 000000000000..77a006dd079e --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionSpans.java @@ -0,0 +1,259 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.spans; + +import java.io.IOException; +import java.util.List; + +import org.apache.lucene.search.DisiPriorityQueue; +import org.apache.lucene.search.DisiWrapper; +import org.apache.lucene.search.TwoPhaseIterator; +import org.apache.lucene.search.DisjunctionDISIApproximation; + + +/** + * A spans that merges given spans. + * + * @lucene.experimental + */ +public class DisjunctionSpans extends Spans { + protected final SpanQuery spanQuery; + protected final List subSpans; + protected final DisiPriorityQueue byDocQueue; + protected final SpanPositionQueue byPositionQueue; + protected Spans topPositionSpans; + protected final long totalCost; + + /** Construct a DisjunctionSpans. + * @param spanQuery The query that provides the subSpans. + * @param subSpans Over which the disjunction is to be taken. + */ + public DisjunctionSpans(SpanQuery spanQuery, List subSpans) { + this.spanQuery = spanQuery; // for toString() only + this.subSpans = subSpans; + byDocQueue = new DisiPriorityQueue(subSpans.size()); + for (Spans spans : subSpans) { + byDocQueue.add(new DisiWrapper(spans)); + } + totalCost = byDocQueue.totalCost(); + byPositionQueue = new SpanPositionQueue(subSpans.size()); // when empty use -1 + topPositionSpans = null; + } + + + /** For {@link DisjunctionSpansDocScorer}. */ + public List subSpans() { + return subSpans; + } + + /** For {@link DisjunctionSpansDocScorer}. */ + public void extractSubSpansAtCurrentDoc(List spansList) { + byPositionQueue.extractSpansList(spansList); + } + + /** For {@link DisjunctionSpansDocScorer}. */ + public Spans getCurrentPositionSpans() { + return byPositionQueue.top(); + } + + public SpanQuery getSpanQuery() { + return spanQuery; + } + + @Override + public int nextDoc() throws IOException { + topPositionSpans = null; + DisiWrapper topDocSpans = byDocQueue.top(); + int currentDoc = topDocSpans.doc; + do { + topDocSpans.doc = topDocSpans.iterator.nextDoc(); + topDocSpans = byDocQueue.updateTop(); + } while (topDocSpans.doc == currentDoc); + return topDocSpans.doc; + } + + @Override + public int advance(int target) throws IOException { + topPositionSpans = null; + DisiWrapper topDocSpans = byDocQueue.top(); + do { + topDocSpans.doc = topDocSpans.iterator.advance(target); + topDocSpans = byDocQueue.updateTop(); + } while (topDocSpans.doc < target); + return topDocSpans.doc; + } + + @Override + public int docID() { + DisiWrapper topDocSpans = byDocQueue.top(); + return topDocSpans.doc; + } + + @Override + public TwoPhaseIterator asTwoPhaseIterator() { + float sumMatchCost = 0; // See also DisjunctionScorer.asTwoPhaseIterator() + long sumApproxCost = 0; + + for (DisiWrapper w : byDocQueue) { + if (w.twoPhaseView != null) { + long costWeight = (w.cost <= 1) ? 1 : w.cost; + sumMatchCost += w.twoPhaseView.matchCost() * costWeight; + sumApproxCost += costWeight; + } + } + + if (sumApproxCost == 0) { // no sub spans supports approximations + computePositionsCost(); + return null; + } + + final float matchCost = sumMatchCost / sumApproxCost; + + return new TwoPhaseIterator(new DisjunctionDISIApproximation(byDocQueue)) { + @Override + public boolean matches() throws IOException { + return twoPhaseCurrentDocMatches(); + } + + @Override + public float matchCost() { + return matchCost; + } + }; + } + + float positionsCost = -1; + + void computePositionsCost() { + float sumPositionsCost = 0; + long sumCost = 0; + for (DisiWrapper w : byDocQueue) { + long costWeight = (w.cost <= 1) ? 1 : w.cost; + sumPositionsCost += w.spans.positionsCost() * costWeight; + sumCost += costWeight; + } + positionsCost = sumPositionsCost / sumCost; + } + + @Override + public float positionsCost() { + // This may be called when asTwoPhaseIterator returned null, + // which happens when none of the sub spans supports approximations. + assert positionsCost > 0; + return positionsCost; + } + + int lastDocTwoPhaseMatched = -1; + + boolean twoPhaseCurrentDocMatches() throws IOException { + DisiWrapper listAtCurrentDoc = byDocQueue.topList(); + // remove the head of the list as long as it does not match + final int currentDoc = listAtCurrentDoc.doc; + while (listAtCurrentDoc.twoPhaseView != null) { + if (listAtCurrentDoc.twoPhaseView.matches()) { + // use this spans for positions at current doc: + listAtCurrentDoc.lastApproxMatchDoc = currentDoc; + break; + } + // do not use this spans for positions at current doc: + listAtCurrentDoc.lastApproxNonMatchDoc = currentDoc; + listAtCurrentDoc = listAtCurrentDoc.next; + if (listAtCurrentDoc == null) { + return false; + } + } + lastDocTwoPhaseMatched = currentDoc; + topPositionSpans = null; + return true; + } + + void fillPositionQueue() throws IOException { // called at first nextStartPosition + assert byPositionQueue.size() == 0; + // add all matching Spans at current doc to byPositionQueue + DisiWrapper listAtCurrentDoc = byDocQueue.topList(); + while (listAtCurrentDoc != null) { + Spans spansAtDoc = listAtCurrentDoc.spans; + if (lastDocTwoPhaseMatched == listAtCurrentDoc.doc) { // matched by DisjunctionDisiApproximation + if (listAtCurrentDoc.twoPhaseView != null) { // matched by approximation + if (listAtCurrentDoc.lastApproxNonMatchDoc == listAtCurrentDoc.doc) { // matches() returned false + spansAtDoc = null; + } else { + if (listAtCurrentDoc.lastApproxMatchDoc != listAtCurrentDoc.doc) { + if (!listAtCurrentDoc.twoPhaseView.matches()) { + spansAtDoc = null; + } + } + } + } + } + + if (spansAtDoc != null) { + assert spansAtDoc.docID() == listAtCurrentDoc.doc; + assert spansAtDoc.startPosition() == -1; + spansAtDoc.nextStartPosition(); + assert spansAtDoc.startPosition() != NO_MORE_POSITIONS; + byPositionQueue.add(spansAtDoc); + } + listAtCurrentDoc = listAtCurrentDoc.next; + } + assert byPositionQueue.size() > 0; + } + + @Override + public int nextStartPosition() throws IOException { + if (topPositionSpans == null) { + byPositionQueue.clear(); + fillPositionQueue(); // fills byPositionQueue at first position + topPositionSpans = byPositionQueue.top(); + } else { + topPositionSpans.nextStartPosition(); + topPositionSpans = byPositionQueue.updateTop(); + } + return topPositionSpans.startPosition(); + } + + @Override + public int startPosition() { + return topPositionSpans == null ? -1 : topPositionSpans.startPosition(); + } + + @Override + public int endPosition() { + return topPositionSpans == null ? -1 : topPositionSpans.endPosition(); + } + + @Override + public int width() { + return topPositionSpans.width(); + } + + @Override + public void collect(SpanCollector collector) throws IOException { + if (topPositionSpans != null) + topPositionSpans.collect(collector); + } + + @Override + public String toString() { + return "DisjunctionSpans(" + spanQuery + ")@" + docID() + ": " + startPosition() + " - " + endPosition(); + } + + @Override + public long cost() { + return totalCost; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionSpansDocScorer.java b/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionSpansDocScorer.java new file mode 100644 index 000000000000..acf3561f0890 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionSpansDocScorer.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.spans; + +import java.io.IOException; +import java.util.List; +import java.util.ArrayList; +import java.util.Set; +import java.util.HashSet; + +/** + * For {@link SpansTreeQuery}. Public for extension. + * + * @lucene.experimental + */ +public class DisjunctionSpansDocScorer + extends SpansDocScorer { + protected final ArrayList subSpansAtDoc; + protected final HashSet> spansDocScorersAtDoc; + protected DisjunctionSpansT orSpans; + + /** Create a DisjunctionSpansDocScorer for a DisjunctionSpans and its subspans. + * For the subspans use {@link SpansTreeScorer#createSpansDocScorer}. + */ + public DisjunctionSpansDocScorer( + SpansTreeScorer spansTreeScorer, + DisjunctionSpansT orSpans) + { + this.orSpans = orSpans; + List subSpans = orSpans.subSpans(); + for (Spans spans : subSpans) { + spansTreeScorer.createSpansDocScorer(spans); + } + this.subSpansAtDoc = new ArrayList<>(subSpans.size()); + this.spansDocScorersAtDoc = new HashSet<>(); + } + + @Override + public void beginDoc(int doc) throws IOException { + subSpansAtDoc.clear(); + orSpans.extractSubSpansAtCurrentDoc(subSpansAtDoc); + assert subSpansAtDoc.size() > 0 : "empty subSpansAtDoc docID=" + docID(); + spansDocScorersAtDoc.clear(); + for (Spans subSpans : subSpansAtDoc) { + assert subSpans.docID() == doc; + subSpans.spansDocScorer.extractSpansDocScorersAtDoc(spansDocScorersAtDoc); + } + for (SpansDocScorer spansDocScorer : spansDocScorersAtDoc) { + spansDocScorer.beginDoc(doc); + } + } + + @Override + public void extractSpansDocScorersAtDoc(Set> spansDocScorersAtDoc) { + spansDocScorersAtDoc.addAll(this.spansDocScorersAtDoc); + } + + + /** Record a match with the given slop factor for the subspans at the first position. */ + @Override + public void recordMatch(double slopFactor, int position) { + Spans firstPosSpans = orSpans.getCurrentPositionSpans(); + assert subSpansAtDoc.contains(firstPosSpans); + assert firstPosSpans.startPosition() == position; + firstPosSpans.spansDocScorer.recordMatch(slopFactor, position); + } + + /** Return the sum of the matching frequencies of the subspans. */ + @Override + public int docMatchFreq() { + int freq = 0; + for (SpansDocScorer spansDocScorer : spansDocScorersAtDoc) { + freq += spansDocScorer.docMatchFreq(); + } + return freq; + } + + /** Return the sum of document scores of the subspans. */ + @Override + public double docScore() throws IOException { + double score = 0; + for (SpansDocScorer spansDocScorer : spansDocScorersAtDoc) { + score += spansDocScorer.docScore(); + } + return score; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java b/lucene/core/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java index f4054738d2c9..257999e3ea17 100644 --- a/lucene/core/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java +++ b/lucene/core/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java @@ -20,6 +20,8 @@ import java.io.IOException; import java.util.List; +import org.apache.lucene.search.similarities.Similarity; + /** * A Spans that is formed from the ordered subspans of a SpanNearQuery * where the subspans do not overlap and have a maximum slop between them. @@ -42,7 +44,7 @@ * Expert: * Only public for subclassing. Most implementations should not need this class */ -public class NearSpansOrdered extends ConjunctionSpans { +public class NearSpansOrdered extends ConjunctionNearSpans { protected int matchStart = -1; protected int matchEnd = -1; @@ -50,8 +52,12 @@ public class NearSpansOrdered extends ConjunctionSpans { private final int allowedSlop; - public NearSpansOrdered(int allowedSlop, List subSpans) throws IOException { - super(subSpans); + public NearSpansOrdered( + int allowedSlop, + List subSpans, + Similarity.SimScorer simScorer) throws IOException + { + super(subSpans, simScorer); this.atFirstInCurrentDoc = true; // -1 startPosition/endPosition also at doc -1 this.allowedSlop = allowedSlop; } @@ -143,6 +149,11 @@ public int width() { return matchWidth; } + @Override + public int currentSlop() { + return matchWidth; + } + @Override public void collect(SpanCollector collector) throws IOException { for (Spans span : subSpans) { diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/NearSpansUnordered.java b/lucene/core/src/java/org/apache/lucene/search/spans/NearSpansUnordered.java index 2a69395c52c2..aa4830ed341c 100644 --- a/lucene/core/src/java/org/apache/lucene/search/spans/NearSpansUnordered.java +++ b/lucene/core/src/java/org/apache/lucene/search/spans/NearSpansUnordered.java @@ -20,6 +20,7 @@ import java.util.List; import org.apache.lucene.util.PriorityQueue; +import org.apache.lucene.search.similarities.Similarity; /** * Similar to {@link NearSpansOrdered}, but for the unordered case. @@ -27,15 +28,17 @@ * Expert: * Only public for subclassing. Most implementations should not need this class */ -public class NearSpansUnordered extends ConjunctionSpans { +public class NearSpansUnordered extends ConjunctionNearSpans { private final int allowedSlop; private SpanTotalLengthEndPositionWindow spanWindow; - public NearSpansUnordered(int allowedSlop, List subSpans) - throws IOException { - super(subSpans); - + public NearSpansUnordered( + int allowedSlop, + List subSpans, + Similarity.SimScorer simScorer) throws IOException + { + super(subSpans, simScorer); this.allowedSlop = allowedSlop; this.spanWindow = new SpanTotalLengthEndPositionWindow(); } @@ -89,13 +92,21 @@ boolean nextPosition() throws IOException { updateTop(); return true; } + + int currentSlop() { + return maxEndPosition - top().startPosition() - totalSpanLength; + } boolean atMatch() { - boolean res = (maxEndPosition - top().startPosition() - totalSpanLength) <= allowedSlop; + boolean res = currentSlop() <= allowedSlop; return res; } } + @Override + public int currentSlop() { + return spanWindow.currentSlop(); + } /** Check whether two Spans in the same document are ordered with possible overlap. * @return true iff spans1 starts before spans2 diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanNearQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanNearQuery.java index 7958f4758b0b..75d14fefc7fa 100644 --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanNearQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanNearQuery.java @@ -48,6 +48,7 @@ public static class Builder { private final String field; private final List clauses = new LinkedList<>(); private int slop; + private int nonMatchSlop = -1; /** * Construct a new builder @@ -87,11 +88,21 @@ public Builder setSlop(int slop) { return this; } + /** + * Set the non match slop for this query + */ + public Builder setNonMatchSlop(int nonMatchSlop) { + this.nonMatchSlop = nonMatchSlop; + return this; + } + /** * Build the query */ public SpanNearQuery build() { - return new SpanNearQuery(clauses.toArray(new SpanQuery[clauses.size()]), slop, ordered); + return (nonMatchSlop == -1) + ? new SpanNearQuery(clauses.toArray(new SpanQuery[clauses.size()]), slop, ordered) + : new SpanNearQuery(clauses.toArray(new SpanQuery[clauses.size()]), slop, ordered, nonMatchSlop); } } @@ -113,9 +124,21 @@ public static Builder newUnorderedNearQuery(String field) { protected List clauses; protected int slop; protected boolean inOrder; + protected int nonMatchSlop; protected String field; + /** + * Construct a SpanNearQuery. + * See {@link SpanNearQuery#SpanNearQuery(SpanQuery[], int, boolean, int)} + * for the first three parameters. + * This will use Integer.MAX_VALUE-1 for the non matching slop. + */ + public SpanNearQuery(SpanQuery[] clausesIn, int slop, boolean inOrder) { + // Integer.MAX_VALUE causes overflow in sloppyFreq which adds 1. + this(clausesIn, slop, inOrder, Integer.MAX_VALUE-1); + } + /** Construct a SpanNearQuery. Matches spans matching a span from each * clause, with up to slop total unmatched positions between * them. @@ -124,10 +147,30 @@ public static Builder newUnorderedNearQuery(String field) { *
When inOrder is false, the spans from each clause * need not be ordered and may overlap. * @param clausesIn the clauses to find near each other, in the same field, at least 2. - * @param slop The slop value + * @param slop The allowed slop. This should be non negative and at most Integer.Max_VALUE-1. * @param inOrder true if order is important + * @param nonMatchSlop + * The distance for determining the slop factor to be used for non matching + * occurrences. This is used for scoring by {@link SpansTreeQuery}, and it + * should not be smaller than slop. + *
+ * Smaller values of nonMatchSlop will increase the + * score contribution of non matching occurrences + * via {@link org.apache.lucene.search.similarities.Similarity.SimScorer#computeSlopFactor}. + *
+ * Smaller values may lead to a scoring inconsistency between two span near queries + * that only differ in the allowed slop. + * For example consider query A with a smaller allowed slop and query B with a larger one. + * For query B there can be more matches, and these should increase the score of B + * when compared to the score of A. + * For each extra match at B, the non matching score for query A should be lower than + * the matching score for query B. + *
+ * To have consistent scoring between two such queries, choose + * a non matching scoring distance that is larger than the largest allowed distance, + * and provide that to both queries. */ - public SpanNearQuery(SpanQuery[] clausesIn, int slop, boolean inOrder) { + public SpanNearQuery(SpanQuery[] clausesIn, int slop, boolean inOrder, int nonMatchSlop) { this.clauses = new ArrayList<>(clausesIn.length); for (SpanQuery clause : clausesIn) { if (this.field == null) { // check field @@ -137,8 +180,14 @@ public SpanNearQuery(SpanQuery[] clausesIn, int slop, boolean inOrder) { } this.clauses.add(clause); } - this.slop = slop; + if (nonMatchSlop != -1) { + if (nonMatchSlop < slop) { + throw new IllegalArgumentException("nonMatchSlop < slop: " + nonMatchSlop + " < " + slop); + } + } this.inOrder = inOrder; + this.slop = slop; + this.nonMatchSlop = nonMatchSlop; } /** Return the clauses whose spans are matched. */ @@ -152,6 +201,9 @@ public SpanQuery[] getClauses() { /** Return true if matches are required to be in-order.*/ public boolean isInOrder() { return inOrder; } + /** Return the slop used for scoring non matching occurrences. */ + public int getNonMatchSlop() { return nonMatchSlop; } + @Override public String getField() { return field; } @@ -171,6 +223,8 @@ public String toString(String field) { buffer.append(slop); buffer.append(", "); buffer.append(inOrder); + buffer.append(", "); + buffer.append(nonMatchSlop); buffer.append(")"); return buffer.toString(); } @@ -179,7 +233,7 @@ public String toString(String field) { public SpanWeight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException { List subWeights = new ArrayList<>(); for (SpanQuery q : clauses) { - subWeights.add(q.createWeight(searcher, false, boost)); + subWeights.add(q.createWeight(searcher, needsScores, boost)); } return new SpanNearWeight(subWeights, searcher, needsScores ? getTermContexts(subWeights) : null, boost); } @@ -219,8 +273,8 @@ public Spans getSpans(final LeafReaderContext context, Postings requiredPostings } // all NearSpans require at least two subSpans - return (!inOrder) ? new NearSpansUnordered(slop, subSpans) - : new NearSpansOrdered(slop, subSpans); + return (!inOrder) ? new NearSpansUnordered(slop, subSpans, getSimScorer(context)) + : new NearSpansOrdered(slop, subSpans, getSimScorer(context)); } @Override @@ -258,10 +312,11 @@ public boolean equals(Object other) { return sameClassAs(other) && equalsTo(getClass().cast(other)); } - + private boolean equalsTo(SpanNearQuery other) { - return inOrder == other.inOrder && + return inOrder == other.inOrder && slop == other.slop && + nonMatchSlop == other.nonMatchSlop && clauses.equals(other.clauses); } @@ -270,6 +325,7 @@ public int hashCode() { int result = classHash(); result ^= clauses.hashCode(); result += slop; + result ^= 4 * nonMatchSlop; int fac = 1 + (inOrder ? 8 : 4); return fac * result; } @@ -326,7 +382,7 @@ public boolean equals(Object other) { return sameClassAs(other) && equalsTo(getClass().cast(other)); } - + private boolean equalsTo(SpanGapQuery other) { return width == other.width && field.equals(other.field); diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanNotQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanNotQuery.java index 00bcc4c1ac78..74413193d405 100644 --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanNotQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanNotQuery.java @@ -98,7 +98,7 @@ public String toString(String field) { @Override public SpanWeight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException { - SpanWeight includeWeight = include.createWeight(searcher, false, boost); + SpanWeight includeWeight = include.createWeight(searcher, needsScores, boost); SpanWeight excludeWeight = exclude.createWeight(searcher, false, boost); return new SpanNotWeight(searcher, needsScores ? getTermContexts(includeWeight, excludeWeight) : null, includeWeight, excludeWeight, boost); diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanOrQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanOrQuery.java index 15abc7ddb27e..264f1c892466 100644 --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanOrQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanOrQuery.java @@ -16,7 +16,6 @@ */ package org.apache.lucene.search.spans; - import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; @@ -28,25 +27,46 @@ import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermContext; -import org.apache.lucene.search.DisiPriorityQueue; -import org.apache.lucene.search.DisiWrapper; -import org.apache.lucene.search.DisjunctionDISIApproximation; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; -import org.apache.lucene.search.TwoPhaseIterator; +import org.apache.lucene.search.similarities.Similarity.SimScorer; /** Matches the union of its clauses. */ public final class SpanOrQuery extends SpanQuery { private List clauses; private String field; + private final int maxDistance; /** Construct a SpanOrQuery merging the provided clauses. * All clauses must have the same field. */ public SpanOrQuery(SpanQuery... clauses) { this.clauses = new ArrayList<>(clauses.length); + this.maxDistance = -1; + for (SpanQuery seq : clauses) { + addClause(seq); + } + } + + /** Construct a SpanOrQuery merging the provided clauses + * with the scoring depending on the distances between the successive clauses. + * All clauses must have the same field. + * The non negative maxDistance is used for scoring the successive occurrences + * of the different clauses. When the actual distance is larger than this, or when + * no other clause is present, maxDistance determines the slop factor. + * Otherwise each clause occurrence is scored with a slop factor determined + * by the minimum distance to the occurrence of another clause. + *
+ * This scoring works only when wrapped in a {@link SpansTreeQuery}. + */ + public SpanOrQuery(int maxDistance, SpanQuery... clauses) { + this.clauses = new ArrayList<>(clauses.length); + this.maxDistance = maxDistance; + if (maxDistance < 0) { + throw new IllegalArgumentException("maxDistance must be non negative: " + maxDistance); + } for (SpanQuery seq : clauses) { addClause(seq); } @@ -67,6 +87,11 @@ public SpanQuery[] getClauses() { return clauses.toArray(new SpanQuery[clauses.size()]); } + /** Return the maximum distance used to determine a slop factor for a clause occurrence. + * When no maximum distance was given, -1 is returned. + */ + public int getMaxDistance() { return maxDistance; } + @Override public String getField() { return field; } @@ -89,7 +114,13 @@ public Query rewrite(IndexReader reader) throws IOException { @Override public String toString(String field) { StringBuilder buffer = new StringBuilder(); - buffer.append("spanOr(["); + buffer.append("spanOr("); + if (maxDistance != -1) { + buffer.append("maxDistance="); + buffer.append(maxDistance); + buffer.append(", "); + } + buffer.append("["); Iterator i = clauses.iterator(); while (i.hasNext()) { SpanQuery clause = i.next(); @@ -104,31 +135,47 @@ public String toString(String field) { @Override public boolean equals(Object other) { - return sameClassAs(other) && - clauses.equals(((SpanOrQuery) other).clauses); + return sameClassAs(other) + && maxDistance == ((SpanOrQuery) other).maxDistance + && clauses.equals(((SpanOrQuery) other).clauses); } @Override public int hashCode() { - return classHash() ^ clauses.hashCode(); + return classHash() ^ clauses.hashCode() ^ (7 * maxDistance); } @Override public SpanWeight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException { List subWeights = new ArrayList<>(clauses.size()); for (SpanQuery q : clauses) { - subWeights.add(q.createWeight(searcher, false, boost)); + subWeights.add(q.createWeight(searcher, needsScores, boost)); } - return new SpanOrWeight(searcher, needsScores ? getTermContexts(subWeights) : null, subWeights, boost); + return new SpanOrWeight(searcher, + needsScores ? getTermContexts(subWeights) : null, + subWeights, + needsScores, + boost); } public class SpanOrWeight extends SpanWeight { final List subWeights; - - public SpanOrWeight(IndexSearcher searcher, Map terms, List subWeights, float boost) throws IOException { + final IndexSearcher searcher; + final boolean needsScores; + final float boost; + + public SpanOrWeight(IndexSearcher searcher, + Map terms, + List subWeights, + boolean needsScores, + float boost) throws IOException + { super(SpanOrQuery.this, searcher, terms, boost); this.subWeights = subWeights; + this.searcher = searcher; + this.needsScores = needsScores; + this.boost = boost; } @Override @@ -151,222 +198,35 @@ public Spans getSpans(final LeafReaderContext context, Postings requiredPostings ArrayList subSpans = new ArrayList<>(clauses.size()); + SpanWeight lastSpanWeight = null; for (SpanWeight w : subWeights) { Spans spans = w.getSpans(context, requiredPostings); if (spans != null) { subSpans.add(spans); + lastSpanWeight = w; } } if (subSpans.size() == 0) { return null; } else if (subSpans.size() == 1) { - return subSpans.get(0); + if (maxDistance == -1) { + return subSpans.get(0); + } else { // only weigh by slop factor of maxDistance + SimScorer simScorer = getSimScorer(context); + float maxDistanceSlop = simScorer.computeSlopFactor(maxDistance); + SpanQuery subQuery = (SpanQuery) lastSpanWeight.getQuery(); + return subQuery.createWeight(searcher, needsScores, (boost * maxDistanceSlop)) + .getSpans(context, requiredPostings); + } } - - DisiPriorityQueue byDocQueue = new DisiPriorityQueue(subSpans.size()); - for (Spans spans : subSpans) { - byDocQueue.add(new DisiWrapper(spans)); + if (maxDistance == -1) { + return new DisjunctionSpans(SpanOrQuery.this, subSpans); + } else { + SimScorer simScorer = getSimScorer(context); + return new DisjunctionNearSpans(SpanOrQuery.this, subSpans, maxDistance, simScorer); } - - SpanPositionQueue byPositionQueue = new SpanPositionQueue(subSpans.size()); // when empty use -1 - - return new Spans() { - Spans topPositionSpans = null; - - @Override - public int nextDoc() throws IOException { - topPositionSpans = null; - DisiWrapper topDocSpans = byDocQueue.top(); - int currentDoc = topDocSpans.doc; - do { - topDocSpans.doc = topDocSpans.iterator.nextDoc(); - topDocSpans = byDocQueue.updateTop(); - } while (topDocSpans.doc == currentDoc); - return topDocSpans.doc; - } - - @Override - public int advance(int target) throws IOException { - topPositionSpans = null; - DisiWrapper topDocSpans = byDocQueue.top(); - do { - topDocSpans.doc = topDocSpans.iterator.advance(target); - topDocSpans = byDocQueue.updateTop(); - } while (topDocSpans.doc < target); - return topDocSpans.doc; - } - - @Override - public int docID() { - DisiWrapper topDocSpans = byDocQueue.top(); - return topDocSpans.doc; - } - - @Override - public TwoPhaseIterator asTwoPhaseIterator() { - float sumMatchCost = 0; // See also DisjunctionScorer.asTwoPhaseIterator() - long sumApproxCost = 0; - - for (DisiWrapper w : byDocQueue) { - if (w.twoPhaseView != null) { - long costWeight = (w.cost <= 1) ? 1 : w.cost; - sumMatchCost += w.twoPhaseView.matchCost() * costWeight; - sumApproxCost += costWeight; - } - } - - if (sumApproxCost == 0) { // no sub spans supports approximations - computePositionsCost(); - return null; - } - - final float matchCost = sumMatchCost / sumApproxCost; - - return new TwoPhaseIterator(new DisjunctionDISIApproximation(byDocQueue)) { - @Override - public boolean matches() throws IOException { - return twoPhaseCurrentDocMatches(); - } - - @Override - public float matchCost() { - return matchCost; - } - }; - } - - float positionsCost = -1; - - void computePositionsCost() { - float sumPositionsCost = 0; - long sumCost = 0; - for (DisiWrapper w : byDocQueue) { - long costWeight = (w.cost <= 1) ? 1 : w.cost; - sumPositionsCost += w.spans.positionsCost() * costWeight; - sumCost += costWeight; - } - positionsCost = sumPositionsCost / sumCost; - } - - @Override - public float positionsCost() { - // This may be called when asTwoPhaseIterator returned null, - // which happens when none of the sub spans supports approximations. - assert positionsCost > 0; - return positionsCost; - } - - int lastDocTwoPhaseMatched = -1; - - boolean twoPhaseCurrentDocMatches() throws IOException { - DisiWrapper listAtCurrentDoc = byDocQueue.topList(); - // remove the head of the list as long as it does not match - final int currentDoc = listAtCurrentDoc.doc; - while (listAtCurrentDoc.twoPhaseView != null) { - if (listAtCurrentDoc.twoPhaseView.matches()) { - // use this spans for positions at current doc: - listAtCurrentDoc.lastApproxMatchDoc = currentDoc; - break; - } - // do not use this spans for positions at current doc: - listAtCurrentDoc.lastApproxNonMatchDoc = currentDoc; - listAtCurrentDoc = listAtCurrentDoc.next; - if (listAtCurrentDoc == null) { - return false; - } - } - lastDocTwoPhaseMatched = currentDoc; - topPositionSpans = null; - return true; - } - - void fillPositionQueue() throws IOException { // called at first nextStartPosition - assert byPositionQueue.size() == 0; - // add all matching Spans at current doc to byPositionQueue - DisiWrapper listAtCurrentDoc = byDocQueue.topList(); - while (listAtCurrentDoc != null) { - Spans spansAtDoc = listAtCurrentDoc.spans; - if (lastDocTwoPhaseMatched == listAtCurrentDoc.doc) { // matched by DisjunctionDisiApproximation - if (listAtCurrentDoc.twoPhaseView != null) { // matched by approximation - if (listAtCurrentDoc.lastApproxNonMatchDoc == listAtCurrentDoc.doc) { // matches() returned false - spansAtDoc = null; - } else { - if (listAtCurrentDoc.lastApproxMatchDoc != listAtCurrentDoc.doc) { - if (!listAtCurrentDoc.twoPhaseView.matches()) { - spansAtDoc = null; - } - } - } - } - } - - if (spansAtDoc != null) { - assert spansAtDoc.docID() == listAtCurrentDoc.doc; - assert spansAtDoc.startPosition() == -1; - spansAtDoc.nextStartPosition(); - assert spansAtDoc.startPosition() != NO_MORE_POSITIONS; - byPositionQueue.add(spansAtDoc); - } - listAtCurrentDoc = listAtCurrentDoc.next; - } - assert byPositionQueue.size() > 0; - } - - @Override - public int nextStartPosition() throws IOException { - if (topPositionSpans == null) { - byPositionQueue.clear(); - fillPositionQueue(); // fills byPositionQueue at first position - topPositionSpans = byPositionQueue.top(); - } else { - topPositionSpans.nextStartPosition(); - topPositionSpans = byPositionQueue.updateTop(); - } - return topPositionSpans.startPosition(); - } - - @Override - public int startPosition() { - return topPositionSpans == null ? -1 : topPositionSpans.startPosition(); - } - - @Override - public int endPosition() { - return topPositionSpans == null ? -1 : topPositionSpans.endPosition(); - } - - @Override - public int width() { - return topPositionSpans.width(); - } - - @Override - public void collect(SpanCollector collector) throws IOException { - if (topPositionSpans != null) - topPositionSpans.collect(collector); - } - - @Override - public String toString() { - return "spanOr(" + SpanOrQuery.this + ")@" + docID() + ": " + startPosition() + " - " + endPosition(); - } - - long cost = -1; - - @Override - public long cost() { - if (cost == -1) { - cost = 0; - for (Spans spans : subSpans) { - cost += spans.cost(); - } - } - return cost; - } - }; } } - } diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanPositionQueue.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanPositionQueue.java index 2d2bd16c5551..22bdb17b0066 100644 --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanPositionQueue.java +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanPositionQueue.java @@ -16,6 +16,8 @@ */ package org.apache.lucene.search.spans; +import java.util.List; +import java.util.Iterator; import org.apache.lucene.util.PriorityQueue; @@ -31,5 +33,12 @@ protected boolean lessThan(Spans s1, Spans s2) { : (start1 == start2) ? s1.endPosition() < s2.endPosition() : false; } + + void extractSpansList(List spansList) { + Iterator spansIter = iterator(); + while (spansIter.hasNext()) { + spansList.add(spansIter.next()); + } + } } diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanSynonymQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanSynonymQuery.java new file mode 100644 index 000000000000..e4287f5cf027 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanSynonymQuery.java @@ -0,0 +1,191 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.spans; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Set; +import java.util.Map; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermContext; +import org.apache.lucene.index.TermState; +import org.apache.lucene.index.TermsEnum; + +import org.apache.lucene.search.similarities.Similarity.SimScorer; + +import org.apache.lucene.search.Query; +import org.apache.lucene.search.SynonymQuery; +import org.apache.lucene.search.SynonymQuery.SynonymWeight; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.MatchNoDocsQuery; + +/** + * A SpanQuery that treats terms as synonyms. + *

+ * For scoring purposes, this query tries to score the terms as if you + * had indexed them as one term: it will match any of the terms while + * using the same scoring as {@link SynonymQuery}, as far as possible. + */ +public final class SpanSynonymQuery extends SpanQuery { + final SynonymQuery synonymQuery; + final List terms; + + /** + * Creates a new SpanSynonymQuery, matching any of the supplied terms. + *

+ * The terms must all have the same field. + */ + public SpanSynonymQuery(Term... terms) { + this.synonymQuery = new SynonymQuery(terms); + this.terms = synonymQuery.getTerms(); + } + + @Override + public String getField() { + return synonymQuery.getField(); + } + + public List getTerms() { + return terms; + } + + @Override + public String toString(String field) { + StringBuilder builder = new StringBuilder("SpanSynonym("); + builder.append(synonymQuery.toString(field)); + builder.append(")"); + return builder.toString(); + } + + @Override + public int hashCode() { + return 31 * classHash() - synonymQuery.hashCode(); + } + + @Override + public boolean equals(Object other) { + return sameClassAs(other) && + synonymQuery.equals(((SpanSynonymQuery) other).synonymQuery); + } + + @Override + public Query rewrite(IndexReader reader) throws IOException { + // optimize zero and single term cases + int numTerms = terms.size(); + if (numTerms == 0) { + return new MatchNoDocsQuery(); + } + if (numTerms == 1) { + return new SpanTermQuery(terms.get(0)); + } + return this; + } + + /** The returned SpanWeight does not support {@link SpanWeight#explain}. */ + @Override + public SpanWeight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException { + if (needsScores) { + SynonymWeight synonymWeight = (SynonymWeight) + synonymQuery.createWeight(searcher, needsScores, boost); + return new SpanSynonymWeight(searcher, boost, synonymWeight); + } + else { // scores not needed, use SpanOrQuery without scoring. + SpanTermQuery[] clauses = new SpanTermQuery[terms.size()]; + int i = 0; + for (Term term : terms) { + clauses[i++] = new SpanTermQuery(term); + } + return new SpanOrQuery(clauses).createWeight(searcher, needsScores, boost); + } + } + + class SpanSynonymWeight extends SpanWeight { + final SynonymWeight synonymWeight; + + SpanSynonymWeight( + IndexSearcher searcher, + float boost, + SynonymWeight synonymWeight) + throws IOException { + super(SpanSynonymQuery.this, searcher, null, boost); // null: no term context map + this.synonymWeight = synonymWeight; + } + + @Override + public void extractTerms(Set termSet) { + for (Term t : terms) { + termSet.add(t); + } + } + + @Override + public void extractTermContexts(Map termContextbyTerm) { + TermContext[] termContexts = synonymWeight.getTermContexts(); + int i = 0; + for (Term term : terms) { + TermContext termContext = termContexts[i++]; + termContextbyTerm.put(term, termContext); + } + } + + @Override + public Explanation explain(LeafReaderContext context, int doc) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public SimScorer getSimScorer(LeafReaderContext context) throws IOException { + return synonymWeight.getSimScorer(context); + } + + @Override + public Spans getSpans(final LeafReaderContext context, Postings requiredPostings) + throws IOException { + SimScorer simScorer = getSimScorer(context); + final String field = getField(); + Terms fieldTerms = context.reader().terms(field); + List termSpans = new ArrayList<>(terms.size()); + if (fieldTerms != null) { + TermsEnum termsEnum = fieldTerms.iterator(); + TermContext[] termContexts = synonymWeight.getTermContexts(); + int i = 0; + for (Term term : terms) { + TermContext termContext = termContexts[i++]; // in term order + TermState termState = termContext.get(context.ord); + if (termState != null) { + termsEnum.seekExact(term.bytes(), termState); + PostingsEnum postings = termsEnum.postings(null, PostingsEnum.POSITIONS); + float positionsCost = SpanTermQuery.termPositionsCost(termsEnum) + * SpanTermQuery.PHRASE_TO_SPAN_TERM_POSITIONS_COST; + termSpans.add(new TermSpans(simScorer, postings, term, positionsCost)); + } + } + } + + return (termSpans.size() == 0) ? null + : (termSpans.size() == 1) ? termSpans.get(0) + : new SynonymSpans(SpanSynonymQuery.this, termSpans, simScorer); + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanTermQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanTermQuery.java index 3e13be7ecb11..b74ac9dc629b 100644 --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanTermQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanTermQuery.java @@ -126,7 +126,7 @@ public Spans getSpans(final LeafReaderContext context, Postings requiredPostings * the relative cost of dealing with the term positions * when using a SpanNearQuery instead of a PhraseQuery. */ - private static final float PHRASE_TO_SPAN_TERM_POSITIONS_COST = 4.0f; + static final float PHRASE_TO_SPAN_TERM_POSITIONS_COST = 4.0f; private static final int TERM_POSNS_SEEK_OPS_PER_DOC = 128; diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/Spans.java b/lucene/core/src/java/org/apache/lucene/search/spans/Spans.java index 785770838c8f..73dd083afdee 100644 --- a/lucene/core/src/java/org/apache/lucene/search/spans/Spans.java +++ b/lucene/core/src/java/org/apache/lucene/search/spans/Spans.java @@ -116,4 +116,6 @@ protected void doStartCurrentDoc() throws IOException {} */ protected void doCurrentSpans() throws IOException {} + /** For {@link SpansTreeQuery}. */ + SpansDocScorer spansDocScorer; } diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpansDocScorer.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpansDocScorer.java new file mode 100644 index 000000000000..f780eeb5fd23 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpansDocScorer.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.spans; + +import java.io.IOException; +import java.util.Set; + +/** + * Record span matches in a document and compute a document score. + *
+ * For {@link SpansTreeQuery}. Public for extension. + * + * @lucene.experimental + */ +public abstract class SpansDocScorer { + protected int currentDoc; + + /** + * Create a SpansDocScorer + */ + public SpansDocScorer() { + currentDoc = -1; + } + + /** The document for which matches are recorded. */ + public int docID() { return currentDoc; } + + /** Called before the first match of the spans is to be recorded for the document. */ + public void beginDoc(int doc) throws IOException { + currentDoc = doc; + } + + /** Provide the SpansDocScorers at the current document. */ + public abstract void extractSpansDocScorersAtDoc(Set> spansDocScorersAtDoc); + + /** Record a match with its slop factor at the given position. */ + public abstract void recordMatch(double slopFactor, int position); + + /** Return the matching frequency of the last {@link #beginDoc} document. */ + public abstract int docMatchFreq(); + + /** Return the score of the last {@link #beginDoc} document. */ + public abstract double docScore() throws IOException; +} diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpansTreeQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpansTreeQuery.java new file mode 100644 index 000000000000..2a73e3889bc0 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpansTreeQuery.java @@ -0,0 +1,328 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.spans; + +import java.io.IOException; +import java.util.Objects; +import java.util.Set; +import java.util.ArrayList; + +import org.apache.lucene.index.Term; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReaderContext; + +import org.apache.lucene.search.Query; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.DisjunctionMaxQuery; +import org.apache.lucene.search.BoostQuery; +import org.apache.lucene.search.Weight; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Explanation; + +import org.apache.lucene.search.similarities.Similarity.SimScorer; + +/** Wrapper class for scoring span queries via matching term occurrences. + * + * @lucene.experimental + */ +public class SpansTreeQuery extends Query { + + final SpanQuery spanQuery; + final int TOP_LEVEL_SLOP = 0; + + /** Wrap a span query to score via its matching term occurrences. + *
+ * For more details on scoring see {@link SpansTreeScorer#createSpansDocScorer}. + * + * @param spanQuery This can be any nested combination of + * {@link org.apache.lucene.search.spans.SpanNearQuery}, + * {@link org.apache.lucene.search.spans.SpanOrQuery}, + * {@link org.apache.lucene.search.spans.SpanSynonymQuery}, + * {@link org.apache.lucene.search.spans.SpanTermQuery}, + * {@link org.apache.lucene.search.spans.SpanBoostQuery}, + * {@link org.apache.lucene.search.spans.SpanNotQuery}, + * {@link org.apache.lucene.search.spans.SpanFirstQuery}, + * {@link org.apache.lucene.search.spans.SpanContainingQuery} and + * {@link org.apache.lucene.search.spans.SpanWithinQuery}. + */ + public SpansTreeQuery(SpanQuery spanQuery) { + this.spanQuery = Objects.requireNonNull(spanQuery); + } + + /** Wrap the span (subqueries of a) query in a SpansTreeQuery. + *
+ * A {@link SpanQuery} will be wrapped in a {@link SpansTreeQuery#SpansTreeQuery}. + * For {@link BooleanQuery}, {@link DisjunctionMaxQuery} and {@link BoostQuery}, + * the subqueries/subquery will be wrapped recursively. + * Otherwise the given query is returned. + *
+ * No double wrapping will be done because + * a {@link SpansTreeQuery} is not a {@link SpanQuery}. + */ + public static Query wrap(Query query) { + if (query instanceof SpanQuery) { + return new SpansTreeQuery((SpanQuery)query); + } + if (query instanceof BooleanQuery) { + return wrapBooleanQuery((BooleanQuery)query); + } + if (query instanceof DisjunctionMaxQuery) { + return wrapDMQ((DisjunctionMaxQuery)query); + } + if (query instanceof BoostQuery) { + Query subQuery = ((BoostQuery)query).getQuery(); + Query wrappedSubQuery = wrap(subQuery); + if (wrappedSubQuery == subQuery) { + return query; + } + float boost = ((BoostQuery)query).getBoost(); + return new BoostQuery(wrappedSubQuery, boost); + } + return query; + } + + static BooleanQuery wrapBooleanQuery(BooleanQuery blq) { + ArrayList wrappedClauses = new ArrayList<>(); + boolean wrapped = false; + for (BooleanClause clause : blq.clauses()) { + Query subQuery = clause.getQuery(); + Query wrappedSubQuery = wrap(subQuery); + if (wrappedSubQuery != subQuery) { + wrapped = true; + wrappedClauses.add(new BooleanClause(wrappedSubQuery, clause.getOccur())); + } + else { + wrappedClauses.add(clause); + } + } + if (! wrapped) { + return blq; + } + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + for (BooleanClause clause : wrappedClauses) { + builder.add(clause); + } + return builder.build(); + } + + static DisjunctionMaxQuery wrapDMQ(DisjunctionMaxQuery dmq) { + ArrayList wrappedDisjuncts = new ArrayList<>(); + boolean wrapped = false; + for (Query disjunct : dmq.getDisjuncts()) { + Query wrappedDisjunct = wrap(disjunct); + if (wrappedDisjunct != disjunct) { + wrapped = true; + wrappedDisjuncts.add(wrappedDisjunct); + } + else { + wrappedDisjuncts.add(disjunct); + } + } + if (! wrapped) { + return dmq; + } + float tbm = dmq.getTieBreakerMultiplier(); + return new DisjunctionMaxQuery(wrappedDisjuncts, tbm); + } + + + /** Wrap a given query by {@link #wrap(Query)} after it was rewritten. + */ + public static Query wrapAfterRewrite(Query query) { + return new Query() { + @Override + public Query rewrite(IndexReader reader) throws IOException { + Query rewritten = query.rewrite(reader); + Query wrapped = wrap(rewritten); + return wrapped; + } + + @Override + public boolean equals(Object other) { + return this == other; + } + + @Override + public int hashCode() { + return query.hashCode() ^ SpansTreeQuery.class.hashCode(); + } + + @Override + public String toString(String field) { + return "SpansTreeQuery.wrapAfterRewrite: " + query.toString(field); + } + }; + } + + /** The wrapped SpanQuery */ + public SpanQuery getSpanQuery() { return spanQuery; } + + @Override + public int hashCode() { + return getClass().hashCode() - spanQuery.hashCode(); + } + + @Override + public boolean equals(Object other) { + return sameClassAs(other) && + equalsTo(getClass().cast(other)); + } + + private boolean equalsTo(SpansTreeQuery other) { + return spanQuery.equals(other.spanQuery); + } + + @Override + public String toString(String field) { + StringBuilder buffer = new StringBuilder(); + buffer.append("SpansTreeQuery("); + buffer.append(spanQuery.toString(field)); + buffer.append(")"); + return buffer.toString(); + } + + /** Return a weight for scoring by matching term occurrences. + *
{@link Weight#explain} is not supported on the result. + */ + @Override + public SpansTreeWeight createWeight( + IndexSearcher searcher, + boolean needsScores, + float boost) + throws IOException + { + return new SpansTreeWeight(searcher, needsScores, boost); + } + + public class SpansTreeWeight extends Weight { + final SpanWeight spanWeight; + + public SpansTreeWeight( + IndexSearcher searcher, + boolean needsScores, + float boost) + throws IOException + { + super(SpansTreeQuery.this); + this.spanWeight = spanQuery.createWeight(searcher, needsScores, boost); + } + + /** Throws an UnsupportedOperationException. */ + @Override + public Explanation explain(LeafReaderContext context, int doc) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public void extractTerms(Set terms) { + spanWeight.extractTerms(terms); + } + + /** Compute a minimal slop factor from the maximum possible slops that can occur + * in a SpanQuery for nested SpanNearQueries and for nested SpanOrQueries with distance. + * This supports the queries mentioned at {@link SpansTreeScorer#createSpansDocScorer}. + *

+ * This uses the maximum slops from {@link SpanOrQuery#getMaxDistance()} and + * {@link SpanNearQuery#getNonMatchSlop()}. + *

+ * This assumes that slop factors are multiplied in + * {@link ConjunctionNearSpansDocScorer#recordMatch} and in + * {@link DisjunctionNearSpansDocScorer#recordMatch} + */ + public double minSlopFactor(SpanQuery spanQuery, SimScorer simScorer, double slopFactor) { + assert slopFactor >= 0; + if (spanQuery instanceof SpanTermQuery) { + return slopFactor; + } + if (spanQuery instanceof SpanSynonymQuery) { + return slopFactor; + } + if (spanQuery instanceof SpanNotQuery) { + return minSlopFactor(((SpanNotQuery)spanQuery).getInclude(), simScorer, slopFactor); + } + if (spanQuery instanceof SpanPositionCheckQuery) { + return minSlopFactor(((SpanFirstQuery)spanQuery).getMatch(), simScorer, slopFactor); + } + if (spanQuery instanceof SpanContainingQuery) { + return minSlopFactor(((SpanContainingQuery)spanQuery).getBig(), simScorer, slopFactor); + } + if (spanQuery instanceof SpanWithinQuery) { + return minSlopFactor(((SpanWithinQuery)spanQuery).getLittle(), simScorer, slopFactor); + } + if (spanQuery instanceof SpanBoostQuery) { + return minSlopFactor(((SpanBoostQuery)spanQuery).getQuery(), simScorer, slopFactor); + } + + SpanQuery[] clauses = null; + int maxAllowedSlop = -1; + + if (spanQuery instanceof SpanOrQuery) { + SpanOrQuery spanOrQuery = (SpanOrQuery)spanQuery; + clauses = spanOrQuery.getClauses(); + maxAllowedSlop = spanOrQuery.getMaxDistance(); + if (maxAllowedSlop == -1) { + return minSlopFactorClauses(clauses, simScorer, slopFactor); + } + } + else if (spanQuery instanceof SpanNearQuery) { + SpanNearQuery spanNearQuery = (SpanNearQuery) spanQuery; + clauses = spanNearQuery.getClauses(); + maxAllowedSlop = spanNearQuery.getNonMatchSlop(); + } + + if (clauses == null) { + throw new IllegalArgumentException("Not implemented for SpanQuery class: " + + spanQuery.getClass().getName()); + } + + assert maxAllowedSlop >= 0; + double localSlopFactor = simScorer.computeSlopFactor(maxAllowedSlop); + assert localSlopFactor >= 0; + // assumed multiplication: + return minSlopFactorClauses(clauses, simScorer, slopFactor * localSlopFactor); + } + + /** Helper for {@link #minSlopFactor} */ + public double minSlopFactorClauses(SpanQuery[] clauses, SimScorer simScorer, double slopFactor) { + assert slopFactor >= 0; + assert clauses.length >= 1; + double res = Double.MAX_VALUE; + for (SpanQuery clause : clauses) { + double minSlopFacClause = minSlopFactor(clause, simScorer, slopFactor); + res = Double.min(res, minSlopFacClause); + } + return res; + } + + /** Provide a SpansTreeScorer that has the result of {@link #minSlopFactor} + * as the weight for non matching terms. + */ + @Override + public SpansTreeScorer scorer(LeafReaderContext context) throws IOException { + final Spans spans = spanWeight.getSpans(context, SpanWeight.Postings.POSITIONS); + if (spans == null) { + return null; + } + SimScorer topLevelScorer = spanWeight.getSimScorer(context); + double topLevelSlopFactor = topLevelScorer.computeSlopFactor(TOP_LEVEL_SLOP); + double nonMatchWeight = minSlopFactor(spanQuery, topLevelScorer, topLevelSlopFactor); + + return new SpansTreeScorer(this, spans, topLevelSlopFactor, nonMatchWeight); + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpansTreeScorer.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpansTreeScorer.java new file mode 100644 index 000000000000..14ad5fa1425c --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpansTreeScorer.java @@ -0,0 +1,202 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.spans; + +import java.io.IOException; +import java.util.Objects; +import java.util.HashMap; + +import org.apache.lucene.index.Term; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.Weight; +import org.apache.lucene.search.TwoPhaseIterator; + +/** + * A Scorer for (nested) spans. + * This associates the spans with a {@link SpansDocScorer} and uses its score values. + *

+ * For {@link SpansTreeQuery}. Public for extension. + * + * @lucene.experimental + */ +public class SpansTreeScorer extends Scorer { + + protected final Spans spans; + protected final double topLevelSlopFactor; + protected final double nonMatchWeight; + protected final HashMap> spansDocScorerByQuery; + protected final SpansDocScorer spansDocScorer; + + protected int lastScoredDoc = -1; + + public SpansTreeScorer(Weight weight, Spans spans, double topLevelSlopFactor, double nonMatchWeight) { + super(weight); + this.spans = Objects.requireNonNull(spans); + this.topLevelSlopFactor = topLevelSlopFactor; + this.nonMatchWeight = nonMatchWeight; + this.spansDocScorerByQuery = new HashMap<>(); + this.spansDocScorer = createSpansDocScorer(spans); + } + + @Override + public int docID() { + return spans.docID(); + } + + @Override + public DocIdSetIterator iterator() { + return spans; + } + + @Override + public TwoPhaseIterator twoPhaseIterator() { + return spans.asTwoPhaseIterator(); + } + + /** + * Provide the SpansDocScorer that will be used by {@link #score} and {@link #freq}. + *
+ * Override this to provide support for span queries for which the spans are not supported here. + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
For {@link Spans}:normally from {@link SpanQuery}:return:
{@link TermSpans}{@link SpanTermQuery}{@link TermSpansDocScorer}
{@link DisjunctionNearSpans}{@link SpanOrQuery#SpanOrQuery(int,SpanQuery...)}{@link DisjunctionNearSpansDocScorer}
{@link DisjunctionSpans}{@link SpanOrQuery#SpanOrQuery(SpanQuery...)}{@link DisjunctionSpansDocScorer}
{@link SynonymSpans}{@link SpanSynonymQuery#SpanSynonymQuery(Term...)}{@link SynonymSpansDocScorer}
{@link ConjunctionNearSpans}{@link SpanNearQuery}{@link ConjunctionNearSpansDocScorer}
{@link FilterSpans}{@link SpanNotQuery}, {@link SpanFirstQuery}recursively use {@link FilterSpans#in}
{@link ContainSpans}{@link SpanContainingQuery}, {@link SpanWithinQuery}recursively use {@link ContainSpans#sourceSpans}
+ * For a term that is present more than once via TermSpans a single SpansDocScorer will be provided. + * For a synonym set of terms that is present more than once via SynonymSpans a single SpansDocScorer will be provided. + */ + public SpansDocScorer createSpansDocScorer(Spans spans) { + SpansDocScorer spansDocScorer = null; + if (spans instanceof TermSpans) { + TermSpans termSpans = (TermSpans) spans; + SpanQuery stq = new SpanTermQuery(termSpans.term); + spansDocScorer = spansDocScorerByQuery.get(stq); + if (spansDocScorer == null) { + spansDocScorer = new TermSpansDocScorer(termSpans, nonMatchWeight); + spansDocScorerByQuery.put(stq, spansDocScorer); + } + } + else if (spans instanceof DisjunctionNearSpans) { + spansDocScorer = new DisjunctionNearSpansDocScorer(this, (DisjunctionNearSpans) spans); + } + else if (spans instanceof SynonymSpans) { + SynonymSpans synSpans = (SynonymSpans) spans; + SpanQuery ssq = synSpans.getSpanQuery(); + spansDocScorer = spansDocScorerByQuery.get(ssq); + if (spansDocScorer == null) { + spansDocScorer = new SynonymSpansDocScorer(synSpans, nonMatchWeight); + spansDocScorerByQuery.put(ssq, spansDocScorer); + } + } + else if (spans instanceof DisjunctionSpans) { + spansDocScorer = new DisjunctionSpansDocScorer<>(this, (DisjunctionSpans) spans); + } + else if (spans instanceof ConjunctionNearSpans) { + spansDocScorer = new ConjunctionNearSpansDocScorer(this, (ConjunctionNearSpans) spans); + } + else if (spans instanceof FilterSpans) { + spansDocScorer = createSpansDocScorer(((FilterSpans) spans).in); + } + else if (spans instanceof ContainSpans) { + spansDocScorer = createSpansDocScorer(((ContainSpans) spans).sourceSpans); + } + if (spansDocScorer == null) { + throw new IllegalArgumentException("Not implemented for Spans class: " + + spans.getClass().getName()); + } + spans.spansDocScorer = spansDocScorer; + return spansDocScorer; + } + + /** + * Record the span matches in the current document. + *

+ * This will be called at most once per document. + */ + protected void recordMatchesCurrentDoc() throws IOException { + int startPos = spans.nextStartPosition(); + assert startPos != Spans.NO_MORE_POSITIONS; + spansDocScorer.beginDoc(spans.docID()); + do { + spansDocScorer.recordMatch(topLevelSlopFactor, spans.startPosition()); + startPos = spans.nextStartPosition(); + } while (startPos != Spans.NO_MORE_POSITIONS); + } + + /** + * Ensure recordMatchesCurrentDoc is called, if not already called for the current doc. + */ + public void ensureMatchesRecorded() throws IOException { + int currentDoc = docID(); + if (lastScoredDoc != currentDoc) { + recordMatchesCurrentDoc(); + lastScoredDoc = currentDoc; + } + } + + /** Score the current document. + * See {@link #createSpansDocScorer} and {@link SpansDocScorer#docScore}. + */ + @Override + public final float score() throws IOException { + ensureMatchesRecorded(); + return (float) spansDocScorer.docScore(); + } + + /** Return the total matching frequency of the current document. + * See {@link #createSpansDocScorer} and {@link SpansDocScorer#docMatchFreq}. + */ + @Override + public final int freq() throws IOException { + ensureMatchesRecorded(); + return spansDocScorer.docMatchFreq(); + } + + public String toString() { + return "SpansTreeScorer(" + spansDocScorer + ")"; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SynonymSpans.java b/lucene/core/src/java/org/apache/lucene/search/spans/SynonymSpans.java new file mode 100644 index 000000000000..b15d2444a44b --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SynonymSpans.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.spans; + +import java.util.List; + +import org.apache.lucene.index.Term; +import org.apache.lucene.search.similarities.Similarity.SimScorer; + + +/** + * A spans for merging and equal scoring of given spans. + * This does not provide score values. + * + * @lucene.experimental + */ +public class SynonymSpans extends DisjunctionSpans { + SimScorer simScorer; + List terms; + + /** Construct a SynonymSpans. + * @param spanSynQuery The query that provides the subSpans. + * @param subSpans Over which the disjunction is to be taken. + * @param simScorer To be used for scoring. + */ + public SynonymSpans(SpanSynonymQuery spanSynQuery, List subSpans, SimScorer simScorer) { + super(spanSynQuery, subSpans); + this.simScorer = simScorer; + this.terms = spanSynQuery.getTerms(); + } + + public List getTerms() { + return terms; + } + + @Override + public String toString() { + return "SynonymSpans(" + spanQuery + ")@" + docID() + ": " + startPosition() + " - " + endPosition(); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SynonymSpansDocScorer.java b/lucene/core/src/java/org/apache/lucene/search/spans/SynonymSpansDocScorer.java new file mode 100644 index 000000000000..666aa176f951 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SynonymSpansDocScorer.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.spans; + +import java.io.IOException; +import java.util.ArrayList; + +/** + * For {@link SpansTreeQuery}. Public for extension. + * + * @lucene.experimental + */ +public class SynonymSpansDocScorer + extends AsSingleTermSpansDocScorer { + + protected final ArrayList subSpansAtDoc; + protected final SynonymSpans synSpans; + + /** + * @param synSpans Provides matching synonym occurrences. + * This should only contain TermSpans. + * @param nonMatchWeight The non negative weight to be used for the non matching term occurrences. + */ + public SynonymSpansDocScorer(SynonymSpans synSpans, double nonMatchWeight) { + super(synSpans.simScorer, nonMatchWeight); + this.synSpans = synSpans; + this.subSpansAtDoc = new ArrayList<>(synSpans.subSpans().size()); + } + + @Override + public int termFreqInDoc() throws IOException { + int freq = 0; + for (Spans subSpans : subSpansAtDoc) { + freq += ((TermSpans)subSpans).getPostings().freq(); + } + return freq; + } + + @Override + public void beginDoc(int doc) throws IOException { + subSpansAtDoc.clear(); + synSpans.extractSubSpansAtCurrentDoc(subSpansAtDoc); + assert subSpansAtDoc.size() > 0 : "empty subSpansAtDoc docID=" + docID(); + assert subSpansAtDoc.get(0).docID() == doc; + super.beginDoc(doc); // calls termFreqInDoc. + } + +} diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/TermSpans.java b/lucene/core/src/java/org/apache/lucene/search/spans/TermSpans.java index f1e1aed65570..6b0bb4796f41 100644 --- a/lucene/core/src/java/org/apache/lucene/search/spans/TermSpans.java +++ b/lucene/core/src/java/org/apache/lucene/search/spans/TermSpans.java @@ -32,6 +32,7 @@ public class TermSpans extends Spans { protected final PostingsEnum postings; protected final Term term; + protected final Similarity.SimScorer simScorer; protected int doc; protected int freq; protected int count; @@ -41,6 +42,7 @@ public class TermSpans extends Spans { public TermSpans(Similarity.SimScorer scorer, PostingsEnum postings, Term term, float positionsCost) { + this.simScorer = scorer; this.postings = Objects.requireNonNull(postings); this.term = Objects.requireNonNull(term); this.doc = -1; diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/TermSpansDocScorer.java b/lucene/core/src/java/org/apache/lucene/search/spans/TermSpansDocScorer.java new file mode 100644 index 000000000000..3474ed8fa86b --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/spans/TermSpansDocScorer.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.spans; + +import java.io.IOException; + +import org.apache.lucene.index.PostingsEnum; + + +/** + * For {@link SpansTreeQuery}. Public for extension. + * + * @lucene.experimental + */ +public class TermSpansDocScorer extends AsSingleTermSpansDocScorer { + + protected final PostingsEnum postings; + + /** + * @param termSpans Provides matching term occurrences. + * @param nonMatchWeight The non negative weight to be used for the non matching term occurrences. + */ + public TermSpansDocScorer(TermSpans termSpans, double nonMatchWeight) { + super(termSpans.simScorer, nonMatchWeight); + this.postings = termSpans.getPostings(); + } + + @Override + public int termFreqInDoc() throws IOException { + return postings.freq(); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/util/PriorityQueue.java b/lucene/core/src/java/org/apache/lucene/util/PriorityQueue.java index 83ac613b676a..83f61c1e0205 100644 --- a/lucene/core/src/java/org/apache/lucene/util/PriorityQueue.java +++ b/lucene/core/src/java/org/apache/lucene/util/PriorityQueue.java @@ -173,6 +173,14 @@ public final T top() { return heap[1]; } + /** Returns the second least element of the PriorityQueue in constant time. */ + public final T subTop() { + if (size == 2) { + return heap[2]; + } + return lessThan(heap[2], heap[3]) ? heap[2] : heap[3]; + } + /** Removes and returns the least element of the PriorityQueue in log(size) time. */ public final T pop() { diff --git a/lucene/core/src/test/org/apache/lucene/search/spans/TestSpanSearchEquivalence.java b/lucene/core/src/test/org/apache/lucene/search/spans/TestSpanSearchEquivalence.java index 7d7fbe47ef06..e3b4d24b74ea 100644 --- a/lucene/core/src/test/org/apache/lucene/search/spans/TestSpanSearchEquivalence.java +++ b/lucene/core/src/test/org/apache/lucene/search/spans/TestSpanSearchEquivalence.java @@ -31,9 +31,11 @@ * Basic equivalence tests for span queries */ public class TestSpanSearchEquivalence extends SearchEquivalenceTestBase { - + + final int MAX_SLOP = Integer.MAX_VALUE-1; // avoid distance+1 overflow in computeSlopFactor + // TODO: we could go a little crazy for a lot of these, - // but these are just simple minimal cases in case something + // but these are just simple minimal cases in case something // goes horribly wrong. Put more intense tests elsewhere. /** SpanTermQuery(A) = TermQuery(A) */ @@ -41,21 +43,21 @@ public void testSpanTermVersusTerm() throws Exception { Term t1 = randomTerm(); assertSameScores(new TermQuery(t1), spanQuery(new SpanTermQuery(t1))); } - + /** SpanOrQuery(A) = SpanTermQuery(A) */ public void testSpanOrVersusTerm() throws Exception { Term t1 = randomTerm(); SpanQuery term = spanQuery(new SpanTermQuery(t1)); assertSameSet(spanQuery(new SpanOrQuery(term)), term); } - + /** SpanOrQuery(A, A) = SpanTermQuery(A) */ public void testSpanOrDoubleVersusTerm() throws Exception { Term t1 = randomTerm(); SpanQuery term = spanQuery(new SpanTermQuery(t1)); assertSameSet(spanQuery(new SpanOrQuery(term, term)), term); } - + /** SpanOrQuery(A, B) = (A B) */ public void testSpanOrVersusBooleanTerm() throws Exception { Term t1 = randomTerm(); @@ -66,20 +68,20 @@ public void testSpanOrVersusBooleanTerm() throws Exception { SpanQuery q2 = spanQuery(new SpanOrQuery(spanQuery(new SpanTermQuery(t1)), spanQuery(new SpanTermQuery(t2)))); assertSameSet(q1.build(), q2); } - + /** SpanOrQuery(SpanNearQuery[A B], SpanNearQuery[C D]) = (SpanNearQuery[A B], SpanNearQuery[C D]) */ public void testSpanOrVersusBooleanNear() throws Exception { Term t1 = randomTerm(); Term t2 = randomTerm(); Term t3 = randomTerm(); Term t4 = randomTerm(); - SpanQuery near1 = spanQuery(new SpanNearQuery(new SpanQuery[] { - spanQuery(new SpanTermQuery(t1)), - spanQuery(new SpanTermQuery(t2)) + SpanQuery near1 = spanQuery(new SpanNearQuery(new SpanQuery[] { + spanQuery(new SpanTermQuery(t1)), + spanQuery(new SpanTermQuery(t2)) }, 10, random().nextBoolean())); - SpanQuery near2 = spanQuery(new SpanNearQuery(new SpanQuery[] { - spanQuery(new SpanTermQuery(t3)), - spanQuery(new SpanTermQuery(t4)) + SpanQuery near2 = spanQuery(new SpanNearQuery(new SpanQuery[] { + spanQuery(new SpanTermQuery(t3)), + spanQuery(new SpanTermQuery(t4)) }, 10, random().nextBoolean())); BooleanQuery.Builder q1 = new BooleanQuery.Builder(); q1.add(near1, Occur.SHOULD); @@ -87,70 +89,70 @@ public void testSpanOrVersusBooleanNear() throws Exception { SpanQuery q2 = spanQuery(new SpanOrQuery(near1, near2)); assertSameSet(q1.build(), q2); } - + /** SpanNotQuery(A, B) ⊆ SpanTermQuery(A) */ public void testSpanNotVersusSpanTerm() throws Exception { Term t1 = randomTerm(); Term t2 = randomTerm(); - assertSubsetOf(spanQuery(new SpanNotQuery(spanQuery(new SpanTermQuery(t1)), spanQuery(new SpanTermQuery(t2)))), + assertSubsetOf(spanQuery(new SpanNotQuery(spanQuery(new SpanTermQuery(t1)), spanQuery(new SpanTermQuery(t2)))), spanQuery(new SpanTermQuery(t1))); } - + /** SpanNotQuery(A, [B C]) ⊆ SpanTermQuery(A) */ public void testSpanNotNearVersusSpanTerm() throws Exception { Term t1 = randomTerm(); Term t2 = randomTerm(); Term t3 = randomTerm(); - SpanQuery near = spanQuery(new SpanNearQuery(new SpanQuery[] { - spanQuery(new SpanTermQuery(t2)), - spanQuery(new SpanTermQuery(t3)) + SpanQuery near = spanQuery(new SpanNearQuery(new SpanQuery[] { + spanQuery(new SpanTermQuery(t2)), + spanQuery(new SpanTermQuery(t3)) }, 10, random().nextBoolean())); assertSubsetOf(spanQuery(new SpanNotQuery(spanQuery(new SpanTermQuery(t1)), near)), spanQuery(new SpanTermQuery(t1))); } - + /** SpanNotQuery([A B], C) ⊆ SpanNearQuery([A B]) */ public void testSpanNotVersusSpanNear() throws Exception { Term t1 = randomTerm(); Term t2 = randomTerm(); Term t3 = randomTerm(); - SpanQuery near = spanQuery(new SpanNearQuery(new SpanQuery[] { - spanQuery(new SpanTermQuery(t1)), - spanQuery(new SpanTermQuery(t2)) + SpanQuery near = spanQuery(new SpanNearQuery(new SpanQuery[] { + spanQuery(new SpanTermQuery(t1)), + spanQuery(new SpanTermQuery(t2)) }, 10, random().nextBoolean())); assertSubsetOf(spanQuery(new SpanNotQuery(near, spanQuery(new SpanTermQuery(t3)))), near); } - + /** SpanNotQuery([A B], [C D]) ⊆ SpanNearQuery([A B]) */ public void testSpanNotNearVersusSpanNear() throws Exception { Term t1 = randomTerm(); Term t2 = randomTerm(); Term t3 = randomTerm(); Term t4 = randomTerm(); - SpanQuery near1 = spanQuery(new SpanNearQuery(new SpanQuery[] { - spanQuery(new SpanTermQuery(t1)), - spanQuery(new SpanTermQuery(t2)) + SpanQuery near1 = spanQuery(new SpanNearQuery(new SpanQuery[] { + spanQuery(new SpanTermQuery(t1)), + spanQuery(new SpanTermQuery(t2)) }, 10, random().nextBoolean())); - SpanQuery near2 = spanQuery(new SpanNearQuery(new SpanQuery[] { - spanQuery(new SpanTermQuery(t3)), - spanQuery(new SpanTermQuery(t4)) + SpanQuery near2 = spanQuery(new SpanNearQuery(new SpanQuery[] { + spanQuery(new SpanTermQuery(t3)), + spanQuery(new SpanTermQuery(t4)) }, 10, random().nextBoolean())); assertSubsetOf(spanQuery(new SpanNotQuery(near1, near2)), near1); } - + /** SpanFirstQuery(A, 10) ⊆ SpanTermQuery(A) */ public void testSpanFirstVersusSpanTerm() throws Exception { Term t1 = randomTerm(); - assertSubsetOf(spanQuery(new SpanFirstQuery(spanQuery(new SpanTermQuery(t1)), 10)), + assertSubsetOf(spanQuery(new SpanFirstQuery(spanQuery(new SpanTermQuery(t1)), 10)), spanQuery(new SpanTermQuery(t1))); } - + /** SpanNearQuery([A, B], 0, true) = "A B" */ public void testSpanNearVersusPhrase() throws Exception { Term t1 = randomTerm(); Term t2 = randomTerm(); - SpanQuery subquery[] = new SpanQuery[] { - spanQuery(new SpanTermQuery(t1)), - spanQuery(new SpanTermQuery(t2)) + SpanQuery subquery[] = new SpanQuery[] { + spanQuery(new SpanTermQuery(t1)), + spanQuery(new SpanTermQuery(t2)) }; SpanQuery q1 = spanQuery(new SpanNearQuery(subquery, 0, true)); PhraseQuery q2 = new PhraseQuery(t1.field(), t1.bytes(), t2.bytes()); @@ -160,55 +162,55 @@ public void testSpanNearVersusPhrase() throws Exception { assertSameScores(q1, q2); } } - + /** SpanNearQuery([A, B], ∞, false) = +A +B */ public void testSpanNearVersusBooleanAnd() throws Exception { Term t1 = randomTerm(); Term t2 = randomTerm(); - SpanQuery subquery[] = new SpanQuery[] { - spanQuery(new SpanTermQuery(t1)), - spanQuery(new SpanTermQuery(t2)) + SpanQuery subquery[] = new SpanQuery[] { + spanQuery(new SpanTermQuery(t1)), + spanQuery(new SpanTermQuery(t2)) }; - SpanQuery q1 = spanQuery(new SpanNearQuery(subquery, Integer.MAX_VALUE, false)); + SpanQuery q1 = spanQuery(new SpanNearQuery(subquery, MAX_SLOP, false)); BooleanQuery.Builder q2 = new BooleanQuery.Builder(); q2.add(new TermQuery(t1), Occur.MUST); q2.add(new TermQuery(t2), Occur.MUST); assertSameSet(q1, q2.build()); } - + /** SpanNearQuery([A B], 0, false) ⊆ SpanNearQuery([A B], 1, false) */ public void testSpanNearVersusSloppySpanNear() throws Exception { Term t1 = randomTerm(); Term t2 = randomTerm(); - SpanQuery subquery[] = new SpanQuery[] { - spanQuery(new SpanTermQuery(t1)), - spanQuery(new SpanTermQuery(t2)) + SpanQuery subquery[] = new SpanQuery[] { + spanQuery(new SpanTermQuery(t1)), + spanQuery(new SpanTermQuery(t2)) }; SpanQuery q1 = spanQuery(new SpanNearQuery(subquery, 0, false)); SpanQuery q2 = spanQuery(new SpanNearQuery(subquery, 1, false)); assertSubsetOf(q1, q2); } - + /** SpanNearQuery([A B], 3, true) ⊆ SpanNearQuery([A B], 3, false) */ public void testSpanNearInOrderVersusOutOfOrder() throws Exception { Term t1 = randomTerm(); Term t2 = randomTerm(); - SpanQuery subquery[] = new SpanQuery[] { - spanQuery(new SpanTermQuery(t1)), - spanQuery(new SpanTermQuery(t2)) + SpanQuery subquery[] = new SpanQuery[] { + spanQuery(new SpanTermQuery(t1)), + spanQuery(new SpanTermQuery(t2)) }; SpanQuery q1 = spanQuery(new SpanNearQuery(subquery, 3, true)); SpanQuery q2 = spanQuery(new SpanNearQuery(subquery, 3, false)); assertSubsetOf(q1, q2); } - + /** SpanNearQuery([A B], N, false) ⊆ SpanNearQuery([A B], N+1, false) */ public void testSpanNearIncreasingSloppiness() throws Exception { Term t1 = randomTerm(); Term t2 = randomTerm(); - SpanQuery subquery[] = new SpanQuery[] { - spanQuery(new SpanTermQuery(t1)), - spanQuery(new SpanTermQuery(t2)) + SpanQuery subquery[] = new SpanQuery[] { + spanQuery(new SpanTermQuery(t1)), + spanQuery(new SpanTermQuery(t2)) }; for (int i = 0; i < 10; i++) { SpanQuery q1 = spanQuery(new SpanNearQuery(subquery, i, false)); @@ -216,16 +218,16 @@ public void testSpanNearIncreasingSloppiness() throws Exception { assertSubsetOf(q1, q2); } } - + /** SpanNearQuery([A B C], N, false) ⊆ SpanNearQuery([A B C], N+1, false) */ public void testSpanNearIncreasingSloppiness3() throws Exception { Term t1 = randomTerm(); Term t2 = randomTerm(); Term t3 = randomTerm(); - SpanQuery subquery[] = new SpanQuery[] { - spanQuery(new SpanTermQuery(t1)), - spanQuery(new SpanTermQuery(t2)), - spanQuery(new SpanTermQuery(t3)) + SpanQuery subquery[] = new SpanQuery[] { + spanQuery(new SpanTermQuery(t1)), + spanQuery(new SpanTermQuery(t2)), + spanQuery(new SpanTermQuery(t3)) }; for (int i = 0; i < 10; i++) { SpanQuery q1 = spanQuery(new SpanNearQuery(subquery, i, false)); @@ -233,14 +235,14 @@ public void testSpanNearIncreasingSloppiness3() throws Exception { assertSubsetOf(q1, q2); } } - + /** SpanNearQuery([A B], N, true) ⊆ SpanNearQuery([A B], N+1, true) */ public void testSpanNearIncreasingOrderedSloppiness() throws Exception { Term t1 = randomTerm(); Term t2 = randomTerm(); - SpanQuery subquery[] = new SpanQuery[] { - spanQuery(new SpanTermQuery(t1)), - spanQuery(new SpanTermQuery(t2)) + SpanQuery subquery[] = new SpanQuery[] { + spanQuery(new SpanTermQuery(t1)), + spanQuery(new SpanTermQuery(t2)) }; for (int i = 0; i < 10; i++) { SpanQuery q1 = spanQuery(new SpanNearQuery(subquery, i, false)); @@ -248,16 +250,16 @@ public void testSpanNearIncreasingOrderedSloppiness() throws Exception { assertSubsetOf(q1, q2); } } - + /** SpanNearQuery([A B C], N, true) ⊆ SpanNearQuery([A B C], N+1, true) */ public void testSpanNearIncreasingOrderedSloppiness3() throws Exception { Term t1 = randomTerm(); Term t2 = randomTerm(); Term t3 = randomTerm(); - SpanQuery subquery[] = new SpanQuery[] { - spanQuery(new SpanTermQuery(t1)), - spanQuery(new SpanTermQuery(t2)), - spanQuery(new SpanTermQuery(t3)) + SpanQuery subquery[] = new SpanQuery[] { + spanQuery(new SpanTermQuery(t1)), + spanQuery(new SpanTermQuery(t2)), + spanQuery(new SpanTermQuery(t3)) }; for (int i = 0; i < 10; i++) { SpanQuery q1 = spanQuery(new SpanNearQuery(subquery, i, true)); @@ -265,7 +267,7 @@ public void testSpanNearIncreasingOrderedSloppiness3() throws Exception { assertSubsetOf(q1, q2); } } - + /** SpanPositionRangeQuery(A, M, N) ⊆ TermQuery(A) */ public void testSpanRangeTerm() throws Exception { Term t1 = randomTerm(); @@ -277,7 +279,7 @@ public void testSpanRangeTerm() throws Exception { } } } - + /** SpanPositionRangeQuery(A, M, N) ⊆ SpanFirstQuery(A, M, N+1) */ public void testSpanRangeTermIncreasingEnd() throws Exception { Term t1 = randomTerm(); @@ -289,22 +291,22 @@ public void testSpanRangeTermIncreasingEnd() throws Exception { } } } - + /** SpanPositionRangeQuery(A, 0, ∞) = TermQuery(A) */ public void testSpanRangeTermEverything() throws Exception { Term t1 = randomTerm(); - Query q1 = spanQuery(new SpanPositionRangeQuery(spanQuery(new SpanTermQuery(t1)), 0, Integer.MAX_VALUE)); + Query q1 = spanQuery(new SpanPositionRangeQuery(spanQuery(new SpanTermQuery(t1)), 0, MAX_SLOP)); Query q2 = new TermQuery(t1); assertSameSet(q1, q2); } - + /** SpanPositionRangeQuery([A B], M, N) ⊆ SpanNearQuery([A B]) */ public void testSpanRangeNear() throws Exception { Term t1 = randomTerm(); Term t2 = randomTerm(); - SpanQuery subquery[] = new SpanQuery[] { - spanQuery(new SpanTermQuery(t1)), - spanQuery(new SpanTermQuery(t2)) + SpanQuery subquery[] = new SpanQuery[] { + spanQuery(new SpanTermQuery(t1)), + spanQuery(new SpanTermQuery(t2)) }; SpanQuery nearQuery = spanQuery(new SpanNearQuery(subquery, 10, true)); for (int i = 0; i < 5; i++) { @@ -315,14 +317,14 @@ public void testSpanRangeNear() throws Exception { } } } - + /** SpanPositionRangeQuery([A B], M, N) ⊆ SpanFirstQuery([A B], M, N+1) */ public void testSpanRangeNearIncreasingEnd() throws Exception { Term t1 = randomTerm(); Term t2 = randomTerm(); - SpanQuery subquery[] = new SpanQuery[] { - spanQuery(new SpanTermQuery(t1)), - spanQuery(new SpanTermQuery(t2)) + SpanQuery subquery[] = new SpanQuery[] { + spanQuery(new SpanTermQuery(t1)), + spanQuery(new SpanTermQuery(t2)) }; SpanQuery nearQuery = spanQuery(new SpanNearQuery(subquery, 10, true)); for (int i = 0; i < 5; i++) { @@ -333,21 +335,21 @@ public void testSpanRangeNearIncreasingEnd() throws Exception { } } } - + /** SpanPositionRangeQuery([A B], ∞) = SpanNearQuery([A B]) */ public void testSpanRangeNearEverything() throws Exception { Term t1 = randomTerm(); Term t2 = randomTerm(); - SpanQuery subquery[] = new SpanQuery[] { - spanQuery(new SpanTermQuery(t1)), - spanQuery(new SpanTermQuery(t2)) + SpanQuery subquery[] = new SpanQuery[] { + spanQuery(new SpanTermQuery(t1)), + spanQuery(new SpanTermQuery(t2)) }; SpanQuery nearQuery = spanQuery(new SpanNearQuery(subquery, 10, true)); - Query q1 = spanQuery(new SpanPositionRangeQuery(nearQuery, 0, Integer.MAX_VALUE)); + Query q1 = spanQuery(new SpanPositionRangeQuery(nearQuery, 0, MAX_SLOP)); Query q2 = nearQuery; assertSameSet(q1, q2); } - + /** SpanFirstQuery(A, N) ⊆ TermQuery(A) */ public void testSpanFirstTerm() throws Exception { Term t1 = randomTerm(); @@ -357,7 +359,7 @@ public void testSpanFirstTerm() throws Exception { assertSubsetOf(q1, q2); } } - + /** SpanFirstQuery(A, N) ⊆ SpanFirstQuery(A, N+1) */ public void testSpanFirstTermIncreasing() throws Exception { Term t1 = randomTerm(); @@ -367,22 +369,22 @@ public void testSpanFirstTermIncreasing() throws Exception { assertSubsetOf(q1, q2); } } - + /** SpanFirstQuery(A, ∞) = TermQuery(A) */ public void testSpanFirstTermEverything() throws Exception { Term t1 = randomTerm(); - Query q1 = spanQuery(new SpanFirstQuery(spanQuery(new SpanTermQuery(t1)), Integer.MAX_VALUE)); + Query q1 = spanQuery(new SpanFirstQuery(spanQuery(new SpanTermQuery(t1)), MAX_SLOP)); Query q2 = new TermQuery(t1); assertSameSet(q1, q2); } - + /** SpanFirstQuery([A B], N) ⊆ SpanNearQuery([A B]) */ public void testSpanFirstNear() throws Exception { Term t1 = randomTerm(); Term t2 = randomTerm(); - SpanQuery subquery[] = new SpanQuery[] { - spanQuery(new SpanTermQuery(t1)), - spanQuery(new SpanTermQuery(t2)) + SpanQuery subquery[] = new SpanQuery[] { + spanQuery(new SpanTermQuery(t1)), + spanQuery(new SpanTermQuery(t2)) }; SpanQuery nearQuery = spanQuery(new SpanNearQuery(subquery, 10, true)); for (int i = 0; i < 10; i++) { @@ -391,14 +393,14 @@ public void testSpanFirstNear() throws Exception { assertSubsetOf(q1, q2); } } - + /** SpanFirstQuery([A B], N) ⊆ SpanFirstQuery([A B], N+1) */ public void testSpanFirstNearIncreasing() throws Exception { Term t1 = randomTerm(); Term t2 = randomTerm(); - SpanQuery subquery[] = new SpanQuery[] { - spanQuery(new SpanTermQuery(t1)), - spanQuery(new SpanTermQuery(t2)) + SpanQuery subquery[] = new SpanQuery[] { + spanQuery(new SpanTermQuery(t1)), + spanQuery(new SpanTermQuery(t2)) }; SpanQuery nearQuery = spanQuery(new SpanNearQuery(subquery, 10, true)); for (int i = 0; i < 10; i++) { @@ -407,47 +409,47 @@ public void testSpanFirstNearIncreasing() throws Exception { assertSubsetOf(q1, q2); } } - + /** SpanFirstQuery([A B], ∞) = SpanNearQuery([A B]) */ public void testSpanFirstNearEverything() throws Exception { Term t1 = randomTerm(); Term t2 = randomTerm(); - SpanQuery subquery[] = new SpanQuery[] { - spanQuery(new SpanTermQuery(t1)), - spanQuery(new SpanTermQuery(t2)) + SpanQuery subquery[] = new SpanQuery[] { + spanQuery(new SpanTermQuery(t1)), + spanQuery(new SpanTermQuery(t2)) }; SpanQuery nearQuery = spanQuery(new SpanNearQuery(subquery, 10, true)); - Query q1 = spanQuery(new SpanFirstQuery(nearQuery, Integer.MAX_VALUE)); + Query q1 = spanQuery(new SpanFirstQuery(nearQuery, MAX_SLOP)); Query q2 = nearQuery; assertSameSet(q1, q2); } - + /** SpanWithinQuery(A, B) ⊆ SpanNearQuery(A) */ public void testSpanWithinVsNear() throws Exception { Term t1 = randomTerm(); Term t2 = randomTerm(); - SpanQuery subquery[] = new SpanQuery[] { - spanQuery(new SpanTermQuery(t1)), - spanQuery(new SpanTermQuery(t2)) + SpanQuery subquery[] = new SpanQuery[] { + spanQuery(new SpanTermQuery(t1)), + spanQuery(new SpanTermQuery(t2)) }; SpanQuery nearQuery = spanQuery(new SpanNearQuery(subquery, 10, true)); - + Term t3 = randomTerm(); SpanQuery termQuery = spanQuery(new SpanTermQuery(t3)); Query q1 = spanQuery(new SpanWithinQuery(nearQuery, termQuery)); assertSubsetOf(q1, termQuery); } - + /** SpanWithinQuery(A, B) = SpanContainingQuery(A, B) */ public void testSpanWithinVsContaining() throws Exception { Term t1 = randomTerm(); Term t2 = randomTerm(); - SpanQuery subquery[] = new SpanQuery[] { - spanQuery(new SpanTermQuery(t1)), - spanQuery(new SpanTermQuery(t2)) + SpanQuery subquery[] = new SpanQuery[] { + spanQuery(new SpanTermQuery(t1)), + spanQuery(new SpanTermQuery(t2)) }; SpanQuery nearQuery = spanQuery(new SpanNearQuery(subquery, 10, true)); - + Term t3 = randomTerm(); SpanQuery termQuery = spanQuery(new SpanTermQuery(t3)); Query q1 = spanQuery(new SpanWithinQuery(nearQuery, termQuery)); diff --git a/lucene/core/src/test/org/apache/lucene/search/spans/TestSpanSynonymQuery.java b/lucene/core/src/test/org/apache/lucene/search/spans/TestSpanSynonymQuery.java new file mode 100644 index 000000000000..5f4b8eb23634 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/spans/TestSpanSynonymQuery.java @@ -0,0 +1,238 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.spans; + + +import java.io.IOException; +import java.util.Arrays; +import java.util.Comparator; + +import org.apache.lucene.analysis.*; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.CheckHits; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TopScoreDocCollector; + +import org.apache.lucene.search.SynonymQuery; + +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.English; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TestUtil; + +import org.junit.AfterClass; +import org.junit.BeforeClass; +import junit.framework.Assert; + + +public class TestSpanSynonymQuery extends LuceneTestCase { + static IndexSearcher searcher; + static IndexReader reader; + static Directory directory; + + static final int MAX_TEST_DOC = 32; + + @BeforeClass + public static void beforeClass() throws Exception { + directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, + newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)) + .setMaxBufferedDocs(TestUtil.nextInt(random(), MAX_TEST_DOC, MAX_TEST_DOC + 100)) + .setMergePolicy(newLogMergePolicy())); + for (int i = 0; i < MAX_TEST_DOC; i++) { + Document doc = new Document(); + String text; + if (i < (MAX_TEST_DOC-1)) { + text = English.intToEnglish(i); + if ((i % 5) == 0) { // add some multiple occurrences of the same term(s) + text += " " + text; + } + } else { // last doc, for testing distances > 1, and repeating occurrrences of wb + text = "az a b c d e wa wb wb wc az"; + } + doc.add(newTextField("field", text, Field.Store.YES)); + writer.addDocument(doc); + } + reader = writer.getReader(); + searcher = new IndexSearcher(reader); + writer.close(); + } + + @AfterClass + public static void afterClass() throws Exception { + reader.close(); + directory.close(); + searcher = null; + directory = null; + } + + final String FIELD_NAME = "field"; + + + Term lcnTerm(String term) { + return new Term(FIELD_NAME, term); + } + + Term[] lcnTerms(String... terms) { + Term[] lcnTrms = new Term[terms.length]; + for (int i = 0; i < terms.length; i++) { + lcnTrms[i] = lcnTerm(terms[i]); + } + return lcnTrms; + } + + TermQuery termQuery(String term) { + return new TermQuery(lcnTerm(term)); + } + + SpanTermQuery spanTermQuery(String term) { + return new SpanTermQuery(lcnTerm(term)); + } + + SpanTermQuery[] spanTermQueries(String... terms) { + SpanTermQuery[] stqs = new SpanTermQuery[terms.length]; + for (int i = 0; i < terms.length; i++) { + stqs[i] = spanTermQuery(terms[i]); + } + return stqs; + } + + SpanSynonymQuery spanSynonymQuery(String... terms) { + return new SpanSynonymQuery(lcnTerms(terms)); + } + + SynonymQuery synonymQuery(String... terms) { + return new SynonymQuery(lcnTerms(terms)); + } + + void sortByDoc(ScoreDoc[] scoreDocs) { + Arrays.sort(scoreDocs, new Comparator() { + @Override + public int compare(ScoreDoc sd1, ScoreDoc sd2) { + return sd1.doc - sd2.doc; + } + }); + } + + ScoreDoc[] search(IndexSearcher searcher, Query query) throws IOException { + TopScoreDocCollector collector = TopScoreDocCollector.create(MAX_TEST_DOC); + searcher.search(query, collector); + return collector.topDocs().scoreDocs; + } + + int[] docsFromHits(ScoreDoc[] hits) throws Exception { + int[] docs = new int[hits.length]; + for (int i = 0; i < hits.length; i++) { + docs[i] = hits[i].doc; + } + return docs; + } + + void showQueryResults(String message, Query q, ScoreDoc[] hits) { + System.out.println(message + " results from query " + q); + for (ScoreDoc hit : hits) { + System.out.println("doc=" + hit.doc + ", score=" + hit.score); + } + } + + void checkEqualScores(Query qexp, Query qact) throws Exception { + ScoreDoc[] expHits = search(searcher, qexp); + + int[] expDocs = docsFromHits(expHits); + //showQueryResults("checkEqualScores expected", qexp, expHits); + + ScoreDoc[] actHits = search(searcher, qact); + //showQueryResults("checkEqualScores actual", qact, actHits); + + CheckHits.checkHitsQuery(qact, actHits, expHits, expDocs); + } + + void checkScoresInRange(Query qexp, Query qact, float maxFac, float minFac) throws Exception { + ScoreDoc[] expHits = search(searcher, qexp); + //showQueryResults("checkScoresInRange expected", qexp, expHits); + + ScoreDoc[] actHits = search(searcher, qact); + //showQueryResults("checkScoresInRange actual", qact, actHits); + + if (expHits.length != actHits.length) { + Assert.fail("Unequal lengths: expHits="+expHits.length+",actHits="+actHits.length); + } + + sortByDoc(expHits); + sortByDoc(actHits); + for (int i = 0; i < expHits.length; i++) { + if (expHits[i].doc != actHits[i].doc) + { + Assert.fail("At index " + i + + ": expHits[i].doc=" + expHits[i].doc + + " != actHits[i].doc=" + actHits[i].doc); + } + + if ( (expHits[i].score * maxFac < actHits[i].score) + || (expHits[i].score * minFac > actHits[i].score)) + { + Assert.fail("At index " + i + + ", expHits[i].doc=" + expHits[i].doc + + ", score not in expected range: " + (expHits[i].score * minFac) + + " <= " + actHits[i].score + + " <= " + (expHits[i].score * maxFac)); + } + } + } + + void checkSingleTerm(String term) throws Exception { + TermQuery tq = termQuery(term); + SpanTermQuery stq = spanTermQuery(term); + SpanSynonymQuery ssq = spanSynonymQuery(term); + + checkEqualScores(tq, stq); + checkEqualScores(tq, ssq); + } + + public void testSingleZero() throws Exception { + checkSingleTerm("zero"); + } + + SpanOrQuery spanOrQuery(String... terms) { + return new SpanOrQuery(spanTermQueries(terms)); + } + + void checkOrTerms(String... terms) throws Exception { + assertTrue(terms.length >= 1); + SpanOrQuery soq = spanOrQuery(terms); + SpanSynonymQuery ssq = spanSynonymQuery(terms); + checkScoresInRange(soq, ssq, 0.7f, 0.3f); + + SynonymQuery sq = synonymQuery(terms); + checkEqualScores(sq, ssq); + } + + public void testOrTwoTermsNoDocOverlap() throws Exception { + checkOrTerms("zero", "one"); + } + + public void testOrTwoTermsDocOverlap() throws Exception { + checkOrTerms("twenty", "one"); + } +} diff --git a/lucene/core/src/test/org/apache/lucene/search/spans/TestSpansTreeQuery.java b/lucene/core/src/test/org/apache/lucene/search/spans/TestSpansTreeQuery.java new file mode 100644 index 000000000000..0e09bf741410 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/spans/TestSpansTreeQuery.java @@ -0,0 +1,679 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.spans; + + +import java.io.IOException; +import java.util.Arrays; +import java.util.Comparator; + +import org.apache.lucene.analysis.*; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.SynonymQuery; +import org.apache.lucene.search.CheckHits; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TopScoreDocCollector; +import org.apache.lucene.search.similarities.ClassicSimilarity; +import org.apache.lucene.search.similarities.BM25Similarity; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.English; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TestUtil; + +import org.junit.AfterClass; +import org.junit.BeforeClass; +import junit.framework.Assert; + + +public class TestSpansTreeQuery extends LuceneTestCase { + static IndexSearcher searcherClassic; + static IndexSearcher searcherBM25; + static IndexReader reader; + static Directory directory; + + static final int MAX_TEST_DOC = 33; + + @BeforeClass + public static void beforeClass() throws Exception { + directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, + newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)) + .setMaxBufferedDocs(TestUtil.nextInt(random(), MAX_TEST_DOC, MAX_TEST_DOC + 100)) + .setMergePolicy(newLogMergePolicy())); + for (int i = 0; i < MAX_TEST_DOC; i++) { + Document doc = new Document(); + String text; + if (i < (MAX_TEST_DOC-1)) { + text = English.intToEnglish(i); + if ((i % 5) == 0) { // add some multiple occurrences of the same term(s) + text += " " + text; + } + } else { // last doc, for testing distances > 1, and repeating occurrrences of wb + text = "az a b c d e wa wb wb wc az"; + } + doc.add(newTextField("field", text, Field.Store.YES)); + writer.addDocument(doc); + } + reader = writer.getReader(); + searcherClassic = new IndexSearcher(reader); + searcherClassic.setSimilarity(new ClassicSimilarity()); + searcherBM25 = new IndexSearcher(reader); + searcherBM25.setSimilarity(new BM25Similarity()); + writer.close(); + } + + @AfterClass + public static void afterClass() throws Exception { + reader.close(); + directory.close(); + searcherClassic = null; + searcherBM25 = null; + reader = null; + directory = null; + } + + final String FIELD_NAME = "field"; + + Term lcnTerm(String term) { + return new Term(FIELD_NAME, term); + } + + Term[] lcnTerms(String... terms) { + Term[] lcnTrms = new Term[terms.length]; + for (int i = 0; i < terms.length; i++) { + lcnTrms[i] = lcnTerm(terms[i]); + } + return lcnTrms; + } + + + TermQuery termQuery(String term) { + return new TermQuery(lcnTerm(term)); + } + + SpanTermQuery spanTermQuery(String term) { + return new SpanTermQuery(lcnTerm(term)); + } + + ScoreDoc[] search(IndexSearcher searcher, Query query) throws IOException { + TopScoreDocCollector collector = TopScoreDocCollector.create(MAX_TEST_DOC); + searcher.search(query, collector); + return collector.topDocs().scoreDocs; + } + + int[] docsFromHits(ScoreDoc[] hits) throws Exception { + int[] docs = new int[hits.length]; + for (int i = 0; i < hits.length; i++) { + docs[i] = hits[i].doc; + } + return docs; + } + + void checkEqualDocOrder(Query qexp, Query qact) throws Exception { + ScoreDoc[] expHits = search(searcherBM25, qexp); + ScoreDoc[] actHits = search(searcherBM25, qact); + assertEquals("same nr of hits", expHits.length, actHits.length); + for (int i = 0; i < expHits.length; i++) { + assertEquals("same doc at rank " + i, expHits[i].doc, actHits[i].doc); + } + } + + void showQueryResults(String message, Query q, ScoreDoc[] hits) { + System.out.println(message + " results from query " + q); + for (ScoreDoc hit : hits) { + System.out.println("doc=" + hit.doc + ", score=" + hit.score); + } + } + + void checkEqualScores(Query qexp, Query qact) throws Exception { + ScoreDoc[] expHits = search(searcherBM25, qexp); + int[] expDocs = docsFromHits(expHits); + //showQueryResults("expected BM25", qexp, expHits); + + ScoreDoc[] actHits = search(searcherBM25, qact); + //showQueryResults("actual BM25", qact, actHits); + + CheckHits.checkHitsQuery(qact, actHits, expHits, expDocs); + + expHits = search(searcherClassic, qexp); + expDocs = docsFromHits(expHits); + //showQueryResults("expected Classic", qexp, expHits); + + actHits = search(searcherClassic, qact); + //showQueryResults("actual Classic", qexp, expHits); + CheckHits.checkHitsQuery(qact, actHits, expHits, expDocs); + } + + void checkSpanTerm(String term) throws Exception { + TermQuery tq = termQuery(term); + SpanTermQuery stq = spanTermQuery(term); + + checkEqualScores(tq, stq); // test SpanScorer + + checkEqualScores(tq, SpansTreeQuery.wrap(stq)); // test SpanTreeScorer + } + + public void testSpanTermZero() throws Exception { + checkSpanTerm("zero"); + } + + public void testSpanTermSeven() throws Exception { + checkSpanTerm("seven"); + } + + public void testSpanTermFive() throws Exception { + checkSpanTerm("five"); + } + + SpanTermQuery[] spanTermQueries(String... terms) { + SpanTermQuery[] stqs = new SpanTermQuery[terms.length]; + for (int i = 0; i < terms.length; i++) { + stqs[i] = spanTermQuery(terms[i]); + } + return stqs; + } + + SpanOrQuery spanOrQuery(String... terms) { + return new SpanOrQuery(spanTermQueries(terms)); + } + + SpanOrQuery spanOrNearQuery(int maxDistance, String... terms) { + return new SpanOrQuery(maxDistance, spanTermQueries(terms)); + } + + BooleanQuery booleanOrQuery(String... terms) { + BooleanQuery.Builder bqb = new BooleanQuery.Builder(); + for (int i = 0; i < terms.length; i++) { + bqb.add(termQuery(terms[i]), BooleanClause.Occur.SHOULD); + } + return bqb.build(); + } + + void checkSpanOrTerms(String... terms) throws Exception { + assertTrue(terms.length >= 1); + Query boq = SpansTreeQuery.wrap(booleanOrQuery(terms)); + assertTrue(boq instanceof BooleanQuery); // test SpansTreeQuery.wrap + assertTrue(((BooleanQuery)boq).clauses().get(terms.length-1).getQuery() instanceof TermQuery); // test SpansTreeQuery.wrap + SpanOrQuery soq = spanOrQuery(terms); + Query sptroq = SpansTreeQuery.wrap(soq); + //checkEqualDocOrder(boq, sptroq); + //checkEqualScores(boq, soq); // test SpanScorer for OR over terms, fails + checkEqualScores(boq, sptroq); // test SpanTreeScorer for OR over terms + } + + public void testSpanOrOneTerm1() throws Exception { + checkSpanOrTerms("zero"); + } + + public void testSpanOrOneTerm2() throws Exception { + checkSpanOrTerms("thirty"); + } + + public void testSpanOrTwoTerms() throws Exception { + checkSpanOrTerms("zero", "thirty"); + } + + public void testSpanOrTwoCooccurringTerms() throws Exception { + checkSpanOrTerms("twenty", "five"); + } + + public void testSpanOrMoreTerms() throws Exception { + checkSpanOrTerms( + "zero", + "one", + "two", + "three", + "four", + "five", + "six", + "seven", + "twenty", + "thirty" + ); + } + + void checkSameHighestScoringDocAndScore(Query exp, Query act) throws Exception { + ScoreDoc[] expHits = search(searcherBM25, exp); + int[] expDocs = docsFromHits(expHits); + //showQueryResults("checkSameHighestScoringDocAndScore expected BM25", exp, expHits); + + ScoreDoc[] actHits = search(searcherBM25, act); + //showQueryResults("checkSameHighestScoringDocAndScore actual BM25", act, actHits); + + final float scoreTolerance = 1.0e-6f; // from CheckHits.java + + assertEquals("highest scoring docs the same", expHits[0].doc, actHits[0].doc); + assertTrue("equal scores", Math.abs(expHits[0].score - actHits[0].score) <= scoreTolerance); + } + + void checkSameHighestScoringDocAndScoreRange(Query exp, Query act, float maxFac, float minFac) throws Exception { + ScoreDoc[] expHits = search(searcherBM25, exp); + int[] expDocs = docsFromHits(expHits); + //showQueryResults("checkSameHighestScoringDocAndScore expected BM25", exp, expHits); + + ScoreDoc[] actHits = search(searcherBM25, act); + //showQueryResults("checkSameHighestScoringDocAndScore actual BM25", act, actHits); + + final float scoreTolerance = 1.0e-6f; // from CheckHits.java + + assertTrue("at least one expected hit", expHits.length >= 1); + assertTrue("at least one actual hit", actHits.length >= 1); + + int actDoc = 0; // order may differ when top scores are equal + while ((actDoc < actHits.length) + && (actHits[actDoc].doc != expHits[0].doc) + && (Math.abs(actHits[0].score - actHits[actDoc+1].score) < 1e-6f) ) { + actDoc++; + } + assertEquals("highest scoring docs the same", expHits[0].doc, actHits[actDoc].doc); + if ( (expHits[0].score * maxFac < actHits[actDoc].score) + || (expHits[0].score * minFac > actHits[actDoc].score)) + { + Assert.fail("For highest scoring doc" + + ", expHits[0].doc=" + expHits[0].doc + + ", score not in expected range: " + (expHits[0].score * minFac) + + " <= " + actHits[actDoc].score + + " <= " + (expHits[0].score * maxFac)); + } + } + + public void testSpanAdjacentAllTermsInDocUnordered() throws Exception { + /* On "twenty five twenty five" + * unordered "twenty five" should score the same as "twenty" OR "five" + */ + String t1 = "twenty"; + String t2 = "five"; + SpanNearQuery snq = SpanNearQuery.newUnorderedNearQuery(FIELD_NAME) + .addClause(spanTermQuery(t1)) + .addClause(spanTermQuery(t2)) + .setSlop(0) + .build(); + BooleanQuery boq = booleanOrQuery(t1, t2); + + checkSameHighestScoringDocAndScore(boq, SpansTreeQuery.wrap(snq)); + } + + public void testSpanAdjacentAllTermsInDocOrdered1() throws Exception { + /* On "twenty five twenty five" + * ordered "twenty five" should score the same as "twenty" OR "five" + */ + String t1 = "twenty"; + String t2 = "five"; + SpanNearQuery snq = SpanNearQuery.newOrderedNearQuery(FIELD_NAME) + .addClause(spanTermQuery(t1)) + .addClause(spanTermQuery(t2)) + .setSlop(0) + .build(); + BooleanQuery boq = booleanOrQuery(t1, t2); + + checkSameHighestScoringDocAndScore(boq, SpansTreeQuery.wrap(snq)); + } + + public void testSpanAdjacentAllTermsInDocOrdered2() throws Exception { + /* On "twenty five twenty five" + * ordered "five twenty" should score less, but more than half of "twenty" OR "five" + */ + String t1 = "five"; + String t2 = "twenty"; + SpanNearQuery snq = SpanNearQuery.newOrderedNearQuery(FIELD_NAME) + .addClause(spanTermQuery(t1)) + .addClause(spanTermQuery(t2)) + .setSlop(0) + .build(); + BooleanQuery.Builder bqb = new BooleanQuery.Builder(); + bqb.add(termQuery(t1), BooleanClause.Occur.SHOULD); + bqb.add(termQuery(t2), BooleanClause.Occur.SHOULD); + BooleanQuery boq = bqb.build(); + + checkSameHighestScoringDocAndScoreRange(boq, SpansTreeQuery.wrap(snq), 0.7f, 0.5f); + } + + public void testSpanMoreDistanceLessScore() throws Exception { + String t1 = "a"; + String t2 = "b"; + String t3 = "c"; + SpanNearQuery snq2 = SpanNearQuery.newOrderedNearQuery(FIELD_NAME) + .addClause(spanTermQuery(t1)) + .addClause(spanTermQuery(t2)) + .setSlop(2) + .build(); + SpanNearQuery snq3 = SpanNearQuery.newOrderedNearQuery(FIELD_NAME) + .addClause(spanTermQuery(t1)) + .addClause(spanTermQuery(t3)) + .setSlop(2) + .build(); + + checkSameHighestScoringDocAndScoreRange(SpansTreeQuery.wrap(snq2), SpansTreeQuery.wrap(snq3), + 0.50f, 0.49f); + } + + Query sptrSimpleUnorderedNested(String t1a, String t1b, String t2, int slop) { + SpanNearQuery snq1 = SpanNearQuery.newUnorderedNearQuery(FIELD_NAME) + .addClause(spanTermQuery(t1a)) + .addClause(spanTermQuery(t1b)) + .setSlop(slop) + .build(); + + SpanNearQuery snqn = SpanNearQuery.newUnorderedNearQuery(FIELD_NAME) + .addClause(snq1) + .addClause(spanTermQuery(t2)) + .setSlop(slop) + .build(); + + return SpansTreeQuery.wrap(snqn); + } + + public void testSpanNestedMoreDistanceLessScore() throws Exception { + String t1 = "a"; + String t2 = "b"; + String t3 = "c"; + String t4 = "d"; + String t5 = "e"; + Query sptrq1 = sptrSimpleUnorderedNested(t1, t2, t4, 2); + Query sptrq2 = sptrSimpleUnorderedNested(t1, t3, t5, 2); + + checkSameHighestScoringDocAndScoreRange(sptrq1, sptrq2, 0.7f, 0.6f); + } + + public void testNonMatchingPresentTermScore() throws Exception { + String t1 = "a"; + String t2 = "b"; + String t3 = "c"; + + SpanOrQuery soq = new SpanOrQuery(spanTermQuery(t1), spanTermQuery(t2)); + + SpanNearQuery snq1 = SpanNearQuery.newUnorderedNearQuery(FIELD_NAME) + .addClause(soq) + .addClause(spanTermQuery(t3)) + .setSlop(0) + .setNonMatchSlop(3) + .build(); // t1 is present but does not match. + + SpanNearQuery snq2 = SpanNearQuery.newUnorderedNearQuery(FIELD_NAME) + .addClause(soq) + .addClause(spanTermQuery(t3)) + .setSlop(0) + .setNonMatchSlop(4) // t1 scores lower than in snq1 + .build(); // t1 is present but does not match. + + SpansTreeQuery sptrnq1 = new SpansTreeQuery(snq1); + SpansTreeQuery sptrnq2 = new SpansTreeQuery(snq2); + + checkSameHighestScoringDocAndScoreRange(sptrnq1, sptrnq2, 0.98f, 0.9f); + } + + public void testSpanNot() throws Exception { + /* On "twenty five twenty five" + * "twenty" not preceeded by "five", and followed by "five", + * should score less, but more than half of "twenty five" + */ + String t1 = "five"; + String t2 = "twenty"; + SpanNotQuery sntq = new SpanNotQuery( spanTermQuery(t2), spanTermQuery(t1), 1, 0); + + SpanNearQuery snrq1 = SpanNearQuery.newOrderedNearQuery(FIELD_NAME) + .addClause(sntq) + .addClause(spanTermQuery(t1)) + .setSlop(0) + .build(); + + SpanNearQuery snrq2 = SpanNearQuery.newOrderedNearQuery(FIELD_NAME) + .addClause(spanTermQuery(t2)) + .addClause(spanTermQuery(t1)) + .setSlop(0) + .build(); + + Query sptrnrq1 = SpansTreeQuery.wrap(snrq1); + Query sptrnrq2 = SpansTreeQuery.wrap(snrq2); + + checkSameHighestScoringDocAndScoreRange(sptrnrq2, sptrnrq1, 0.8f, 0.5f); + } + + public void testSpanBoost() throws Exception { + String term = "zero"; + SpanTermQuery stq = spanTermQuery(term); + SpanBoostQuery sbq = new SpanBoostQuery(stq, 1.1f); + + checkSameHighestScoringDocAndScoreRange(sbq, stq, 0.92f, 0.90f); + checkSameHighestScoringDocAndScoreRange(SpansTreeQuery.wrap(sbq), stq, 0.92f, 0.90f); + } + + public void testSpanOrNearZeroDistance() throws Exception { + String t1 = "a"; + String t2 = "b"; + BooleanQuery boq = booleanOrQuery(t1, t2); + SpanOrQuery sonq = spanOrNearQuery(0, t1, t2); + checkEqualScores(boq, SpansTreeQuery.wrap(sonq)); + } + + public void testSpanOrNearMoreDistanceLessScore() throws Exception { + String t1 = "a"; + String t2 = "b"; + String t3 = "c"; + Query stq1 = SpansTreeQuery.wrap(spanOrNearQuery(4, t1, t2)); + Query stq2 = SpansTreeQuery.wrap(spanOrNearQuery(4, t1, t3)); + checkSameHighestScoringDocAndScoreRange(stq1, stq2, 0.5f, 0.4f); + } + + public void testSpanOrNearThreeSubqueries() throws Exception { + String t1 = "a"; + String t2 = "b"; + String t3 = "c"; + BooleanQuery boq = booleanOrQuery(t1, t2, t3); + SpanOrQuery sonq = spanOrNearQuery(0, t3, t2, t1); + checkEqualScores(boq, SpansTreeQuery.wrap(sonq)); + } + + public void testSpanOrNearNonMatchingSubQuery() throws Exception { + String t1 = "a"; + String t2 = "b"; + String t3 = "c"; + String t5 = "e"; + SpanOrQuery sonq1 = spanOrNearQuery(1, t3, t2, t1); + SpanOrQuery sonq2 = spanOrNearQuery(1, t5, t2, t1); + checkSameHighestScoringDocAndScoreRange( + SpansTreeQuery.wrap(sonq1), + SpansTreeQuery.wrap(sonq2), + 0.9f, 0.8f); + } + + public void testSpanOrNearSinglePresentSubquery() throws Exception { + String t1 = "a"; + String t2 = "h"; + SpanQuery q1 = spanTermQuery(t1); + SpanOrQuery q2 = spanOrNearQuery(1, t2, t1); + checkSameHighestScoringDocAndScoreRange( + SpansTreeQuery.wrap(q1), + SpansTreeQuery.wrap(q2), + 0.51f, 0.49f); + } + + public void testSpanOrNearRepeatingOccurrences1() throws Exception { + String t1 = "wa"; + String t2 = "wb"; + BooleanQuery boq = booleanOrQuery(t1, t2); + SpanOrQuery sonq = spanOrNearQuery(3, t2, t1); + checkSameHighestScoringDocAndScoreRange( + boq, + SpansTreeQuery.wrap(sonq), + 0.9f, 0.8f); + } + + public void testSpanOrNearRepeatingOccurrences2() throws Exception { + String t1 = "wb"; + String t2 = "wc"; + BooleanQuery boq = booleanOrQuery(t1, t2); + SpanOrQuery sonq = spanOrNearQuery(3, t2, t1); + checkSameHighestScoringDocAndScoreRange( + boq, + SpansTreeQuery.wrap(sonq), + 0.9f, 0.8f); + } + + public void testIncreasingScoreExtraMatchLowSlopFactor() throws Exception { + String t1 = "az"; // near and far from a + String t2 = "a"; + SpanNearQuery snq1 = SpanNearQuery.newUnorderedNearQuery(FIELD_NAME) + .addClause(spanTermQuery(t1)) + .addClause(spanTermQuery(t2)) + .setSlop(0) // does not match far + .setNonMatchSlop(20) // for consistent non match scoring + .build(); + SpanNearQuery snq2 = SpanNearQuery.newUnorderedNearQuery(FIELD_NAME) + .addClause(spanTermQuery(t1)) + .addClause(spanTermQuery(t2)) + .setSlop(8) // also matches far + .setNonMatchSlop(20) // for consistent non match scoring + .build(); + checkSameHighestScoringDocAndScoreRange( + SpansTreeQuery.wrap(snq2), + SpansTreeQuery.wrap(snq1), + 0.98f, 0.9f); + } + + SynonymQuery synonymQuery(String... terms) { + return new SynonymQuery(lcnTerms(terms)); + } + + SpanSynonymQuery spanSynonymQuery(String... terms) { + return new SpanSynonymQuery(lcnTerms(terms)); + } + + void sortByDoc(ScoreDoc[] scoreDocs) { + Arrays.sort(scoreDocs, new Comparator() { + @Override + public int compare(ScoreDoc sd1, ScoreDoc sd2) { + return sd1.doc - sd2.doc; + } + }); + } + + void checkScoresInRange(Query qexp, Query qact, float maxFac, float minFac) throws Exception { + ScoreDoc[] expHits = search(searcherBM25, qexp); + //showQueryResults("checkScoresInRange expected", qexp, expHits); + + ScoreDoc[] actHits = search(searcherBM25, qact); + //showQueryResults("checkScoresInRange actual", qact, actHits); + + if (expHits.length != actHits.length) { + Assert.fail("Unequal lengths: expHits="+expHits.length+",actHits="+actHits.length); + } + + sortByDoc(expHits); + sortByDoc(actHits); + for (int i = 0; i < expHits.length; i++) { + if (expHits[i].doc != actHits[i].doc) + { + Assert.fail("At index " + i + + ": expHits[i].doc=" + expHits[i].doc + + " != actHits[i].doc=" + actHits[i].doc); + } + + if ( (expHits[i].score * maxFac < actHits[i].score) + || (expHits[i].score * minFac > actHits[i].score)) + { + Assert.fail("At index " + i + + ", expHits[i].doc=" + expHits[i].doc + + ", score not in expected range: " + (expHits[i].score * minFac) + + " <= " + actHits[i].score + + " <= " + (expHits[i].score * maxFac)); + } + } + } + + void checkSynTerms(String... terms) throws Exception { + assertTrue(terms.length >= 1); + SpanOrQuery soq = spanOrQuery(terms); + SpanSynonymQuery ssq = spanSynonymQuery(terms); + checkScoresInRange(SpansTreeQuery.wrap(soq), SpansTreeQuery.wrap(ssq), 1.0f, 0.425f); + + SynonymQuery sq = synonymQuery(terms); + checkEqualScores(SpansTreeQuery.wrap(sq), SpansTreeQuery.wrap(ssq)); + } + + public void testSynTwoTermsNoDocOverlap() throws Exception { + checkSynTerms("zero", "one"); + } + + public void testSynTwoTermsDocOverlap() throws Exception { + checkSynTerms("twenty", "one"); + } + + public void testSynNearOrNear() throws Exception { + // twenty occurs 10 times + // thirty occurs 2 times + SpanSynonymQuery ssq2030 = spanSynonymQuery("twenty", "thirty"); + SpanOrQuery soq2030 = spanOrQuery("twenty", "thirty"); + SpanTermQuery stq1 = spanTermQuery("one"); + + SpanNearQuery synNear = SpanNearQuery.newOrderedNearQuery(FIELD_NAME) + .addClause(ssq2030) + .addClause(stq1) + .setSlop(0) + .build(); + SpanNearQuery orNear = SpanNearQuery.newOrderedNearQuery(FIELD_NAME) + .addClause(soq2030) + .addClause(stq1) + .setSlop(0) + .build(); + + checkSameHighestScoringDocAndScoreRange( + SpansTreeQuery.wrap(orNear), + SpansTreeQuery.wrap(synNear), + 0.80f, 0.70f); + } + + public void testRecurringTerms() throws Exception { + String t1 = "a"; + String t2 = "b"; + String t3 = "c"; + + SpanNearQuery snq1 = SpanNearQuery.newUnorderedNearQuery(FIELD_NAME) + .addClause(spanTermQuery(t1)) + .addClause(spanTermQuery(t2)) + .setSlop(0) + .build(); + + SpanNearQuery snq2 = SpanNearQuery.newUnorderedNearQuery(FIELD_NAME) + .addClause(spanTermQuery(t2)) + .addClause(spanTermQuery(t3)) + .setSlop(0) + .build(); + + SpanNearQuery snq3 = SpanNearQuery.newUnorderedNearQuery(FIELD_NAME) + .addClause(spanTermQuery(t1)) + .addClause(spanTermQuery(t3)) + .setSlop(1) + .build(); + + SpanOrQuery soq = new SpanOrQuery(snq1, snq2, snq3); // should score as bag of words + + BooleanQuery boq = booleanOrQuery(t1, t2, t3); // bag of words + + checkSameHighestScoringDocAndScore(boq, SpansTreeQuery.wrap(soq)); + } + +} diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/CoreParser.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/CoreParser.java index 8637c4a6fcd2..2ac8d4f34af5 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/CoreParser.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/CoreParser.java @@ -109,6 +109,10 @@ protected CoreParser(String defaultField, Analyzer analyzer, QueryParser parser) SpanNotBuilder snot = new SpanNotBuilder(spanFactory); spanFactory.addBuilder("SpanNot", snot); queryFactory.addBuilder("SpanNot", snot); + + SpanSynonymBuilder ssyn = new SpanSynonymBuilder(analyzer); + spanFactory.addBuilder("SpanSynonym", ssyn); + queryFactory.addBuilder("SpanSynonym", ssyn); } public Query parse(InputStream xmlStream) throws ParserException { diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/builders/SpanSynonymBuilder.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/builders/SpanSynonymBuilder.java new file mode 100644 index 000000000000..ba0330784ddb --- /dev/null +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/builders/SpanSynonymBuilder.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.queryparser.xml.builders; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.spans.SpanBoostQuery; +import org.apache.lucene.search.spans.SpanSynonymQuery; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.queryparser.xml.DOMUtils; +import org.apache.lucene.queryparser.xml.ParserException; +import org.w3c.dom.Element; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +/** + * Builder that analyzes the text into a {@link SpanSynonymQuery} + */ +public class SpanSynonymBuilder extends SpanBuilderBase { + + private final Analyzer analyzer; + + public SpanSynonymBuilder(Analyzer analyzer) { + this.analyzer = analyzer; + } + + @Override + public SpanQuery getSpanQuery(Element e) throws ParserException { + String fieldName = DOMUtils.getAttributeWithInheritanceOrFail(e, "fieldName"); + String value = DOMUtils.getNonBlankTextOrFail(e); + + List termsList = new ArrayList<>(); + + try (TokenStream ts = analyzer.tokenStream(fieldName, value)) { + TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class); + ts.reset(); + while (ts.incrementToken()) { + Term t = new Term(fieldName, BytesRef.deepCopyOf(termAtt.getBytesRef())); + termsList.add(t); + } + ts.end(); + SpanSynonymQuery ssyn = new SpanSynonymQuery(termsList.toArray(new Term[termsList.size()])); + float boost = DOMUtils.getAttribute(e, "boost", 1.0f); + return new SpanBoostQuery(ssyn, boost); + } + catch (IOException ioe) { + throw new ParserException("IOException parsing value:" + value); + } + } + +} diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/xml/SpanQuery.xml b/lucene/queryparser/src/test/org/apache/lucene/queryparser/xml/SpanQuery.xml index fc37d96e6901..21cc74fa6a0b 100644 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/xml/SpanQuery.xml +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/xml/SpanQuery.xml @@ -38,6 +38,7 @@ fire burn + go goes going went gone