From 676c13c0c70e3f344ad6fb430eb5868270be83aa Mon Sep 17 00:00:00 2001 From: Paul Elschot Date: Wed, 8 Mar 2017 23:10:40 +0100 Subject: [PATCH] LUCENE-7615 of 8 March 2017. Adds support for SpanSynonymQuery in xml queryparser. --- .../lucene/search/DisiPriorityQueue.java | 9 + .../apache/lucene/search/SynonymQuery.java | 65 +++-- .../lucene/search/spans/DisjunctionSpans.java | 239 ++++++++++++++++++ .../lucene/search/spans/SpanOrQuery.java | 225 +---------------- .../lucene/search/spans/SpanSynonymQuery.java | 187 ++++++++++++++ .../lucene/search/spans/SpanTermQuery.java | 2 +- .../lucene/search/spans/SynonymSpans.java | 47 ++++ .../search/spans/TestSpanSynonymQuery.java | 238 +++++++++++++++++ .../lucene/queryparser/xml/CoreParser.java | 4 + .../xml/builders/SpanSynonymBuilder.java | 70 +++++ .../lucene/queryparser/xml/SpanQuery.xml | 1 + 11 files changed, 858 insertions(+), 229 deletions(-) create mode 100644 lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionSpans.java create mode 100644 lucene/core/src/java/org/apache/lucene/search/spans/SpanSynonymQuery.java create mode 100644 lucene/core/src/java/org/apache/lucene/search/spans/SynonymSpans.java create mode 100644 lucene/core/src/test/org/apache/lucene/search/spans/TestSpanSynonymQuery.java create mode 100644 lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/builders/SpanSynonymBuilder.java diff --git a/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueue.java b/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueue.java index 0692a7b914e8..e1dcbbb74d76 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueue.java +++ b/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueue.java @@ -105,6 +105,15 @@ public DisiWrapper add(DisiWrapper entry) { return heap[0]; } + /** The total {@link DocIdSetIterator#cost()} of the iterators in the queue */ + public long totalCost() { + long res = 0; + for (int i = 0; i < size; i++) { + res += heap[i].cost; + } + return res; + } + public DisiWrapper pop() { final DisiWrapper[] heap = this.heap; final DisiWrapper result = heap[0]; diff --git a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java index c718dc9ed761..ecca69a0a2ed 100644 --- a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java @@ -29,11 +29,14 @@ import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermContext; import org.apache.lucene.index.TermState; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.search.similarities.Similarity.SimScorer; +import org.apache.lucene.search.MatchNoDocsQuery; + /** * A query that treats multiple terms as synonyms. @@ -45,7 +48,8 @@ */ public final class SynonymQuery extends Query { private final Term terms[]; - + private final String field; + /** * Creates a new SynonymQuery, matching any of the supplied terms. *

@@ -62,16 +66,23 @@ public SynonymQuery(Term... terms) { throw new IllegalArgumentException("Synonyms must be across the same field"); } } + this.field = field; if (terms.length > BooleanQuery.getMaxClauseCount()) { throw new BooleanQuery.TooManyClauses(); } Arrays.sort(this.terms); } + /** The terms to be treated as synonyms. */ public List getTerms() { return Collections.unmodifiableList(Arrays.asList(terms)); } - + + /** The field of the terms. */ + public String getField() { + return field; + } + @Override public String toString(String field) { StringBuilder builder = new StringBuilder("Synonym("); @@ -101,7 +112,7 @@ public boolean equals(Object other) { public Query rewrite(IndexReader reader) throws IOException { // optimize zero and single term cases if (terms.length == 0) { - return new BooleanQuery.Builder().build(); + return new MatchNoDocsQuery(); } if (terms.length == 1) { return new TermQuery(terms[0]); @@ -122,8 +133,8 @@ public Weight createWeight(IndexSearcher searcher, boolean needsScores, float bo return searcher.rewrite(bq.build()).createWeight(searcher, needsScores, boost); } } - - class SynonymWeight extends Weight { + + public class SynonymWeight extends Weight { private final TermContext termContexts[]; private final Similarity similarity; private final Similarity.SimWeight simWeight; @@ -183,18 +194,40 @@ public Explanation explain(LeafReaderContext context, int doc) throws IOExceptio return Explanation.noMatch("no matching term"); } + /** + * Expert: Return a SimScorer for this context. + * Public only for use in the spans package. + * @param context the LeafReaderContext + * @return a SimWeight + * @throws IOException on error + */ + public Similarity.SimScorer getSimScorer(LeafReaderContext context) throws IOException { + return similarity.simScorer(simWeight, context); + } + + /** + * Expert: Return a TermContext array in the same order as the terms. + * Public only for use in the spans package, do not modify. + */ + public TermContext[] getTermContexts() { + return termContexts; + } + @Override public Scorer scorer(LeafReaderContext context) throws IOException { - Similarity.SimScorer simScorer = similarity.simScorer(simWeight, context); + Similarity.SimScorer simScorer = getSimScorer(context); // we use termscorers + disjunction as an impl detail List subScorers = new ArrayList<>(); - for (int i = 0; i < terms.length; i++) { - TermState state = termContexts[i].get(context.ord); - if (state != null) { - TermsEnum termsEnum = context.reader().terms(terms[i].field()).iterator(); - termsEnum.seekExact(terms[i].bytes(), state); - PostingsEnum postings = termsEnum.postings(null, PostingsEnum.FREQS); - subScorers.add(new TermScorer(this, postings, simScorer)); + Terms fieldTerms = context.reader().terms(field); + if (fieldTerms != null) { + TermsEnum termsEnum = fieldTerms.iterator(); + for (int i = 0; i < terms.length; i++) { + TermState state = termContexts[i].get(context.ord); + if (state != null) { + termsEnum.seekExact(terms[i].bytes(), state); + PostingsEnum postings = termsEnum.postings(null, PostingsEnum.FREQS); + subScorers.add(new TermScorer(this, postings, simScorer)); + } } } if (subScorers.isEmpty()) { @@ -207,10 +240,10 @@ public Scorer scorer(LeafReaderContext context) throws IOException { } } } - + static class SynonymScorer extends DisjunctionScorer { private final Similarity.SimScorer similarity; - + SynonymScorer(Similarity.SimScorer similarity, Weight weight, List subScorers) { super(weight, subScorers, true); this.similarity = similarity; @@ -220,7 +253,7 @@ static class SynonymScorer extends DisjunctionScorer { protected float score(DisiWrapper topList) throws IOException { return similarity.score(topList.doc, tf(topList)); } - + /** combines TF of all subs. */ final int tf(DisiWrapper topList) throws IOException { int tf = 0; diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionSpans.java b/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionSpans.java new file mode 100644 index 000000000000..c6abdde08198 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionSpans.java @@ -0,0 +1,239 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.spans; + +import java.io.IOException; +import java.util.List; + +import org.apache.lucene.search.DisiPriorityQueue; +import org.apache.lucene.search.DisiWrapper; +import org.apache.lucene.search.TwoPhaseIterator; +import org.apache.lucene.search.DisjunctionDISIApproximation; + + +/** + * A spans that merges given spans. + * + * @lucene.experimental + */ +public class DisjunctionSpans extends Spans { + protected final SpanQuery spanQuery; + protected final List subSpans; + protected final DisiPriorityQueue byDocQueue; + protected final SpanPositionQueue byPositionQueue; + protected Spans topPositionSpans; + protected final long totalCost; + + /** Construct a DisjunctionSpans. + * @param spanQuery The query that provides the subSpans. + * @param subSpans Over which the disjunction is to be taken. + */ + public DisjunctionSpans(SpanQuery spanQuery, List subSpans) { + this.spanQuery = spanQuery; // for toString() only + this.subSpans = subSpans; + byDocQueue = new DisiPriorityQueue(subSpans.size()); + for (Spans spans : subSpans) { + byDocQueue.add(new DisiWrapper(spans)); + } + totalCost = byDocQueue.totalCost(); + byPositionQueue = new SpanPositionQueue(subSpans.size()); // when empty use -1 + topPositionSpans = null; + } + + @Override + public int nextDoc() throws IOException { + topPositionSpans = null; + DisiWrapper topDocSpans = byDocQueue.top(); + int currentDoc = topDocSpans.doc; + do { + topDocSpans.doc = topDocSpans.iterator.nextDoc(); + topDocSpans = byDocQueue.updateTop(); + } while (topDocSpans.doc == currentDoc); + return topDocSpans.doc; + } + + @Override + public int advance(int target) throws IOException { + topPositionSpans = null; + DisiWrapper topDocSpans = byDocQueue.top(); + do { + topDocSpans.doc = topDocSpans.iterator.advance(target); + topDocSpans = byDocQueue.updateTop(); + } while (topDocSpans.doc < target); + return topDocSpans.doc; + } + + @Override + public int docID() { + DisiWrapper topDocSpans = byDocQueue.top(); + return topDocSpans.doc; + } + + @Override + public TwoPhaseIterator asTwoPhaseIterator() { + float sumMatchCost = 0; // See also DisjunctionScorer.asTwoPhaseIterator() + long sumApproxCost = 0; + + for (DisiWrapper w : byDocQueue) { + if (w.twoPhaseView != null) { + long costWeight = (w.cost <= 1) ? 1 : w.cost; + sumMatchCost += w.twoPhaseView.matchCost() * costWeight; + sumApproxCost += costWeight; + } + } + + if (sumApproxCost == 0) { // no sub spans supports approximations + computePositionsCost(); + return null; + } + + final float matchCost = sumMatchCost / sumApproxCost; + + return new TwoPhaseIterator(new DisjunctionDISIApproximation(byDocQueue)) { + @Override + public boolean matches() throws IOException { + return twoPhaseCurrentDocMatches(); + } + + @Override + public float matchCost() { + return matchCost; + } + }; + } + + float positionsCost = -1; + + void computePositionsCost() { + float sumPositionsCost = 0; + long sumCost = 0; + for (DisiWrapper w : byDocQueue) { + long costWeight = (w.cost <= 1) ? 1 : w.cost; + sumPositionsCost += w.spans.positionsCost() * costWeight; + sumCost += costWeight; + } + positionsCost = sumPositionsCost / sumCost; + } + + @Override + public float positionsCost() { + // This may be called when asTwoPhaseIterator returned null, + // which happens when none of the sub spans supports approximations. + assert positionsCost > 0; + return positionsCost; + } + + int lastDocTwoPhaseMatched = -1; + + boolean twoPhaseCurrentDocMatches() throws IOException { + DisiWrapper listAtCurrentDoc = byDocQueue.topList(); + // remove the head of the list as long as it does not match + final int currentDoc = listAtCurrentDoc.doc; + while (listAtCurrentDoc.twoPhaseView != null) { + if (listAtCurrentDoc.twoPhaseView.matches()) { + // use this spans for positions at current doc: + listAtCurrentDoc.lastApproxMatchDoc = currentDoc; + break; + } + // do not use this spans for positions at current doc: + listAtCurrentDoc.lastApproxNonMatchDoc = currentDoc; + listAtCurrentDoc = listAtCurrentDoc.next; + if (listAtCurrentDoc == null) { + return false; + } + } + lastDocTwoPhaseMatched = currentDoc; + topPositionSpans = null; + return true; + } + + void fillPositionQueue() throws IOException { // called at first nextStartPosition + assert byPositionQueue.size() == 0; + // add all matching Spans at current doc to byPositionQueue + DisiWrapper listAtCurrentDoc = byDocQueue.topList(); + while (listAtCurrentDoc != null) { + Spans spansAtDoc = listAtCurrentDoc.spans; + if (lastDocTwoPhaseMatched == listAtCurrentDoc.doc) { // matched by DisjunctionDisiApproximation + if (listAtCurrentDoc.twoPhaseView != null) { // matched by approximation + if (listAtCurrentDoc.lastApproxNonMatchDoc == listAtCurrentDoc.doc) { // matches() returned false + spansAtDoc = null; + } else { + if (listAtCurrentDoc.lastApproxMatchDoc != listAtCurrentDoc.doc) { + if (!listAtCurrentDoc.twoPhaseView.matches()) { + spansAtDoc = null; + } + } + } + } + } + + if (spansAtDoc != null) { + assert spansAtDoc.docID() == listAtCurrentDoc.doc; + assert spansAtDoc.startPosition() == -1; + spansAtDoc.nextStartPosition(); + assert spansAtDoc.startPosition() != NO_MORE_POSITIONS; + byPositionQueue.add(spansAtDoc); + } + listAtCurrentDoc = listAtCurrentDoc.next; + } + assert byPositionQueue.size() > 0; + } + + @Override + public int nextStartPosition() throws IOException { + if (topPositionSpans == null) { + byPositionQueue.clear(); + fillPositionQueue(); // fills byPositionQueue at first position + topPositionSpans = byPositionQueue.top(); + } else { + topPositionSpans.nextStartPosition(); + topPositionSpans = byPositionQueue.updateTop(); + } + return topPositionSpans.startPosition(); + } + + @Override + public int startPosition() { + return topPositionSpans == null ? -1 : topPositionSpans.startPosition(); + } + + @Override + public int endPosition() { + return topPositionSpans == null ? -1 : topPositionSpans.endPosition(); + } + + @Override + public int width() { + return topPositionSpans.width(); + } + + @Override + public void collect(SpanCollector collector) throws IOException { + if (topPositionSpans != null) + topPositionSpans.collect(collector); + } + + @Override + public String toString() { + return "DisjunctionSpans(" + spanQuery + ")@" + docID() + ": " + startPosition() + " - " + endPosition(); + } + + @Override + public long cost() { + return totalCost; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanOrQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanOrQuery.java index 15abc7ddb27e..c2996cc7820a 100644 --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanOrQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanOrQuery.java @@ -16,7 +16,6 @@ */ package org.apache.lucene.search.spans; - import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; @@ -28,13 +27,8 @@ import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermContext; -import org.apache.lucene.search.DisiPriorityQueue; -import org.apache.lucene.search.DisiWrapper; -import org.apache.lucene.search.DisjunctionDISIApproximation; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; -import org.apache.lucene.search.TwoPhaseIterator; - /** Matches the union of its clauses. */ @@ -117,16 +111,25 @@ public int hashCode() { public SpanWeight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException { List subWeights = new ArrayList<>(clauses.size()); for (SpanQuery q : clauses) { - subWeights.add(q.createWeight(searcher, false, boost)); + subWeights.add(q.createWeight(searcher, needsScores, boost)); } - return new SpanOrWeight(searcher, needsScores ? getTermContexts(subWeights) : null, subWeights, boost); + return new SpanOrWeight(searcher, + needsScores ? getTermContexts(subWeights) : null, + subWeights, + needsScores, + boost); } public class SpanOrWeight extends SpanWeight { final List subWeights; - public SpanOrWeight(IndexSearcher searcher, Map terms, List subWeights, float boost) throws IOException { + public SpanOrWeight(IndexSearcher searcher, + Map terms, + List subWeights, + boolean needsScores, + float boost) throws IOException + { super(SpanOrQuery.this, searcher, terms, boost); this.subWeights = subWeights; } @@ -163,210 +166,8 @@ public Spans getSpans(final LeafReaderContext context, Postings requiredPostings } else if (subSpans.size() == 1) { return subSpans.get(0); } - - DisiPriorityQueue byDocQueue = new DisiPriorityQueue(subSpans.size()); - for (Spans spans : subSpans) { - byDocQueue.add(new DisiWrapper(spans)); - } - - SpanPositionQueue byPositionQueue = new SpanPositionQueue(subSpans.size()); // when empty use -1 - - return new Spans() { - Spans topPositionSpans = null; - - @Override - public int nextDoc() throws IOException { - topPositionSpans = null; - DisiWrapper topDocSpans = byDocQueue.top(); - int currentDoc = topDocSpans.doc; - do { - topDocSpans.doc = topDocSpans.iterator.nextDoc(); - topDocSpans = byDocQueue.updateTop(); - } while (topDocSpans.doc == currentDoc); - return topDocSpans.doc; - } - - @Override - public int advance(int target) throws IOException { - topPositionSpans = null; - DisiWrapper topDocSpans = byDocQueue.top(); - do { - topDocSpans.doc = topDocSpans.iterator.advance(target); - topDocSpans = byDocQueue.updateTop(); - } while (topDocSpans.doc < target); - return topDocSpans.doc; - } - - @Override - public int docID() { - DisiWrapper topDocSpans = byDocQueue.top(); - return topDocSpans.doc; - } - - @Override - public TwoPhaseIterator asTwoPhaseIterator() { - float sumMatchCost = 0; // See also DisjunctionScorer.asTwoPhaseIterator() - long sumApproxCost = 0; - - for (DisiWrapper w : byDocQueue) { - if (w.twoPhaseView != null) { - long costWeight = (w.cost <= 1) ? 1 : w.cost; - sumMatchCost += w.twoPhaseView.matchCost() * costWeight; - sumApproxCost += costWeight; - } - } - - if (sumApproxCost == 0) { // no sub spans supports approximations - computePositionsCost(); - return null; - } - - final float matchCost = sumMatchCost / sumApproxCost; - - return new TwoPhaseIterator(new DisjunctionDISIApproximation(byDocQueue)) { - @Override - public boolean matches() throws IOException { - return twoPhaseCurrentDocMatches(); - } - - @Override - public float matchCost() { - return matchCost; - } - }; - } - - float positionsCost = -1; - - void computePositionsCost() { - float sumPositionsCost = 0; - long sumCost = 0; - for (DisiWrapper w : byDocQueue) { - long costWeight = (w.cost <= 1) ? 1 : w.cost; - sumPositionsCost += w.spans.positionsCost() * costWeight; - sumCost += costWeight; - } - positionsCost = sumPositionsCost / sumCost; - } - - @Override - public float positionsCost() { - // This may be called when asTwoPhaseIterator returned null, - // which happens when none of the sub spans supports approximations. - assert positionsCost > 0; - return positionsCost; - } - - int lastDocTwoPhaseMatched = -1; - - boolean twoPhaseCurrentDocMatches() throws IOException { - DisiWrapper listAtCurrentDoc = byDocQueue.topList(); - // remove the head of the list as long as it does not match - final int currentDoc = listAtCurrentDoc.doc; - while (listAtCurrentDoc.twoPhaseView != null) { - if (listAtCurrentDoc.twoPhaseView.matches()) { - // use this spans for positions at current doc: - listAtCurrentDoc.lastApproxMatchDoc = currentDoc; - break; - } - // do not use this spans for positions at current doc: - listAtCurrentDoc.lastApproxNonMatchDoc = currentDoc; - listAtCurrentDoc = listAtCurrentDoc.next; - if (listAtCurrentDoc == null) { - return false; - } - } - lastDocTwoPhaseMatched = currentDoc; - topPositionSpans = null; - return true; - } - - void fillPositionQueue() throws IOException { // called at first nextStartPosition - assert byPositionQueue.size() == 0; - // add all matching Spans at current doc to byPositionQueue - DisiWrapper listAtCurrentDoc = byDocQueue.topList(); - while (listAtCurrentDoc != null) { - Spans spansAtDoc = listAtCurrentDoc.spans; - if (lastDocTwoPhaseMatched == listAtCurrentDoc.doc) { // matched by DisjunctionDisiApproximation - if (listAtCurrentDoc.twoPhaseView != null) { // matched by approximation - if (listAtCurrentDoc.lastApproxNonMatchDoc == listAtCurrentDoc.doc) { // matches() returned false - spansAtDoc = null; - } else { - if (listAtCurrentDoc.lastApproxMatchDoc != listAtCurrentDoc.doc) { - if (!listAtCurrentDoc.twoPhaseView.matches()) { - spansAtDoc = null; - } - } - } - } - } - - if (spansAtDoc != null) { - assert spansAtDoc.docID() == listAtCurrentDoc.doc; - assert spansAtDoc.startPosition() == -1; - spansAtDoc.nextStartPosition(); - assert spansAtDoc.startPosition() != NO_MORE_POSITIONS; - byPositionQueue.add(spansAtDoc); - } - listAtCurrentDoc = listAtCurrentDoc.next; - } - assert byPositionQueue.size() > 0; - } - - @Override - public int nextStartPosition() throws IOException { - if (topPositionSpans == null) { - byPositionQueue.clear(); - fillPositionQueue(); // fills byPositionQueue at first position - topPositionSpans = byPositionQueue.top(); - } else { - topPositionSpans.nextStartPosition(); - topPositionSpans = byPositionQueue.updateTop(); - } - return topPositionSpans.startPosition(); - } - - @Override - public int startPosition() { - return topPositionSpans == null ? -1 : topPositionSpans.startPosition(); - } - - @Override - public int endPosition() { - return topPositionSpans == null ? -1 : topPositionSpans.endPosition(); - } - - @Override - public int width() { - return topPositionSpans.width(); - } - - @Override - public void collect(SpanCollector collector) throws IOException { - if (topPositionSpans != null) - topPositionSpans.collect(collector); - } - - @Override - public String toString() { - return "spanOr(" + SpanOrQuery.this + ")@" + docID() + ": " + startPosition() + " - " + endPosition(); - } - - long cost = -1; - - @Override - public long cost() { - if (cost == -1) { - cost = 0; - for (Spans spans : subSpans) { - cost += spans.cost(); - } - } - return cost; - } - }; + return new DisjunctionSpans(SpanOrQuery.this, subSpans); } } - } diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanSynonymQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanSynonymQuery.java new file mode 100644 index 000000000000..fb57ddbbbd3a --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanSynonymQuery.java @@ -0,0 +1,187 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.spans; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Set; +import java.util.Map; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermContext; +import org.apache.lucene.index.TermState; +import org.apache.lucene.index.TermsEnum; + +import org.apache.lucene.search.similarities.Similarity.SimScorer; + +import org.apache.lucene.search.Query; +import org.apache.lucene.search.SynonymQuery; +import org.apache.lucene.search.SynonymQuery.SynonymWeight; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.MatchNoDocsQuery; + +/** + * A SpanQuery that treats terms as synonyms. + *

+ * For scoring purposes, this query tries to score the terms as if you + * had indexed them as one term: it will match any of the terms while + * using the same scoring as {@link SynonymQuery}, as far as possible. + */ +public final class SpanSynonymQuery extends SpanQuery { + final SynonymQuery synonymQuery; + final List terms; + + /** + * Creates a new SpanSynonymQuery, matching any of the supplied terms. + *

+ * The terms must all have the same field. + */ + public SpanSynonymQuery(Term... terms) { + this.synonymQuery = new SynonymQuery(terms); + this.terms = synonymQuery.getTerms(); + } + + @Override + public String getField() { + return synonymQuery.getField(); + } + + @Override + public String toString(String field) { + StringBuilder builder = new StringBuilder("SpanSynonym("); + builder.append(synonymQuery.toString(field)); + builder.append(")"); + return builder.toString(); + } + + @Override + public int hashCode() { + return 31 * classHash() - synonymQuery.hashCode(); + } + + @Override + public boolean equals(Object other) { + return sameClassAs(other) && + synonymQuery.equals(((SpanSynonymQuery) other).synonymQuery); + } + + @Override + public Query rewrite(IndexReader reader) throws IOException { + // optimize zero and single term cases + int numTerms = terms.size(); + if (numTerms == 0) { + return new MatchNoDocsQuery(); + } + if (numTerms == 1) { + return new SpanTermQuery(terms.get(0)); + } + return this; + } + + /** The returned SpanWeight does not support {@link SpanWeight#explain}. */ + @Override + public SpanWeight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException { + if (needsScores) { + SynonymWeight synonymWeight = (SynonymWeight) + synonymQuery.createWeight(searcher, needsScores, boost); + return new SpanSynonymWeight(searcher, boost, synonymWeight); + } + else { // scores not needed, use SpanOrQuery without scoring. + SpanTermQuery[] clauses = new SpanTermQuery[terms.size()]; + int i = 0; + for (Term term : terms) { + clauses[i++] = new SpanTermQuery(term); + } + return new SpanOrQuery(clauses).createWeight(searcher, needsScores, boost); + } + } + + class SpanSynonymWeight extends SpanWeight { + final SynonymWeight synonymWeight; + + SpanSynonymWeight( + IndexSearcher searcher, + float boost, + SynonymWeight synonymWeight) + throws IOException { + super(SpanSynonymQuery.this, searcher, null, boost); // null: no term context map + this.synonymWeight = synonymWeight; + } + + @Override + public void extractTerms(Set termSet) { + for (Term t : terms) { + termSet.add(t); + } + } + + @Override + public void extractTermContexts(Map termContextbyTerm) { + TermContext[] termContexts = synonymWeight.getTermContexts(); + int i = 0; + for (Term term : terms) { + TermContext termContext = termContexts[i++]; + termContextbyTerm.put(term, termContext); + } + } + + @Override + public Explanation explain(LeafReaderContext context, int doc) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public SimScorer getSimScorer(LeafReaderContext context) throws IOException { + return synonymWeight.getSimScorer(context); + } + + @Override + public Spans getSpans(final LeafReaderContext context, Postings requiredPostings) + throws IOException { + SimScorer simScorer = getSimScorer(context); + final String field = getField(); + Terms fieldTerms = context.reader().terms(field); + List termSpans = new ArrayList<>(terms.size()); + if (fieldTerms != null) { + TermsEnum termsEnum = fieldTerms.iterator(); + TermContext[] termContexts = synonymWeight.getTermContexts(); + int i = 0; + for (Term term : terms) { + TermContext termContext = termContexts[i++]; // in term order + TermState termState = termContext.get(context.ord); + if (termState != null) { + termsEnum.seekExact(term.bytes(), termState); + PostingsEnum postings = termsEnum.postings(null, PostingsEnum.POSITIONS); + float positionsCost = SpanTermQuery.termPositionsCost(termsEnum) + * SpanTermQuery.PHRASE_TO_SPAN_TERM_POSITIONS_COST; + termSpans.add(new TermSpans(simScorer, postings, term, positionsCost)); + } + } + } + + return (termSpans.size() == 0) ? null + : (termSpans.size() == 1) ? termSpans.get(0) + : new SynonymSpans(SpanSynonymQuery.this, termSpans, simScorer); + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanTermQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanTermQuery.java index 3e13be7ecb11..b74ac9dc629b 100644 --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanTermQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanTermQuery.java @@ -126,7 +126,7 @@ public Spans getSpans(final LeafReaderContext context, Postings requiredPostings * the relative cost of dealing with the term positions * when using a SpanNearQuery instead of a PhraseQuery. */ - private static final float PHRASE_TO_SPAN_TERM_POSITIONS_COST = 4.0f; + static final float PHRASE_TO_SPAN_TERM_POSITIONS_COST = 4.0f; private static final int TERM_POSNS_SEEK_OPS_PER_DOC = 128; diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SynonymSpans.java b/lucene/core/src/java/org/apache/lucene/search/spans/SynonymSpans.java new file mode 100644 index 000000000000..fdbf676e0e01 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SynonymSpans.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.spans; + +import java.util.List; + +import org.apache.lucene.search.similarities.Similarity.SimScorer; + + +/** + * A spans for merging and equal scoring of given spans. + * This does not provide score values. + * + * @lucene.experimental + */ +public class SynonymSpans extends DisjunctionSpans { + SimScorer simScorer; + + /** Construct a SynonymSpans. + * @param spanQuery The query that provides the subSpans. + * @param subSpans Over which the disjunction is to be taken. + * @param simScorer To be used for scoring. + */ + public SynonymSpans(SpanQuery spanQuery, List subSpans, SimScorer simScorer) { + super(spanQuery, subSpans); + this.simScorer = simScorer; + } + + @Override + public String toString() { + return "SynonymSpans(" + spanQuery + ")@" + docID() + ": " + startPosition() + " - " + endPosition(); + } +} diff --git a/lucene/core/src/test/org/apache/lucene/search/spans/TestSpanSynonymQuery.java b/lucene/core/src/test/org/apache/lucene/search/spans/TestSpanSynonymQuery.java new file mode 100644 index 000000000000..5f4b8eb23634 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/spans/TestSpanSynonymQuery.java @@ -0,0 +1,238 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.spans; + + +import java.io.IOException; +import java.util.Arrays; +import java.util.Comparator; + +import org.apache.lucene.analysis.*; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.CheckHits; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TopScoreDocCollector; + +import org.apache.lucene.search.SynonymQuery; + +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.English; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TestUtil; + +import org.junit.AfterClass; +import org.junit.BeforeClass; +import junit.framework.Assert; + + +public class TestSpanSynonymQuery extends LuceneTestCase { + static IndexSearcher searcher; + static IndexReader reader; + static Directory directory; + + static final int MAX_TEST_DOC = 32; + + @BeforeClass + public static void beforeClass() throws Exception { + directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, + newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)) + .setMaxBufferedDocs(TestUtil.nextInt(random(), MAX_TEST_DOC, MAX_TEST_DOC + 100)) + .setMergePolicy(newLogMergePolicy())); + for (int i = 0; i < MAX_TEST_DOC; i++) { + Document doc = new Document(); + String text; + if (i < (MAX_TEST_DOC-1)) { + text = English.intToEnglish(i); + if ((i % 5) == 0) { // add some multiple occurrences of the same term(s) + text += " " + text; + } + } else { // last doc, for testing distances > 1, and repeating occurrrences of wb + text = "az a b c d e wa wb wb wc az"; + } + doc.add(newTextField("field", text, Field.Store.YES)); + writer.addDocument(doc); + } + reader = writer.getReader(); + searcher = new IndexSearcher(reader); + writer.close(); + } + + @AfterClass + public static void afterClass() throws Exception { + reader.close(); + directory.close(); + searcher = null; + directory = null; + } + + final String FIELD_NAME = "field"; + + + Term lcnTerm(String term) { + return new Term(FIELD_NAME, term); + } + + Term[] lcnTerms(String... terms) { + Term[] lcnTrms = new Term[terms.length]; + for (int i = 0; i < terms.length; i++) { + lcnTrms[i] = lcnTerm(terms[i]); + } + return lcnTrms; + } + + TermQuery termQuery(String term) { + return new TermQuery(lcnTerm(term)); + } + + SpanTermQuery spanTermQuery(String term) { + return new SpanTermQuery(lcnTerm(term)); + } + + SpanTermQuery[] spanTermQueries(String... terms) { + SpanTermQuery[] stqs = new SpanTermQuery[terms.length]; + for (int i = 0; i < terms.length; i++) { + stqs[i] = spanTermQuery(terms[i]); + } + return stqs; + } + + SpanSynonymQuery spanSynonymQuery(String... terms) { + return new SpanSynonymQuery(lcnTerms(terms)); + } + + SynonymQuery synonymQuery(String... terms) { + return new SynonymQuery(lcnTerms(terms)); + } + + void sortByDoc(ScoreDoc[] scoreDocs) { + Arrays.sort(scoreDocs, new Comparator() { + @Override + public int compare(ScoreDoc sd1, ScoreDoc sd2) { + return sd1.doc - sd2.doc; + } + }); + } + + ScoreDoc[] search(IndexSearcher searcher, Query query) throws IOException { + TopScoreDocCollector collector = TopScoreDocCollector.create(MAX_TEST_DOC); + searcher.search(query, collector); + return collector.topDocs().scoreDocs; + } + + int[] docsFromHits(ScoreDoc[] hits) throws Exception { + int[] docs = new int[hits.length]; + for (int i = 0; i < hits.length; i++) { + docs[i] = hits[i].doc; + } + return docs; + } + + void showQueryResults(String message, Query q, ScoreDoc[] hits) { + System.out.println(message + " results from query " + q); + for (ScoreDoc hit : hits) { + System.out.println("doc=" + hit.doc + ", score=" + hit.score); + } + } + + void checkEqualScores(Query qexp, Query qact) throws Exception { + ScoreDoc[] expHits = search(searcher, qexp); + + int[] expDocs = docsFromHits(expHits); + //showQueryResults("checkEqualScores expected", qexp, expHits); + + ScoreDoc[] actHits = search(searcher, qact); + //showQueryResults("checkEqualScores actual", qact, actHits); + + CheckHits.checkHitsQuery(qact, actHits, expHits, expDocs); + } + + void checkScoresInRange(Query qexp, Query qact, float maxFac, float minFac) throws Exception { + ScoreDoc[] expHits = search(searcher, qexp); + //showQueryResults("checkScoresInRange expected", qexp, expHits); + + ScoreDoc[] actHits = search(searcher, qact); + //showQueryResults("checkScoresInRange actual", qact, actHits); + + if (expHits.length != actHits.length) { + Assert.fail("Unequal lengths: expHits="+expHits.length+",actHits="+actHits.length); + } + + sortByDoc(expHits); + sortByDoc(actHits); + for (int i = 0; i < expHits.length; i++) { + if (expHits[i].doc != actHits[i].doc) + { + Assert.fail("At index " + i + + ": expHits[i].doc=" + expHits[i].doc + + " != actHits[i].doc=" + actHits[i].doc); + } + + if ( (expHits[i].score * maxFac < actHits[i].score) + || (expHits[i].score * minFac > actHits[i].score)) + { + Assert.fail("At index " + i + + ", expHits[i].doc=" + expHits[i].doc + + ", score not in expected range: " + (expHits[i].score * minFac) + + " <= " + actHits[i].score + + " <= " + (expHits[i].score * maxFac)); + } + } + } + + void checkSingleTerm(String term) throws Exception { + TermQuery tq = termQuery(term); + SpanTermQuery stq = spanTermQuery(term); + SpanSynonymQuery ssq = spanSynonymQuery(term); + + checkEqualScores(tq, stq); + checkEqualScores(tq, ssq); + } + + public void testSingleZero() throws Exception { + checkSingleTerm("zero"); + } + + SpanOrQuery spanOrQuery(String... terms) { + return new SpanOrQuery(spanTermQueries(terms)); + } + + void checkOrTerms(String... terms) throws Exception { + assertTrue(terms.length >= 1); + SpanOrQuery soq = spanOrQuery(terms); + SpanSynonymQuery ssq = spanSynonymQuery(terms); + checkScoresInRange(soq, ssq, 0.7f, 0.3f); + + SynonymQuery sq = synonymQuery(terms); + checkEqualScores(sq, ssq); + } + + public void testOrTwoTermsNoDocOverlap() throws Exception { + checkOrTerms("zero", "one"); + } + + public void testOrTwoTermsDocOverlap() throws Exception { + checkOrTerms("twenty", "one"); + } +} diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/CoreParser.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/CoreParser.java index 8637c4a6fcd2..2ac8d4f34af5 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/CoreParser.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/CoreParser.java @@ -109,6 +109,10 @@ protected CoreParser(String defaultField, Analyzer analyzer, QueryParser parser) SpanNotBuilder snot = new SpanNotBuilder(spanFactory); spanFactory.addBuilder("SpanNot", snot); queryFactory.addBuilder("SpanNot", snot); + + SpanSynonymBuilder ssyn = new SpanSynonymBuilder(analyzer); + spanFactory.addBuilder("SpanSynonym", ssyn); + queryFactory.addBuilder("SpanSynonym", ssyn); } public Query parse(InputStream xmlStream) throws ParserException { diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/builders/SpanSynonymBuilder.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/builders/SpanSynonymBuilder.java new file mode 100644 index 000000000000..ba0330784ddb --- /dev/null +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/builders/SpanSynonymBuilder.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.queryparser.xml.builders; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.spans.SpanBoostQuery; +import org.apache.lucene.search.spans.SpanSynonymQuery; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.queryparser.xml.DOMUtils; +import org.apache.lucene.queryparser.xml.ParserException; +import org.w3c.dom.Element; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +/** + * Builder that analyzes the text into a {@link SpanSynonymQuery} + */ +public class SpanSynonymBuilder extends SpanBuilderBase { + + private final Analyzer analyzer; + + public SpanSynonymBuilder(Analyzer analyzer) { + this.analyzer = analyzer; + } + + @Override + public SpanQuery getSpanQuery(Element e) throws ParserException { + String fieldName = DOMUtils.getAttributeWithInheritanceOrFail(e, "fieldName"); + String value = DOMUtils.getNonBlankTextOrFail(e); + + List termsList = new ArrayList<>(); + + try (TokenStream ts = analyzer.tokenStream(fieldName, value)) { + TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class); + ts.reset(); + while (ts.incrementToken()) { + Term t = new Term(fieldName, BytesRef.deepCopyOf(termAtt.getBytesRef())); + termsList.add(t); + } + ts.end(); + SpanSynonymQuery ssyn = new SpanSynonymQuery(termsList.toArray(new Term[termsList.size()])); + float boost = DOMUtils.getAttribute(e, "boost", 1.0f); + return new SpanBoostQuery(ssyn, boost); + } + catch (IOException ioe) { + throw new ParserException("IOException parsing value:" + value); + } + } + +} diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/xml/SpanQuery.xml b/lucene/queryparser/src/test/org/apache/lucene/queryparser/xml/SpanQuery.xml index fc37d96e6901..21cc74fa6a0b 100644 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/xml/SpanQuery.xml +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/xml/SpanQuery.xml @@ -38,6 +38,7 @@ fire burn + go goes going went gone