LUCENE-7624: Move TermsQuery into core as TermInSetQuery

apache · Jan 9, 2017 · 22940f5 · 22940f5
1 parent 5e9f927
commit 22940f5
Show file tree

Hide file tree

Showing 13 changed files with 451 additions and 408 deletions.
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -85,6 +85,9 @@ API Changes
 * LUCENE-7611: DocumentValueSourceDictionary now takes a LongValuesSource
   as a parameter, and the ValueSource equivalent is deprecated (Alan Woodward)
 
+* LUCENE-7624: TermsQuery has been renamed as TermInSetQuery and moved to core.
+  (Alan Woodward)
+
 New features
 
 * LUCENE-5867: Added BooleanSimilarity. (Robert Muir, Adrien Grand)

diff --git a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
@@ -0,0 +1,369 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Objects;
+import java.util.Set;
+import java.util.SortedSet;
+
+import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.PrefixCodedTerms;
+import org.apache.lucene.index.PrefixCodedTerms.TermIterator;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermContext;
+import org.apache.lucene.index.TermState;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.util.Accountable;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.DocIdSetBuilder;
+import org.apache.lucene.util.RamUsageEstimator;
+
+/**
+ * Specialization for a disjunction over many terms that behaves like a
+ * {@link ConstantScoreQuery} over a {@link BooleanQuery} containing only
+ * {@link org.apache.lucene.search.BooleanClause.Occur#SHOULD} clauses.
+ * <p>For instance in the following example, both @{code q1} and {@code q2}
+ * would yield the same scores:
+ * <pre class="prettyprint">
+ * Query q1 = new TermInSetQuery(new Term("field", "foo"), new Term("field", "bar"));
+ *
+ * BooleanQuery bq = new BooleanQuery();
+ * bq.add(new TermQuery(new Term("field", "foo")), Occur.SHOULD);
+ * bq.add(new TermQuery(new Term("field", "bar")), Occur.SHOULD);
+ * Query q2 = new ConstantScoreQuery(bq);
+ * </pre>
+ * <p>When there are few terms, this query executes like a regular disjunction.
+ * However, when there are many terms, instead of merging iterators on the fly,
+ * it will populate a bit set with matching docs and return a {@link Scorer}
+ * over this bit set.
+ * <p>NOTE: This query produces scores that are equal to its boost
+ */
+public class TermInSetQuery extends Query implements Accountable {
+
+  private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(TermInSetQuery.class);
+  // Same threshold as MultiTermQueryConstantScoreWrapper
+  static final int BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD = 16;
+
+  private final boolean singleField; // whether all terms are from the same field
+  private final PrefixCodedTerms termData;
+  private final int termDataHashCode; // cached hashcode of termData
+
+  /**
+   * Creates a new {@link TermInSetQuery} from the given collection. It
+   * can contain duplicate terms and multiple fields.
+   */
+  public TermInSetQuery(Collection<Term> terms) {
+    Term[] sortedTerms = terms.toArray(new Term[terms.size()]);
+    // already sorted if we are a SortedSet with natural order
+    boolean sorted = terms instanceof SortedSet && ((SortedSet<Term>)terms).comparator() == null;
+    if (!sorted) {
+      ArrayUtil.timSort(sortedTerms);
+    }
+    PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder();
+    Set<String> fields = new HashSet<>();
+    Term previous = null;
+    for (Term term : sortedTerms) {
+      if (term.equals(previous) == false) {
+        fields.add(term.field());
+        builder.add(term);
+      }
+      previous = term;
+    }
+    singleField = fields.size() == 1;
+    termData = builder.finish();
+    termDataHashCode = termData.hashCode();
+  }
+
+  /**
+   * Creates a new {@link TermInSetQuery} from the given collection for
+   * a single field. It can contain duplicate terms.
+   */
+  public TermInSetQuery(String field, Collection<BytesRef> terms) {
+    BytesRef[] sortedTerms = terms.toArray(new BytesRef[terms.size()]);
+    // already sorted if we are a SortedSet with natural order
+    boolean sorted = terms instanceof SortedSet && ((SortedSet<BytesRef>)terms).comparator() == null;
+    if (!sorted) {
+      ArrayUtil.timSort(sortedTerms);
+    }
+    PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder();
+    BytesRefBuilder previous = null;
+    for (BytesRef term : sortedTerms) {
+      if (previous == null) {
+        previous = new BytesRefBuilder();
+      } else if (previous.get().equals(term)) {
+        continue; // deduplicate
+      }
+      builder.add(field, term);
+      previous.copyBytes(term);
+    }
+    singleField = true;
+    termData = builder.finish();
+    termDataHashCode = termData.hashCode();
+  }
+
+  /**
+   * Creates a new {@link TermInSetQuery} from the given {@link BytesRef} array for
+   * a single field.
+   */
+  public TermInSetQuery(String field, BytesRef...terms) {
+    this(field, Arrays.asList(terms));
+  }
+
+  /**
+   * Creates a new {@link TermInSetQuery} from the given array. The array can
+   * contain duplicate terms and multiple fields.
+   */
+  public TermInSetQuery(final Term... terms) {
+    this(Arrays.asList(terms));
+  }
+
+  @Override
+  public Query rewrite(IndexReader reader) throws IOException {
+    final int threshold = Math.min(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD, BooleanQuery.getMaxClauseCount());
+    if (termData.size() <= threshold) {
+      BooleanQuery.Builder bq = new BooleanQuery.Builder();
+      TermIterator iterator = termData.iterator();
+      for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
+        bq.add(new TermQuery(new Term(iterator.field(), BytesRef.deepCopyOf(term))), Occur.SHOULD);
+      }
+      return new ConstantScoreQuery(bq.build());
+    }
+    return super.rewrite(reader);
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    return sameClassAs(other) &&
+        equalsTo(getClass().cast(other));
+  }
+
+  private boolean equalsTo(TermInSetQuery other) {
+    // termData might be heavy to compare so check the hash code first
+    return termDataHashCode == other.termDataHashCode &&
+        termData.equals(other.termData);
+  }
+
+  @Override
+  public int hashCode() {
+    return 31 * classHash() + termDataHashCode;
+  }
+
+  /** Returns the terms wrapped in a PrefixCodedTerms. */
+  public PrefixCodedTerms getTermData() {
+    return termData;
+  }
+
+  @Override
+  public String toString(String defaultField) {
+    StringBuilder builder = new StringBuilder();
+    boolean first = true;
+    TermIterator iterator = termData.iterator();
+    for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
+      if (!first) {
+        builder.append(' ');
+      }
+      first = false;
+      builder.append(new Term(iterator.field(), term).toString());
+    }
+
+    return builder.toString();
+  }
+
+  @Override
+  public long ramBytesUsed() {
+    return BASE_RAM_BYTES_USED + termData.ramBytesUsed();
+  }
+
+  @Override
+  public Collection<Accountable> getChildResources() {
+    return Collections.emptyList();
+  }
+
+  private static class TermAndState {
+    final String field;
+    final TermsEnum termsEnum;
+    final BytesRef term;
+    final TermState state;
+    final int docFreq;
+    final long totalTermFreq;
+
+    TermAndState(String field, TermsEnum termsEnum) throws IOException {
+      this.field = field;
+      this.termsEnum = termsEnum;
+      this.term = BytesRef.deepCopyOf(termsEnum.term());
+      this.state = termsEnum.termState();
+      this.docFreq = termsEnum.docFreq();
+      this.totalTermFreq = termsEnum.totalTermFreq();
+    }
+  }
+
+  private static class WeightOrDocIdSet {
+    final Weight weight;
+    final DocIdSet set;
+
+    WeightOrDocIdSet(Weight weight) {
+      this.weight = Objects.requireNonNull(weight);
+      this.set = null;
+    }
+
+    WeightOrDocIdSet(DocIdSet bitset) {
+      this.set = bitset;
+      this.weight = null;
+    }
+  }
+
+  @Override
+  public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
+    return new ConstantScoreWeight(this, boost) {
+
+      @Override
+      public void extractTerms(Set<Term> terms) {
+        // no-op
+        // This query is for abuse cases when the number of terms is too high to
+        // run efficiently as a BooleanQuery. So likewise we hide its terms in
+        // order to protect highlighters
+      }
+
+      /**
+       * On the given leaf context, try to either rewrite to a disjunction if
+       * there are few matching terms, or build a bitset containing matching docs.
+       */
+      private WeightOrDocIdSet rewrite(LeafReaderContext context) throws IOException {
+        final LeafReader reader = context.reader();
+
+        // We will first try to collect up to 'threshold' terms into 'matchingTerms'
+        // if there are two many terms, we will fall back to building the 'builder'
+        final int threshold = Math.min(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD, BooleanQuery.getMaxClauseCount());
+        assert termData.size() > threshold : "Query should have been rewritten";
+        List<TermAndState> matchingTerms = new ArrayList<>(threshold);
+        DocIdSetBuilder builder = null;
+
+        final Fields fields = reader.fields();
+        String lastField = null;
+        Terms terms = null;
+        TermsEnum termsEnum = null;
+        PostingsEnum docs = null;
+        TermIterator iterator = termData.iterator();
+        for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
+          String field = iterator.field();
+          // comparing references is fine here
+          if (field != lastField) {
+            terms = fields.terms(field);
+            if (terms == null) {
+              termsEnum = null;
+            } else {
+              termsEnum = terms.iterator();
+            }
+            lastField = field;
+          }
+          if (termsEnum != null && termsEnum.seekExact(term)) {
+            if (matchingTerms == null) {
+              docs = termsEnum.postings(docs, PostingsEnum.NONE);
+              builder.add(docs);
+            } else if (matchingTerms.size() < threshold) {
+              matchingTerms.add(new TermAndState(field, termsEnum));
+            } else {
+              assert matchingTerms.size() == threshold;
+              if (singleField) {
+                // common case: all terms are in the same field
+                // use an optimized builder that leverages terms stats to be more efficient
+                builder = new DocIdSetBuilder(reader.maxDoc(), terms);
+              } else {
+                // corner case: different fields
+                // don't make assumptions about the docs we will get
+                builder = new DocIdSetBuilder(reader.maxDoc());
+              }
+              docs = termsEnum.postings(docs, PostingsEnum.NONE);
+              builder.add(docs);
+              for (TermAndState t : matchingTerms) {
+                t.termsEnum.seekExact(t.term, t.state);
+                docs = t.termsEnum.postings(docs, PostingsEnum.NONE);
+                builder.add(docs);
+              }
+              matchingTerms = null;
+            }
+          }
+        }
+        if (matchingTerms != null) {
+          assert builder == null;
+          BooleanQuery.Builder bq = new BooleanQuery.Builder();
+          for (TermAndState t : matchingTerms) {
+            final TermContext termContext = new TermContext(searcher.getTopReaderContext());
+            termContext.register(t.state, context.ord, t.docFreq, t.totalTermFreq);
+            bq.add(new TermQuery(new Term(t.field, t.term), termContext), Occur.SHOULD);
+          }
+          Query q = new ConstantScoreQuery(bq.build());
+          final Weight weight = searcher.rewrite(q).createWeight(searcher, needsScores, score());
+          return new WeightOrDocIdSet(weight);
+        } else {
+          assert builder != null;
+          return new WeightOrDocIdSet(builder.build());
+        }
+      }
+
+      private Scorer scorer(DocIdSet set) throws IOException {
+        if (set == null) {
+          return null;
+        }
+        final DocIdSetIterator disi = set.iterator();
+        if (disi == null) {
+          return null;
+        }
+        return new ConstantScoreScorer(this, score(), disi);
+      }
+
+      @Override
+      public BulkScorer bulkScorer(LeafReaderContext context) throws IOException {
+        final WeightOrDocIdSet weightOrBitSet = rewrite(context);
+        if (weightOrBitSet.weight != null) {
+          return weightOrBitSet.weight.bulkScorer(context);
+        } else {
+          final Scorer scorer = scorer(weightOrBitSet.set);
+          if (scorer == null) {
+            return null;
+          }
+          return new DefaultBulkScorer(scorer);
+        }
+      }
+
+      @Override
+      public Scorer scorer(LeafReaderContext context) throws IOException {
+        final WeightOrDocIdSet weightOrBitSet = rewrite(context);
+        if (weightOrBitSet.weight != null) {
+          return weightOrBitSet.weight.scorer(context);
+        } else {
+          return scorer(weightOrBitSet.set);
+        }
+      }
+    };
+  }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/search/UsageTrackingQueryCachingPolicy.java b/lucene/core/src/java/org/apache/lucene/search/UsageTrackingQueryCachingPolicy.java
@@ -50,9 +50,8 @@ static boolean isCostly(Query query) {
     // DocIdSet in the first place
     return query instanceof MultiTermQuery ||
         query instanceof MultiTermQueryConstantScoreWrapper ||
-        isPointQuery(query) ||
-        // can't refer to TermsQuery directly as it is in another module
-        "TermsQuery".equals(query.getClass().getSimpleName());
+        query instanceof TermInSetQuery ||
+        isPointQuery(query);
   }
 
   static boolean isCheap(Query query) {