LUCENE-10633: Dynamic pruning for sorting on SORTED(_SET) fields. (#1023

) This commit enables dynamic pruning for queries sorted on SORTED(_SET) fields by using postings to filter competitive documents.
apache · Jul 29, 2022 · 261db55 · 261db55
1 parent 317e1fa
commit 261db55
Show file tree

Hide file tree

Showing 7 changed files with 900 additions and 297 deletions.
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -24,6 +24,10 @@ Optimizations
 
 * GITHUB#1020: Support #scoreSupplier and small optimizations to DocValuesRewriteMethod. (Greg Miller)
 
+* LUCENE-10633: Added support for dynamic pruning to queries sorted by a string
+  field that is indexed with terms and SORTED or SORTED_SET doc values.
+  (Adrien Grand)
+
 Bug Fixes
 ---------------------
 (No changes)

diff --git a/lucene/core/src/java/org/apache/lucene/search/FieldComparator.java b/lucene/core/src/java/org/apache/lucene/search/FieldComparator.java
@@ -20,7 +20,6 @@
 import org.apache.lucene.index.BinaryDocValues;
 import org.apache.lucene.index.DocValues;
 import org.apache.lucene.index.LeafReaderContext;
-import org.apache.lucene.index.SortedDocValues;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
 
@@ -211,282 +210,6 @@ public int compareTop(int doc) throws IOException {
     }
   }
 
-  /**
-   * Sorts by field's natural Term sort order, using ordinals. This is functionally equivalent to
-   * {@link org.apache.lucene.search.FieldComparator.TermValComparator}, but it first resolves the
-   * string to their relative ordinal positions (using the index returned by {@link
-   * org.apache.lucene.index.LeafReader#getSortedDocValues(String)}), and does most comparisons
-   * using the ordinals. For medium to large results, this comparator will be much faster than
-   * {@link org.apache.lucene.search.FieldComparator.TermValComparator}. For very small result sets
-   * it may be slower.
-   */
-  public static class TermOrdValComparator extends FieldComparator<BytesRef>
-      implements LeafFieldComparator {
-    /* Ords for each slot.
-    @lucene.internal */
-    final int[] ords;
-
-    /* Values for each slot.
-    @lucene.internal */
-    final BytesRef[] values;
-    private final BytesRefBuilder[] tempBRs;
-
-    /* Which reader last copied a value into the slot. When
-    we compare two slots, we just compare-by-ord if the
-    readerGen is the same; else we must compare the
-    values (slower).
-    @lucene.internal */
-    final int[] readerGen;
-
-    /* Gen of current reader we are on.
-    @lucene.internal */
-    int currentReaderGen = -1;
-
-    /* Current reader's doc ord/values.
-    @lucene.internal */
-    SortedDocValues termsIndex;
-
-    private final String field;
-
-    /* Bottom slot, or -1 if queue isn't full yet
-    @lucene.internal */
-    int bottomSlot = -1;
-
-    /* Bottom ord (same as ords[bottomSlot] once bottomSlot
-    is set).  Cached for faster compares.
-    @lucene.internal */
-    int bottomOrd;
-
-    /* True if current bottom slot matches the current
-    reader.
-    @lucene.internal */
-    boolean bottomSameReader;
-
-    /* Bottom value (same as values[bottomSlot] once
-     bottomSlot is set).  Cached for faster compares.
-    @lucene.internal */
-    BytesRef bottomValue;
-
-    /** Set by setTopValue. */
-    BytesRef topValue;
-
-    boolean topSameReader;
-    int topOrd;
-
-    /** -1 if missing values are sorted first, 1 if they are sorted last */
-    final int missingSortCmp;
-
-    /** Which ordinal to use for a missing value. */
-    final int missingOrd;
-
-    /** Creates this, sorting missing values first. */
-    public TermOrdValComparator(int numHits, String field) {
-      this(numHits, field, false);
-    }
-
-    /**
-     * Creates this, with control over how missing values are sorted. Pass sortMissingLast=true to
-     * put missing values at the end.
-     */
-    public TermOrdValComparator(int numHits, String field, boolean sortMissingLast) {
-      ords = new int[numHits];
-      values = new BytesRef[numHits];
-      tempBRs = new BytesRefBuilder[numHits];
-      readerGen = new int[numHits];
-      this.field = field;
-      if (sortMissingLast) {
-        missingSortCmp = 1;
-        missingOrd = Integer.MAX_VALUE;
-      } else {
-        missingSortCmp = -1;
-        missingOrd = -1;
-      }
-    }
-
-    private int getOrdForDoc(int doc) throws IOException {
-      if (termsIndex.advanceExact(doc)) {
-        return termsIndex.ordValue();
-      } else {
-        return -1;
-      }
-    }
-
-    @Override
-    public int compare(int slot1, int slot2) {
-      if (readerGen[slot1] == readerGen[slot2]) {
-        return ords[slot1] - ords[slot2];
-      }
-
-      final BytesRef val1 = values[slot1];
-      final BytesRef val2 = values[slot2];
-      if (val1 == null) {
-        if (val2 == null) {
-          return 0;
-        }
-        return missingSortCmp;
-      } else if (val2 == null) {
-        return -missingSortCmp;
-      }
-      return val1.compareTo(val2);
-    }
-
-    @Override
-    public int compareBottom(int doc) throws IOException {
-      assert bottomSlot != -1;
-      int docOrd = getOrdForDoc(doc);
-      if (docOrd == -1) {
-        docOrd = missingOrd;
-      }
-      if (bottomSameReader) {
-        // ord is precisely comparable, even in the equal case
-        return bottomOrd - docOrd;
-      } else if (bottomOrd >= docOrd) {
-        // the equals case always means bottom is > doc
-        // (because we set bottomOrd to the lower bound in
-        // setBottom):
-        return 1;
-      } else {
-        return -1;
-      }
-    }
-
-    @Override
-    public void copy(int slot, int doc) throws IOException {
-      int ord = getOrdForDoc(doc);
-      if (ord == -1) {
-        ord = missingOrd;
-        values[slot] = null;
-      } else {
-        assert ord >= 0;
-        if (tempBRs[slot] == null) {
-          tempBRs[slot] = new BytesRefBuilder();
-        }
-        tempBRs[slot].copyBytes(termsIndex.lookupOrd(ord));
-        values[slot] = tempBRs[slot].get();
-      }
-      ords[slot] = ord;
-      readerGen[slot] = currentReaderGen;
-    }
-
-    /** Retrieves the SortedDocValues for the field in this segment */
-    protected SortedDocValues getSortedDocValues(LeafReaderContext context, String field)
-        throws IOException {
-      return DocValues.getSorted(context.reader(), field);
-    }
-
-    @Override
-    public LeafFieldComparator getLeafComparator(LeafReaderContext context) throws IOException {
-      termsIndex = getSortedDocValues(context, field);
-      currentReaderGen++;
-
-      if (topValue != null) {
-        // Recompute topOrd/SameReader
-        int ord = termsIndex.lookupTerm(topValue);
-        if (ord >= 0) {
-          topSameReader = true;
-          topOrd = ord;
-        } else {
-          topSameReader = false;
-          topOrd = -ord - 2;
-        }
-      } else {
-        topOrd = missingOrd;
-        topSameReader = true;
-      }
-      // System.out.println("  getLeafComparator topOrd=" + topOrd + " topSameReader=" +
-      // topSameReader);
-
-      if (bottomSlot != -1) {
-        // Recompute bottomOrd/SameReader
-        setBottom(bottomSlot);
-      }
-
-      return this;
-    }
-
-    @Override
-    public void setBottom(final int bottom) throws IOException {
-      bottomSlot = bottom;
-
-      bottomValue = values[bottomSlot];
-      if (currentReaderGen == readerGen[bottomSlot]) {
-        bottomOrd = ords[bottomSlot];
-        bottomSameReader = true;
-      } else {
-        if (bottomValue == null) {
-          // missingOrd is null for all segments
-          assert ords[bottomSlot] == missingOrd;
-          bottomOrd = missingOrd;
-          bottomSameReader = true;
-          readerGen[bottomSlot] = currentReaderGen;
-        } else {
-          final int ord = termsIndex.lookupTerm(bottomValue);
-          if (ord < 0) {
-            bottomOrd = -ord - 2;
-            bottomSameReader = false;
-          } else {
-            bottomOrd = ord;
-            // exact value match
-            bottomSameReader = true;
-            readerGen[bottomSlot] = currentReaderGen;
-            ords[bottomSlot] = bottomOrd;
-          }
-        }
-      }
-    }
-
-    @Override
-    public void setTopValue(BytesRef value) {
-      // null is fine: it means the last doc of the prior
-      // search was missing this value
-      topValue = value;
-      // System.out.println("setTopValue " + topValue);
-    }
-
-    @Override
-    public BytesRef value(int slot) {
-      return values[slot];
-    }
-
-    @Override
-    public int compareTop(int doc) throws IOException {
-
-      int ord = getOrdForDoc(doc);
-      if (ord == -1) {
-        ord = missingOrd;
-      }
-
-      if (topSameReader) {
-        // ord is precisely comparable, even in the equal
-        // case
-        // System.out.println("compareTop doc=" + doc + " ord=" + ord + " ret=" + (topOrd-ord));
-        return topOrd - ord;
-      } else if (ord <= topOrd) {
-        // the equals case always means doc is < value
-        // (because we set lastOrd to the lower bound)
-        return 1;
-      } else {
-        return -1;
-      }
-    }
-
-    @Override
-    public int compareValues(BytesRef val1, BytesRef val2) {
-      if (val1 == null) {
-        if (val2 == null) {
-          return 0;
-        }
-        return missingSortCmp;
-      } else if (val2 == null) {
-        return -missingSortCmp;
-      }
-      return val1.compareTo(val2);
-    }
-
-    @Override
-    public void setScorer(Scorable scorer) {}
-  }
-
   /**
    * Sorts by field's natural Term sort order. All comparisons are done using BytesRef.compareTo,
    * which is slow for medium to large result sets but possibly very fast for very small results

diff --git a/lucene/core/src/java/org/apache/lucene/search/SortField.java b/lucene/core/src/java/org/apache/lucene/search/SortField.java
@@ -31,6 +31,7 @@
 import org.apache.lucene.search.comparators.FloatComparator;
 import org.apache.lucene.search.comparators.IntComparator;
 import org.apache.lucene.search.comparators.LongComparator;
+import org.apache.lucene.search.comparators.TermOrdValComparator;
 import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.util.BytesRef;
@@ -536,8 +537,7 @@ public FieldComparator<?> getComparator(final int numHits, boolean enableSkippin
         break;
 
       case STRING:
-        return new FieldComparator.TermOrdValComparator(
-            numHits, field, missingValue == STRING_LAST);
+        return new TermOrdValComparator(numHits, field, missingValue == STRING_LAST, reverse);
 
       case STRING_VAL:
         fieldComparator =

diff --git a/lucene/core/src/java/org/apache/lucene/search/SortedSetSortField.java b/lucene/core/src/java/org/apache/lucene/search/SortedSetSortField.java
@@ -24,6 +24,7 @@
 import org.apache.lucene.index.SortFieldProvider;
 import org.apache.lucene.index.SortedDocValues;
 import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.search.comparators.TermOrdValComparator;
 import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.DataOutput;
 
@@ -178,8 +179,7 @@ public void setMissingValue(Object missingValue) {
 
   @Override
   public FieldComparator<?> getComparator(int numHits, boolean enableSkipping) {
-    return new FieldComparator.TermOrdValComparator(
-        numHits, getField(), missingValue == STRING_LAST) {
+    return new TermOrdValComparator(numHits, getField(), missingValue == STRING_LAST, reverse) {
       @Override
       protected SortedDocValues getSortedDocValues(LeafReaderContext context, String field)
           throws IOException {