apache · jpountz · Nov 2, 2020 · Sep 13, 2020 · Oct 15, 2020 · Oct 15, 2020
diff --git a/lucene/core/src/java/org/apache/lucene/index/OrdinalMap.java b/lucene/core/src/java/org/apache/lucene/index/OrdinalMap.java
@@ -172,10 +172,12 @@ public static OrdinalMap build(IndexReader.CacheKey owner, TermsEnum subs[], lon
 
   /** Cache key of whoever asked for this awful thing */
   public final IndexReader.CacheKey owner;
+  // number of global ordinals
+  final long valueCount;
   // globalOrd -> (globalOrd - segmentOrd) where segmentOrd is the the ordinal in the first segment that contains this term
-  final PackedLongValues globalOrdDeltas;
+  final LongValues globalOrdDeltas;
   // globalOrd -> first segment container
-  final PackedLongValues firstSegments;
+  final LongValues firstSegments;
   // for every segment, segmentOrd -> globalOrd
   final LongValues segmentToGlobalOrds[];
   // the map from/to segment ids
@@ -271,13 +273,25 @@ protected boolean lessThan(TermsEnumIndex a, TermsEnumIndex b) {
       globalOrd++;
     }
 
-    this.firstSegments = firstSegments.build();
-    this.globalOrdDeltas = globalOrdDeltas.build();
+    long ramBytesUsed = BASE_RAM_BYTES_USED + segmentMap.ramBytesUsed();
+    this.valueCount = globalOrd;
+
+    // If the first segment contains all of the global ords, then we can apply a small optimization
+    // and hardcode the first segment indices and global ord deltas as all zeroes.
+    if (ordDeltaBits.length > 0 && ordDeltaBits[0] == 0L && ordDeltas[0].size() == this.valueCount) {
+      this.firstSegments = LongValues.ZEROES;
+      this.globalOrdDeltas = LongValues.ZEROES;
+    } else {
+      PackedLongValues packedFirstSegments = firstSegments.build();
+      PackedLongValues packedGlobalOrdDeltas = globalOrdDeltas.build();
+      this.firstSegments = packedFirstSegments;
+      this.globalOrdDeltas = packedGlobalOrdDeltas;
+      ramBytesUsed += packedFirstSegments.ramBytesUsed() + packedGlobalOrdDeltas.ramBytesUsed();
+    }
+
     // ordDeltas is typically the bottleneck, so let's see what we can do to make it faster
     segmentToGlobalOrds = new LongValues[subs.length];
-    long ramBytesUsed = BASE_RAM_BYTES_USED + this.globalOrdDeltas.ramBytesUsed()
-      + this.firstSegments.ramBytesUsed() + RamUsageEstimator.shallowSizeOf(segmentToGlobalOrds)
-      + segmentMap.ramBytesUsed();
+    ramBytesUsed += RamUsageEstimator.shallowSizeOf(segmentToGlobalOrds);
     for (int i = 0; i < ordDeltas.length; ++i) {
       final PackedLongValues deltas = ordDeltas[i].build();
       if (ordDeltaBits[i] == 0L) {
@@ -317,6 +331,7 @@ public long get(long ord) {
         ramBytesUsed += RamUsageEstimator.shallowSizeOf(segmentToGlobalOrds[i]);
       }
     }
+
     this.ramBytesUsed = ramBytesUsed;
   }
 
@@ -348,7 +363,7 @@ public int getFirstSegmentNumber(long globalOrd) {
    * Returns the total number of unique terms in global ord space.
    */
   public long getValueCount() {
-    return globalOrdDeltas.size();
+    return valueCount;
   }
 
   @Override
@@ -359,10 +374,9 @@ public long ramBytesUsed() {
   @Override
   public Collection<Accountable> getChildResources() {
     List<Accountable> resources = new ArrayList<>();
-    resources.add(Accountables.namedAccountable("global ord deltas", globalOrdDeltas));
-    resources.add(Accountables.namedAccountable("first segments", firstSegments));
     resources.add(Accountables.namedAccountable("segment map", segmentMap));
-    // TODO: would be nice to return actual child segment deltas too, but the optimizations are confusing
+    // TODO: would be nice to return the ordinal and segment maps too, but it's not straightforward
+    //  because of optimizations.
     return resources;
   }
 }
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestOrdinalMap.java b/lucene/core/src/test/org/apache/lucene/index/TestOrdinalMap.java
@@ -17,10 +17,6 @@
 package org.apache.lucene.index;
 
 
-import java.io.IOException;
-import java.lang.reflect.Field;
-import java.util.HashMap;
-
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.SortedDocValuesField;
@@ -32,6 +28,10 @@
 import org.apache.lucene.util.RamUsageTester;
 import org.apache.lucene.util.TestUtil;
 
+import java.io.IOException;
+import java.lang.reflect.Field;
+import java.util.HashMap;
+
 public class TestOrdinalMap extends LuceneTestCase {
 
   private static final Field ORDINAL_MAP_OWNER_FIELD;
@@ -46,7 +46,7 @@ public class TestOrdinalMap extends LuceneTestCase {
   private static final RamUsageTester.Accumulator ORDINAL_MAP_ACCUMULATOR = new RamUsageTester.Accumulator() {
 
     public long accumulateObject(Object o, long shallowSize, java.util.Map<Field,Object> fieldValues, java.util.Collection<Object> queue) {
-      if (o == LongValues.IDENTITY) {
+      if (o == LongValues.ZEROES || o == LongValues.IDENTITY) {
         return 0L;
       }
       if (o instanceof OrdinalMap) {
@@ -95,4 +95,53 @@ public void testRamBytesUsed() throws IOException {
     dir.close();
   }
 
+  /**
+   * Tests the case where one segment contains all of the global ords. In this case, we apply a
+   * small optimization and hardcode the first segment indices and global ord deltas as all zeroes.
+   */
+  public void testOneSegmentWithAllValues() throws IOException {
+    Directory dir = newDirectory();
+    IndexWriterConfig cfg = new IndexWriterConfig(new MockAnalyzer(random())).setCodec(
+            TestUtil.alwaysDocValuesFormat(TestUtil.getDefaultDocValuesFormat()));
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, cfg);
+
+    int numTerms = 1000;
+    for (int i = 0; i < numTerms; ++i) {
+      Document d = new Document();
+      String term = String.valueOf(i);
+      d.add(new SortedDocValuesField("sdv", new BytesRef(term)));
+      iw.addDocument(d);
+    }
+    iw.forceMerge(1);
+
+    for (int i = 0; i < 10; ++i) {
+      Document d = new Document();
+      String term = String.valueOf(random().nextInt(numTerms));
+      d.add(new SortedDocValuesField("sdv", new BytesRef(term)));
+      iw.addDocument(d);
+    }
+    iw.commit();
+
+    DirectoryReader r = iw.getReader();
+    SortedDocValues sdv = MultiDocValues.getSortedValues(r, "sdv");
+    assertNotNull(sdv);
+    assertTrue(sdv instanceof MultiDocValues.MultiSortedDocValues);
+
+    // Check that the optimization kicks in.
+    OrdinalMap map = ((MultiDocValues.MultiSortedDocValues) sdv).mapping;
+    assertEquals(LongValues.ZEROES, map.firstSegments);
+    assertEquals(LongValues.ZEROES, map.globalOrdDeltas);
+
+    // Check the map's basic behavior.
+    assertEquals(numTerms, (int) map.getValueCount());
+    for (int i = 0; i < numTerms; i++) {
+      assertEquals(0, map.getFirstSegmentNumber(i));
+      assertEquals(i, map.getFirstSegmentOrd(i));
+    }
+
+    iw.close();
+    r.close();
+    dir.close();
+  }
+
 }