apache · bruno-roustant · May 13, 2024 · Apr 29, 2024 · Apr 30, 2024 · May 2, 2024
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -337,6 +337,8 @@ Optimizations
 * GITHUB#12408: Lazy initialization improvements for Facets implementations when there are segments with no hits
   to count. (Greg Miller)
 
+* GITHUB#13327: Reduce memory usage of field maps in FieldInfos and BlockTree TermsReader. (Bruno Roustant, David Smiley)
+
 Bug Fixes
 ---------------------
 

diff --git a/...re/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java b/...re/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java
@@ -21,12 +21,12 @@
 import java.util.Collections;
 import java.util.Iterator;
 import java.util.List;
-import java.util.Map;
 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.codecs.FieldsProducer;
 import org.apache.lucene.codecs.PostingsReaderBase;
 import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.IndexFileNames;
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.SegmentReadState;
@@ -35,10 +35,11 @@
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.store.ReadAdvice;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.CollectionUtil;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.fst.ByteSequenceOutputs;
 import org.apache.lucene.util.fst.Outputs;
+import org.apache.lucene.util.hppc.IntCursor;
+import org.apache.lucene.util.hppc.IntObjectHashMap;
 
 /**
  * A block-based terms index and dictionary that assigns terms to variable length blocks according
@@ -113,7 +114,8 @@ public final class Lucene90BlockTreeTermsReader extends FieldsProducer {
   // produce DocsEnum on demand
   final PostingsReaderBase postingsReader;
 
-  private final Map<String, FieldReader> fieldMap;
+  private final FieldInfos fieldInfos;
+  private final IntObjectHashMap<FieldReader> fieldMap;
   private final List<String> fieldList;
 
   final String segment;
@@ -157,7 +159,7 @@ public Lucene90BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentRe
       // Read per-field details
       String metaName =
           IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_META_EXTENSION);
-      Map<String, FieldReader> fieldMap = null;
+      IntObjectHashMap<FieldReader> fieldMap = null;
       Throwable priorE = null;
       long indexLength = -1, termsLength = -1;
       try (ChecksumIndexInput metaIn = state.directory.openChecksumInput(metaName)) {
@@ -175,7 +177,7 @@ public Lucene90BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentRe
           if (numFields < 0) {
             throw new CorruptIndexException("invalid numFields: " + numFields, metaIn);
           }
-          fieldMap = CollectionUtil.newHashMap(numFields);
+          fieldMap = new IntObjectHashMap<>(numFields);
           for (int i = 0; i < numFields; ++i) {
             final int field = metaIn.readVInt();
             final long numTerms = metaIn.readVLong();
@@ -216,7 +218,7 @@ public Lucene90BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentRe
             final long indexStartFP = metaIn.readVLong();
             FieldReader previous =
                 fieldMap.put(
-                    fieldInfo.name,
+                    fieldInfo.number,
                     new FieldReader(
                         this,
                         fieldInfo,
@@ -250,10 +252,9 @@ public Lucene90BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentRe
       // correct
       CodecUtil.retrieveChecksum(indexIn, indexLength);
       CodecUtil.retrieveChecksum(termsIn, termsLength);
-      List<String> fieldList = new ArrayList<>(fieldMap.keySet());
-      fieldList.sort(null);
+      fieldInfos = state.fieldInfos;
       this.fieldMap = fieldMap;
-      this.fieldList = Collections.unmodifiableList(fieldList);
+      this.fieldList = sortFieldNames(fieldMap, state.fieldInfos);
       success = true;
     } finally {
       if (!success) {
@@ -277,6 +278,16 @@ private static BytesRef readBytesRef(IndexInput in) throws IOException {
     return bytes;
   }
 
+  private static List<String> sortFieldNames(
+      IntObjectHashMap<FieldReader> fieldMap, FieldInfos fieldInfos) {
+    List<String> fieldNames = new ArrayList<>(fieldMap.size());
+    for (IntCursor fieldNumberCursor : fieldMap.keys()) {
+      fieldNames.add(fieldInfos.fieldInfo(fieldNumberCursor.value).name);
+    }
+    fieldNames.sort(null);
+    return Collections.unmodifiableList(fieldNames);
+  }
+
   // for debugging
   // private static String toHex(int v) {
   //   return "0x" + Integer.toHexString(v);
@@ -301,7 +312,8 @@ public Iterator<String> iterator() {
   @Override
   public Terms terms(String field) throws IOException {
     assert field != null;
-    return fieldMap.get(field);
+    FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
+    return fieldInfo == null ? null : fieldMap.get(fieldInfo.number);
   }
 
   @Override

diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java
@@ -23,9 +23,8 @@
 import static org.apache.lucene.index.FieldInfo.verifySameStoreTermVectors;
 import static org.apache.lucene.index.FieldInfo.verifySameVectorOptions;
 
-import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collection;
-import java.util.Collections;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
@@ -34,7 +33,7 @@
 import java.util.Set;
 import java.util.stream.Collectors;
 import java.util.stream.StreamSupport;
-import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.CollectionUtil;
 
 /**
  * Collection of {@link FieldInfo}s (accessible by number or by name).
@@ -62,11 +61,15 @@ public class FieldInfos implements Iterable<FieldInfo> {
 
   // used only by fieldInfo(int)
   private final FieldInfo[] byNumber;
+  private final HashMap<String, FieldInfo> byName;
 
-  private final HashMap<String, FieldInfo> byName = new HashMap<>();
-  private final Collection<FieldInfo> values; // for an unmodifiable iterator
+  /** Iterator in ascending order of field number. */
+  private final Collection<FieldInfo> values;
 
-  /** Constructs a new FieldInfos from an array of FieldInfo objects */
+  /**
+   * Constructs a new FieldInfos from an array of FieldInfo objects. The array can be used directly
+   * as the backing structure.
+   */
   public FieldInfos(FieldInfo[] infos) {
     boolean hasVectors = false;
     boolean hasPostings = false;
@@ -81,30 +84,21 @@ public FieldInfos(FieldInfo[] infos) {
     String softDeletesField = null;
     String parentField = null;
 
-    int size = 0; // number of elements in byNumberTemp, number of used array slots
-    FieldInfo[] byNumberTemp = new FieldInfo[10]; // initial array capacity of 10
+    byName = CollectionUtil.newHashMap(infos.length);
+    int maxFieldNumber = -1;
+    boolean fieldNumberStrictlyAscending = true;
     for (FieldInfo info : infos) {
-      if (info.number < 0) {
+      int fieldNumber = info.number;
+      if (fieldNumber < 0) {
         throw new IllegalArgumentException(
             "illegal field number: " + info.number + " for field " + info.name);
       }
-      size = info.number >= size ? info.number + 1 : size;
-      if (info.number >= byNumberTemp.length) { // grow array
-        byNumberTemp = ArrayUtil.grow(byNumberTemp, info.number + 1);
-      }
-      FieldInfo previous = byNumberTemp[info.number];
-      if (previous != null) {
-        throw new IllegalArgumentException(
-            "duplicate field numbers: "
-                + previous.name
-                + " and "
-                + info.name
-                + " have: "
-                + info.number);
+      if (maxFieldNumber < fieldNumber) {
+        maxFieldNumber = fieldNumber;
+      } else {
+        fieldNumberStrictlyAscending = false;
       }
-      byNumberTemp[info.number] = info;
-
-      previous = byName.put(info.name, info);
+      FieldInfo previous = byName.put(info.name, info);
       if (previous != null) {
         throw new IllegalArgumentException(
             "duplicate field names: "
@@ -156,15 +150,40 @@ public FieldInfos(FieldInfo[] infos) {
     this.softDeletesField = softDeletesField;
     this.parentField = parentField;
 
-    List<FieldInfo> valuesTemp = new ArrayList<>(infos.length);
-    byNumber = new FieldInfo[size];
-    for (int i = 0; i < size; i++) {
-      byNumber[i] = byNumberTemp[i];
-      if (byNumberTemp[i] != null) {
-        valuesTemp.add(byNumberTemp[i]);
+    if (fieldNumberStrictlyAscending && maxFieldNumber == infos.length - 1) {
+      // The input FieldInfo[] contains all fields numbered from 0 to infos.length - 1, and they are
+      // sorted, use it directly. This is an optimization when reading a segment with all fields
+      // since the FieldInfo[] is sorted.
+      byNumber = infos;
+      values = Arrays.asList(byNumber);
+    } else {
+      byNumber = new FieldInfo[maxFieldNumber + 1];
+      for (FieldInfo fieldInfo : infos) {
+        FieldInfo existing = byNumber[fieldInfo.number];
+        if (existing != null) {
+          throw new IllegalArgumentException(
+              "duplicate field numbers: "
+                  + existing.name
+                  + " and "
+                  + fieldInfo.name
+                  + " have: "
+                  + fieldInfo.number);
+        }
+        byNumber[fieldInfo.number] = fieldInfo;
+      }
+      if (maxFieldNumber == infos.length - 1) {
+        // No fields are missing, use byNumber.
+        values = Arrays.asList(byNumber);
+      } else {
+        if (!fieldNumberStrictlyAscending) {
+          // The below code is faster than
+          // Arrays.stream(byNumber).filter(Objects::nonNull).toList(),
+          // mainly when the input FieldInfo[] is small compared to maxFieldNumber.
+          Arrays.sort(infos, (fi1, fi2) -> Integer.compare(fi1.number, fi2.number));
+        }
+        values = Arrays.asList(infos);
       }
     }
-    values = Collections.unmodifiableCollection(valuesTemp);
   }
 
   /**
@@ -323,10 +342,7 @@ public FieldInfo fieldInfo(int fieldNumber) {
     if (fieldNumber < 0) {
       throw new IllegalArgumentException("Illegal field number: " + fieldNumber);
     }
-    if (fieldNumber >= byNumber.length) {
-      return null;
-    }
-    return byNumber[fieldNumber];
+    return fieldNumber >= byNumber.length ? null : byNumber[fieldNumber];
   }
 
   static final class FieldDimensions {

diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/StateSet.java b/lucene/core/src/java/org/apache/lucene/util/automaton/StateSet.java
@@ -19,6 +19,7 @@
 
 import java.util.Arrays;
 import org.apache.lucene.util.hppc.BitMixer;
+import org.apache.lucene.util.hppc.IntCursor;
 import org.apache.lucene.util.hppc.IntIntHashMap;
 
 /**
@@ -94,7 +95,7 @@ int[] getArray() {
     }
     arrayCache = new int[inner.size()];
     int i = 0;
-    for (IntIntHashMap.IntCursor cursor : inner.keys()) {
+    for (IntCursor cursor : inner.keys()) {
       arrayCache[i++] = cursor.value;
     }
     // we need to sort this array since "equals" method depend on this
@@ -114,7 +115,7 @@ long longHashCode() {
       return hashCode;
     }
     hashCode = inner.size();
-    for (IntIntHashMap.IntCursor cursor : inner.keys()) {
+    for (IntCursor cursor : inner.keys()) {
       hashCode += BitMixer.mix(cursor.value);
     }
     hashUpdated = true;

diff --git a/lucene/core/src/java/org/apache/lucene/util/hppc/AbstractIterator.java b/lucene/core/src/java/org/apache/lucene/util/hppc/AbstractIterator.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.util.hppc;
+
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+
+/**
+ * Simplifies the implementation of iterators a bit. Modeled loosely after Google Guava's API.
+ *
+ * <p>Forked from com.carrotsearch.hppc.AbstractIterator
+ */
+public abstract class AbstractIterator<E> implements Iterator<E> {
+  private static final int NOT_CACHED = 0;
+  private static final int CACHED = 1;
+  private static final int AT_END = 2;
+
+  /** Current iterator state. */
+  private int state = NOT_CACHED;
+
+  /** The next element to be returned from {@link #next()} if fetched. */
+  private E nextElement;
+
+  @Override
+  public boolean hasNext() {
+    if (state == NOT_CACHED) {
+      state = CACHED;
+      nextElement = fetch();
+    }
+    return state == CACHED;
+  }
+
+  @Override
+  public E next() {
+    if (!hasNext()) {
+      throw new NoSuchElementException();
+    }
+
+    state = NOT_CACHED;
+    return nextElement;
+  }
+
+  /** Default implementation throws {@link UnsupportedOperationException}. */
+  @Override
+  public void remove() {
+    throw new UnsupportedOperationException();
+  }
+
+  /**
+   * Fetch next element. The implementation must return {@link #done()} when all elements have been
+   * fetched.
+   *
+   * @return Returns the next value for the iterator or chain-calls {@link #done()}.
+   */
+  protected abstract E fetch();
+
+  /**
+   * Call when done.
+   *
+   * @return Returns a unique sentinel value to indicate end-of-iteration.
+   */
+  protected final E done() {
+    state = AT_END;
+    return null;
+  }
+}