Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce memory usage of field maps in FieldInfos and BlockTree TermsReader. #13327

Merged
merged 9 commits into from
May 13, 2024
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,8 @@ Optimizations
* GITHUB#12408: Lazy initialization improvements for Facets implementations when there are segments with no hits
to count. (Greg Miller)

* GITHUB#13327: Reduce memory usage of field maps in FieldInfos and BlockTree TermsReader. (Bruno Roustant, David Smiley)

Bug Fixes
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,12 @@
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.SegmentReadState;
Expand All @@ -35,10 +35,11 @@
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.ReadAdvice;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CollectionUtil;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.Outputs;
import org.apache.lucene.util.hppc.IntCursor;
import org.apache.lucene.util.hppc.IntObjectHashMap;

/**
* A block-based terms index and dictionary that assigns terms to variable length blocks according
Expand Down Expand Up @@ -113,7 +114,8 @@ public final class Lucene90BlockTreeTermsReader extends FieldsProducer {
// produce DocsEnum on demand
final PostingsReaderBase postingsReader;

private final Map<String, FieldReader> fieldMap;
private final FieldInfos fieldInfos;
private final IntObjectHashMap<FieldReader> fieldMap;
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This PR proposes to leverage the existing field-name -> FieldInfo map in FieldInfos to not repeat the ref to the field name strings here. Instead use the field number (specific to the FieldInfos) as key, so that we can use a compact primitive map.

Then below, in terms(String fieldName), we can use FieldInfos.fieldInfo(String fieldName) as a first mapping to the field number, and then use this compact map to get the Terms.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW this is what Lucene90PointsReader does today as well.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice. It could benefit from the IntObjectHashMap.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually there are many usages of Map<Integer, Object>. I could open some PRs when memory (and perf) matters after IntObjectHashMap is in.

private final List<String> fieldList;

final String segment;
Expand Down Expand Up @@ -157,7 +159,7 @@ public Lucene90BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentRe
// Read per-field details
String metaName =
IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_META_EXTENSION);
Map<String, FieldReader> fieldMap = null;
IntObjectHashMap<FieldReader> fieldMap = null;
Throwable priorE = null;
long indexLength = -1, termsLength = -1;
try (ChecksumIndexInput metaIn = state.directory.openChecksumInput(metaName)) {
Expand All @@ -175,7 +177,7 @@ public Lucene90BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentRe
if (numFields < 0) {
throw new CorruptIndexException("invalid numFields: " + numFields, metaIn);
}
fieldMap = CollectionUtil.newHashMap(numFields);
fieldMap = new IntObjectHashMap<>(numFields);
for (int i = 0; i < numFields; ++i) {
final int field = metaIn.readVInt();
final long numTerms = metaIn.readVLong();
Expand Down Expand Up @@ -216,7 +218,7 @@ public Lucene90BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentRe
final long indexStartFP = metaIn.readVLong();
FieldReader previous =
fieldMap.put(
fieldInfo.name,
fieldInfo.number,
new FieldReader(
this,
fieldInfo,
Expand Down Expand Up @@ -250,10 +252,9 @@ public Lucene90BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentRe
// correct
CodecUtil.retrieveChecksum(indexIn, indexLength);
CodecUtil.retrieveChecksum(termsIn, termsLength);
List<String> fieldList = new ArrayList<>(fieldMap.keySet());
fieldList.sort(null);
fieldInfos = state.fieldInfos;
this.fieldMap = fieldMap;
this.fieldList = Collections.unmodifiableList(fieldList);
this.fieldList = sortFieldNames(fieldMap, state.fieldInfos);
success = true;
} finally {
if (!success) {
Expand All @@ -277,6 +278,16 @@ private static BytesRef readBytesRef(IndexInput in) throws IOException {
return bytes;
}

private static List<String> sortFieldNames(
IntObjectHashMap<FieldReader> fieldMap, FieldInfos fieldInfos) {
List<String> fieldNames = new ArrayList<>(fieldMap.size());
for (IntCursor fieldNumberCursor : fieldMap.keys()) {
fieldNames.add(fieldInfos.fieldInfo(fieldNumberCursor.value).name);
}
fieldNames.sort(null);
return Collections.unmodifiableList(fieldNames);
}

// for debugging
// private static String toHex(int v) {
// return "0x" + Integer.toHexString(v);
Expand All @@ -301,7 +312,8 @@ public Iterator<String> iterator() {
@Override
public Terms terms(String field) throws IOException {
assert field != null;
return fieldMap.get(field);
FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
return fieldInfo == null ? null : fieldMap.get(fieldInfo.number);
}

@Override
Expand Down
88 changes: 52 additions & 36 deletions lucene/core/src/java/org/apache/lucene/index/FieldInfos.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,8 @@
import static org.apache.lucene.index.FieldInfo.verifySameStoreTermVectors;
import static org.apache.lucene.index.FieldInfo.verifySameVectorOptions;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
Expand All @@ -34,7 +33,7 @@
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.CollectionUtil;

/**
* Collection of {@link FieldInfo}s (accessible by number or by name).
Expand Down Expand Up @@ -62,11 +61,15 @@ public class FieldInfos implements Iterable<FieldInfo> {

// used only by fieldInfo(int)
private final FieldInfo[] byNumber;
private final HashMap<String, FieldInfo> byName;

private final HashMap<String, FieldInfo> byName = new HashMap<>();
private final Collection<FieldInfo> values; // for an unmodifiable iterator
/** Iterator in ascending order of field number. */
private final Collection<FieldInfo> values;

/** Constructs a new FieldInfos from an array of FieldInfo objects */
/**
* Constructs a new FieldInfos from an array of FieldInfo objects. The array can be used directly
* as the backing structure.
*/
public FieldInfos(FieldInfo[] infos) {
boolean hasVectors = false;
boolean hasPostings = false;
Expand All @@ -81,30 +84,21 @@ public FieldInfos(FieldInfo[] infos) {
String softDeletesField = null;
String parentField = null;

int size = 0; // number of elements in byNumberTemp, number of used array slots
FieldInfo[] byNumberTemp = new FieldInfo[10]; // initial array capacity of 10
byName = CollectionUtil.newHashMap(infos.length);
int maxFieldNumber = -1;
boolean fieldNumberStrictlyAscending = true;
for (FieldInfo info : infos) {
if (info.number < 0) {
int fieldNumber = info.number;
if (fieldNumber < 0) {
throw new IllegalArgumentException(
"illegal field number: " + info.number + " for field " + info.name);
}
size = info.number >= size ? info.number + 1 : size;
dsmiley marked this conversation as resolved.
Show resolved Hide resolved
if (info.number >= byNumberTemp.length) { // grow array
byNumberTemp = ArrayUtil.grow(byNumberTemp, info.number + 1);
}
FieldInfo previous = byNumberTemp[info.number];
if (previous != null) {
throw new IllegalArgumentException(
"duplicate field numbers: "
+ previous.name
+ " and "
+ info.name
+ " have: "
+ info.number);
if (maxFieldNumber < fieldNumber) {
maxFieldNumber = fieldNumber;
} else {
fieldNumberStrictlyAscending = false;
}
byNumberTemp[info.number] = info;

previous = byName.put(info.name, info);
FieldInfo previous = byName.put(info.name, info);
if (previous != null) {
throw new IllegalArgumentException(
"duplicate field names: "
Expand Down Expand Up @@ -156,15 +150,40 @@ public FieldInfos(FieldInfo[] infos) {
this.softDeletesField = softDeletesField;
this.parentField = parentField;

List<FieldInfo> valuesTemp = new ArrayList<>(infos.length);
byNumber = new FieldInfo[size];
for (int i = 0; i < size; i++) {
byNumber[i] = byNumberTemp[i];
if (byNumberTemp[i] != null) {
valuesTemp.add(byNumberTemp[i]);
if (fieldNumberStrictlyAscending && maxFieldNumber == infos.length - 1) {
// The input FieldInfo[] contains all fields numbered from 0 to infos.length - 1, and they are
// sorted, use it directly. This is an optimization when reading a segment with all fields
// since the FieldInfo[] is sorted.
byNumber = infos;
values = Arrays.asList(byNumber);
} else {
byNumber = new FieldInfo[maxFieldNumber + 1];
for (FieldInfo fieldInfo : infos) {
FieldInfo existing = byNumber[fieldInfo.number];
if (existing != null) {
throw new IllegalArgumentException(
"duplicate field numbers: "
+ existing.name
+ " and "
+ fieldInfo.name
+ " have: "
+ fieldInfo.number);
}
byNumber[fieldInfo.number] = fieldInfo;
}
if (maxFieldNumber == infos.length - 1) {
// No fields are missing, use byNumber.
values = Arrays.asList(byNumber);
} else {
if (!fieldNumberStrictlyAscending) {
// The below code is faster than
// Arrays.stream(byNumber).filter(Objects::nonNull).toList(),
// mainly when the input FieldInfo[] is small compared to maxFieldNumber.
Arrays.sort(infos, (fi1, fi2) -> Integer.compare(fi1.number, fi2.number));
}
values = Arrays.asList(infos);
}
}
values = Collections.unmodifiableCollection(valuesTemp);
}

/**
Expand Down Expand Up @@ -323,10 +342,7 @@ public FieldInfo fieldInfo(int fieldNumber) {
if (fieldNumber < 0) {
throw new IllegalArgumentException("Illegal field number: " + fieldNumber);
}
if (fieldNumber >= byNumber.length) {
return null;
}
return byNumber[fieldNumber];
return fieldNumber >= byNumber.length ? null : byNumber[fieldNumber];
}

static final class FieldDimensions {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import java.util.Arrays;
import org.apache.lucene.util.hppc.BitMixer;
import org.apache.lucene.util.hppc.IntCursor;
import org.apache.lucene.util.hppc.IntIntHashMap;

/**
Expand Down Expand Up @@ -94,7 +95,7 @@ int[] getArray() {
}
arrayCache = new int[inner.size()];
int i = 0;
for (IntIntHashMap.IntCursor cursor : inner.keys()) {
for (IntCursor cursor : inner.keys()) {
arrayCache[i++] = cursor.value;
}
// we need to sort this array since "equals" method depend on this
Expand All @@ -114,7 +115,7 @@ long longHashCode() {
return hashCode;
}
hashCode = inner.size();
for (IntIntHashMap.IntCursor cursor : inner.keys()) {
for (IntCursor cursor : inner.keys()) {
hashCode += BitMixer.mix(cursor.value);
}
hashUpdated = true;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.lucene.util.hppc;

import java.util.Iterator;
import java.util.NoSuchElementException;

/**
* Simplifies the implementation of iterators a bit. Modeled loosely after Google Guava's API.
*
* <p>Forked from com.carrotsearch.hppc.AbstractIterator
*/
public abstract class AbstractIterator<E> implements Iterator<E> {
private static final int NOT_CACHED = 0;
private static final int CACHED = 1;
private static final int AT_END = 2;

/** Current iterator state. */
private int state = NOT_CACHED;

/** The next element to be returned from {@link #next()} if fetched. */
private E nextElement;

@Override
public boolean hasNext() {
if (state == NOT_CACHED) {
state = CACHED;
nextElement = fetch();
}
return state == CACHED;
}

@Override
public E next() {
if (!hasNext()) {
throw new NoSuchElementException();
}

state = NOT_CACHED;
return nextElement;
}

/** Default implementation throws {@link UnsupportedOperationException}. */
@Override
public void remove() {
throw new UnsupportedOperationException();
}

/**
* Fetch next element. The implementation must return {@link #done()} when all elements have been
* fetched.
*
* @return Returns the next value for the iterator or chain-calls {@link #done()}.
*/
protected abstract E fetch();

/**
* Call when done.
*
* @return Returns a unique sentinel value to indicate end-of-iteration.
*/
protected final E done() {
state = AT_END;
return null;
}
}
Loading