Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LUCENE-9353: Move terms metadata to its own file. #1473

Merged
merged 10 commits into from
Jun 16, 2020
4 changes: 4 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,10 @@ Improvements
* LUCENE-9342: TotalHits' relation will be EQUAL_TO when the number of hits is lower than TopDocsColector's numHits
(Tomás Fernández Löbbe)

* LUCENE-9353: Metadata of the terms dictionary moved to its own file, with the
`.tmd` extension. This allows checksums of metadata to be verified when
opening indices. (Adrien Grand)

Optimizations
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
*/
package org.apache.lucene.codecs.blocktree;


import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
Expand All @@ -35,6 +34,7 @@
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
Expand Down Expand Up @@ -97,13 +97,20 @@ public final class BlockTreeTermsReader extends FieldsProducer {
/** Suffixes are compressed to save space. */
public static final int VERSION_COMPRESSED_SUFFIXES = 5;

/** Metadata is written to its own file. */
public static final int VERSION_META_FILE = 6;

/** Current terms format. */
public static final int VERSION_CURRENT = VERSION_COMPRESSED_SUFFIXES;
public static final int VERSION_CURRENT = VERSION_META_FILE;

/** Extension of terms index file */
static final String TERMS_INDEX_EXTENSION = "tip";
final static String TERMS_INDEX_CODEC_NAME = "BlockTreeTermsIndex";

/** Extension of terms meta file */
static final String TERMS_META_EXTENSION = "tmd";
final static String TERMS_META_CODEC_NAME = "BlockTreeTermsMeta";

// Open input to the main terms dict file (_X.tib)
final IndexInput termsIn;
// Open input to the terms index file (_X.tip)
Expand All @@ -128,9 +135,9 @@ public BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentReadState

this.postingsReader = postingsReader;
this.segment = state.segmentInfo.name;

String termsName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_EXTENSION);

try {
String termsName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_EXTENSION);
termsIn = state.directory.openInput(termsName, state.context);
version = CodecUtil.checkIndexHeader(termsIn, TERMS_CODEC_NAME, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);

Expand All @@ -148,56 +155,80 @@ public BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentReadState
CodecUtil.retrieveChecksum(termsIn);

// Read per-field details
seekDir(termsIn);
seekDir(indexIn);
String metaName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_META_EXTENSION);
Map<String, FieldReader> fieldMap = null;
Throwable priorE = null;
try (ChecksumIndexInput metaIn = version >= VERSION_META_FILE ? state.directory.openChecksumInput(metaName, state.context) : null) {
try {
final IndexInput indexMetaIn, termsMetaIn;
if (version >= VERSION_META_FILE) {
CodecUtil.checkIndexHeader(metaIn, TERMS_META_CODEC_NAME, version, version, state.segmentInfo.getId(), state.segmentSuffix);
indexMetaIn = termsMetaIn = metaIn;
} else {
seekDir(termsIn);
seekDir(indexIn);
indexMetaIn = indexIn;
termsMetaIn = termsIn;
}

final int numFields = termsIn.readVInt();
if (numFields < 0) {
throw new CorruptIndexException("invalid numFields: " + numFields, termsIn);
}
fieldMap = new HashMap<>((int) (numFields / 0.75f) + 1);
for (int i = 0; i < numFields; ++i) {
final int field = termsIn.readVInt();
final long numTerms = termsIn.readVLong();
if (numTerms <= 0) {
throw new CorruptIndexException("Illegal numTerms for field number: " + field, termsIn);
}
final BytesRef rootCode = readBytesRef(termsIn);
final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
if (fieldInfo == null) {
throw new CorruptIndexException("invalid field number: " + field, termsIn);
}
final long sumTotalTermFreq = termsIn.readVLong();
// when frequencies are omitted, sumDocFreq=sumTotalTermFreq and only one value is written.
final long sumDocFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS ? sumTotalTermFreq : termsIn.readVLong();
final int docCount = termsIn.readVInt();
if (version < VERSION_META_LONGS_REMOVED) {
final int longsSize = termsIn.readVInt();
if (longsSize < 0) {
throw new CorruptIndexException("invalid longsSize for field: " + fieldInfo.name + ", longsSize=" + longsSize, termsIn);
final int numFields = termsMetaIn.readVInt();
if (numFields < 0) {
throw new CorruptIndexException("invalid numFields: " + numFields, termsMetaIn);
}
fieldMap = new HashMap<>((int) (numFields / 0.75f) + 1);
for (int i = 0; i < numFields; ++i) {
final int field = termsMetaIn.readVInt();
final long numTerms = termsMetaIn.readVLong();
if (numTerms <= 0) {
throw new CorruptIndexException("Illegal numTerms for field number: " + field, termsMetaIn);
}
final BytesRef rootCode = readBytesRef(termsMetaIn);
final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
if (fieldInfo == null) {
throw new CorruptIndexException("invalid field number: " + field, termsMetaIn);
}
final long sumTotalTermFreq = termsMetaIn.readVLong();
// when frequencies are omitted, sumDocFreq=sumTotalTermFreq and only one value is written.
final long sumDocFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS ? sumTotalTermFreq : termsMetaIn.readVLong();
final int docCount = termsMetaIn.readVInt();
if (version < VERSION_META_LONGS_REMOVED) {
final int longsSize = termsMetaIn.readVInt();
if (longsSize < 0) {
throw new CorruptIndexException("invalid longsSize for field: " + fieldInfo.name + ", longsSize=" + longsSize, termsMetaIn);
}
}
BytesRef minTerm = readBytesRef(termsMetaIn);
BytesRef maxTerm = readBytesRef(termsMetaIn);
if (docCount < 0 || docCount > state.segmentInfo.maxDoc()) { // #docs with field must be <= #docs
throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + state.segmentInfo.maxDoc(), termsMetaIn);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we have a test that tickles these cases?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not directly, and these things are hard to test, though I agree we could do better. I opened https://issues.apache.org/jira/browse/LUCENE-9356 to try to improve the coverage of these code paths.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, Adrien

}
if (sumDocFreq < docCount) { // #postings must be >= #docs with field
throw new CorruptIndexException("invalid sumDocFreq: " + sumDocFreq + " docCount: " + docCount, termsMetaIn);
}
if (sumTotalTermFreq < sumDocFreq) { // #positions must be >= #postings
throw new CorruptIndexException("invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq, termsMetaIn);
}
final long indexStartFP = indexMetaIn.readVLong();
FieldReader previous = fieldMap.put(fieldInfo.name,
new FieldReader(this, fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount,
indexStartFP, indexIn, minTerm, maxTerm));
if (previous != null) {
throw new CorruptIndexException("duplicate field: " + fieldInfo.name, termsMetaIn);
}
}
} catch (Throwable exception) {
priorE = exception;
} finally {
if (metaIn != null) {
CodecUtil.checkFooter(metaIn, priorE);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

again, I don't know if we have test coverage for the corrputed metadata?

} else if (priorE != null) {
IOUtils.rethrowAlways(priorE);
}
}
BytesRef minTerm = readBytesRef(termsIn);
BytesRef maxTerm = readBytesRef(termsIn);
if (docCount < 0 || docCount > state.segmentInfo.maxDoc()) { // #docs with field must be <= #docs
throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + state.segmentInfo.maxDoc(), termsIn);
}
if (sumDocFreq < docCount) { // #postings must be >= #docs with field
throw new CorruptIndexException("invalid sumDocFreq: " + sumDocFreq + " docCount: " + docCount, termsIn);
}
if (sumTotalTermFreq < sumDocFreq) { // #positions must be >= #postings
throw new CorruptIndexException("invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq, termsIn);
}
final long indexStartFP = indexIn.readVLong();
FieldReader previous = fieldMap.put(fieldInfo.name,
new FieldReader(this, fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount,
indexStartFP, indexIn, minTerm, maxTerm));
if (previous != null) {
throw new CorruptIndexException("duplicate field: " + fieldInfo.name, termsIn);
}
}
List<String> fieldList = new ArrayList<>(fieldMap.keySet());
fieldList.sort(null);
this.fieldMap = fieldMap;
this.fieldList = Collections.unmodifiableList(fieldList);
success = true;
} finally {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {

//private final static boolean SAVE_DOT_FILES = false;

private final SegmentWriteState state;
private final IndexOutput termsOut;
private final IndexOutput indexOut;
final int maxDoc;
Expand Down Expand Up @@ -262,6 +263,7 @@ public BlockTreeTermsWriter(SegmentWriteState state,
validateSettings(minItemsInBlock,
maxItemsInBlock);

this.state = state;
this.minItemsInBlock = minItemsInBlock;
this.maxItemsInBlock = maxItemsInBlock;

Expand Down Expand Up @@ -294,16 +296,6 @@ public BlockTreeTermsWriter(SegmentWriteState state,
}
}

/** Writes the terms file trailer. */
private void writeTrailer(IndexOutput out, long dirStart) throws IOException {
out.writeLong(dirStart);
}

/** Writes the index file trailer. */
private void writeIndexTrailer(IndexOutput indexOut, long dirStart) throws IOException {
indexOut.writeLong(dirStart);
}

/** Throws {@code IllegalArgumentException} if any of these settings
* is invalid. */
public static void validateSettings(int minItemsInBlock, int maxItemsInBlock) {
Expand Down Expand Up @@ -1060,36 +1052,35 @@ public void close() throws IOException {
return;
}
closed = true;


final String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockTreeTermsReader.TERMS_META_EXTENSION);
boolean success = false;
try {

final long dirStart = termsOut.getFilePointer();
final long indexDirStart = indexOut.getFilePointer();
try (IndexOutput metaOut = state.directory.createOutput(metaName, state.context)) {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why this file is not created at the same time with the indexOut, termOut?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That would work too. I like keeping the index output open for as little time as possible when it doesn't make things worse otherwise.

CodecUtil.writeIndexHeader(metaOut, BlockTreeTermsReader.TERMS_META_CODEC_NAME, BlockTreeTermsReader.VERSION_CURRENT,
state.segmentInfo.getId(), state.segmentSuffix);

termsOut.writeVInt(fields.size());
metaOut.writeVInt(fields.size());

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jpountz here I see the same lack of serializer write/read code, could it be possible to have such thing, It would improve readability and unit testing by only mocking fieldMetadatas and check serialization is correctly applied.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your proposal sounds orthogonal to this pull request to me?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes but maybe is the opportunity to move the code and add more testability in BlockTreeTermsReader.java

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I prefer decoupling changes when possible.


for(FieldMetaData field : fields) {
//System.out.println(" field " + field.fieldInfo.name + " " + field.numTerms + " terms");
termsOut.writeVInt(field.fieldInfo.number);
metaOut.writeVInt(field.fieldInfo.number);
assert field.numTerms > 0;
termsOut.writeVLong(field.numTerms);
termsOut.writeVInt(field.rootCode.length);
termsOut.writeBytes(field.rootCode.bytes, field.rootCode.offset, field.rootCode.length);
metaOut.writeVLong(field.numTerms);
metaOut.writeVInt(field.rootCode.length);
metaOut.writeBytes(field.rootCode.bytes, field.rootCode.offset, field.rootCode.length);
assert field.fieldInfo.getIndexOptions() != IndexOptions.NONE;
if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS) {
termsOut.writeVLong(field.sumTotalTermFreq);
metaOut.writeVLong(field.sumTotalTermFreq);
}
termsOut.writeVLong(field.sumDocFreq);
termsOut.writeVInt(field.docCount);
indexOut.writeVLong(field.indexStartFP);
writeBytesRef(termsOut, field.minTerm);
writeBytesRef(termsOut, field.maxTerm);
metaOut.writeVLong(field.sumDocFreq);
metaOut.writeVInt(field.docCount);
writeBytesRef(metaOut, field.minTerm);
writeBytesRef(metaOut, field.maxTerm);
metaOut.writeVLong(field.indexStartFP);
}
writeTrailer(termsOut, dirStart);
CodecUtil.writeFooter(termsOut);
writeIndexTrailer(indexOut, indexDirStart);
CodecUtil.writeFooter(indexOut);
CodecUtil.writeFooter(metaOut);
success = true;
} finally {
if (success) {
Expand Down