-
Notifications
You must be signed in to change notification settings - Fork 2.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
LUCENE-9353: Move terms metadata to its own file. #1473
Changes from 2 commits
6edfcfa
9a4df99
9faa630
10b8c13
7587f1c
06e381a
84208aa
db5a8f2
3e225d8
8b8417e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,7 +16,6 @@ | |
*/ | ||
package org.apache.lucene.codecs.blocktree; | ||
|
||
|
||
import java.io.IOException; | ||
import java.util.ArrayList; | ||
import java.util.Collection; | ||
|
@@ -35,6 +34,7 @@ | |
import org.apache.lucene.index.IndexOptions; | ||
import org.apache.lucene.index.SegmentReadState; | ||
import org.apache.lucene.index.Terms; | ||
import org.apache.lucene.store.ChecksumIndexInput; | ||
import org.apache.lucene.store.IndexInput; | ||
import org.apache.lucene.util.Accountable; | ||
import org.apache.lucene.util.Accountables; | ||
|
@@ -97,13 +97,20 @@ public final class BlockTreeTermsReader extends FieldsProducer { | |
/** Suffixes are compressed to save space. */ | ||
public static final int VERSION_COMPRESSED_SUFFIXES = 5; | ||
|
||
/** Metadata is written to its own file. */ | ||
public static final int VERSION_META_FILE = 6; | ||
|
||
/** Current terms format. */ | ||
public static final int VERSION_CURRENT = VERSION_COMPRESSED_SUFFIXES; | ||
public static final int VERSION_CURRENT = VERSION_META_FILE; | ||
|
||
/** Extension of terms index file */ | ||
static final String TERMS_INDEX_EXTENSION = "tip"; | ||
final static String TERMS_INDEX_CODEC_NAME = "BlockTreeTermsIndex"; | ||
|
||
/** Extension of terms meta file */ | ||
static final String TERMS_META_EXTENSION = "tmd"; | ||
final static String TERMS_META_CODEC_NAME = "BlockTreeTermsMeta"; | ||
|
||
// Open input to the main terms dict file (_X.tib) | ||
final IndexInput termsIn; | ||
// Open input to the terms index file (_X.tip) | ||
|
@@ -128,9 +135,9 @@ public BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentReadState | |
|
||
this.postingsReader = postingsReader; | ||
this.segment = state.segmentInfo.name; | ||
|
||
String termsName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_EXTENSION); | ||
|
||
try { | ||
String termsName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_EXTENSION); | ||
termsIn = state.directory.openInput(termsName, state.context); | ||
version = CodecUtil.checkIndexHeader(termsIn, TERMS_CODEC_NAME, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); | ||
|
||
|
@@ -148,56 +155,80 @@ public BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentReadState | |
CodecUtil.retrieveChecksum(termsIn); | ||
|
||
// Read per-field details | ||
seekDir(termsIn); | ||
seekDir(indexIn); | ||
String metaName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_META_EXTENSION); | ||
Map<String, FieldReader> fieldMap = null; | ||
Throwable priorE = null; | ||
try (ChecksumIndexInput metaIn = version >= VERSION_META_FILE ? state.directory.openChecksumInput(metaName, state.context) : null) { | ||
try { | ||
final IndexInput indexMetaIn, termsMetaIn; | ||
if (version >= VERSION_META_FILE) { | ||
CodecUtil.checkIndexHeader(metaIn, TERMS_META_CODEC_NAME, version, version, state.segmentInfo.getId(), state.segmentSuffix); | ||
indexMetaIn = termsMetaIn = metaIn; | ||
} else { | ||
seekDir(termsIn); | ||
seekDir(indexIn); | ||
indexMetaIn = indexIn; | ||
termsMetaIn = termsIn; | ||
} | ||
|
||
final int numFields = termsIn.readVInt(); | ||
if (numFields < 0) { | ||
throw new CorruptIndexException("invalid numFields: " + numFields, termsIn); | ||
} | ||
fieldMap = new HashMap<>((int) (numFields / 0.75f) + 1); | ||
for (int i = 0; i < numFields; ++i) { | ||
final int field = termsIn.readVInt(); | ||
final long numTerms = termsIn.readVLong(); | ||
if (numTerms <= 0) { | ||
throw new CorruptIndexException("Illegal numTerms for field number: " + field, termsIn); | ||
} | ||
final BytesRef rootCode = readBytesRef(termsIn); | ||
final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field); | ||
if (fieldInfo == null) { | ||
throw new CorruptIndexException("invalid field number: " + field, termsIn); | ||
} | ||
final long sumTotalTermFreq = termsIn.readVLong(); | ||
// when frequencies are omitted, sumDocFreq=sumTotalTermFreq and only one value is written. | ||
final long sumDocFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS ? sumTotalTermFreq : termsIn.readVLong(); | ||
final int docCount = termsIn.readVInt(); | ||
if (version < VERSION_META_LONGS_REMOVED) { | ||
final int longsSize = termsIn.readVInt(); | ||
if (longsSize < 0) { | ||
throw new CorruptIndexException("invalid longsSize for field: " + fieldInfo.name + ", longsSize=" + longsSize, termsIn); | ||
final int numFields = termsMetaIn.readVInt(); | ||
if (numFields < 0) { | ||
throw new CorruptIndexException("invalid numFields: " + numFields, termsMetaIn); | ||
} | ||
fieldMap = new HashMap<>((int) (numFields / 0.75f) + 1); | ||
for (int i = 0; i < numFields; ++i) { | ||
final int field = termsMetaIn.readVInt(); | ||
final long numTerms = termsMetaIn.readVLong(); | ||
if (numTerms <= 0) { | ||
throw new CorruptIndexException("Illegal numTerms for field number: " + field, termsMetaIn); | ||
} | ||
final BytesRef rootCode = readBytesRef(termsMetaIn); | ||
final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field); | ||
if (fieldInfo == null) { | ||
throw new CorruptIndexException("invalid field number: " + field, termsMetaIn); | ||
} | ||
final long sumTotalTermFreq = termsMetaIn.readVLong(); | ||
// when frequencies are omitted, sumDocFreq=sumTotalTermFreq and only one value is written. | ||
final long sumDocFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS ? sumTotalTermFreq : termsMetaIn.readVLong(); | ||
final int docCount = termsMetaIn.readVInt(); | ||
if (version < VERSION_META_LONGS_REMOVED) { | ||
final int longsSize = termsMetaIn.readVInt(); | ||
if (longsSize < 0) { | ||
throw new CorruptIndexException("invalid longsSize for field: " + fieldInfo.name + ", longsSize=" + longsSize, termsMetaIn); | ||
} | ||
} | ||
BytesRef minTerm = readBytesRef(termsMetaIn); | ||
BytesRef maxTerm = readBytesRef(termsMetaIn); | ||
if (docCount < 0 || docCount > state.segmentInfo.maxDoc()) { // #docs with field must be <= #docs | ||
throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + state.segmentInfo.maxDoc(), termsMetaIn); | ||
} | ||
if (sumDocFreq < docCount) { // #postings must be >= #docs with field | ||
throw new CorruptIndexException("invalid sumDocFreq: " + sumDocFreq + " docCount: " + docCount, termsMetaIn); | ||
} | ||
if (sumTotalTermFreq < sumDocFreq) { // #positions must be >= #postings | ||
throw new CorruptIndexException("invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq, termsMetaIn); | ||
} | ||
final long indexStartFP = indexMetaIn.readVLong(); | ||
FieldReader previous = fieldMap.put(fieldInfo.name, | ||
new FieldReader(this, fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount, | ||
indexStartFP, indexIn, minTerm, maxTerm)); | ||
if (previous != null) { | ||
throw new CorruptIndexException("duplicate field: " + fieldInfo.name, termsMetaIn); | ||
} | ||
} | ||
} catch (Throwable exception) { | ||
priorE = exception; | ||
} finally { | ||
if (metaIn != null) { | ||
CodecUtil.checkFooter(metaIn, priorE); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. again, I don't know if we have test coverage for the corrputed metadata? |
||
} else if (priorE != null) { | ||
IOUtils.rethrowAlways(priorE); | ||
} | ||
} | ||
BytesRef minTerm = readBytesRef(termsIn); | ||
BytesRef maxTerm = readBytesRef(termsIn); | ||
if (docCount < 0 || docCount > state.segmentInfo.maxDoc()) { // #docs with field must be <= #docs | ||
throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + state.segmentInfo.maxDoc(), termsIn); | ||
} | ||
if (sumDocFreq < docCount) { // #postings must be >= #docs with field | ||
throw new CorruptIndexException("invalid sumDocFreq: " + sumDocFreq + " docCount: " + docCount, termsIn); | ||
} | ||
if (sumTotalTermFreq < sumDocFreq) { // #positions must be >= #postings | ||
throw new CorruptIndexException("invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq, termsIn); | ||
} | ||
final long indexStartFP = indexIn.readVLong(); | ||
FieldReader previous = fieldMap.put(fieldInfo.name, | ||
new FieldReader(this, fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount, | ||
indexStartFP, indexIn, minTerm, maxTerm)); | ||
if (previous != null) { | ||
throw new CorruptIndexException("duplicate field: " + fieldInfo.name, termsIn); | ||
} | ||
} | ||
List<String> fieldList = new ArrayList<>(fieldMap.keySet()); | ||
fieldList.sort(null); | ||
this.fieldMap = fieldMap; | ||
this.fieldList = Collections.unmodifiableList(fieldList); | ||
success = true; | ||
} finally { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -211,6 +211,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { | |
|
||
//private final static boolean SAVE_DOT_FILES = false; | ||
|
||
private final SegmentWriteState state; | ||
private final IndexOutput termsOut; | ||
private final IndexOutput indexOut; | ||
final int maxDoc; | ||
|
@@ -262,6 +263,7 @@ public BlockTreeTermsWriter(SegmentWriteState state, | |
validateSettings(minItemsInBlock, | ||
maxItemsInBlock); | ||
|
||
this.state = state; | ||
this.minItemsInBlock = minItemsInBlock; | ||
this.maxItemsInBlock = maxItemsInBlock; | ||
|
||
|
@@ -294,16 +296,6 @@ public BlockTreeTermsWriter(SegmentWriteState state, | |
} | ||
} | ||
|
||
/** Writes the terms file trailer. */ | ||
private void writeTrailer(IndexOutput out, long dirStart) throws IOException { | ||
out.writeLong(dirStart); | ||
} | ||
|
||
/** Writes the index file trailer. */ | ||
private void writeIndexTrailer(IndexOutput indexOut, long dirStart) throws IOException { | ||
indexOut.writeLong(dirStart); | ||
} | ||
|
||
/** Throws {@code IllegalArgumentException} if any of these settings | ||
* is invalid. */ | ||
public static void validateSettings(int minItemsInBlock, int maxItemsInBlock) { | ||
|
@@ -1060,36 +1052,35 @@ public void close() throws IOException { | |
return; | ||
} | ||
closed = true; | ||
|
||
|
||
final String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockTreeTermsReader.TERMS_META_EXTENSION); | ||
boolean success = false; | ||
try { | ||
|
||
final long dirStart = termsOut.getFilePointer(); | ||
final long indexDirStart = indexOut.getFilePointer(); | ||
try (IndexOutput metaOut = state.directory.createOutput(metaName, state.context)) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why this file is not created at the same time with the indexOut, termOut? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That would work too. I like keeping the index output open for as little time as possible when it doesn't make things worse otherwise. |
||
CodecUtil.writeIndexHeader(metaOut, BlockTreeTermsReader.TERMS_META_CODEC_NAME, BlockTreeTermsReader.VERSION_CURRENT, | ||
state.segmentInfo.getId(), state.segmentSuffix); | ||
|
||
termsOut.writeVInt(fields.size()); | ||
metaOut.writeVInt(fields.size()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jpountz here I see the same lack of serializer write/read code, could it be possible to have such thing, It would improve readability and unit testing by only mocking fieldMetadatas and check serialization is correctly applied. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Your proposal sounds orthogonal to this pull request to me? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes but maybe is the opportunity to move the code and add more testability in BlockTreeTermsReader.java There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I prefer decoupling changes when possible. |
||
|
||
for(FieldMetaData field : fields) { | ||
//System.out.println(" field " + field.fieldInfo.name + " " + field.numTerms + " terms"); | ||
termsOut.writeVInt(field.fieldInfo.number); | ||
metaOut.writeVInt(field.fieldInfo.number); | ||
assert field.numTerms > 0; | ||
termsOut.writeVLong(field.numTerms); | ||
termsOut.writeVInt(field.rootCode.length); | ||
termsOut.writeBytes(field.rootCode.bytes, field.rootCode.offset, field.rootCode.length); | ||
metaOut.writeVLong(field.numTerms); | ||
metaOut.writeVInt(field.rootCode.length); | ||
metaOut.writeBytes(field.rootCode.bytes, field.rootCode.offset, field.rootCode.length); | ||
assert field.fieldInfo.getIndexOptions() != IndexOptions.NONE; | ||
if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS) { | ||
termsOut.writeVLong(field.sumTotalTermFreq); | ||
metaOut.writeVLong(field.sumTotalTermFreq); | ||
} | ||
termsOut.writeVLong(field.sumDocFreq); | ||
termsOut.writeVInt(field.docCount); | ||
indexOut.writeVLong(field.indexStartFP); | ||
writeBytesRef(termsOut, field.minTerm); | ||
writeBytesRef(termsOut, field.maxTerm); | ||
metaOut.writeVLong(field.sumDocFreq); | ||
metaOut.writeVInt(field.docCount); | ||
writeBytesRef(metaOut, field.minTerm); | ||
writeBytesRef(metaOut, field.maxTerm); | ||
metaOut.writeVLong(field.indexStartFP); | ||
} | ||
writeTrailer(termsOut, dirStart); | ||
CodecUtil.writeFooter(termsOut); | ||
writeIndexTrailer(indexOut, indexDirStart); | ||
CodecUtil.writeFooter(indexOut); | ||
CodecUtil.writeFooter(metaOut); | ||
success = true; | ||
} finally { | ||
if (success) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
do we have a test that tickles these cases?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not directly, and these things are hard to test, though I agree we could do better. I opened https://issues.apache.org/jira/browse/LUCENE-9356 to try to improve the coverage of these code paths.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, Adrien