Skip to content

Commit

Permalink
LUCENE-9353: Move terms metadata to its own file. (#1473)
Browse files Browse the repository at this point in the history
  • Loading branch information
jpountz committed Jun 16, 2020
1 parent 740bfc9 commit 0dac659
Show file tree
Hide file tree
Showing 26 changed files with 231 additions and 221 deletions.
4 changes: 4 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@ Improvements
* LUCENE-9342: TotalHits' relation will be EQUAL_TO when the number of hits is lower than TopDocsColector's numHits
(Tomás Fernández Löbbe)

* LUCENE-9353: Metadata of the terms dictionary moved to its own file, with the
`.tmd` extension. This allows checksums of metadata to be verified when
opening indices and helps save seeks when opening an index. (Adrien Grand)

* LUCENE-9359: SegmentInfos#readCommit now always returns a
CorruptIndexException if the content of the file is invalid. (Adrien Grand)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import java.io.InputStream;
import java.io.IOException;

import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
Expand All @@ -44,7 +45,8 @@ public TokenInfoDictionary(ResourceScheme resourceScheme, String resourcePath) t
super(resourceScheme, resourcePath);
FST<Long> fst;
try (InputStream is = new BufferedInputStream(getResource(FST_FILENAME_SUFFIX))) {
fst = new FST<>(new InputStreamDataInput(is), PositiveIntOutputs.getSingleton());
DataInput in = new InputStreamDataInput(is);
fst = new FST<>(in, in, PositiveIntOutputs.getSingleton());
}
// TODO: some way to configure?
this.fst = new TokenInfoFST(fst, true);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import java.io.InputStream;
import java.io.IOException;

import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
Expand Down Expand Up @@ -47,7 +48,8 @@ public TokenInfoDictionary(ResourceScheme resourceScheme, String resourcePath) t
super(resourceScheme, resourcePath);
FST<Long> fst;
try (InputStream is = new BufferedInputStream(getResource(FST_FILENAME_SUFFIX))) {
fst = new FST<>(new InputStreamDataInput(is), PositiveIntOutputs.getSingleton());
DataInput in = new InputStreamDataInput(is);
fst = new FST<>(in, in, PositiveIntOutputs.getSingleton());
}
this.fst = new TokenInfoFST(fst);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ private final class FieldIndexData implements Accountable {
public FieldIndexData(IndexInput in, FieldInfo fieldInfo, long indexStart) throws IOException {
IndexInput clone = in.clone();
clone.seek(indexStart);
fst = new FST<>(clone, fstOutputs);
fst = new FST<>(clone, clone, fstOutputs);
clone.close();

/*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ public void add(BytesRef text, TermStats stats, long termsFilePointer) throws IO
public void finish(long termsFilePointer) throws IOException {
fst = fstBuilder.finish();
if (fst != null) {
fst.save(out);
fst.save(out, out);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -836,7 +836,7 @@ public void finish() throws IOException {

// Write FST to index
indexStartFP = indexOut.getFilePointer();
root.index.save(indexOut);
root.index.save(indexOut, indexOut);
//System.out.println(" write FST " + indexStartFP + " field=" + fieldInfo.name);

// if (SAVE_DOT_FILES || DEBUG) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ final class OrdsFieldReader extends Terms implements Accountable {
final IndexInput clone = indexIn.clone();
//System.out.println("start=" + indexStartFP + " field=" + fieldInfo.name);
clone.seek(indexStartFP);
index = new FST<>(clone, OrdsBlockTreeTermsWriter.FST_OUTPUTS);
index = new FST<>(clone, clone, OrdsBlockTreeTermsWriter.FST_OUTPUTS);

/*
if (true) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ final class TermsReader extends Terms implements Accountable {
this.sumTotalTermFreq = sumTotalTermFreq;
this.sumDocFreq = sumDocFreq;
this.docCount = docCount;
this.dict = new FST<>(in, new FSTTermOutputs(fieldInfo));
this.dict = new FST<>(in, in, new FSTTermOutputs(fieldInfo));
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ public void close() throws IOException {
}
out.writeVLong(field.sumDocFreq);
out.writeVInt(field.docCount);
field.dict.save(out);
field.dict.save(out, out);
}
writeTrailer(out, dirStart);
CodecUtil.writeFooter(out);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,10 +70,10 @@ public long ramBytesUsed() {
@Override
public void write(DataOutput output, BlockEncoder blockEncoder) throws IOException {
if (blockEncoder == null) {
fst.save(output);
fst.save(output, output);
} else {
ByteBuffersDataOutput bytesDataOutput = ByteBuffersDataOutput.newResettableInstance();
fst.save(bytesDataOutput);
fst.save(bytesDataOutput, bytesDataOutput);
BlockEncoder.WritableBytes encodedBytes = blockEncoder.encode(bytesDataOutput.toDataInput(), bytesDataOutput.size());
output.writeVLong(encodedBytes.size());
encodedBytes.writeTo(output);
Expand All @@ -97,8 +97,8 @@ protected static FSTDictionary read(DataInput input, BlockDecoder blockDecoder,
isFSTOnHeap = true;
}
PositiveIntOutputs fstOutputs = PositiveIntOutputs.getSingleton();
FST<Long> fst = isFSTOnHeap ? new FST<>(fstDataInput, fstOutputs)
: new FST<>(fstDataInput, fstOutputs, new OffHeapFSTStore());
FST<Long> fst = isFSTOnHeap ? new FST<>(fstDataInput, fstDataInput, fstOutputs)
: new FST<>(fstDataInput, fstDataInput, fstOutputs, new OffHeapFSTStore());
return new FSTDictionary(fst);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
*/
package org.apache.lucene.codecs.blocktree;


import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
Expand All @@ -35,6 +34,7 @@
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
Expand Down Expand Up @@ -97,13 +97,20 @@ public final class BlockTreeTermsReader extends FieldsProducer {
/** Suffixes are compressed to save space. */
public static final int VERSION_COMPRESSED_SUFFIXES = 5;

/** Metadata is written to its own file. */
public static final int VERSION_META_FILE = 6;

/** Current terms format. */
public static final int VERSION_CURRENT = VERSION_COMPRESSED_SUFFIXES;
public static final int VERSION_CURRENT = VERSION_META_FILE;

/** Extension of terms index file */
static final String TERMS_INDEX_EXTENSION = "tip";
final static String TERMS_INDEX_CODEC_NAME = "BlockTreeTermsIndex";

/** Extension of terms meta file */
static final String TERMS_META_EXTENSION = "tmd";
final static String TERMS_META_CODEC_NAME = "BlockTreeTermsMeta";

// Open input to the main terms dict file (_X.tib)
final IndexInput termsIn;
// Open input to the terms index file (_X.tip)
Expand All @@ -128,76 +135,116 @@ public BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentReadState

this.postingsReader = postingsReader;
this.segment = state.segmentInfo.name;

String termsName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_EXTENSION);

try {
String termsName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_EXTENSION);
termsIn = state.directory.openInput(termsName, state.context);
version = CodecUtil.checkIndexHeader(termsIn, TERMS_CODEC_NAME, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);

String indexName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_INDEX_EXTENSION);
indexIn = state.directory.openInput(indexName, state.context);
CodecUtil.checkIndexHeader(indexIn, TERMS_INDEX_CODEC_NAME, version, version, state.segmentInfo.getId(), state.segmentSuffix);

// Have PostingsReader init itself
postingsReader.init(termsIn, state);
if (version < VERSION_META_FILE) {
// Have PostingsReader init itself
postingsReader.init(termsIn, state);

// Verifying the checksum against all bytes would be too costly, but for now we at least
// verify proper structure of the checksum footer. This is cheap and can detect some forms
// of corruption such as file truncation.
CodecUtil.retrieveChecksum(indexIn);
CodecUtil.retrieveChecksum(termsIn);
// Verifying the checksum against all bytes would be too costly, but for now we at least
// verify proper structure of the checksum footer. This is cheap and can detect some forms
// of corruption such as file truncation.
CodecUtil.retrieveChecksum(indexIn);
CodecUtil.retrieveChecksum(termsIn);
}

// Read per-field details
seekDir(termsIn);
seekDir(indexIn);
String metaName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_META_EXTENSION);
Map<String, FieldReader> fieldMap = null;
Throwable priorE = null;
long indexLength = -1, termsLength = -1;
try (ChecksumIndexInput metaIn = version >= VERSION_META_FILE ? state.directory.openChecksumInput(metaName, state.context) : null) {
try {
final IndexInput indexMetaIn, termsMetaIn;
if (version >= VERSION_META_FILE) {
CodecUtil.checkIndexHeader(metaIn, TERMS_META_CODEC_NAME, version, version, state.segmentInfo.getId(), state.segmentSuffix);
indexMetaIn = termsMetaIn = metaIn;
postingsReader.init(metaIn, state);
} else {
seekDir(termsIn);
seekDir(indexIn);
indexMetaIn = indexIn;
termsMetaIn = termsIn;
}

final int numFields = termsIn.readVInt();
if (numFields < 0) {
throw new CorruptIndexException("invalid numFields: " + numFields, termsIn);
}
fieldMap = new HashMap<>((int) (numFields / 0.75f) + 1);
for (int i = 0; i < numFields; ++i) {
final int field = termsIn.readVInt();
final long numTerms = termsIn.readVLong();
if (numTerms <= 0) {
throw new CorruptIndexException("Illegal numTerms for field number: " + field, termsIn);
}
final BytesRef rootCode = readBytesRef(termsIn);
final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
if (fieldInfo == null) {
throw new CorruptIndexException("invalid field number: " + field, termsIn);
}
final long sumTotalTermFreq = termsIn.readVLong();
// when frequencies are omitted, sumDocFreq=sumTotalTermFreq and only one value is written.
final long sumDocFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS ? sumTotalTermFreq : termsIn.readVLong();
final int docCount = termsIn.readVInt();
if (version < VERSION_META_LONGS_REMOVED) {
final int longsSize = termsIn.readVInt();
if (longsSize < 0) {
throw new CorruptIndexException("invalid longsSize for field: " + fieldInfo.name + ", longsSize=" + longsSize, termsIn);
final int numFields = termsMetaIn.readVInt();
if (numFields < 0) {
throw new CorruptIndexException("invalid numFields: " + numFields, termsMetaIn);
}
fieldMap = new HashMap<>((int) (numFields / 0.75f) + 1);
for (int i = 0; i < numFields; ++i) {
final int field = termsMetaIn.readVInt();
final long numTerms = termsMetaIn.readVLong();
if (numTerms <= 0) {
throw new CorruptIndexException("Illegal numTerms for field number: " + field, termsMetaIn);
}
final BytesRef rootCode = readBytesRef(termsMetaIn);
final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
if (fieldInfo == null) {
throw new CorruptIndexException("invalid field number: " + field, termsMetaIn);
}
final long sumTotalTermFreq = termsMetaIn.readVLong();
// when frequencies are omitted, sumDocFreq=sumTotalTermFreq and only one value is written.
final long sumDocFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS ? sumTotalTermFreq : termsMetaIn.readVLong();
final int docCount = termsMetaIn.readVInt();
if (version < VERSION_META_LONGS_REMOVED) {
final int longsSize = termsMetaIn.readVInt();
if (longsSize < 0) {
throw new CorruptIndexException("invalid longsSize for field: " + fieldInfo.name + ", longsSize=" + longsSize, termsMetaIn);
}
}
BytesRef minTerm = readBytesRef(termsMetaIn);
BytesRef maxTerm = readBytesRef(termsMetaIn);
if (docCount < 0 || docCount > state.segmentInfo.maxDoc()) { // #docs with field must be <= #docs
throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + state.segmentInfo.maxDoc(), termsMetaIn);
}
if (sumDocFreq < docCount) { // #postings must be >= #docs with field
throw new CorruptIndexException("invalid sumDocFreq: " + sumDocFreq + " docCount: " + docCount, termsMetaIn);
}
if (sumTotalTermFreq < sumDocFreq) { // #positions must be >= #postings
throw new CorruptIndexException("invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq, termsMetaIn);
}
final long indexStartFP = indexMetaIn.readVLong();
FieldReader previous = fieldMap.put(fieldInfo.name,
new FieldReader(this, fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount,
indexStartFP, indexMetaIn, indexIn, minTerm, maxTerm));
if (previous != null) {
throw new CorruptIndexException("duplicate field: " + fieldInfo.name, termsMetaIn);
}
}
if (version >= VERSION_META_FILE) {
indexLength = metaIn.readLong();
termsLength = metaIn.readLong();
}
} catch (Throwable exception) {
priorE = exception;
} finally {
if (metaIn != null) {
CodecUtil.checkFooter(metaIn, priorE);
} else if (priorE != null) {
IOUtils.rethrowAlways(priorE);
}
}
BytesRef minTerm = readBytesRef(termsIn);
BytesRef maxTerm = readBytesRef(termsIn);
if (docCount < 0 || docCount > state.segmentInfo.maxDoc()) { // #docs with field must be <= #docs
throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + state.segmentInfo.maxDoc(), termsIn);
}
if (sumDocFreq < docCount) { // #postings must be >= #docs with field
throw new CorruptIndexException("invalid sumDocFreq: " + sumDocFreq + " docCount: " + docCount, termsIn);
}
if (sumTotalTermFreq < sumDocFreq) { // #positions must be >= #postings
throw new CorruptIndexException("invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq, termsIn);
}
final long indexStartFP = indexIn.readVLong();
FieldReader previous = fieldMap.put(fieldInfo.name,
new FieldReader(this, fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount,
indexStartFP, indexIn, minTerm, maxTerm));
if (previous != null) {
throw new CorruptIndexException("duplicate field: " + fieldInfo.name, termsIn);
}
}
if (version >= VERSION_META_FILE) {
// At this point the checksum of the meta file has been verified so the lengths are likely correct
CodecUtil.retrieveChecksum(indexIn, indexLength);
CodecUtil.retrieveChecksum(termsIn, termsLength);
} else {
assert indexLength == -1 : indexLength;
assert termsLength == -1 : termsLength;
}
List<String> fieldList = new ArrayList<>(fieldMap.keySet());
fieldList.sort(null);
this.fieldMap = fieldMap;
this.fieldList = Collections.unmodifiableList(fieldList);
success = true;
} finally {
Expand Down
Loading

0 comments on commit 0dac659

Please sign in to comment.