LUCENE-9353: Move terms metadata to its own file. (#1473)

apache · Jun 16, 2020 · 0dac659 · 0dac659
1 parent 740bfc9
commit 0dac659
Show file tree

Hide file tree

Showing 26 changed files with 231 additions and 221 deletions.
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -63,6 +63,10 @@ Improvements
 * LUCENE-9342: TotalHits' relation will be EQUAL_TO when the number of hits is lower than TopDocsColector's numHits
   (Tomás Fernández Löbbe)
 
+* LUCENE-9353: Metadata of the terms dictionary moved to its own file, with the
+  `.tmd` extension. This allows checksums of metadata to be verified when
+  opening indices and helps save seeks when opening an index. (Adrien Grand)
+
 * LUCENE-9359: SegmentInfos#readCommit now always returns a
   CorruptIndexException if the content of the file is invalid. (Adrien Grand)
 

diff --git a/...ne/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary.java b/...ne/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary.java
@@ -21,6 +21,7 @@
 import java.io.InputStream;
 import java.io.IOException;
 
+import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.InputStreamDataInput;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.PositiveIntOutputs;
@@ -44,7 +45,8 @@ public TokenInfoDictionary(ResourceScheme resourceScheme, String resourcePath) t
     super(resourceScheme, resourcePath);
     FST<Long> fst;
     try (InputStream is = new BufferedInputStream(getResource(FST_FILENAME_SUFFIX))) {
-      fst = new FST<>(new InputStreamDataInput(is), PositiveIntOutputs.getSingleton());
+      DataInput in = new InputStreamDataInput(is);
+      fst = new FST<>(in, in, PositiveIntOutputs.getSingleton());
     }
     // TODO: some way to configure?
     this.fst = new TokenInfoFST(fst, true);

diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java
@@ -20,6 +20,7 @@
 import java.io.InputStream;
 import java.io.IOException;
 
+import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.InputStreamDataInput;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.PositiveIntOutputs;
@@ -47,7 +48,8 @@ public TokenInfoDictionary(ResourceScheme resourceScheme, String resourcePath) t
     super(resourceScheme, resourcePath);
     FST<Long> fst;
     try (InputStream is = new BufferedInputStream(getResource(FST_FILENAME_SUFFIX))) {
-      fst = new FST<>(new InputStreamDataInput(is), PositiveIntOutputs.getSingleton());
+      DataInput in = new InputStreamDataInput(is);
+      fst = new FST<>(in, in, PositiveIntOutputs.getSingleton());
     }
     this.fst = new TokenInfoFST(fst);
   }

diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexReader.java
@@ -148,7 +148,7 @@ private final class FieldIndexData implements Accountable {
     public FieldIndexData(IndexInput in, FieldInfo fieldInfo, long indexStart) throws IOException {
       IndexInput clone = in.clone();
       clone.seek(indexStart);
-      fst = new FST<>(clone, fstOutputs);
+      fst = new FST<>(clone, clone, fstOutputs);
       clone.close();
 
       /*

diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexWriter.java
@@ -280,7 +280,7 @@ public void add(BytesRef text, TermStats stats, long termsFilePointer) throws IO
     public void finish(long termsFilePointer) throws IOException {
       fst = fstBuilder.finish();
       if (fst != null) {
-        fst.save(out);
+        fst.save(out, out);
       }
     }
   }

diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java
@@ -836,7 +836,7 @@ public void finish() throws IOException {
 
         // Write FST to index
         indexStartFP = indexOut.getFilePointer();
-        root.index.save(indexOut);
+        root.index.save(indexOut, indexOut);
         //System.out.println("  write FST " + indexStartFP + " field=" + fieldInfo.name);
 
         // if (SAVE_DOT_FILES || DEBUG) {

diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsFieldReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsFieldReader.java
@@ -78,7 +78,7 @@ final class OrdsFieldReader extends Terms implements Accountable {
       final IndexInput clone = indexIn.clone();
       //System.out.println("start=" + indexStartFP + " field=" + fieldInfo.name);
       clone.seek(indexStartFP);
-      index = new FST<>(clone, OrdsBlockTreeTermsWriter.FST_OUTPUTS);
+      index = new FST<>(clone, clone, OrdsBlockTreeTermsWriter.FST_OUTPUTS);
 
       /*
       if (true) {

diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java
@@ -176,7 +176,7 @@ final class TermsReader extends Terms implements Accountable {
       this.sumTotalTermFreq = sumTotalTermFreq;
       this.sumDocFreq = sumDocFreq;
       this.docCount = docCount;
-      this.dict = new FST<>(in, new FSTTermOutputs(fieldInfo));
+      this.dict = new FST<>(in, in, new FSTTermOutputs(fieldInfo));
     }
 
     @Override

diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java
@@ -209,7 +209,7 @@ public void close() throws IOException {
           }
           out.writeVLong(field.sumDocFreq);
           out.writeVInt(field.docCount);
-          field.dict.save(out);
+          field.dict.save(out, out);
         }
         writeTrailer(out, dirStart);
         CodecUtil.writeFooter(out);

diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FSTDictionary.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FSTDictionary.java
@@ -70,10 +70,10 @@ public long ramBytesUsed() {
   @Override
   public void write(DataOutput output, BlockEncoder blockEncoder) throws IOException {
     if (blockEncoder == null) {
-      fst.save(output);
+      fst.save(output, output);
     } else {
       ByteBuffersDataOutput bytesDataOutput = ByteBuffersDataOutput.newResettableInstance();
-      fst.save(bytesDataOutput);
+      fst.save(bytesDataOutput, bytesDataOutput);
       BlockEncoder.WritableBytes encodedBytes = blockEncoder.encode(bytesDataOutput.toDataInput(), bytesDataOutput.size());
       output.writeVLong(encodedBytes.size());
       encodedBytes.writeTo(output);
@@ -97,8 +97,8 @@ protected static FSTDictionary read(DataInput input, BlockDecoder blockDecoder,
       isFSTOnHeap = true;
     }
     PositiveIntOutputs fstOutputs = PositiveIntOutputs.getSingleton();
-    FST<Long> fst = isFSTOnHeap ? new FST<>(fstDataInput, fstOutputs)
-        : new FST<>(fstDataInput, fstOutputs, new OffHeapFSTStore());
+    FST<Long> fst = isFSTOnHeap ? new FST<>(fstDataInput, fstDataInput, fstOutputs)
+        : new FST<>(fstDataInput, fstDataInput, fstOutputs, new OffHeapFSTStore());
     return new FSTDictionary(fst);
   }
 

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java
@@ -16,7 +16,6 @@
  */
 package org.apache.lucene.codecs.blocktree;
 
-
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
@@ -35,6 +34,7 @@
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.SegmentReadState;
 import org.apache.lucene.index.Terms;
+import org.apache.lucene.store.ChecksumIndexInput;
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.util.Accountable;
 import org.apache.lucene.util.Accountables;
@@ -97,13 +97,20 @@ public final class BlockTreeTermsReader extends FieldsProducer {
   /** Suffixes are compressed to save space. */
   public static final int VERSION_COMPRESSED_SUFFIXES = 5;
 
+  /** Metadata is written to its own file. */
+  public static final int VERSION_META_FILE = 6;
+
   /** Current terms format. */
-  public static final int VERSION_CURRENT = VERSION_COMPRESSED_SUFFIXES;
+  public static final int VERSION_CURRENT = VERSION_META_FILE;
 
   /** Extension of terms index file */
   static final String TERMS_INDEX_EXTENSION = "tip";
   final static String TERMS_INDEX_CODEC_NAME = "BlockTreeTermsIndex";
 
+  /** Extension of terms meta file */
+  static final String TERMS_META_EXTENSION = "tmd";
+  final static String TERMS_META_CODEC_NAME = "BlockTreeTermsMeta";
+
   // Open input to the main terms dict file (_X.tib)
   final IndexInput termsIn;
   // Open input to the terms index file (_X.tip)
@@ -128,76 +135,116 @@ public BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentReadState
 
     this.postingsReader = postingsReader;
     this.segment = state.segmentInfo.name;
-
-    String termsName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_EXTENSION);
+
     try {
+      String termsName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_EXTENSION);
       termsIn = state.directory.openInput(termsName, state.context);
       version = CodecUtil.checkIndexHeader(termsIn, TERMS_CODEC_NAME, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
 
       String indexName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_INDEX_EXTENSION);
       indexIn = state.directory.openInput(indexName, state.context);
       CodecUtil.checkIndexHeader(indexIn, TERMS_INDEX_CODEC_NAME, version, version, state.segmentInfo.getId(), state.segmentSuffix);
 
-      // Have PostingsReader init itself
-      postingsReader.init(termsIn, state);
+      if (version < VERSION_META_FILE) {
+        // Have PostingsReader init itself
+        postingsReader.init(termsIn, state);
 
-      // Verifying the checksum against all bytes would be too costly, but for now we at least
-      // verify proper structure of the checksum footer. This is cheap and can detect some forms
-      // of corruption such as file truncation.
-      CodecUtil.retrieveChecksum(indexIn);
-      CodecUtil.retrieveChecksum(termsIn);
+        // Verifying the checksum against all bytes would be too costly, but for now we at least
+        // verify proper structure of the checksum footer. This is cheap and can detect some forms
+        // of corruption such as file truncation.
+        CodecUtil.retrieveChecksum(indexIn);
+        CodecUtil.retrieveChecksum(termsIn);
+      }
 
       // Read per-field details
-      seekDir(termsIn);
-      seekDir(indexIn);
+      String metaName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_META_EXTENSION);
+      Map<String, FieldReader> fieldMap = null;
+      Throwable priorE = null;
+      long indexLength = -1, termsLength = -1;
+      try (ChecksumIndexInput metaIn = version >= VERSION_META_FILE ? state.directory.openChecksumInput(metaName, state.context) : null) {
+        try {
+          final IndexInput indexMetaIn, termsMetaIn;
+          if (version >= VERSION_META_FILE) {
+            CodecUtil.checkIndexHeader(metaIn, TERMS_META_CODEC_NAME, version, version, state.segmentInfo.getId(), state.segmentSuffix);
+            indexMetaIn = termsMetaIn = metaIn;
+            postingsReader.init(metaIn, state);
+          } else {
+            seekDir(termsIn);
+            seekDir(indexIn);
+            indexMetaIn = indexIn;
+            termsMetaIn = termsIn;
+          }
 
-      final int numFields = termsIn.readVInt();
-      if (numFields < 0) {
-        throw new CorruptIndexException("invalid numFields: " + numFields, termsIn);
-      }
-      fieldMap = new HashMap<>((int) (numFields / 0.75f) + 1);
-      for (int i = 0; i < numFields; ++i) {
-        final int field = termsIn.readVInt();
-        final long numTerms = termsIn.readVLong();
-        if (numTerms <= 0) {
-          throw new CorruptIndexException("Illegal numTerms for field number: " + field, termsIn);
-        }
-        final BytesRef rootCode = readBytesRef(termsIn);
-        final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
-        if (fieldInfo == null) {
-          throw new CorruptIndexException("invalid field number: " + field, termsIn);
-        }
-        final long sumTotalTermFreq = termsIn.readVLong();
-        // when frequencies are omitted, sumDocFreq=sumTotalTermFreq and only one value is written.
-        final long sumDocFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS ? sumTotalTermFreq : termsIn.readVLong();
-        final int docCount = termsIn.readVInt();
-        if (version < VERSION_META_LONGS_REMOVED) {
-          final int longsSize = termsIn.readVInt();
-          if (longsSize < 0) {
-            throw new CorruptIndexException("invalid longsSize for field: " + fieldInfo.name + ", longsSize=" + longsSize, termsIn);
+          final int numFields = termsMetaIn.readVInt();
+          if (numFields < 0) {
+            throw new CorruptIndexException("invalid numFields: " + numFields, termsMetaIn);
+          }
+          fieldMap = new HashMap<>((int) (numFields / 0.75f) + 1);
+          for (int i = 0; i < numFields; ++i) {
+            final int field = termsMetaIn.readVInt();
+            final long numTerms = termsMetaIn.readVLong();
+            if (numTerms <= 0) {
+              throw new CorruptIndexException("Illegal numTerms for field number: " + field, termsMetaIn);
+            }
+            final BytesRef rootCode = readBytesRef(termsMetaIn);
+            final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
+            if (fieldInfo == null) {
+              throw new CorruptIndexException("invalid field number: " + field, termsMetaIn);
+            }
+            final long sumTotalTermFreq = termsMetaIn.readVLong();
+            // when frequencies are omitted, sumDocFreq=sumTotalTermFreq and only one value is written.
+            final long sumDocFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS ? sumTotalTermFreq : termsMetaIn.readVLong();
+            final int docCount = termsMetaIn.readVInt();
+            if (version < VERSION_META_LONGS_REMOVED) {
+              final int longsSize = termsMetaIn.readVInt();
+              if (longsSize < 0) {
+                throw new CorruptIndexException("invalid longsSize for field: " + fieldInfo.name + ", longsSize=" + longsSize, termsMetaIn);
+              }
+            }
+            BytesRef minTerm = readBytesRef(termsMetaIn);
+            BytesRef maxTerm = readBytesRef(termsMetaIn);
+            if (docCount < 0 || docCount > state.segmentInfo.maxDoc()) { // #docs with field must be <= #docs
+              throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + state.segmentInfo.maxDoc(), termsMetaIn);
+            }
+            if (sumDocFreq < docCount) {  // #postings must be >= #docs with field
+              throw new CorruptIndexException("invalid sumDocFreq: " + sumDocFreq + " docCount: " + docCount, termsMetaIn);
+            }
+            if (sumTotalTermFreq < sumDocFreq) { // #positions must be >= #postings
+              throw new CorruptIndexException("invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq, termsMetaIn);
+            }
+            final long indexStartFP = indexMetaIn.readVLong();
+            FieldReader previous = fieldMap.put(fieldInfo.name,
+                new FieldReader(this, fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount,
+                    indexStartFP, indexMetaIn, indexIn, minTerm, maxTerm));
+            if (previous != null) {
+              throw new CorruptIndexException("duplicate field: " + fieldInfo.name, termsMetaIn);
+            }
+          }
+          if (version >= VERSION_META_FILE) {
+            indexLength = metaIn.readLong();
+            termsLength = metaIn.readLong();
+          }
+        } catch (Throwable exception) {
+          priorE = exception;
+        } finally {
+          if (metaIn != null) {
+            CodecUtil.checkFooter(metaIn, priorE);
+          } else if (priorE != null) {
+            IOUtils.rethrowAlways(priorE);
           }
         }
-        BytesRef minTerm = readBytesRef(termsIn);
-        BytesRef maxTerm = readBytesRef(termsIn);
-        if (docCount < 0 || docCount > state.segmentInfo.maxDoc()) { // #docs with field must be <= #docs
-          throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + state.segmentInfo.maxDoc(), termsIn);
-        }
-        if (sumDocFreq < docCount) {  // #postings must be >= #docs with field
-          throw new CorruptIndexException("invalid sumDocFreq: " + sumDocFreq + " docCount: " + docCount, termsIn);
-        }
-        if (sumTotalTermFreq < sumDocFreq) { // #positions must be >= #postings
-          throw new CorruptIndexException("invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq, termsIn);
-        }
-        final long indexStartFP = indexIn.readVLong();
-        FieldReader previous = fieldMap.put(fieldInfo.name,
-                                          new FieldReader(this, fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount,
-                                                          indexStartFP, indexIn, minTerm, maxTerm));
-        if (previous != null) {
-          throw new CorruptIndexException("duplicate field: " + fieldInfo.name, termsIn);
-        }
+      }
+      if (version >= VERSION_META_FILE) {
+        // At this point the checksum of the meta file has been verified so the lengths are likely correct
+        CodecUtil.retrieveChecksum(indexIn, indexLength);
+        CodecUtil.retrieveChecksum(termsIn, termsLength);
+      } else {
+        assert indexLength == -1 : indexLength;
+        assert termsLength == -1 : termsLength;
       }
       List<String> fieldList = new ArrayList<>(fieldMap.keySet());
       fieldList.sort(null);
+      this.fieldMap = fieldMap;
       this.fieldList = Collections.unmodifiableList(fieldList);
       success = true;
     } finally {