apache · jpountz · Jun 16, 2020 · May 1, 2020 · May 1, 2020 · Jun 8, 2020
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -170,6 +170,10 @@ Improvements
 * LUCENE-9342: TotalHits' relation will be EQUAL_TO when the number of hits is lower than TopDocsColector's numHits
   (Tomás Fernández Löbbe)
 
+* LUCENE-9353: Metadata of the terms dictionary moved to its own file, with the
+  `.tmd` extension. This allows checksums of metadata to be verified when
+  opening indices. (Adrien Grand)
+
 Optimizations
 ---------------------
 

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java
@@ -16,7 +16,6 @@
  */
 package org.apache.lucene.codecs.blocktree;
 
-
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
@@ -35,6 +34,7 @@
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.SegmentReadState;
 import org.apache.lucene.index.Terms;
+import org.apache.lucene.store.ChecksumIndexInput;
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.util.Accountable;
 import org.apache.lucene.util.Accountables;
@@ -97,13 +97,20 @@ public final class BlockTreeTermsReader extends FieldsProducer {
   /** Suffixes are compressed to save space. */
   public static final int VERSION_COMPRESSED_SUFFIXES = 5;
 
+  /** Metadata is written to its own file. */
+  public static final int VERSION_META_FILE = 6;
+
   /** Current terms format. */
-  public static final int VERSION_CURRENT = VERSION_COMPRESSED_SUFFIXES;
+  public static final int VERSION_CURRENT = VERSION_META_FILE;
 
   /** Extension of terms index file */
   static final String TERMS_INDEX_EXTENSION = "tip";
   final static String TERMS_INDEX_CODEC_NAME = "BlockTreeTermsIndex";
 
+  /** Extension of terms meta file */
+  static final String TERMS_META_EXTENSION = "tmd";
+  final static String TERMS_META_CODEC_NAME = "BlockTreeTermsMeta";
+
   // Open input to the main terms dict file (_X.tib)
   final IndexInput termsIn;
   // Open input to the terms index file (_X.tip)
@@ -128,9 +135,9 @@ public BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentReadState
 
     this.postingsReader = postingsReader;
     this.segment = state.segmentInfo.name;
-
-    String termsName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_EXTENSION);
+
     try {
+      String termsName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_EXTENSION);
       termsIn = state.directory.openInput(termsName, state.context);
       version = CodecUtil.checkIndexHeader(termsIn, TERMS_CODEC_NAME, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
 
@@ -148,56 +155,80 @@ public BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentReadState
       CodecUtil.retrieveChecksum(termsIn);
 
       // Read per-field details
-      seekDir(termsIn);
-      seekDir(indexIn);
+      String metaName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_META_EXTENSION);
+      Map<String, FieldReader> fieldMap = null;
+      Throwable priorE = null;
+      try (ChecksumIndexInput metaIn = version >= VERSION_META_FILE ? state.directory.openChecksumInput(metaName, state.context) : null) {
+        try {
+          final IndexInput indexMetaIn, termsMetaIn;
+          if (version >= VERSION_META_FILE) {
+            CodecUtil.checkIndexHeader(metaIn, TERMS_META_CODEC_NAME, version, version, state.segmentInfo.getId(), state.segmentSuffix);
+            indexMetaIn = termsMetaIn = metaIn;
+          } else {
+            seekDir(termsIn);
+            seekDir(indexIn);
+            indexMetaIn = indexIn;
+            termsMetaIn = termsIn;
+          }
 
-      final int numFields = termsIn.readVInt();
-      if (numFields < 0) {
-        throw new CorruptIndexException("invalid numFields: " + numFields, termsIn);
-      }
-      fieldMap = new HashMap<>((int) (numFields / 0.75f) + 1);
-      for (int i = 0; i < numFields; ++i) {
-        final int field = termsIn.readVInt();
-        final long numTerms = termsIn.readVLong();
-        if (numTerms <= 0) {
-          throw new CorruptIndexException("Illegal numTerms for field number: " + field, termsIn);
-        }
-        final BytesRef rootCode = readBytesRef(termsIn);
-        final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
-        if (fieldInfo == null) {
-          throw new CorruptIndexException("invalid field number: " + field, termsIn);
-        }
-        final long sumTotalTermFreq = termsIn.readVLong();
-        // when frequencies are omitted, sumDocFreq=sumTotalTermFreq and only one value is written.
-        final long sumDocFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS ? sumTotalTermFreq : termsIn.readVLong();
-        final int docCount = termsIn.readVInt();
-        if (version < VERSION_META_LONGS_REMOVED) {
-          final int longsSize = termsIn.readVInt();
-          if (longsSize < 0) {
-            throw new CorruptIndexException("invalid longsSize for field: " + fieldInfo.name + ", longsSize=" + longsSize, termsIn);
+          final int numFields = termsMetaIn.readVInt();
+          if (numFields < 0) {
+            throw new CorruptIndexException("invalid numFields: " + numFields, termsMetaIn);
+          }
+          fieldMap = new HashMap<>((int) (numFields / 0.75f) + 1);
+          for (int i = 0; i < numFields; ++i) {
+            final int field = termsMetaIn.readVInt();
+            final long numTerms = termsMetaIn.readVLong();
+            if (numTerms <= 0) {
+              throw new CorruptIndexException("Illegal numTerms for field number: " + field, termsMetaIn);
+            }
+            final BytesRef rootCode = readBytesRef(termsMetaIn);
+            final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
+            if (fieldInfo == null) {
+              throw new CorruptIndexException("invalid field number: " + field, termsMetaIn);
+            }
+            final long sumTotalTermFreq = termsMetaIn.readVLong();
+            // when frequencies are omitted, sumDocFreq=sumTotalTermFreq and only one value is written.
+            final long sumDocFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS ? sumTotalTermFreq : termsMetaIn.readVLong();
+            final int docCount = termsMetaIn.readVInt();
+            if (version < VERSION_META_LONGS_REMOVED) {
+              final int longsSize = termsMetaIn.readVInt();
+              if (longsSize < 0) {
+                throw new CorruptIndexException("invalid longsSize for field: " + fieldInfo.name + ", longsSize=" + longsSize, termsMetaIn);
+              }
+            }
+            BytesRef minTerm = readBytesRef(termsMetaIn);
+            BytesRef maxTerm = readBytesRef(termsMetaIn);
+            if (docCount < 0 || docCount > state.segmentInfo.maxDoc()) { // #docs with field must be <= #docs
+              throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + state.segmentInfo.maxDoc(), termsMetaIn);
+            }
+            if (sumDocFreq < docCount) {  // #postings must be >= #docs with field
+              throw new CorruptIndexException("invalid sumDocFreq: " + sumDocFreq + " docCount: " + docCount, termsMetaIn);
+            }
+            if (sumTotalTermFreq < sumDocFreq) { // #positions must be >= #postings
+              throw new CorruptIndexException("invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq, termsMetaIn);
+            }
+            final long indexStartFP = indexMetaIn.readVLong();
+            FieldReader previous = fieldMap.put(fieldInfo.name,
+                new FieldReader(this, fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount,
+                    indexStartFP, indexIn, minTerm, maxTerm));
+            if (previous != null) {
+              throw new CorruptIndexException("duplicate field: " + fieldInfo.name, termsMetaIn);
+            }
+          }
+        } catch (Throwable exception) {
+          priorE = exception;
+        } finally {
+          if (metaIn != null) {
+            CodecUtil.checkFooter(metaIn, priorE);
+          } else if (priorE != null) {
+            IOUtils.rethrowAlways(priorE);
           }
-        }
-        BytesRef minTerm = readBytesRef(termsIn);
-        BytesRef maxTerm = readBytesRef(termsIn);
-        if (docCount < 0 || docCount > state.segmentInfo.maxDoc()) { // #docs with field must be <= #docs
-          throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + state.segmentInfo.maxDoc(), termsIn);
-        }
-        if (sumDocFreq < docCount) {  // #postings must be >= #docs with field
-          throw new CorruptIndexException("invalid sumDocFreq: " + sumDocFreq + " docCount: " + docCount, termsIn);
-        }
-        if (sumTotalTermFreq < sumDocFreq) { // #positions must be >= #postings
-          throw new CorruptIndexException("invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq, termsIn);
-        }
-        final long indexStartFP = indexIn.readVLong();
-        FieldReader previous = fieldMap.put(fieldInfo.name,
-                                          new FieldReader(this, fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount,
-                                                          indexStartFP, indexIn, minTerm, maxTerm));
-        if (previous != null) {
-          throw new CorruptIndexException("duplicate field: " + fieldInfo.name, termsIn);
         }
       }
       List<String> fieldList = new ArrayList<>(fieldMap.keySet());
       fieldList.sort(null);
+      this.fieldMap = fieldMap;
       this.fieldList = Collections.unmodifiableList(fieldList);
       success = true;
     } finally {

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java
@@ -211,6 +211,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
 
   //private final static boolean SAVE_DOT_FILES = false;
 
+  private final SegmentWriteState state;
   private final IndexOutput termsOut;
   private final IndexOutput indexOut;
   final int maxDoc;
@@ -262,6 +263,7 @@ public BlockTreeTermsWriter(SegmentWriteState state,
     validateSettings(minItemsInBlock,
                      maxItemsInBlock);
 
+    this.state = state;
     this.minItemsInBlock = minItemsInBlock;
     this.maxItemsInBlock = maxItemsInBlock;
 
@@ -294,16 +296,6 @@ public BlockTreeTermsWriter(SegmentWriteState state,
     }
   }
 
-  /** Writes the terms file trailer. */
-  private void writeTrailer(IndexOutput out, long dirStart) throws IOException {
-    out.writeLong(dirStart);    
-  }
-
-  /** Writes the index file trailer. */
-  private void writeIndexTrailer(IndexOutput indexOut, long dirStart) throws IOException {
-    indexOut.writeLong(dirStart);    
-  }
-
   /** Throws {@code IllegalArgumentException} if any of these settings
    *  is invalid. */
   public static void validateSettings(int minItemsInBlock, int maxItemsInBlock) {
@@ -1060,36 +1052,35 @@ public void close() throws IOException {
       return;
     }
     closed = true;
-
+
+    final String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockTreeTermsReader.TERMS_META_EXTENSION);
     boolean success = false;
-    try {
-
-      final long dirStart = termsOut.getFilePointer();
-      final long indexDirStart = indexOut.getFilePointer();
+    try (IndexOutput metaOut = state.directory.createOutput(metaName, state.context)) {
+      CodecUtil.writeIndexHeader(metaOut, BlockTreeTermsReader.TERMS_META_CODEC_NAME, BlockTreeTermsReader.VERSION_CURRENT,
+          state.segmentInfo.getId(), state.segmentSuffix);
 
-      termsOut.writeVInt(fields.size());
+      metaOut.writeVInt(fields.size());
 
       for(FieldMetaData field : fields) {
         //System.out.println("  field " + field.fieldInfo.name + " " + field.numTerms + " terms");
-        termsOut.writeVInt(field.fieldInfo.number);
+        metaOut.writeVInt(field.fieldInfo.number);
         assert field.numTerms > 0;
-        termsOut.writeVLong(field.numTerms);
-        termsOut.writeVInt(field.rootCode.length);
-        termsOut.writeBytes(field.rootCode.bytes, field.rootCode.offset, field.rootCode.length);
+        metaOut.writeVLong(field.numTerms);
+        metaOut.writeVInt(field.rootCode.length);
+        metaOut.writeBytes(field.rootCode.bytes, field.rootCode.offset, field.rootCode.length);
         assert field.fieldInfo.getIndexOptions() != IndexOptions.NONE;
         if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS) {
-          termsOut.writeVLong(field.sumTotalTermFreq);
+          metaOut.writeVLong(field.sumTotalTermFreq);
         }
-        termsOut.writeVLong(field.sumDocFreq);
-        termsOut.writeVInt(field.docCount);
-        indexOut.writeVLong(field.indexStartFP);
-        writeBytesRef(termsOut, field.minTerm);
-        writeBytesRef(termsOut, field.maxTerm);
+        metaOut.writeVLong(field.sumDocFreq);
+        metaOut.writeVInt(field.docCount);
+        writeBytesRef(metaOut, field.minTerm);
+        writeBytesRef(metaOut, field.maxTerm);
+        metaOut.writeVLong(field.indexStartFP);
       }
-      writeTrailer(termsOut, dirStart);
       CodecUtil.writeFooter(termsOut);
-      writeIndexTrailer(indexOut, indexDirStart);
       CodecUtil.writeFooter(indexOut);
+      CodecUtil.writeFooter(metaOut);
       success = true;
     } finally {
       if (success) {