Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LUCENE-9324: Add an ID to SegmentCommitInfo #1434

Merged
merged 17 commits into from Apr 18, 2020
3 changes: 3 additions & 0 deletions lucene/CHANGES.txt
Expand Up @@ -143,6 +143,9 @@ Improvements
* LUCENE-9304: Removed ThreadState abstraction from DocumentsWriter which allows pooling of DWPT directly and
improves the approachability of the IndexWriter code. (Simon Willnauer)

* LUCENE-9324: Add an ID to SegmentCommitInfo in order to compare commits for equality and make
snapshots incremental on generational files. (Simon Willnauer, Mike Mccandless, Adrien Grant)

Optimizations
---------------------

Expand Down
Expand Up @@ -768,7 +768,6 @@ public void testUnsupportedOldIndexes() throws Exception {
writer.close();
}
}
writer = null;
}

ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
Expand Down Expand Up @@ -830,8 +829,12 @@ public void testAddOldIndexes() throws IOException {
IndexWriter w = new IndexWriter(targetDir, newIndexWriterConfig(new MockAnalyzer(random())));
w.addIndexes(oldDir);
w.close();
targetDir.close();

SegmentInfos si = SegmentInfos.readLatestCommit(targetDir);
assertNull("none of the segments should have been upgraded",
si.asList().stream().filter( // depending on the MergePolicy we might see these segments merged away
sci -> sci.getId() != null && sci.info.getVersion().onOrAfter(Version.LUCENE_8_6_0) == false
).findAny().orElse(null));
if (VERBOSE) {
System.out.println("\nTEST: done adding indices; now close");
}
Expand Down Expand Up @@ -862,7 +865,9 @@ public void testAddOldIndexesReader() throws IOException {
TestUtil.addIndexesSlowly(w, reader);
w.close();
reader.close();

SegmentInfos si = SegmentInfos.readLatestCommit(targetDir);
assertNull("all SCIs should have an id now",
si.asList().stream().filter(sci -> sci.getId() == null).findAny().orElse(null));
targetDir.close();
}
}
Expand Down Expand Up @@ -1367,6 +1372,20 @@ public void testIndexCreatedVersion() throws IOException {
}
}

public void testSegmentCommitInfoId() throws IOException {
for (String name : oldNames) {
Directory dir = oldIndexDirs.get(name);
SegmentInfos infos = SegmentInfos.readLatestCommit(dir);
for (SegmentCommitInfo info : infos) {
if (info.info.getVersion().onOrAfter(Version.LUCENE_8_6_0)) {
assertNotNull(info.toString(), info.getId());
} else {
assertNull(info.toString(), info.getId());
}
}
}
}

public void verifyUsesDefaultCodec(Directory dir, String name) throws Exception {
DirectoryReader r = DirectoryReader.open(dir);
for (LeafReaderContext context : r.leaves()) {
Expand All @@ -1392,6 +1411,7 @@ private int checkAllSegmentsUpgraded(Directory dir, int indexCreatedVersion) thr
}
for (SegmentCommitInfo si : infos) {
assertEquals(Version.LATEST, si.info.getVersion());
assertNotNull(si.getId());
}
assertEquals(Version.LATEST, infos.getCommitLuceneVersion());
assertEquals(indexCreatedVersion, infos.getIndexCreatedVersionMajor());
Expand Down
Expand Up @@ -419,7 +419,7 @@ FlushedSegment flush(DocumentsWriter.FlushNotifications flushNotifications) thro
pendingUpdates.clearDeleteTerms();
segmentInfo.setFiles(new HashSet<>(directory.getCreatedFiles()));

final SegmentCommitInfo segmentInfoPerCommit = new SegmentCommitInfo(segmentInfo, 0, flushState.softDelCountOnFlush, -1L, -1L, -1L);
final SegmentCommitInfo segmentInfoPerCommit = new SegmentCommitInfo(segmentInfo, 0, flushState.softDelCountOnFlush, -1L, -1L, -1L, StringHelper.randomId());
if (infoStream.isEnabled("DWPT")) {
infoStream.message("DWPT", "new segment has " + (flushState.liveDocs == null ? 0 : flushState.delCountOnFlush) + " deleted docs");
infoStream.message("DWPT", "new segment has " + flushState.softDelCountOnFlush + " soft-deleted docs");
Expand Down
6 changes: 3 additions & 3 deletions lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
Expand Up @@ -3003,7 +3003,7 @@ public long addIndexes(CodecReader... readers) throws IOException {
notifyAll();
}
}
SegmentCommitInfo infoPerCommit = new SegmentCommitInfo(info, 0, numSoftDeleted, -1L, -1L, -1L);
SegmentCommitInfo infoPerCommit = new SegmentCommitInfo(info, 0, numSoftDeleted, -1L, -1L, -1L, StringHelper.randomId());

info.setFiles(new HashSet<>(trackingDir.getCreatedFiles()));
trackingDir.clearCreatedFiles();
Expand Down Expand Up @@ -3081,7 +3081,7 @@ private SegmentCommitInfo copySegmentAsIs(SegmentCommitInfo info, String segName
info.info.getUseCompoundFile(), info.info.getCodec(),
info.info.getDiagnostics(), info.info.getId(), info.info.getAttributes(), info.info.getIndexSort());
SegmentCommitInfo newInfoPerCommit = new SegmentCommitInfo(newInfo, info.getDelCount(), info.getSoftDelCount(), info.getDelGen(),
info.getFieldInfosGen(), info.getDocValuesGen());
info.getFieldInfosGen(), info.getDocValuesGen(), info.getId());
s1monw marked this conversation as resolved.
Show resolved Hide resolved

newInfo.setFiles(info.info.files());
newInfoPerCommit.setFieldInfosFiles(info.getFieldInfosFiles());
Expand Down Expand Up @@ -4273,7 +4273,7 @@ synchronized private void _mergeInit(MergePolicy.OneMerge merge) throws IOExcept
details.put("mergeMaxNumSegments", "" + merge.maxNumSegments);
details.put("mergeFactor", Integer.toString(merge.segments.size()));
setDiagnostics(si, SOURCE_MERGE, details);
merge.setMergeInfo(new SegmentCommitInfo(si, 0, 0, -1L, -1L, -1L));
merge.setMergeInfo(new SegmentCommitInfo(si, 0, 0, -1L, -1L, -1L, StringHelper.randomId()));

if (infoStream.isEnabled("IW")) {
infoStream.message("IW", "merge seg=" + merge.info.info.name + " " + segString(merge.segments));
Expand Down
Expand Up @@ -18,6 +18,7 @@


import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
Expand All @@ -26,6 +27,8 @@
import java.util.Map.Entry;
import java.util.Set;

import org.apache.lucene.util.StringHelper;

/** Embeds a [read-only] SegmentInfo and adds per-commit
* fields.
*
Expand All @@ -35,6 +38,9 @@ public class SegmentCommitInfo {
/** The {@link SegmentInfo} that we wrap. */
public final SegmentInfo info;

/** Id that uniquely identifies this segment commit. */
private byte[] id;

// How many deleted docs in the segment:
private int delCount;

Expand Down Expand Up @@ -79,7 +85,6 @@ public class SegmentCommitInfo {

/**
* Sole constructor.
*
* @param info
* {@link SegmentInfo} that we wrap
* @param delCount
Expand All @@ -90,8 +95,9 @@ public class SegmentCommitInfo {
* FieldInfos generation number (used to name field-infos files)
* @param docValuesGen
* DocValues generation number (used to name doc-values updates files)
* @param id Id that uniquely identifies this segment commit. This id must be 16 bytes long. See {@link StringHelper#randomId()}
*/
public SegmentCommitInfo(SegmentInfo info, int delCount, int softDelCount, long delGen, long fieldInfosGen, long docValuesGen) {
public SegmentCommitInfo(SegmentInfo info, int delCount, int softDelCount, long delGen, long fieldInfosGen, long docValuesGen, byte[] id) {
this.info = info;
this.delCount = delCount;
this.softDelCount = softDelCount;
Expand All @@ -101,6 +107,10 @@ public SegmentCommitInfo(SegmentInfo info, int delCount, int softDelCount, long
this.nextWriteFieldInfosGen = fieldInfosGen == -1 ? 1 : fieldInfosGen + 1;
this.docValuesGen = docValuesGen;
this.nextWriteDocValuesGen = docValuesGen == -1 ? 1 : docValuesGen + 1;
this.id = id;
if (id != null && id.length != StringHelper.ID_LENGTH) {
throw new IllegalArgumentException("invalid id: " + Arrays.toString(id));
}
}

/** Returns the per-field DocValues updates files. */
Expand Down Expand Up @@ -138,7 +148,7 @@ public void setFieldInfosFiles(Set<String> fieldInfosFiles) {
void advanceDelGen() {
delGen = nextWriteDelGen;
nextWriteDelGen = delGen+1;
sizeInBytes = -1;
generationAdvanced();
}

/** Called if there was an exception while writing
Expand All @@ -162,7 +172,7 @@ void setNextWriteDelGen(long v) {
void advanceFieldInfosGen() {
fieldInfosGen = nextWriteFieldInfosGen;
nextWriteFieldInfosGen = fieldInfosGen + 1;
sizeInBytes = -1;
generationAdvanced();
}

/**
Expand All @@ -187,7 +197,7 @@ void setNextWriteFieldInfosGen(long v) {
void advanceDocValuesGen() {
docValuesGen = nextWriteDocValuesGen;
nextWriteDocValuesGen = docValuesGen + 1;
sizeInBytes = -1;
generationAdvanced();
}

/**
Expand Down Expand Up @@ -251,7 +261,7 @@ long getBufferedDeletesGen() {
void setBufferedDeletesGen(long v) {
if (bufferedDeletesGen == -1) {
bufferedDeletesGen = v;
sizeInBytes = -1;
generationAdvanced();
} else {
throw new IllegalStateException("buffered deletes gen should only be set once");
}
Expand Down Expand Up @@ -355,6 +365,9 @@ public String toString(int pendingDelCount) {
if (softDelCount > 0) {
s += " :softDel=" + softDelCount;
}
if (this.id != null) {
s += " :id=" + StringHelper.idToString(id);
}

return s;
}
Expand All @@ -366,7 +379,7 @@ public String toString() {

@Override
public SegmentCommitInfo clone() {
SegmentCommitInfo other = new SegmentCommitInfo(info, delCount, softDelCount, delGen, fieldInfosGen, docValuesGen);
SegmentCommitInfo other = new SegmentCommitInfo(info, delCount, softDelCount, delGen, fieldInfosGen, docValuesGen, getId());
// Not clear that we need to carry over nextWriteDelGen
// (i.e. do we ever clone after a failed write and
// before the next successful write?), but just do it to
Expand All @@ -388,4 +401,17 @@ public SegmentCommitInfo clone() {
final int getDelCount(boolean includeSoftDeletes) {
return includeSoftDeletes ? getDelCount() + getSoftDelCount() : getDelCount();
}

private void generationAdvanced() {
sizeInBytes = -1;
id = StringHelper.randomId();
}

/**
* Returns and Id that uniquely identifies this segment commit or <code>null</code> if there is no ID assigned.
* This ID changes each time the the segment changes due to a delete, doc-value or field update.
*/
public byte[] getId() {
return id == null ? null : id.clone();
}
}
38 changes: 34 additions & 4 deletions lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java
Expand Up @@ -124,7 +124,9 @@ public final class SegmentInfos implements Cloneable, Iterable<SegmentCommitInfo
public static final int VERSION_72 = 8;
/** The version that recorded softDelCount */
public static final int VERSION_74 = 9;
static final int VERSION_CURRENT = VERSION_74;
/** The version that recorded SegmentCommitInfo IDs */
public static final int VERSION_86 = 10;
static final int VERSION_CURRENT = VERSION_86;

/** Name of the generation reference file name */
private static final String OLD_SEGMENTS_GEN = "segments.gen";
Expand Down Expand Up @@ -374,7 +376,24 @@ public static final SegmentInfos readCommit(Directory directory, ChecksumIndexIn
if (softDelCount + delCount > info.maxDoc()) {
throw new CorruptIndexException("invalid deletion count: " + softDelCount + delCount + " vs maxDoc=" + info.maxDoc(), input);
}
SegmentCommitInfo siPerCommit = new SegmentCommitInfo(info, delCount, softDelCount, delGen, fieldInfosGen, dvGen);
final byte[] sciId;
if (format > VERSION_74) {
byte marker = input.readByte();
switch (marker) {
case 1:
sciId = new byte[StringHelper.ID_LENGTH];
input.readBytes(sciId, 0, sciId.length);
break;
case 0:
sciId = null;
break;
default:
throw new CorruptIndexException("invalid SegmentCommitInfo ID marker: " + marker, input);
}
} else {
sciId = null;
}
SegmentCommitInfo siPerCommit = new SegmentCommitInfo(info, delCount, softDelCount, delGen, fieldInfosGen, dvGen, sciId);
siPerCommit.setFieldInfosFiles(input.readSetOfStrings());
final Map<Integer,Set<String>> dvUpdateFiles;
final int numDVFields = input.readInt();
Expand Down Expand Up @@ -460,7 +479,7 @@ private void write(Directory directory) throws IOException {

try {
segnOutput = directory.createOutput(segmentFileName, IOContext.DEFAULT);
write(directory, segnOutput);
write(segnOutput);
segnOutput.close();
directory.sync(Collections.singleton(segmentFileName));
success = true;
Expand All @@ -479,7 +498,7 @@ private void write(Directory directory) throws IOException {
}

/** Write ourselves to the provided {@link IndexOutput} */
public void write(Directory directory, IndexOutput out) throws IOException {
public void write(IndexOutput out) throws IOException {
CodecUtil.writeIndexHeader(out, "segments", VERSION_CURRENT,
StringHelper.randomId(), Long.toString(generation, Character.MAX_RADIX));
out.writeVInt(Version.LATEST.major);
Expand Down Expand Up @@ -537,6 +556,17 @@ public void write(Directory directory, IndexOutput out) throws IOException {
throw new IllegalStateException("cannot write segment: invalid maxDoc segment=" + si.name + " maxDoc=" + si.maxDoc() + " softDelCount=" + softDelCount);
}
out.writeInt(softDelCount);
// we ensure that there is a valid ID for this SCI just in case
// this is manually upgraded outside of IW
byte[] sciId = siPerCommit.getId();
if (sciId != null) {
out.writeByte((byte)1);
assert sciId.length == StringHelper.ID_LENGTH : "invalid SegmentCommitInfo#id: " + Arrays.toString(sciId);
out.writeBytes(sciId, 0, sciId.length);
} else {
out.writeByte((byte)0);
}

out.writeSetOfStrings(siPerCommit.getFieldInfosFiles());
final Map<Integer,Set<String>> dvUpdatesFiles = siPerCommit.getDocValuesUpdatesFiles();
out.writeInt(dvUpdatesFiles.size());
Expand Down
7 changes: 7 additions & 0 deletions lucene/core/src/java/org/apache/lucene/util/Version.java
Expand Up @@ -102,6 +102,13 @@ public final class Version {
@Deprecated
public static final Version LUCENE_8_5_1 = new Version(8, 5, 1);

/**
* Match settings and bugs in Lucene's 8.6.0 release.
* @deprecated Use latest
*/
@Deprecated
public static final Version LUCENE_8_6_0 = new Version(8, 6, 0);

/**
* Match settings and bugs in Lucene's 9.0.0 release.
* <p>
Expand Down
2 changes: 1 addition & 1 deletion lucene/core/src/test/org/apache/lucene/index/TestDoc.java
Expand Up @@ -238,7 +238,7 @@ private SegmentCommitInfo merge(Directory dir, SegmentCommitInfo si1, SegmentCom
}
}

return new SegmentCommitInfo(si, 0, 0, -1L, -1L, -1L);
return new SegmentCommitInfo(si, 0, 0, -1L, -1L, -1L, StringHelper.randomId());
}


Expand Down