From d21ae987b5196a2778bd9c30e4b53ee024c9ad42 Mon Sep 17 00:00:00 2001 From: fudongying Date: Sat, 27 May 2023 23:34:08 +0800 Subject: [PATCH 1/6] feat: soft delete optimize --- .../src/java/org/apache/lucene/index/MergePolicy.java | 10 +++++++++- .../org/apache/lucene/index/TieredMergePolicy.java | 5 ++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java b/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java index 3eb527edb19..6b90b10836e 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java +++ b/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java @@ -741,8 +741,16 @@ public boolean useCompoundFile( * non-deleted documents is set. */ protected long size(SegmentCommitInfo info, MergeContext mergeContext) throws IOException { - long byteSize = info.sizeInBytes(); int delCount = mergeContext.numDeletesToMerge(info); + return size(info, delCount); + } + + /** + * Return the byte size of the provided {@link SegmentCommitInfo}, pro-rated by percentage of + * non-deleted documents is set. + */ + protected final long size(SegmentCommitInfo info, int delCount) throws IOException { + long byteSize = info.sizeInBytes(); assert assertDelCount(delCount, info); double delRatio = info.info.maxDoc() <= 0 ? 0d : (double) delCount / (double) info.info.maxDoc(); diff --git a/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java b/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java index 304bbdd2e57..5893bb72aa1 100644 --- a/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java +++ b/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java @@ -295,9 +295,8 @@ private List getSortedBySegmentSize( List sortedBySize = new ArrayList<>(); for (SegmentCommitInfo info : infos) { - sortedBySize.add( - new SegmentSizeAndDocs( - info, size(info, mergeContext), mergeContext.numDeletesToMerge(info))); + final int delCount = mergeContext.numDeletesToMerge(info); + sortedBySize.add(new SegmentSizeAndDocs(info, size(info, delCount), delCount)); } sortedBySize.sort( From 43454b91746bb3a01d1af6a61a6d50ffae52901d Mon Sep 17 00:00:00 2001 From: fudongying Date: Fri, 9 Jun 2023 11:44:08 +0800 Subject: [PATCH 2/6] cache numDeletesToMerge to reduce calculate --- .../lucene/index/CachingMergeContext.java | 61 +++++++++++++++++++ .../org/apache/lucene/index/IndexWriter.java | 13 ++-- 2 files changed, 70 insertions(+), 4 deletions(-) create mode 100644 lucene/core/src/java/org/apache/lucene/index/CachingMergeContext.java diff --git a/lucene/core/src/java/org/apache/lucene/index/CachingMergeContext.java b/lucene/core/src/java/org/apache/lucene/index/CachingMergeContext.java new file mode 100644 index 00000000000..9661ab91e76 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/index/CachingMergeContext.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.index; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Set; +import org.apache.lucene.util.InfoStream; + +/** + * a wrapper of IndexWriter MergeContext. Try to cache the {@link + * #numDeletesToMerge(SegmentCommitInfo)} result in merge phase, to avoid duplicate calculation + */ +public class CachingMergeContext implements MergePolicy.MergeContext { + final MergePolicy.MergeContext mergeContext; + final HashMap cachedNumDeletesToMerge = new HashMap<>(); + + CachingMergeContext(MergePolicy.MergeContext mergeContext) { + this.mergeContext = mergeContext; + } + + @Override + public final int numDeletesToMerge(SegmentCommitInfo info) throws IOException { + Integer numDeletesToMerge = cachedNumDeletesToMerge.get(info); + if (numDeletesToMerge != null) { + return numDeletesToMerge; + } + numDeletesToMerge = mergeContext.numDeletesToMerge(info); + cachedNumDeletesToMerge.put(info, numDeletesToMerge); + return numDeletesToMerge; + } + + @Override + public final int numDeletedDocs(SegmentCommitInfo info) { + return mergeContext.numDeletedDocs(info); + } + + @Override + public final InfoStream getInfoStream() { + return mergeContext.getInfoStream(); + } + + @Override + public final Set getMergingSegments() { + return mergeContext.getMergingSegments(); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java index 33cec219fac..b562f7740d2 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java @@ -2214,10 +2214,11 @@ public void forceMergeDeletes(boolean doWait) throws IOException { } final MergePolicy mergePolicy = config.getMergePolicy(); + final CachingMergeContext cachingMergeContext = new CachingMergeContext(this); MergePolicy.MergeSpecification spec; boolean newMergesFound = false; synchronized (this) { - spec = mergePolicy.findForcedDeletesMerges(segmentInfos, this); + spec = mergePolicy.findForcedDeletesMerges(segmentInfos, cachingMergeContext); newMergesFound = spec != null; if (newMergesFound) { final int numMerges = spec.merges.size(); @@ -2327,6 +2328,7 @@ private synchronized MergePolicy.MergeSpecification updatePendingMerges( } final MergePolicy.MergeSpecification spec; + final CachingMergeContext cachingMergeContext = new CachingMergeContext(this); if (maxNumSegments != UNBOUNDED_MAX_MERGE_SEGMENTS) { assert trigger == MergeTrigger.EXPLICIT || trigger == MergeTrigger.MERGE_FINISHED : "Expected EXPLICT or MERGE_FINISHED as trigger even with maxNumSegments set but was: " @@ -2334,7 +2336,10 @@ private synchronized MergePolicy.MergeSpecification updatePendingMerges( spec = mergePolicy.findForcedMerges( - segmentInfos, maxNumSegments, Collections.unmodifiableMap(segmentsToMerge), this); + segmentInfos, + maxNumSegments, + Collections.unmodifiableMap(segmentsToMerge), + cachingMergeContext); if (spec != null) { final int numMerges = spec.merges.size(); for (int i = 0; i < numMerges; i++) { @@ -2346,7 +2351,7 @@ private synchronized MergePolicy.MergeSpecification updatePendingMerges( switch (trigger) { case GET_READER: case COMMIT: - spec = mergePolicy.findFullFlushMerges(trigger, segmentInfos, this); + spec = mergePolicy.findFullFlushMerges(trigger, segmentInfos, cachingMergeContext); break; case ADD_INDEXES: throw new IllegalStateException( @@ -2358,7 +2363,7 @@ private synchronized MergePolicy.MergeSpecification updatePendingMerges( case SEGMENT_FLUSH: case CLOSING: default: - spec = mergePolicy.findMerges(trigger, segmentInfos, this); + spec = mergePolicy.findMerges(trigger, segmentInfos, cachingMergeContext); } } if (spec != null) { From 5aaa1d9c766114233cf04c070f3493f338837f40 Mon Sep 17 00:00:00 2001 From: fudongying Date: Fri, 9 Jun 2023 11:57:09 +0800 Subject: [PATCH 3/6] Revert "feat: soft delete optimize" This reverts commit d21ae987b5196a2778bd9c30e4b53ee024c9ad42. --- .../src/java/org/apache/lucene/index/MergePolicy.java | 10 +--------- .../org/apache/lucene/index/TieredMergePolicy.java | 5 +++-- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java b/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java index 6b90b10836e..3eb527edb19 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java +++ b/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java @@ -741,16 +741,8 @@ public boolean useCompoundFile( * non-deleted documents is set. */ protected long size(SegmentCommitInfo info, MergeContext mergeContext) throws IOException { - int delCount = mergeContext.numDeletesToMerge(info); - return size(info, delCount); - } - - /** - * Return the byte size of the provided {@link SegmentCommitInfo}, pro-rated by percentage of - * non-deleted documents is set. - */ - protected final long size(SegmentCommitInfo info, int delCount) throws IOException { long byteSize = info.sizeInBytes(); + int delCount = mergeContext.numDeletesToMerge(info); assert assertDelCount(delCount, info); double delRatio = info.info.maxDoc() <= 0 ? 0d : (double) delCount / (double) info.info.maxDoc(); diff --git a/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java b/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java index 5893bb72aa1..304bbdd2e57 100644 --- a/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java +++ b/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java @@ -295,8 +295,9 @@ private List getSortedBySegmentSize( List sortedBySize = new ArrayList<>(); for (SegmentCommitInfo info : infos) { - final int delCount = mergeContext.numDeletesToMerge(info); - sortedBySize.add(new SegmentSizeAndDocs(info, size(info, delCount), delCount)); + sortedBySize.add( + new SegmentSizeAndDocs( + info, size(info, mergeContext), mergeContext.numDeletesToMerge(info))); } sortedBySize.sort( From 44629fdc9724c2b674817131c03fba666dd35187 Mon Sep 17 00:00:00 2001 From: fudongying Date: Fri, 9 Jun 2023 13:07:12 +0800 Subject: [PATCH 4/6] chore: add some test cases --- .../lucene/index/TestCachingMergeContext.java | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 lucene/core/src/test/org/apache/lucene/index/TestCachingMergeContext.java diff --git a/lucene/core/src/test/org/apache/lucene/index/TestCachingMergeContext.java b/lucene/core/src/test/org/apache/lucene/index/TestCachingMergeContext.java new file mode 100644 index 00000000000..3210d54540d --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/index/TestCachingMergeContext.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.index; + +import java.io.IOException; +import java.util.Set; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.InfoStream; + +public class TestCachingMergeContext extends LuceneTestCase { + public void testNumDeletesToMerge() throws IOException { + MockMergeContext mergeContext = new MockMergeContext(); + CachingMergeContext cachingMergeContext = new CachingMergeContext(mergeContext); + assertEquals(cachingMergeContext.numDeletesToMerge(null), 1); + assertEquals(cachingMergeContext.cachedNumDeletesToMerge.size(), 1); + assertEquals( + cachingMergeContext.cachedNumDeletesToMerge.getOrDefault(null, -1), Integer.valueOf(1)); + assertEquals(mergeContext.count, 1); + + // increase the mock count + mergeContext.numDeletesToMerge(null); + assertEquals(mergeContext.count, 2); + + // assert the cache result still one + assertEquals(cachingMergeContext.numDeletesToMerge(null), 1); + assertEquals(cachingMergeContext.cachedNumDeletesToMerge.size(), 1); + assertEquals( + cachingMergeContext.cachedNumDeletesToMerge.getOrDefault(null, -1), Integer.valueOf(1)); + } + + private static final class MockMergeContext implements MergePolicy.MergeContext { + int count = 0; + + @Override + public final int numDeletesToMerge(SegmentCommitInfo info) throws IOException { + this.count += 1; + return this.count; + } + + @Override + public int numDeletedDocs(SegmentCommitInfo info) { + return 0; + } + + @Override + public InfoStream getInfoStream() { + return null; + } + + @Override + public Set getMergingSegments() { + return null; + } + } +} From 366182d6574bbe2829daccbc312fac9fd1b43786 Mon Sep 17 00:00:00 2001 From: fudongying Date: Fri, 9 Jun 2023 16:16:42 +0800 Subject: [PATCH 5/6] chore: pkg-private class as jpountz comment --- .../src/java/org/apache/lucene/index/CachingMergeContext.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/CachingMergeContext.java b/lucene/core/src/java/org/apache/lucene/index/CachingMergeContext.java index 9661ab91e76..a9974c64e0b 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CachingMergeContext.java +++ b/lucene/core/src/java/org/apache/lucene/index/CachingMergeContext.java @@ -25,7 +25,7 @@ * a wrapper of IndexWriter MergeContext. Try to cache the {@link * #numDeletesToMerge(SegmentCommitInfo)} result in merge phase, to avoid duplicate calculation */ -public class CachingMergeContext implements MergePolicy.MergeContext { +class CachingMergeContext implements MergePolicy.MergeContext { final MergePolicy.MergeContext mergeContext; final HashMap cachedNumDeletesToMerge = new HashMap<>(); From bcb8dacde6f7014346a1c6034cbf2ec2a32d1baf Mon Sep 17 00:00:00 2001 From: fudongying Date: Fri, 9 Jun 2023 16:27:38 +0800 Subject: [PATCH 6/6] chore: add change log --- lucene/CHANGES.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 50229e12dae..9de8f3b9009 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -177,6 +177,8 @@ Optimizations * GITHUB#12328: Optimize ConjunctionDISI.createConjunction (Armin Braun) +* GITHUB#12339: Optimize part of duplicate calculation numDeletesToMerge in merge phase (fudongying) + Bug Fixes ---------------------