From edfd968d61febd5fd2592d7c5ea3ef1a7dfd5189 Mon Sep 17 00:00:00 2001 From: "guofeng.my" Date: Thu, 28 Sep 2023 23:15:08 +0800 Subject: [PATCH 1/6] reduce fst block size --- .../codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java | 1 + 1 file changed, 1 insertion(+) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java index a7eb438489f9..2bbf29990e09 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java @@ -494,6 +494,7 @@ public void compileIndex( final FSTCompiler fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs) .shouldShareNonSingletonNodes(false) + .bytesPageBits(10) .build(); // if (DEBUG) { // System.out.println(" compile index for prefix=" + prefix); From 43e67cc0278f6f80e572c0830ecc0af86fa9959e Mon Sep 17 00:00:00 2001 From: "guofeng.my" Date: Thu, 28 Sep 2023 23:40:05 +0800 Subject: [PATCH 2/6] add CHANGES --- lucene/CHANGES.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index dce07644fab3..ec3b33a122e6 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -163,6 +163,8 @@ Optimizations * GITHUB#12382: Faster top-level conjunctions on term queries when sorting by descending score. (Adrien Grand) +* GITHUB#12573: Reduce block size of FST BytesStore in BlockTreeTermsWriter. (Guo Feng) + Changes in runtime behavior --------------------- From 521d693631664f442fd01c56d17196dbd773f665 Mon Sep 17 00:00:00 2001 From: "guofeng.my" Date: Thu, 28 Sep 2023 23:41:59 +0800 Subject: [PATCH 3/6] fix --- lucene/CHANGES.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index ec3b33a122e6..86eac66d37ab 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -163,7 +163,7 @@ Optimizations * GITHUB#12382: Faster top-level conjunctions on term queries when sorting by descending score. (Adrien Grand) -* GITHUB#12573: Reduce block size of FST BytesStore in BlockTreeTermsWriter. (Guo Feng) +* GITHUB#12604: Reduce block size of FST BytesStore in BlockTreeTermsWriter. (Guo Feng) Changes in runtime behavior --------------------- From 1eb6a99d5db452f2af4bc0bb04472db3c5b812ac Mon Sep 17 00:00:00 2001 From: "guofeng.my" Date: Wed, 4 Oct 2023 00:23:04 +0800 Subject: [PATCH 4/6] estimate bit --- .../blocktree/Lucene90BlockTreeTermsWriter.java | 14 +++++++++++++- .../src/java/org/apache/lucene/util/fst/FST.java | 4 ++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java index 2bbf29990e09..1b0bd3568c9f 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java @@ -52,6 +52,7 @@ import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FSTCompiler; import org.apache.lucene.util.fst.Util; +import org.apache.lucene.util.packed.PackedInts; /* TODO: @@ -490,11 +491,22 @@ public void compileIndex( } } + long estimateSize = prefix.length; + for (PendingBlock block : blocks) { + if (block.subIndices != null) { + for (FST subIndex : block.subIndices) { + estimateSize += subIndex.numBytes(); + } + } + } + int estimateBitsRequired = PackedInts.bitsRequired(estimateSize); + int pageBits = Math.min(15, Math.max(6, estimateBitsRequired)); + final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); final FSTCompiler fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs) .shouldShareNonSingletonNodes(false) - .bytesPageBits(10) + .bytesPageBits(pageBits) .build(); // if (DEBUG) { // System.out.println(" compile index for prefix=" + prefix); diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java index 816a28572681..fb356c2c9c7e 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java @@ -520,6 +520,10 @@ void finish(long newStartNode) throws IOException { bytes.finish(); } + public long numBytes() { + return bytes.getPosition(); + } + public T getEmptyOutput() { return emptyOutput; } From a4a496424fcb47499cdcc90ff5ff44d9cb5f6668 Mon Sep 17 00:00:00 2001 From: "guofeng.my" Date: Wed, 4 Oct 2023 00:50:03 +0800 Subject: [PATCH 5/6] update changes --- lucene/CHANGES.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 86eac66d37ab..0e9cc5b59abb 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -163,7 +163,8 @@ Optimizations * GITHUB#12382: Faster top-level conjunctions on term queries when sorting by descending score. (Adrien Grand) -* GITHUB#12604: Reduce block size of FST BytesStore in BlockTreeTermsWriter. (Guo Feng) +* GITHUB#12604: Estimate the block size of FST BytesStore in BlockTreeTermsWriter + to reducing GC load during indexing. (Guo Feng) Changes in runtime behavior --------------------- From 21b6858fb6473842edd3088b037e76d02547790c Mon Sep 17 00:00:00 2001 From: "guofeng.my" Date: Wed, 4 Oct 2023 11:40:59 +0800 Subject: [PATCH 6/6] fix change --- lucene/CHANGES.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 0e9cc5b59abb..2449db7978ac 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -164,7 +164,7 @@ Optimizations descending score. (Adrien Grand) * GITHUB#12604: Estimate the block size of FST BytesStore in BlockTreeTermsWriter - to reducing GC load during indexing. (Guo Feng) + to reduce GC load during indexing. (Guo Feng) Changes in runtime behavior ---------------------