From a79d307c3520e9afdb4f0a01bc7eca5af76f3e6e Mon Sep 17 00:00:00 2001 From: nickyulin Date: Mon, 13 Oct 2025 14:51:45 +0800 Subject: [PATCH 1/7] perf: Added configurable limit for PhraseQuery#builder terms to solve the problem of excessive memory usage in ultra-long text search case --- .../java/org/apache/lucene/search/PhraseQuery.java | 14 ++++++++++++++ .../org/apache/lucene/search/TestPhraseQuery.java | 14 ++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java index b762fced2eef..a8ae7a95de93 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java @@ -74,12 +74,14 @@ public class PhraseQuery extends Query { public static class Builder { private int slop; + private int termLimit; private final List terms; private final IntArrayList positions; /** Sole constructor. */ public Builder() { slop = 0; + termLimit = -1; terms = new ArrayList<>(); positions = new IntArrayList(); } @@ -94,6 +96,14 @@ public Builder setSlop(int slop) { return this; } + /** + * Set the term limit. + */ + public Builder setTermLimit(int value) { + this.termLimit = value; + return this; + } + /** * Adds a term to the end of the query phrase. The relative position of the term is the one * immediately after the last term added. @@ -128,6 +138,10 @@ public Builder add(Term term, int position) { + " and " + terms.get(0).field()); } + if (termLimit > 0 && terms.size() >= termLimit) { + throw new IllegalArgumentException("The current value of terms is " + + terms.size() + ", which exceeds the limit of " + termLimit); + } terms.add(term); positions.add(position); return this; diff --git a/lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java index a569fb5c5c91..17a38e36c6b5 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java @@ -743,6 +743,20 @@ public void testBackwardPositions() throws Exception { }); } + public void testPhraseQueryTermLimit() throws Exception { + PhraseQuery.Builder builder = new PhraseQuery.Builder(); + int termLimit = 1000; + builder.setTermLimit(termLimit); + for (int i = 0; i < termLimit; i++) { + builder.add(new Term("field", "one" + i), i + 1); + } + expectThrows( + IllegalArgumentException.class, + () -> { + builder.add(new Term("field", "three"), termLimit + 1); + }); + } + private static final String[] DOCS = new String[] { "a b c d e f g h", From 1acef659eff5661cd5664b5a8f2c173f0fdd886c Mon Sep 17 00:00:00 2001 From: nickyulin Date: Mon, 13 Oct 2025 15:39:17 +0800 Subject: [PATCH 2/7] perf: Added configurable term threshold for PhraseQuery#Builder to solve the problem of excessive memory usage in ultra-long text search case --- .../java/org/apache/lucene/search/PhraseQuery.java | 12 ++++++------ .../org/apache/lucene/search/TestPhraseQuery.java | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java index a8ae7a95de93..d18efc7bc432 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java @@ -74,14 +74,14 @@ public class PhraseQuery extends Query { public static class Builder { private int slop; - private int termLimit; + private int termThreshold; private final List terms; private final IntArrayList positions; /** Sole constructor. */ public Builder() { slop = 0; - termLimit = -1; + termThreshold = -1; terms = new ArrayList<>(); positions = new IntArrayList(); } @@ -99,8 +99,8 @@ public Builder setSlop(int slop) { /** * Set the term limit. */ - public Builder setTermLimit(int value) { - this.termLimit = value; + public Builder setTermThreshold(int value) { + this.termThreshold = value; return this; } @@ -138,9 +138,9 @@ public Builder add(Term term, int position) { + " and " + terms.get(0).field()); } - if (termLimit > 0 && terms.size() >= termLimit) { + if (termThreshold > 0 && terms.size() >= termThreshold) { throw new IllegalArgumentException("The current value of terms is " + - terms.size() + ", which exceeds the limit of " + termLimit); + terms.size() + ", which exceeds the limit of " + termThreshold); } terms.add(term); positions.add(position); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java index 17a38e36c6b5..ac0740b5ce64 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java @@ -746,7 +746,7 @@ public void testBackwardPositions() throws Exception { public void testPhraseQueryTermLimit() throws Exception { PhraseQuery.Builder builder = new PhraseQuery.Builder(); int termLimit = 1000; - builder.setTermLimit(termLimit); + builder.setTermThreshold(termLimit); for (int i = 0; i < termLimit; i++) { builder.add(new Term("field", "one" + i), i + 1); } From b65921fd56e56577dd60a7972a5dc2f47669758c Mon Sep 17 00:00:00 2001 From: nickyulin Date: Mon, 13 Oct 2025 15:41:23 +0800 Subject: [PATCH 3/7] perf: Added configurable term threshold for PhraseQuery#Builder to solve the problem of excessive memory usage in ultra-long text search case --- lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java index d18efc7bc432..874278b1bd45 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java @@ -97,7 +97,7 @@ public Builder setSlop(int slop) { } /** - * Set the term limit. + * Set the term threshold. */ public Builder setTermThreshold(int value) { this.termThreshold = value; From 8b92cd218f626967f619a4a8dab858921f3db677 Mon Sep 17 00:00:00 2001 From: nickyulin Date: Mon, 13 Oct 2025 16:24:30 +0800 Subject: [PATCH 4/7] perf: Added configurable term threshold for PhraseQuery#Builder to solve the problem of excessive memory usage in ultra-long text search case --- .../java/org/apache/lucene/search/PhraseQuery.java | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java index 874278b1bd45..7e518deeadf6 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java @@ -96,9 +96,7 @@ public Builder setSlop(int slop) { return this; } - /** - * Set the term threshold. - */ + /** Set the term threshold. */ public Builder setTermThreshold(int value) { this.termThreshold = value; return this; @@ -139,8 +137,11 @@ public Builder add(Term term, int position) { + terms.get(0).field()); } if (termThreshold > 0 && terms.size() >= termThreshold) { - throw new IllegalArgumentException("The current value of terms is " + - terms.size() + ", which exceeds the limit of " + termThreshold); + throw new IllegalArgumentException( + "The current value of terms is " + + terms.size() + + ", which exceeds the limit of " + + termThreshold); } terms.add(term); positions.add(position); From 5b07af2c98ddb57333588dfd5e660577eb2360ac Mon Sep 17 00:00:00 2001 From: nickyulin Date: Mon, 13 Oct 2025 18:05:05 +0800 Subject: [PATCH 5/7] add changes entry --- lucene/CHANGES.txt | 2 ++ .../src/java/org/apache/lucene/search/PhraseQuery.java | 2 +- .../test/org/apache/lucene/search/TestPhraseQuery.java | 10 +++++----- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 15151a955ef0..49961c08e972 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -174,6 +174,8 @@ Improvements * GITHUB#15184: Refactoring internal HNSWGraphBuilder's APIs and avoid creating new scorer for each call (Patrick Zhai) +* GITHUB#15332: Solve the problem of excessive memory usage in ultra-long text search case (linyunanit) + Optimizations --------------------- * GITHUB#15140: Optimize TopScoreDocCollector with TernaryLongHeap for improved performance over Binary-LongHeap. (Ramakrishna Chilaka) diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java index 7e518deeadf6..a9bf7e9780fc 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java @@ -138,7 +138,7 @@ public Builder add(Term term, int position) { } if (termThreshold > 0 && terms.size() >= termThreshold) { throw new IllegalArgumentException( - "The current value of terms is " + "The current number of terms is " + terms.size() + ", which exceeds the limit of " + termThreshold); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java index ac0740b5ce64..dec7b06eb2a7 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java @@ -745,15 +745,15 @@ public void testBackwardPositions() throws Exception { public void testPhraseQueryTermLimit() throws Exception { PhraseQuery.Builder builder = new PhraseQuery.Builder(); - int termLimit = 1000; - builder.setTermThreshold(termLimit); - for (int i = 0; i < termLimit; i++) { - builder.add(new Term("field", "one" + i), i + 1); + int termThreshold = 5; + builder.setTermThreshold(termThreshold); + for (int i = 0; i < termThreshold; i++) { + builder.add(new Term("field", "one" + i), i); } expectThrows( IllegalArgumentException.class, () -> { - builder.add(new Term("field", "three"), termLimit + 1); + builder.add(new Term("field", "three"), termThreshold); }); } From e41623d0fd95a002ac82711bea9e6b1c99e19968 Mon Sep 17 00:00:00 2001 From: nickyulin Date: Tue, 14 Oct 2025 00:32:34 +0800 Subject: [PATCH 6/7] Add PhraseQuery.Builder.setMaxTerms() method to limit the maximum number of terms and excessive memory use. --- lucene/CHANGES.txt | 2 +- .../org/apache/lucene/search/PhraseQuery.java | 20 ++++++++++++------- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 49961c08e972..86518e6e9b65 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -174,7 +174,7 @@ Improvements * GITHUB#15184: Refactoring internal HNSWGraphBuilder's APIs and avoid creating new scorer for each call (Patrick Zhai) -* GITHUB#15332: Solve the problem of excessive memory usage in ultra-long text search case (linyunanit) +* GITHUB#15332: Add PhraseQuery.Builder.setMaxTerms() method to limit the maximum number of terms and excessive memory use (linyunanit) Optimizations --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java index a9bf7e9780fc..222a8cf2f52a 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java @@ -74,14 +74,14 @@ public class PhraseQuery extends Query { public static class Builder { private int slop; - private int termThreshold; + private int maxTerms; private final List terms; private final IntArrayList positions; /** Sole constructor. */ public Builder() { slop = 0; - termThreshold = -1; + maxTerms = -1; terms = new ArrayList<>(); positions = new IntArrayList(); } @@ -96,9 +96,15 @@ public Builder setSlop(int slop) { return this; } - /** Set the term threshold. */ - public Builder setTermThreshold(int value) { - this.termThreshold = value; + /** + * Set the maximum number of terms allowed in the phrase query. + * This helps prevent excessive memory usage for very long phrases. + * + *

If the number of terms added via {@link #add(Term)} or {@link #add(Term, int)} + * exceeds this threshold, an {@link IllegalArgumentException} will be thrown. + */ + public Builder setMaxTerms(int maxTerms) { + this.maxTerms = maxTerms; return this; } @@ -136,12 +142,12 @@ public Builder add(Term term, int position) { + " and " + terms.get(0).field()); } - if (termThreshold > 0 && terms.size() >= termThreshold) { + if (maxTerms > 0 && terms.size() >= maxTerms) { throw new IllegalArgumentException( "The current number of terms is " + terms.size() + ", which exceeds the limit of " - + termThreshold); + + maxTerms); } terms.add(term); positions.add(position); From 2a3b1bebf123f1f8e16ff98359b2907bdc24e0e6 Mon Sep 17 00:00:00 2001 From: nickyulin Date: Tue, 14 Oct 2025 00:44:32 +0800 Subject: [PATCH 7/7] =?UTF-8?q?Optimizing=20test=20cases=EF=BC=9AtestPhras?= =?UTF-8?q?eQueryMaxTerms#testPhraseQueryTermLimit?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/java/org/apache/lucene/search/PhraseQuery.java | 8 ++++---- .../test/org/apache/lucene/search/TestPhraseQuery.java | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java index 222a8cf2f52a..122554747394 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java @@ -97,11 +97,11 @@ public Builder setSlop(int slop) { } /** - * Set the maximum number of terms allowed in the phrase query. - * This helps prevent excessive memory usage for very long phrases. + * Set the maximum number of terms allowed in the phrase query. This helps prevent excessive + * memory usage for very long phrases. * - *

If the number of terms added via {@link #add(Term)} or {@link #add(Term, int)} - * exceeds this threshold, an {@link IllegalArgumentException} will be thrown. + *

If the number of terms added via {@link #add(Term)} or {@link #add(Term, int)} exceeds + * this threshold, an {@link IllegalArgumentException} will be thrown. */ public Builder setMaxTerms(int maxTerms) { this.maxTerms = maxTerms; diff --git a/lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java index dec7b06eb2a7..654b93991db2 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java @@ -743,10 +743,10 @@ public void testBackwardPositions() throws Exception { }); } - public void testPhraseQueryTermLimit() throws Exception { + public void testPhraseQueryMaxTerms() throws Exception { PhraseQuery.Builder builder = new PhraseQuery.Builder(); int termThreshold = 5; - builder.setTermThreshold(termThreshold); + builder.setMaxTerms(termThreshold); for (int i = 0; i < termThreshold; i++) { builder.add(new Term("field", "one" + i), i); }