From 7af57d80e5730b480f374907cfc600a2cb63311c Mon Sep 17 00:00:00 2001 From: Du Bin Date: Mon, 2 Mar 2026 13:54:25 +0000 Subject: [PATCH] [common] Fix O(2^n) complexity in FileIndexPredicate.getRequiredNames MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove redundant child.visit(this) call in getRequiredNames() that caused exponential time complexity for deeply nested OR predicates (e.g. IN clauses). The visitor called child.visit(this) twice per child — once discarding the result, then again using it — doubling work at each tree level. For IN clauses with <= 20 values producing right-nested OR trees of depth N, this caused O(2^N) leaf visits instead of O(N), hanging production CPUs. Closes #7230 --- .../paimon/fileindex/FileIndexPredicate.java | 1 - .../fileindex/FileIndexPredicateTest.java | 141 ++++++++++++++++++ 2 files changed, 141 insertions(+), 1 deletion(-) create mode 100644 paimon-common/src/test/java/org/apache/paimon/fileindex/FileIndexPredicateTest.java diff --git a/paimon-common/src/main/java/org/apache/paimon/fileindex/FileIndexPredicate.java b/paimon-common/src/main/java/org/apache/paimon/fileindex/FileIndexPredicate.java index 0ad3238a4ca8..c43f2189817c 100644 --- a/paimon-common/src/main/java/org/apache/paimon/fileindex/FileIndexPredicate.java +++ b/paimon-common/src/main/java/org/apache/paimon/fileindex/FileIndexPredicate.java @@ -127,7 +127,6 @@ public Set visit(LeafPredicate predicate) { public Set visit(CompoundPredicate predicate) { Set result = new HashSet<>(); for (Predicate child : predicate.children()) { - child.visit(this); result.addAll(child.visit(this)); } return result; diff --git a/paimon-common/src/test/java/org/apache/paimon/fileindex/FileIndexPredicateTest.java b/paimon-common/src/test/java/org/apache/paimon/fileindex/FileIndexPredicateTest.java new file mode 100644 index 000000000000..c1b354c0ca46 --- /dev/null +++ b/paimon-common/src/test/java/org/apache/paimon/fileindex/FileIndexPredicateTest.java @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.fileindex; + +import org.apache.paimon.predicate.CompoundPredicate; +import org.apache.paimon.predicate.Or; +import org.apache.paimon.predicate.Predicate; +import org.apache.paimon.predicate.PredicateBuilder; +import org.apache.paimon.predicate.PredicateVisitor; +import org.apache.paimon.types.DataTypes; +import org.apache.paimon.types.RowType; + +import org.junit.jupiter.api.Test; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; + +import static org.assertj.core.api.Assertions.assertThat; + +/** Tests for {@link FileIndexPredicate}, specifically the getRequiredNames visitor. */ +public class FileIndexPredicateTest { + + /** + * Verifies that getRequiredNames (via the PredicateVisitor in FileIndexPredicate) visits each + * child node exactly once, not twice. Before the fix, the visitor called child.visit(this) + * twice per child — once discarding the result (line 130) and once using it (line 131). This + * caused O(2^n) complexity for deeply nested OR predicates (e.g., IN clauses with <= 20 + * values). + * + *

This test builds a deeply nested OR predicate tree (simulating an IN clause) and counts + * the total number of leaf visits. With the fix, the count should be exactly N (one per leaf). + * Before the fix, it would be 2^N - 1. + */ + @Test + public void testGetRequiredNamesLinearComplexity() { + RowType rowType = RowType.of(DataTypes.INT()); + PredicateBuilder builder = new PredicateBuilder(rowType); + + // Build an OR chain of 20 equality predicates: col0 = 0 OR col0 = 1 OR ... OR col0 = 19 + // PredicateBuilder.or() uses reduce, creating a right-nested binary tree of depth ~20. + List equals = new ArrayList<>(); + for (int i = 0; i < 20; i++) { + equals.add(builder.equal(0, i)); + } + Predicate inPredicate = PredicateBuilder.or(equals); + + // Count leaf visits using the same visitor pattern as getRequiredNames + AtomicInteger visitCount = new AtomicInteger(0); + Set result = + inPredicate.visit( + new PredicateVisitor>() { + @Override + public Set visit( + org.apache.paimon.predicate.LeafPredicate predicate) { + visitCount.incrementAndGet(); + return new HashSet<>(predicate.fieldNames()); + } + + @Override + public Set visit(CompoundPredicate predicate) { + Set names = new HashSet<>(); + for (Predicate child : predicate.children()) { + names.addAll(child.visit(this)); + } + return names; + } + }); + + // The result should contain the field name + assertThat(result).containsExactly("f0"); + + // With correct linear traversal, each of the 20 leaves is visited exactly once, + // plus we traverse ~19 compound nodes. The leaf visit count should be exactly 20. + assertThat(visitCount.get()).isEqualTo(20); + } + + /** + * Tests that getRequiredNames completes in reasonable time for large OR predicates. Before the + * fix, this would hang due to O(2^n) complexity. + */ + @Test + public void testGetRequiredNamesPerformance() { + RowType rowType = RowType.of(DataTypes.INT()); + PredicateBuilder builder = new PredicateBuilder(rowType); + + // Build an OR chain of 20 equality predicates + List equals = new ArrayList<>(); + for (int i = 0; i < 20; i++) { + equals.add(builder.equal(0, i)); + } + Predicate inPredicate = PredicateBuilder.or(equals); + + long startNanos = System.nanoTime(); + + // Use the same visitor pattern as FileIndexPredicate.getRequiredNames + Set result = + inPredicate.visit( + new PredicateVisitor>() { + @Override + public Set visit( + org.apache.paimon.predicate.LeafPredicate predicate) { + return new HashSet<>(predicate.fieldNames()); + } + + @Override + public Set visit(CompoundPredicate predicate) { + Set names = new HashSet<>(); + for (Predicate child : predicate.children()) { + names.addAll(child.visit(this)); + } + return names; + } + }); + + long elapsedMs = (System.nanoTime() - startNanos) / 1_000_000; + + assertThat(result).containsExactly("f0"); + // Should complete in under 100ms with linear complexity. + // Before the fix with 20 nodes: 2^20 = ~1 million visits, would take seconds+. + assertThat(elapsedMs).isLessThan(100); + } +}