From 61f54f90ca7969d4c1ca11567bff4c81335d67fe Mon Sep 17 00:00:00 2001 From: xiaohongbo Date: Tue, 28 Apr 2026 15:14:04 +0800 Subject: [PATCH 1/3] Fix ArrayIndexOutOfBoundsException when reading vector search results with Lance format --- .../operation/DataEvolutionSplitRead.java | 20 ++++-- .../paimon/format/lance/jni/LanceReader.java | 5 ++ .../format/lance/LanceVectorSearchTest.java | 64 +++++++++++++++++++ 3 files changed, 82 insertions(+), 7 deletions(-) create mode 100644 paimon-lance/src/test/java/org/apache/paimon/format/lance/LanceVectorSearchTest.java diff --git a/paimon-core/src/main/java/org/apache/paimon/operation/DataEvolutionSplitRead.java b/paimon-core/src/main/java/org/apache/paimon/operation/DataEvolutionSplitRead.java index c0e70ff4f892..c2f10b9a2b0c 100644 --- a/paimon-core/src/main/java/org/apache/paimon/operation/DataEvolutionSplitRead.java +++ b/paimon-core/src/main/java/org/apache/paimon/operation/DataEvolutionSplitRead.java @@ -183,7 +183,8 @@ private RecordReader createReader( dataFilePathFactory, needMergeFiles.get(0), formatBuilder, - rowRanges)); + rowRanges, + readRowType)); } else { suppliers.add( @@ -308,7 +309,8 @@ private DataEvolutionFileReader createUnionReader( bunch, dataFilePathFactory, formatReaderMapping, - rowRanges)); + rowRanges, + readRowType)); } } @@ -330,7 +332,8 @@ private FileRecordReader createFileReader( DataFilePathFactory dataFilePathFactory, DataFileMeta file, Builder formatBuilder, - List rowRanges) + List rowRanges, + RowType readRowType) throws IOException { String formatIdentifier = DataFilePathFactory.formatIdentifier(file.fileName()); long schemaId = file.schemaId(); @@ -345,7 +348,7 @@ private FileRecordReader createFileReader( ? schema : schemaFetcher.apply(schemaId))); return createFileReader( - partition, file, dataFilePathFactory, formatReaderMapping, rowRanges); + partition, file, dataFilePathFactory, formatReaderMapping, rowRanges, readRowType); } private RecordReader createFileReader( @@ -353,7 +356,8 @@ private RecordReader createFileReader( FieldBunch bunch, DataFilePathFactory dataFilePathFactory, FormatReaderMapping formatReaderMapping, - List rowRanges) + List rowRanges, + RowType readRowType) throws IOException { if (bunch.files().size() == 1) { return createFileReader( @@ -361,7 +365,8 @@ private RecordReader createFileReader( bunch.files().get(0), dataFilePathFactory, formatReaderMapping, - rowRanges); + rowRanges, + readRowType); } List> readerSuppliers = new ArrayList<>(); for (DataFileMeta file : bunch.files()) { @@ -394,7 +399,8 @@ private FileRecordReader createFileReader( DataFileMeta file, DataFilePathFactory dataFilePathFactory, FormatReaderMapping formatReaderMapping, - List rowRanges) + List rowRanges, + RowType readRowType) throws IOException { RoaringBitmap32 selection = file.toFileSelection(rowRanges); FormatReaderContext formatReaderContext = diff --git a/paimon-lance/src/main/java/org/apache/paimon/format/lance/jni/LanceReader.java b/paimon-lance/src/main/java/org/apache/paimon/format/lance/jni/LanceReader.java index 5aba599079d0..9a3ae750721d 100644 --- a/paimon-lance/src/main/java/org/apache/paimon/format/lance/jni/LanceReader.java +++ b/paimon-lance/src/main/java/org/apache/paimon/format/lance/jni/LanceReader.java @@ -65,6 +65,11 @@ public LanceReader( .filter(fileFieldNames::contains) .collect(Collectors.toList()); if (existingFields.isEmpty()) { +<<<<<<< HEAD +======= + // Read at least one column to get the correct row count. + // ArrowBatchReader maps by name; unmatched projected fields become null. +>>>>>>> e3ea8868b ([lance] Fix ArrayIndexOutOfBoundsException when reading vector search results with Lance format) existingFields = Collections.singletonList(fileFieldNames.iterator().next()); } this.arrowReader = reader.readAll(existingFields, ranges, batchSize); diff --git a/paimon-lance/src/test/java/org/apache/paimon/format/lance/LanceVectorSearchTest.java b/paimon-lance/src/test/java/org/apache/paimon/format/lance/LanceVectorSearchTest.java new file mode 100644 index 000000000000..1d152b9b21bc --- /dev/null +++ b/paimon-lance/src/test/java/org/apache/paimon/format/lance/LanceVectorSearchTest.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.lance; + +import org.apache.paimon.CoreOptions; +import org.apache.paimon.catalog.Catalog; +import org.apache.paimon.catalog.CatalogContext; +import org.apache.paimon.catalog.CatalogFactory; +import org.apache.paimon.fs.Path; +import org.apache.paimon.options.Options; +import org.apache.paimon.schema.Schema; +import org.apache.paimon.table.source.VectorSearchBuilderTest; +import org.apache.paimon.types.ArrayType; +import org.apache.paimon.types.DataTypes; +import org.apache.paimon.utils.TraceableFileIO; + +import org.junit.jupiter.api.BeforeEach; + +import static org.apache.paimon.options.CatalogOptions.WAREHOUSE; + +/** Test vector search with Lance file format. */ +public class LanceVectorSearchTest extends VectorSearchBuilderTest { + + @BeforeEach + public void beforeEach() throws Catalog.DatabaseAlreadyExistException { + database = "default"; + warehouse = new Path(TraceableFileIO.SCHEME + "://" + tempPath.toString()); + Options options = new Options(); + options.set(WAREHOUSE, warehouse.toUri().toString()); + CatalogContext context = CatalogContext.create(options, new TraceableFileIO.Loader(), null); + catalog = CatalogFactory.createCatalog(context); + catalog.createDatabase(database, true); + } + + @Override + protected Schema schemaDefault() { + return Schema.newBuilder() + .column("id", DataTypes.INT()) + .column("vec", new ArrayType(DataTypes.FLOAT())) + .option(CoreOptions.BUCKET.key(), "-1") + .option(CoreOptions.ROW_TRACKING_ENABLED.key(), "true") + .option(CoreOptions.DATA_EVOLUTION_ENABLED.key(), "true") + .option(CoreOptions.FILE_FORMAT.key(), "lance") + .option("test.vector.dimension", "2") + .option("test.vector.metric", "l2") + .build(); + } +} From bb4142205fe4ff789fb7dc64c3b3c809e25a7161 Mon Sep 17 00:00:00 2001 From: xiaohongbo Date: Tue, 28 Apr 2026 15:28:18 +0800 Subject: [PATCH 2/3] fix merge issue --- .../java/org/apache/paimon/format/lance/jni/LanceReader.java | 3 --- 1 file changed, 3 deletions(-) diff --git a/paimon-lance/src/main/java/org/apache/paimon/format/lance/jni/LanceReader.java b/paimon-lance/src/main/java/org/apache/paimon/format/lance/jni/LanceReader.java index 9a3ae750721d..430be8e0340e 100644 --- a/paimon-lance/src/main/java/org/apache/paimon/format/lance/jni/LanceReader.java +++ b/paimon-lance/src/main/java/org/apache/paimon/format/lance/jni/LanceReader.java @@ -65,11 +65,8 @@ public LanceReader( .filter(fileFieldNames::contains) .collect(Collectors.toList()); if (existingFields.isEmpty()) { -<<<<<<< HEAD -======= // Read at least one column to get the correct row count. // ArrowBatchReader maps by name; unmatched projected fields become null. ->>>>>>> e3ea8868b ([lance] Fix ArrayIndexOutOfBoundsException when reading vector search results with Lance format) existingFields = Collections.singletonList(fileFieldNames.iterator().next()); } this.arrowReader = reader.readAll(existingFields, ranges, batchSize); From 5374253391c1db9d2054b959e3c8eb0e1bd9def4 Mon Sep 17 00:00:00 2001 From: xiaohongbo Date: Tue, 28 Apr 2026 16:19:37 +0800 Subject: [PATCH 3/3] disable testVectorSearchWithCosineMetric --- .../apache/paimon/format/lance/LanceVectorSearchTest.java | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/paimon-lance/src/test/java/org/apache/paimon/format/lance/LanceVectorSearchTest.java b/paimon-lance/src/test/java/org/apache/paimon/format/lance/LanceVectorSearchTest.java index 1d152b9b21bc..a8107977bba4 100644 --- a/paimon-lance/src/test/java/org/apache/paimon/format/lance/LanceVectorSearchTest.java +++ b/paimon-lance/src/test/java/org/apache/paimon/format/lance/LanceVectorSearchTest.java @@ -31,6 +31,8 @@ import org.apache.paimon.utils.TraceableFileIO; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; import static org.apache.paimon.options.CatalogOptions.WAREHOUSE; @@ -61,4 +63,9 @@ protected Schema schemaDefault() { .option("test.vector.metric", "l2") .build(); } + + @Disabled("Cosine metric uses Tantivy index which requires Hadoop dependencies") + @Test + @Override + public void testVectorSearchWithCosineMetric() {} }