From 1f64bad28d79bd41155e20e0e0b41681e09e934a Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Thu, 2 May 2024 20:37:20 +0100 Subject: [PATCH 01/37] Add a MemorySegment Vector scorer - for scoring without copying on-heap --- .../benchmark/jmh/VectorScorerBenchmark.java | 113 +++++++++ lucene/core/src/java/module-info.java | 2 + .../lucene99/Lucene99HnswVectorsFormat.java | 4 +- .../FlatVectorScorerProvider.java | 56 +++++ .../vectorization/VectorizationProvider.java | 12 +- .../internal/vectorization/DotProduct.java | 41 ++++ .../internal/vectorization/Euclidean.java | 40 ++++ .../vectorization/MemorySegmentAccess.java | 27 +++ ...MemorySegmentByteVectorScorerSupplier.java | 129 +++++++++++ .../MemorySegmentFlatVectorsScorer.java | 97 ++++++++ .../PanamaVectorUtilSupport.java | 65 +++--- .../lucene/store/MemorySegmentIndexInput.java | 35 ++- .../TestLucene99HnswVectorsFormat.java | 13 +- .../vectorization/TestVectorScorer.java | 215 ++++++++++++++++++ .../search/BaseKnnVectorQueryTestCase.java | 36 +-- .../search/TestKnnByteVectorQueryMMap.java | 37 +++ 16 files changed, 874 insertions(+), 48 deletions(-) create mode 100644 lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java create mode 100644 lucene/core/src/java/org/apache/lucene/internal/vectorization/FlatVectorScorerProvider.java create mode 100644 lucene/core/src/java21/org/apache/lucene/internal/vectorization/DotProduct.java create mode 100644 lucene/core/src/java21/org/apache/lucene/internal/vectorization/Euclidean.java create mode 100644 lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentAccess.java create mode 100644 lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java create mode 100644 lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentFlatVectorsScorer.java create mode 100644 lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java create mode 100644 lucene/core/src/test/org/apache/lucene/search/TestKnnByteVectorQueryMMap.java diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java new file mode 100644 index 00000000000..5db830efe97 --- /dev/null +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.benchmark.jmh; + +import static org.apache.lucene.index.VectorSimilarityFunction.DOT_PRODUCT; + +import java.io.IOException; +import java.nio.file.Files; +import java.util.concurrent.ThreadLocalRandom; +import java.util.concurrent.TimeUnit; +import org.apache.lucene.codecs.lucene95.OffHeapByteVectorValues; +import org.apache.lucene.internal.vectorization.FlatVectorScorerProvider; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.MMapDirectory; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.hnsw.RandomAccessVectorValues; +import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; +import org.openjdk.jmh.annotations.*; + +@BenchmarkMode(Mode.Throughput) +@OutputTimeUnit(TimeUnit.MICROSECONDS) +@State(Scope.Benchmark) +// first iteration is complete garbage, so make sure we really warmup +@Warmup(iterations = 4, time = 1) +// real iterations. not useful to spend tons of time here, better to fork more +@Measurement(iterations = 5, time = 1) +// engage some noise reduction +@Fork( + value = 1, // TODO restore to 3, + jvmArgsAppend = {"-Xmx2g", "-Xms2g", "-XX:+AlwaysPreTouch"}) +public class VectorScorerBenchmark { + + @Param({"1", "128", "207", "256", "300", "512", "702", "1024"}) + int size; + + Directory dir; + IndexInput in; + RandomAccessVectorValues vectorValues; + byte[] vec1, vec2; + RandomVectorScorerSupplier scorer; + + @Setup(Level.Iteration) + public void init() throws IOException { + vec1 = new byte[size]; + vec2 = new byte[size]; + ThreadLocalRandom.current().nextBytes(vec1); + ThreadLocalRandom.current().nextBytes(vec2); + + dir = new MMapDirectory(Files.createTempDirectory("VectorScorerBenchmark")); + try (IndexOutput out = dir.createOutput("vector.data", IOContext.DEFAULT)) { + out.writeBytes(vec1, 0, vec1.length); + out.writeBytes(vec2, 0, vec2.length); + } + in = dir.openInput("vector.data", IOContext.DEFAULT); + vectorValues = vectorValues(size, 2, in); + scorer = + FlatVectorScorerProvider.createDefault() + .getRandomVectorScorerSupplier(DOT_PRODUCT, vectorValues); + + // Ensure we're using the right vector scorer + var name = FlatVectorScorerProvider.createDefault().getClass().getSimpleName(); + if (Object.class.getModule().getLayer().findModule("jdk.incubator.vector").isPresent()) { + if (!name.equals("MemorySegmentFlatVectorsScorer")) { + throw new AssertionError("expected MemorySegmentFlatVectorsScorer, got:" + name); + } + } else { + if (!name.equals("DefaultFlatVectorScorer")) { + throw new AssertionError("expected DefaultFlatVectorScorer, got:" + name); + } + } + } + + @TearDown + public void teardown() throws IOException { + IOUtils.close(dir, in); + } + + @Benchmark + public float binaryDotProductDefault() throws IOException { + // score twice to invalidate and re-read the vector at the first position + return scorer.scorer(0).score(1) + scorer.scorer(1).score(0); + } + + @Benchmark + @Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) + public float binaryDotProductMemSeg() throws IOException { + // score twice to invalidate and re-read the vector at the first position + return scorer.scorer(0).score(1) + scorer.scorer(1).score(0); + } + + static RandomAccessVectorValues vectorValues(int dims, int size, IndexInput in) + throws IOException { + return new OffHeapByteVectorValues.DenseOffHeapVectorValues( + dims, size, in.slice("test", 0, in.length()), dims); + } +} diff --git a/lucene/core/src/java/module-info.java b/lucene/core/src/java/module-info.java index 94ff818c499..a8ebf98ad2b 100644 --- a/lucene/core/src/java/module-info.java +++ b/lucene/core/src/java/module-info.java @@ -64,6 +64,8 @@ exports org.apache.lucene.util.quantization; exports org.apache.lucene.codecs.hnsw; + exports org.apache.lucene.internal.vectorization to + org.apache.lucene.benchmark.jmh; provides org.apache.lucene.analysis.TokenizerFactory with org.apache.lucene.analysis.standard.StandardTokenizerFactory; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java index 8c78a0cb0a0..9dfe13e00fc 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java @@ -22,13 +22,13 @@ import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.KnnVectorsWriter; -import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer; import org.apache.lucene.codecs.hnsw.FlatVectorsFormat; import org.apache.lucene.codecs.lucene90.IndexedDISI; import org.apache.lucene.index.MergePolicy; import org.apache.lucene.index.MergeScheduler; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.internal.vectorization.FlatVectorScorerProvider; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.TaskExecutor; import org.apache.lucene.store.IndexOutput; @@ -139,7 +139,7 @@ public final class Lucene99HnswVectorsFormat extends KnnVectorsFormat { /** The format for storing, reading, merging vectors on disk */ private static final FlatVectorsFormat flatVectorsFormat = - new Lucene99FlatVectorsFormat(new DefaultFlatVectorScorer()); + new Lucene99FlatVectorsFormat(FlatVectorScorerProvider.createDefault()); private final int numMergeWorkers; private final TaskExecutor mergeExec; diff --git a/lucene/core/src/java/org/apache/lucene/internal/vectorization/FlatVectorScorerProvider.java b/lucene/core/src/java/org/apache/lucene/internal/vectorization/FlatVectorScorerProvider.java new file mode 100644 index 00000000000..0b2d1732ad0 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/internal/vectorization/FlatVectorScorerProvider.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.internal.vectorization; + +import java.lang.invoke.MethodHandles; +import java.lang.invoke.MethodType; +import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer; +import org.apache.lucene.codecs.hnsw.FlatVectorsScorer; + +/** A provider of FlatVectorsScorer. */ +public class FlatVectorScorerProvider { + + /** Returns the default FlatVectorsScorer. TODO: find a better name than default. */ + public static FlatVectorsScorer createDefault() { + if (isPanamaVectorUtilSupportEnabled()) { + // we only enable this scorer if the Panama vector provider is also enabled + return lookup(); + } + return new DefaultFlatVectorScorer(); + } + + public static FlatVectorsScorer lookup() { + try { + var cls = + Class.forName("org.apache.lucene.internal.vectorization.MemorySegmentFlatVectorsScorer"); + var lookup = MethodHandles.lookup(); + var mh = + lookup.findConstructor(cls, MethodType.methodType(void.class, FlatVectorsScorer.class)); + return (FlatVectorsScorer) mh.invoke(new DefaultFlatVectorScorer()); + } catch (Throwable t) { + throw new RuntimeException(t); + } + } + + private static boolean isPanamaVectorUtilSupportEnabled() { + var name = VectorizationProvider.getInstance().getClass().getSimpleName(); + assert name.equals("PanamaVectorizationProvider") + || name.equals("DefaultVectorizationProvider"); + return name.equals("PanamaVectorizationProvider"); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java b/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java index 158cdde5e0e..5affb3a9e98 100644 --- a/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java +++ b/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java @@ -177,7 +177,10 @@ private static Optional lookupVectorModule() { } // add all possible callers here as FQCN: - private static final Set VALID_CALLERS = Set.of("org.apache.lucene.util.VectorUtil"); + private static final Set VALID_CALLERS = + Set.of( + "org.apache.lucene.util.VectorUtil", + "org.apache.lucene.internal.vectorization.FlatVectorScorerProvider"); private static void ensureCaller() { final boolean validCaller = @@ -198,6 +201,11 @@ private static void ensureCaller() { private static final class Holder { private Holder() {} - static final VectorizationProvider INSTANCE = lookup(false); + // TODO: this is not quite right. But we should be able to run tests with Panama Vector + static boolean testMode() { + return TESTS_VECTOR_SIZE.isPresent() || TESTS_FORCE_INTEGER_VECTORS; + } + + static final VectorizationProvider INSTANCE = lookup(testMode()); } } diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/DotProduct.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/DotProduct.java new file mode 100644 index 00000000000..a9ed482b763 --- /dev/null +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/DotProduct.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.internal.vectorization; + +import java.io.IOException; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.hnsw.RandomAccessVectorValues; + +final class DotProduct extends MemorySegmentByteVectorScorerSupplier { + + DotProduct( + int dims, int maxOrd, int vectorByteSize, IndexInput input, RandomAccessVectorValues values) { + super(dims, maxOrd, vectorByteSize, input, values); + } + + @Override + public float score(int node) throws IOException { + // divide by 2 * 2^14 (maximum absolute value of product of 2 signed bytes) * len + float raw = PanamaVectorUtilSupport.dotProduct(first, getSegment(node, scratch2)); + return 0.5f + raw / (float) (dims * (1 << 15)); + } + + @Override + public DotProduct copy() throws IOException { + return new DotProduct(dims, maxOrd, vectorByteSize, input.clone(), values); + } +} diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Euclidean.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Euclidean.java new file mode 100644 index 00000000000..9c0e73e90f8 --- /dev/null +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Euclidean.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.internal.vectorization; + +import java.io.IOException; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.hnsw.RandomAccessVectorValues; + +final class Euclidean extends MemorySegmentByteVectorScorerSupplier { + + Euclidean( + int dims, int maxOrd, int vectorByteSize, IndexInput input, RandomAccessVectorValues values) { + super(dims, maxOrd, vectorByteSize, input, values); + } + + @Override + public float score(int node) throws IOException { + float raw = PanamaVectorUtilSupport.squareDistance(first, getSegment(node, scratch2)); + return 1 / (1f + raw); + } + + @Override + public Euclidean copy() throws IOException { + return new Euclidean(dims, maxOrd, vectorByteSize, input.clone(), values); + } +} diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentAccess.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentAccess.java new file mode 100644 index 00000000000..2afc607f3e3 --- /dev/null +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentAccess.java @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.internal.vectorization; + +import java.io.IOException; +import java.lang.foreign.MemorySegment; + +/** Provides access to the backing memory segment. */ +public interface MemorySegmentAccess { + + /** Returns the memory segment for a given position and length, or null. */ + MemorySegment segmentSliceOrNull(long pos, int len) throws IOException; +} diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java new file mode 100644 index 00000000000..4dcb922bcbe --- /dev/null +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.internal.vectorization; + +import java.io.IOException; +import java.lang.foreign.MemorySegment; +import java.util.Optional; +import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.store.FilterIndexInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.hnsw.RandomAccessVectorValues; +import org.apache.lucene.util.hnsw.RandomVectorScorer; +import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; + +/** A scorer of vectors whose element size is byte. */ +public abstract sealed class MemorySegmentByteVectorScorerSupplier + implements RandomVectorScorerSupplier, RandomVectorScorer permits DotProduct, Euclidean { + final int vectorByteSize; + final int dims; + final int maxOrd; + final IndexInput input; + final MemorySegmentAccess memorySegmentAccess; + + final RandomAccessVectorValues values; // to support ordToDoc/getAcceptOrds + final byte[] scratch1, scratch2; + + MemorySegment first; + + /** + * Return an optional whose value, if present, is the scorer. Otherwise, an empty optional is + * returned. + */ + public static Optional create( + int dims, + int maxOrd, + int vectorByteSize, + VectorSimilarityFunction type, + IndexInput input, + RandomAccessVectorValues values) { + input = FilterIndexInput.unwrap(input); + if (!(input instanceof MemorySegmentAccess)) { + return Optional.empty(); + } + checkInvariants(maxOrd, vectorByteSize, input); + return switch (type) { + case DOT_PRODUCT -> Optional.of(new DotProduct(dims, maxOrd, vectorByteSize, input, values)); + case EUCLIDEAN -> Optional.of(new Euclidean(dims, maxOrd, vectorByteSize, input, values)); + case MAXIMUM_INNER_PRODUCT -> Optional.empty(); // TODO: implement MAXIMUM_INNER_PRODUCT + case COSINE -> Optional.empty(); // TODO: implement Cosine + }; + } + + MemorySegmentByteVectorScorerSupplier( + int dims, int maxOrd, int vectorByteSize, IndexInput input, RandomAccessVectorValues values) { + this.vectorByteSize = vectorByteSize; + this.dims = dims; + this.maxOrd = maxOrd; + this.input = input; + this.memorySegmentAccess = (MemorySegmentAccess) input; + this.values = values; + scratch1 = new byte[vectorByteSize]; + scratch2 = new byte[vectorByteSize]; + } + + static void checkInvariants(int maxOrd, int vectorByteLength, IndexInput input) { + if (input.length() < (long) vectorByteLength * maxOrd) { + throw new IllegalArgumentException("input length not equal to expected vector data"); + } + } + + final void checkOrdinal(int ord, int maxOrd) { + if (ord < 0 || ord >= maxOrd) { + throw new IllegalArgumentException("illegal ordinal: " + ord); + } + } + + protected final MemorySegment getSegment(int ord, byte[] scratch) throws IOException { + checkOrdinal(ord, maxOrd); + int byteOffset = ord * vectorByteSize; // TODO: random + meta size + MemorySegment seg = memorySegmentAccess.segmentSliceOrNull(byteOffset, vectorByteSize); + if (seg == null) { + input.seek(byteOffset); + input.readBytes(scratch, 0, vectorByteSize); + seg = MemorySegment.ofArray(scratch); + } + return seg; + } + + public final RandomVectorScorer scorer(byte[] target) { + first = MemorySegment.ofArray(target); + return this; + } + + @Override + public final RandomVectorScorer scorer(int ord) throws IOException { + first = getSegment(ord, scratch1); + return this; + } + + @Override + public final int maxOrd() { + return maxOrd; + } + + @Override + public final int ordToDoc(int ord) { + return values.ordToDoc(ord); + } + + @Override + public final Bits getAcceptOrds(Bits acceptDocs) { + return values.getAcceptOrds(acceptDocs); + } +} diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentFlatVectorsScorer.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentFlatVectorsScorer.java new file mode 100644 index 00000000000..f95ec27dc20 --- /dev/null +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentFlatVectorsScorer.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.internal.vectorization; + +import java.io.IOException; +import org.apache.lucene.codecs.hnsw.FlatVectorsScorer; +import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.util.hnsw.RandomAccessVectorValues; +import org.apache.lucene.util.hnsw.RandomVectorScorer; +import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; + +public class MemorySegmentFlatVectorsScorer implements FlatVectorsScorer { + + private final FlatVectorsScorer delegate; + + public MemorySegmentFlatVectorsScorer(FlatVectorsScorer delegate) { + this.delegate = delegate; + } + + @Override + public RandomVectorScorerSupplier getRandomVectorScorerSupplier( + VectorSimilarityFunction similarityType, RandomAccessVectorValues vectorValues) + throws IOException { + // currently only supports binary vectors + if (vectorValues instanceof RandomAccessVectorValues.Bytes && vectorValues.getSlice() != null) { + var scorer = + MemorySegmentByteVectorScorerSupplier.create( + vectorValues.dimension(), + vectorValues.size(), + vectorValues.getVectorByteLength(), + similarityType, + vectorValues.getSlice(), + vectorValues); + if (scorer.isPresent()) { + return scorer.get(); + } + } + return delegate.getRandomVectorScorerSupplier(similarityType, vectorValues); + } + + @Override + public RandomVectorScorer getRandomVectorScorer( + VectorSimilarityFunction similarityType, + RandomAccessVectorValues vectorValues, + float[] target) + throws IOException { + // currently only supports binary vectors, so always delegate + return delegate.getRandomVectorScorer(similarityType, vectorValues, target); + } + + @Override + public RandomVectorScorer getRandomVectorScorer( + VectorSimilarityFunction similarityType, RandomAccessVectorValues vectorValues, byte[] target) + throws IOException { + checkDimensions(target.length, vectorValues.dimension()); + if (vectorValues instanceof RandomAccessVectorValues.Bytes && vectorValues.getSlice() != null) { + var scorer = + MemorySegmentByteVectorScorerSupplier.create( + vectorValues.dimension(), + vectorValues.size(), + vectorValues.getVectorByteLength(), + similarityType, + vectorValues.getSlice(), + vectorValues); + if (scorer.isPresent()) { + return scorer.get().scorer(target); + } + } + return delegate.getRandomVectorScorer(similarityType, vectorValues, target); + } + + static void checkDimensions(int queryLen, int fieldLen) { + if (queryLen != fieldLen) { + throw new IllegalArgumentException( + "vector query dimension: " + queryLen + " differs from field dimension: " + fieldLen); + } + } + + @Override + public String toString() { + return "MemorySegmentFlatVectorsScorer()"; + } +} diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java index 9e447612215..abfb3bf4e1e 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java @@ -16,6 +16,8 @@ */ package org.apache.lucene.internal.vectorization; +import static java.lang.foreign.ValueLayout.JAVA_BYTE; +import static java.nio.ByteOrder.LITTLE_ENDIAN; import static jdk.incubator.vector.VectorOperators.ADD; import static jdk.incubator.vector.VectorOperators.B2I; import static jdk.incubator.vector.VectorOperators.B2S; @@ -23,6 +25,7 @@ import static jdk.incubator.vector.VectorOperators.S2I; import static jdk.incubator.vector.VectorOperators.ZERO_EXTEND_B2S; +import java.lang.foreign.MemorySegment; import jdk.incubator.vector.ByteVector; import jdk.incubator.vector.FloatVector; import jdk.incubator.vector.IntVector; @@ -307,39 +310,44 @@ private float squareDistanceBody(float[] a, float[] b, int limit) { @Override public int dotProduct(byte[] a, byte[] b) { + return dotProduct(MemorySegment.ofArray(a), MemorySegment.ofArray(b)); + } + + public static int dotProduct(MemorySegment a, MemorySegment b) { + assert a.byteSize() == b.byteSize(); int i = 0; int res = 0; // only vectorize if we'll at least enter the loop a single time, and we have at least 128-bit // vectors (256-bit on intel to dodge performance landmines) - if (a.length >= 16 && HAS_FAST_INTEGER_VECTORS) { + if (a.byteSize() >= 16 && HAS_FAST_INTEGER_VECTORS) { // compute vectorized dot product consistent with VPDPBUSD instruction if (VECTOR_BITSIZE >= 512) { - i += BYTE_SPECIES.loopBound(a.length); + i += BYTE_SPECIES.loopBound(a.byteSize()); res += dotProductBody512(a, b, i); } else if (VECTOR_BITSIZE == 256) { - i += BYTE_SPECIES.loopBound(a.length); + i += BYTE_SPECIES.loopBound(a.byteSize()); res += dotProductBody256(a, b, i); } else { // tricky: we don't have SPECIES_32, so we workaround with "overlapping read" - i += ByteVector.SPECIES_64.loopBound(a.length - ByteVector.SPECIES_64.length()); + i += ByteVector.SPECIES_64.loopBound(a.byteSize() - ByteVector.SPECIES_64.length()); res += dotProductBody128(a, b, i); } } // scalar tail - for (; i < a.length; i++) { - res += b[i] * a[i]; + for (; i < a.byteSize(); i++) { + res += b.get(JAVA_BYTE, i) * a.get(JAVA_BYTE, i); } return res; } /** vectorized dot product body (512 bit vectors) */ - private int dotProductBody512(byte[] a, byte[] b, int limit) { + private static int dotProductBody512(MemorySegment a, MemorySegment b, int limit) { IntVector acc = IntVector.zero(INT_SPECIES); for (int i = 0; i < limit; i += BYTE_SPECIES.length()) { - ByteVector va8 = ByteVector.fromArray(BYTE_SPECIES, a, i); - ByteVector vb8 = ByteVector.fromArray(BYTE_SPECIES, b, i); + ByteVector va8 = ByteVector.fromMemorySegment(BYTE_SPECIES, a, i, LITTLE_ENDIAN); + ByteVector vb8 = ByteVector.fromMemorySegment(BYTE_SPECIES, b, i, LITTLE_ENDIAN); // 16-bit multiply: avoid AVX-512 heavy multiply on zmm Vector va16 = va8.convertShape(B2S, SHORT_SPECIES, 0); @@ -355,11 +363,11 @@ private int dotProductBody512(byte[] a, byte[] b, int limit) { } /** vectorized dot product body (256 bit vectors) */ - private int dotProductBody256(byte[] a, byte[] b, int limit) { + private static int dotProductBody256(MemorySegment a, MemorySegment b, int limit) { IntVector acc = IntVector.zero(IntVector.SPECIES_256); for (int i = 0; i < limit; i += ByteVector.SPECIES_64.length()) { - ByteVector va8 = ByteVector.fromArray(ByteVector.SPECIES_64, a, i); - ByteVector vb8 = ByteVector.fromArray(ByteVector.SPECIES_64, b, i); + ByteVector va8 = ByteVector.fromMemorySegment(ByteVector.SPECIES_64, a, i, LITTLE_ENDIAN); + ByteVector vb8 = ByteVector.fromMemorySegment(ByteVector.SPECIES_64, b, i, LITTLE_ENDIAN); // 32-bit multiply and add into accumulator Vector va32 = va8.convertShape(B2I, IntVector.SPECIES_256, 0); @@ -371,13 +379,13 @@ private int dotProductBody256(byte[] a, byte[] b, int limit) { } /** vectorized dot product body (128 bit vectors) */ - private int dotProductBody128(byte[] a, byte[] b, int limit) { + private static int dotProductBody128(MemorySegment a, MemorySegment b, int limit) { IntVector acc = IntVector.zero(IntVector.SPECIES_128); // 4 bytes at a time (re-loading half the vector each time!) for (int i = 0; i < limit; i += ByteVector.SPECIES_64.length() >> 1) { // load 8 bytes - ByteVector va8 = ByteVector.fromArray(ByteVector.SPECIES_64, a, i); - ByteVector vb8 = ByteVector.fromArray(ByteVector.SPECIES_64, b, i); + ByteVector va8 = ByteVector.fromMemorySegment(ByteVector.SPECIES_64, a, i, LITTLE_ENDIAN); + ByteVector vb8 = ByteVector.fromMemorySegment(ByteVector.SPECIES_64, b, i, LITTLE_ENDIAN); // process first "half" only: 16-bit multiply Vector va16 = va8.convert(B2S, 0); @@ -689,35 +697,40 @@ private float[] cosineBody128(byte[] a, byte[] b, int limit) { @Override public int squareDistance(byte[] a, byte[] b) { + return squareDistance(MemorySegment.ofArray(a), MemorySegment.ofArray(b)); + } + + public static int squareDistance(MemorySegment a, MemorySegment b) { + assert a.byteSize() == b.byteSize(); int i = 0; int res = 0; // only vectorize if we'll at least enter the loop a single time, and we have at least 128-bit // vectors (256-bit on intel to dodge performance landmines) - if (a.length >= 16 && HAS_FAST_INTEGER_VECTORS) { + if (a.byteSize() >= 16 && HAS_FAST_INTEGER_VECTORS) { if (VECTOR_BITSIZE >= 256) { - i += BYTE_SPECIES.loopBound(a.length); + i += BYTE_SPECIES.loopBound((int) a.byteSize()); res += squareDistanceBody256(a, b, i); } else { - i += ByteVector.SPECIES_64.loopBound(a.length); + i += ByteVector.SPECIES_64.loopBound((int) a.byteSize()); res += squareDistanceBody128(a, b, i); } } // scalar tail - for (; i < a.length; i++) { - int diff = a[i] - b[i]; + for (; i < a.byteSize(); i++) { + int diff = a.get(JAVA_BYTE, i) - b.get(JAVA_BYTE, i); res += diff * diff; } return res; } /** vectorized square distance body (256+ bit vectors) */ - private int squareDistanceBody256(byte[] a, byte[] b, int limit) { + private static int squareDistanceBody256(MemorySegment a, MemorySegment b, int limit) { IntVector acc = IntVector.zero(INT_SPECIES); for (int i = 0; i < limit; i += BYTE_SPECIES.length()) { - ByteVector va8 = ByteVector.fromArray(BYTE_SPECIES, a, i); - ByteVector vb8 = ByteVector.fromArray(BYTE_SPECIES, b, i); + ByteVector va8 = ByteVector.fromMemorySegment(BYTE_SPECIES, a, i, LITTLE_ENDIAN); + ByteVector vb8 = ByteVector.fromMemorySegment(BYTE_SPECIES, b, i, LITTLE_ENDIAN); // 32-bit sub, multiply, and add into accumulators // TODO: uses AVX-512 heavy multiply on zmm, should we just use 256-bit vectors on AVX-512? @@ -731,14 +744,14 @@ private int squareDistanceBody256(byte[] a, byte[] b, int limit) { } /** vectorized square distance body (128 bit vectors) */ - private int squareDistanceBody128(byte[] a, byte[] b, int limit) { + private static int squareDistanceBody128(MemorySegment a, MemorySegment b, int limit) { // 128-bit implementation, which must "split up" vectors due to widening conversions // it doesn't help to do the overlapping read trick, due to 32-bit multiply in the formula IntVector acc1 = IntVector.zero(IntVector.SPECIES_128); IntVector acc2 = IntVector.zero(IntVector.SPECIES_128); for (int i = 0; i < limit; i += ByteVector.SPECIES_64.length()) { - ByteVector va8 = ByteVector.fromArray(ByteVector.SPECIES_64, a, i); - ByteVector vb8 = ByteVector.fromArray(ByteVector.SPECIES_64, b, i); + ByteVector va8 = ByteVector.fromMemorySegment(ByteVector.SPECIES_64, a, i, LITTLE_ENDIAN); + ByteVector vb8 = ByteVector.fromMemorySegment(ByteVector.SPECIES_64, b, i, LITTLE_ENDIAN); // 16-bit sub Vector va16 = va8.convertShape(B2S, ShortVector.SPECIES_128, 0); diff --git a/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java b/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java index 7d1e2572fdb..9a6e950430e 100644 --- a/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java +++ b/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java @@ -24,6 +24,7 @@ import java.nio.ByteOrder; import java.util.Arrays; import java.util.Objects; +import org.apache.lucene.internal.vectorization.MemorySegmentAccess; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.GroupVIntUtil; @@ -34,7 +35,8 @@ * chunkSizePower). */ @SuppressWarnings("preview") -abstract class MemorySegmentIndexInput extends IndexInput implements RandomAccessInput { +abstract class MemorySegmentIndexInput extends IndexInput + implements RandomAccessInput, MemorySegmentAccess { static final ValueLayout.OfByte LAYOUT_BYTE = ValueLayout.JAVA_BYTE; static final ValueLayout.OfShort LAYOUT_LE_SHORT = ValueLayout.JAVA_SHORT_UNALIGNED.withOrder(ByteOrder.LITTLE_ENDIAN); @@ -503,6 +505,23 @@ MemorySegmentIndexInput buildSlice(String sliceDescription, long offset, long le } } + public MemorySegment segmentSliceOrNull(long pos, int len) throws IOException { + if (pos + len > length) { + throw handlePositionalIOOBE(null, "segmentSliceOrNull", pos); + } + final int si = (int) (pos >> chunkSizePower); + final MemorySegment seg = segments[si]; + final long segOffset = pos & chunkSizeMask; + if (checkIndex(segOffset + len, seg.byteSize() + 1)) { + return seg.asSlice(segOffset, len); + } + return null; + } + + static boolean checkIndex(long index, long length) { + return index >= 0 && index < length; + } + @Override public final void close() throws IOException { if (curSegment == null) { @@ -614,6 +633,16 @@ public long readLong(long pos) throws IOException { throw alreadyClosed(e); } } + + @Override + public MemorySegment segmentSliceOrNull(long pos, int len) throws IOException { + try { + Objects.checkIndex(pos + len, this.length + 1); + return curSegment.asSlice(pos, len); + } catch (IndexOutOfBoundsException e) { + throw handlePositionalIOOBE(e, "segmentSliceOrNull", pos); + } + } } /** This class adds offset support to MemorySegmentIndexInput, which is needed for slices. */ @@ -679,6 +708,10 @@ public long readLong(long pos) throws IOException { return super.readLong(pos + offset); } + public MemorySegment segmentSliceOrNull(long pos, int len) throws IOException { + return super.segmentSliceOrNull(pos + offset, len); + } + @Override MemorySegmentIndexInput buildSlice(String sliceDescription, long ofs, long length) { return super.buildSlice(sliceDescription, this.offset + ofs, length); diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswVectorsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswVectorsFormat.java index 0f84f8ab4ae..54f9bda2af8 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswVectorsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswVectorsFormat.java @@ -16,6 +16,11 @@ */ package org.apache.lucene.codecs.lucene99; +import static java.lang.String.format; +import static org.hamcrest.Matchers.is; +import static org.hamcrest.Matchers.oneOf; + +import java.util.Locale; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.KnnVectorsFormat; @@ -37,9 +42,11 @@ public KnnVectorsFormat knnVectorsFormat() { return new Lucene99HnswVectorsFormat(10, 20); } }; - String expectedString = - "Lucene99HnswVectorsFormat(name=Lucene99HnswVectorsFormat, maxConn=10, beamWidth=20, flatVectorFormat=Lucene99FlatVectorsFormat(vectorsScorer=DefaultFlatVectorScorer()))"; - assertEquals(expectedString, customCodec.knnVectorsFormat().toString()); + String expectedPattern = + "Lucene99HnswVectorsFormat(name=Lucene99HnswVectorsFormat, maxConn=10, beamWidth=20, flatVectorFormat=Lucene99FlatVectorsFormat(vectorsScorer=%s()))"; + var defaultScorer = format(Locale.ROOT, expectedPattern, "DefaultFlatVectorScorer"); + var memSegScorer = format(Locale.ROOT, expectedPattern, "MemorySegmentFlatVectorsScorer"); + assertThat(customCodec.knnVectorsFormat().toString(), is(oneOf(defaultScorer, memSegScorer))); } public void testLimits() { diff --git a/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java b/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java new file mode 100644 index 00000000000..71bc5dd1498 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java @@ -0,0 +1,215 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.internal.vectorization; + +import static org.apache.lucene.index.VectorSimilarityFunction.DOT_PRODUCT; +import static org.apache.lucene.index.VectorSimilarityFunction.EUCLIDEAN; +import static org.hamcrest.Matchers.equalTo; + +import com.carrotsearch.randomizedtesting.generators.RandomNumbers; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.function.Function; +import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer; +import org.apache.lucene.codecs.hnsw.FlatVectorsScorer; +import org.apache.lucene.codecs.lucene95.OffHeapByteVectorValues; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.MMapDirectory; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.hnsw.RandomAccessVectorValues; +import org.junit.BeforeClass; + +// @com.carrotsearch.randomizedtesting.annotations.Repeat(iterations = 10) +public class TestVectorScorer extends LuceneTestCase { + + static final FlatVectorsScorer DEFAULT_SCORER = new DefaultFlatVectorScorer(); + static final FlatVectorsScorer MEMSEG_SCORER = FlatVectorScorerProvider.lookup(); + + @BeforeClass + public static void beforeClass() throws Exception { + assumeTrue( + "Test only works when the Memory segment scorer is present.", + MEMSEG_SCORER.getClass() != DEFAULT_SCORER.getClass()); + } + + public void testSimpleScorer() throws IOException { + testSimpleScorer(MMapDirectory.DEFAULT_MAX_CHUNK_SIZE); + } + + public void testSimpleScorerSmallChunkSize() throws IOException { + long maxChunkSize = random().nextLong(4, 16); + testSimpleScorer(maxChunkSize); + } + + public void testSimpleScorerMedChunkSize() throws IOException { + // a chunk size where in some vectors will be copied on-heap, while others remain off-heap + testSimpleScorer(64); + } + + void testSimpleScorer(long maxChunkSize) throws IOException { + try (Directory dir = new MMapDirectory(createTempDir(getTestName()), maxChunkSize)) { + for (int dims : List.of(31, 32, 33)) { + // System.out.println("testing with dim=" + dims); + // dimensions that, in some scenarios, cross the mmap chunk sizes + byte[][] vectors = new byte[2][dims]; + String fileName = getTestName() + "-" + dims; + try (IndexOutput out = dir.createOutput(fileName, IOContext.DEFAULT)) { + for (int i = 0; i < dims; i++) { + vectors[0][i] = (byte) i; + vectors[1][i] = (byte) (dims - i); + } + byte[] bytes = concat(vectors[0], vectors[1]); + out.writeBytes(bytes, 0, bytes.length); + } + try (IndexInput in = dir.openInput(fileName, IOContext.DEFAULT)) { + var vectorValues = vectorValues(dims, 2, in); + for (var sim : List.of(DOT_PRODUCT, EUCLIDEAN)) { + for (var ords : List.of(List.of(0, 1), List.of(1, 0))) { + int idx0 = ords.get(0); + int idx1 = ords.get(1); + + // getRandomVectorScorerSupplier + var scorer1 = DEFAULT_SCORER.getRandomVectorScorerSupplier(sim, vectorValues); + float expected = scorer1.scorer(idx0).score(idx1); + var scorer2 = MEMSEG_SCORER.getRandomVectorScorerSupplier(sim, vectorValues); + assertThat(scorer2.scorer(idx0).score(idx1), equalTo(expected)); + + // getRandomVectorScorer + var scorer3 = DEFAULT_SCORER.getRandomVectorScorer(sim, vectorValues, vectors[idx0]); + assertThat(scorer3.score(idx1), equalTo(expected)); + var scorer4 = MEMSEG_SCORER.getRandomVectorScorer(sim, vectorValues, vectors[idx0]); + assertThat(scorer4.score(idx1), equalTo(expected)); + } + } + } + } + } + } + + public void testRandomScorer() throws IOException { + testRandomScorer(MMapDirectory.DEFAULT_MAX_CHUNK_SIZE, BYTE_ARRAY_RANDOM_FUNC); + } + + public void testRandomScorerMax() throws IOException { + testRandomScorer(MMapDirectory.DEFAULT_MAX_CHUNK_SIZE, BYTE_ARRAY_MAX_FUNC); + } + + public void testRandomScorerMin() throws IOException { + testRandomScorer(MMapDirectory.DEFAULT_MAX_CHUNK_SIZE, BYTE_ARRAY_MIN_FUNC); + } + + public void testRandomSmallChunkSize() throws IOException { + long maxChunkSize = randomLongBetween(32, 128); + testRandomScorer(maxChunkSize, BYTE_ARRAY_RANDOM_FUNC); + } + + void testRandomScorer(long maxChunkSize, Function byteArraySupplier) + throws IOException { + try (Directory dir = new MMapDirectory(createTempDir(getTestName()), maxChunkSize)) { + final int dims = randomIntBetween(1, 4096); + final int size = randomIntBetween(2, 100); + final byte[][] vectors = new byte[size][]; + String fileName = getTestName() + "-" + dims; + // System.out.println("Testing, maxChunkSize=" + maxChunkSize + ",fn=" + fileName); + try (IndexOutput out = dir.createOutput(fileName, IOContext.DEFAULT)) { + for (int i = 0; i < size; i++) { + var vec = byteArraySupplier.apply(dims); + out.writeBytes(vec, 0, vec.length); + vectors[i] = vec; + } + } + + try (IndexInput in = dir.openInput(fileName, IOContext.DEFAULT)) { + var vectorValues = vectorValues(dims, size, in); + for (int times = 0; times < TIMES; times++) { + for (var sim : List.of(DOT_PRODUCT, EUCLIDEAN)) { + int idx0 = randomIntBetween(0, size - 1); + int idx1 = randomIntBetween(0, size - 1); // may be the same as idx0 - which is ok. + + // getRandomVectorScorerSupplier + var scorer1 = DEFAULT_SCORER.getRandomVectorScorerSupplier(sim, vectorValues); + float expected = scorer1.scorer(idx0).score(idx1); + var scorer2 = MEMSEG_SCORER.getRandomVectorScorerSupplier(sim, vectorValues); + assertThat(scorer2.scorer(idx0).score(idx1), equalTo(expected)); + + // getRandomVectorScorer + var scorer3 = DEFAULT_SCORER.getRandomVectorScorer(sim, vectorValues, vectors[idx0]); + assertThat(scorer3.score(idx1), equalTo(expected)); + var scorer4 = MEMSEG_SCORER.getRandomVectorScorer(sim, vectorValues, vectors[idx0]); + assertThat(scorer4.score(idx1), equalTo(expected)); + } + } + } + } + } + + // TODO: add initial offset tests + + static RandomAccessVectorValues vectorValues(int dims, int size, IndexInput in) + throws IOException { + return new OffHeapByteVectorValues.DenseOffHeapVectorValues( + dims, size, in.slice("test", 0, in.length()), dims); + } + + /** Concatenates byte arrays. */ + static byte[] concat(byte[]... arrays) throws IOException { + try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + for (var ba : arrays) { + baos.write(ba); + } + return baos.toByteArray(); + } + } + + static int randomIntBetween(int minInclusive, int maxInclusive) { + return RandomNumbers.randomIntBetween(random(), minInclusive, maxInclusive); + } + + static long randomLongBetween(long minInclusive, long maxInclusive) { + return RandomNumbers.randomLongBetween(random(), minInclusive, maxInclusive); + } + + static Function BYTE_ARRAY_RANDOM_FUNC = + size -> { + byte[] ba = new byte[size]; + for (int i = 0; i < size; i++) { + ba[i] = (byte) random().nextInt(); + } + return ba; + }; + + static Function BYTE_ARRAY_MAX_FUNC = + size -> { + byte[] ba = new byte[size]; + Arrays.fill(ba, Byte.MAX_VALUE); + return ba; + }; + + static Function BYTE_ARRAY_MIN_FUNC = + size -> { + byte[] ba = new byte[size]; + Arrays.fill(ba, Byte.MIN_VALUE); + return ba; + }; + + static final int TIMES = 100; // a loop iteration times +} diff --git a/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java b/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java index be1526503ff..b38328e0dc2 100644 --- a/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java +++ b/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java @@ -50,6 +50,7 @@ import org.apache.lucene.tests.analysis.MockAnalyzer; import org.apache.lucene.tests.codecs.asserting.AssertingCodec; import org.apache.lucene.tests.index.RandomIndexWriter; +import org.apache.lucene.tests.store.BaseDirectoryWrapper; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.BitSet; @@ -77,6 +78,13 @@ abstract Field getKnnVectorField( abstract Field getKnnVectorField(String name, float[] vector); + /** + * Creates a new directory. Subclasses can override to test different directory implementations. + */ + protected BaseDirectoryWrapper newDirectoryForTest() { + return LuceneTestCase.newDirectory(random()); + } + public void testEquals() { AbstractKnnVectorQuery q1 = getKnnVectorQuery("f1", new float[] {0, 1}, 10); Query filter1 = new TermQuery(new Term("id", "id1")); @@ -308,7 +316,7 @@ public void testScoreEuclidean() throws IOException { } public void testScoreCosine() throws IOException { - try (Directory d = newDirectory()) { + try (Directory d = newDirectoryForTest()) { try (IndexWriter w = new IndexWriter(d, new IndexWriterConfig())) { for (int j = 1; j <= 5; j++) { Document doc = new Document(); @@ -383,7 +391,7 @@ public void testScoreMIP() throws IOException { } public void testExplain() throws IOException { - try (Directory d = newDirectory()) { + try (Directory d = newDirectoryForTest()) { try (IndexWriter w = new IndexWriter(d, new IndexWriterConfig())) { for (int j = 0; j < 5; j++) { Document doc = new Document(); @@ -410,7 +418,7 @@ public void testExplain() throws IOException { } public void testExplainMultipleSegments() throws IOException { - try (Directory d = newDirectory()) { + try (Directory d = newDirectoryForTest()) { try (IndexWriter w = new IndexWriter(d, new IndexWriterConfig())) { for (int j = 0; j < 5; j++) { Document doc = new Document(); @@ -443,7 +451,7 @@ public void testSkewedIndex() throws IOException { * number of top K documents, but no more than K documents in total (otherwise we might occasionally * randomly fail to find one). */ - try (Directory d = newDirectory()) { + try (Directory d = newDirectoryForTest()) { try (IndexWriter w = new IndexWriter(d, new IndexWriterConfig())) { int r = 0; for (int i = 0; i < 5; i++) { @@ -479,7 +487,7 @@ public void testRandom() throws IOException { int dimension = atLeast(5); int numIters = atLeast(10); boolean everyDocHasAVector = random().nextBoolean(); - try (Directory d = newDirectory()) { + try (Directory d = newDirectoryForTest()) { RandomIndexWriter w = new RandomIndexWriter(random(), d); for (int i = 0; i < numDocs; i++) { Document doc = new Document(); @@ -518,7 +526,7 @@ public void testRandomWithFilter() throws IOException { int numDocs = 1000; int dimension = atLeast(5); int numIters = atLeast(10); - try (Directory d = newDirectory()) { + try (Directory d = newDirectoryForTest()) { // Always use the default kNN format to have predictable behavior around when it hits // visitedLimit. This is fine since the test targets AbstractKnnVectorQuery logic, not the kNN // format @@ -604,7 +612,7 @@ public void testRandomWithFilter() throws IOException { public void testFilterWithSameScore() throws IOException { int numDocs = 100; int dimension = atLeast(5); - try (Directory d = newDirectory()) { + try (Directory d = newDirectoryForTest()) { // Always use the default kNN format to have predictable behavior around when it hits // visitedLimit. This is fine since the test targets AbstractKnnVectorQuery logic, not the kNN // format @@ -644,7 +652,7 @@ public void testFilterWithSameScore() throws IOException { } public void testDeletes() throws IOException { - try (Directory dir = newDirectory(); + try (Directory dir = newDirectoryForTest(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) { final int numDocs = atLeast(100); final int dim = 30; @@ -688,7 +696,7 @@ public void testDeletes() throws IOException { } public void testAllDeletes() throws IOException { - try (Directory dir = newDirectory(); + try (Directory dir = newDirectoryForTest(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) { final int numDocs = atLeast(100); final int dim = 30; @@ -717,7 +725,7 @@ public void testAllDeletes() throws IOException { */ public void testNoLiveDocsReader() throws IOException { IndexWriterConfig iwc = newIndexWriterConfig(); - try (Directory dir = newDirectory(); + try (Directory dir = newDirectoryForTest(); IndexWriter w = new IndexWriter(dir, iwc)) { final int numDocs = 10; final int dim = 30; @@ -745,7 +753,7 @@ public void testNoLiveDocsReader() throws IOException { */ public void testBitSetQuery() throws IOException { IndexWriterConfig iwc = newIndexWriterConfig(); - try (Directory dir = newDirectory(); + try (Directory dir = newDirectoryForTest(); IndexWriter w = new IndexWriter(dir, iwc)) { final int numDocs = 100; final int dim = 30; @@ -853,7 +861,7 @@ Directory getIndexStore(String field, float[]... contents) throws IOException { Directory getIndexStore( String field, VectorSimilarityFunction vectorSimilarityFunction, float[]... contents) throws IOException { - Directory indexStore = newDirectory(); + Directory indexStore = newDirectoryForTest(); RandomIndexWriter writer = new RandomIndexWriter(random(), indexStore); for (int i = 0; i < contents.length; ++i) { Document doc = new Document(); @@ -886,7 +894,7 @@ Directory getIndexStore( * preserving the order of the added documents. */ private Directory getStableIndexStore(String field, float[]... contents) throws IOException { - Directory indexStore = newDirectory(); + Directory indexStore = newDirectoryForTest(); try (IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig())) { for (int i = 0; i < contents.length; ++i) { Document doc = new Document(); @@ -1030,7 +1038,7 @@ public int hashCode() { } public void testSameFieldDifferentFormats() throws IOException { - try (Directory directory = newDirectory()) { + try (Directory directory = newDirectoryForTest()) { MockAnalyzer mockAnalyzer = new MockAnalyzer(random()); IndexWriterConfig iwc = newIndexWriterConfig(mockAnalyzer); KnnVectorsFormat format1 = randomVectorFormat(VectorEncoding.FLOAT32); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestKnnByteVectorQueryMMap.java b/lucene/core/src/test/org/apache/lucene/search/TestKnnByteVectorQueryMMap.java new file mode 100644 index 00000000000..812ac1b312a --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/TestKnnByteVectorQueryMMap.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; +import java.io.UncheckedIOException; +import org.apache.lucene.store.MMapDirectory; +import org.apache.lucene.tests.store.BaseDirectoryWrapper; +import org.apache.lucene.tests.store.MockDirectoryWrapper; + +@com.carrotsearch.randomizedtesting.annotations.Repeat(iterations = 100) +public class TestKnnByteVectorQueryMMap extends TestKnnByteVectorQuery { + + @Override + protected BaseDirectoryWrapper newDirectoryForTest() { + try { + return new MockDirectoryWrapper( + random(), new MMapDirectory(createTempDir("TestKnnByteVectorQueryMMap"))); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } +} From 8313c88cc59532f2e88d9d29694c018ff9627a92 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Fri, 3 May 2024 12:22:13 +0100 Subject: [PATCH 02/37] refactoring --- .../benchmark/jmh/VectorScorerBenchmark.java | 2 +- lucene/core/src/java/module-info.java | 2 -- .../hnsw}/FlatVectorScorerProvider.java | 21 ++++++++++++------- .../lucene99/Lucene99HnswVectorsFormat.java | 2 +- .../vectorization/VectorizationProvider.java | 2 +- ...MemorySegmentByteVectorScorerSupplier.java | 1 + .../MemorySegmentAccess.java | 8 +++++-- .../lucene/store/MemorySegmentIndexInput.java | 1 - .../vectorization/TestVectorScorer.java | 1 + 9 files changed, 25 insertions(+), 15 deletions(-) rename lucene/core/src/java/org/apache/lucene/{internal/vectorization => codecs/hnsw}/FlatVectorScorerProvider.java (81%) rename lucene/core/src/java21/org/apache/lucene/{internal/vectorization => store}/MemorySegmentAccess.java (87%) diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java index 5db830efe97..28c4d70b0ac 100644 --- a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java @@ -22,8 +22,8 @@ import java.nio.file.Files; import java.util.concurrent.ThreadLocalRandom; import java.util.concurrent.TimeUnit; +import org.apache.lucene.codecs.hnsw.FlatVectorScorerProvider; import org.apache.lucene.codecs.lucene95.OffHeapByteVectorValues; -import org.apache.lucene.internal.vectorization.FlatVectorScorerProvider; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; diff --git a/lucene/core/src/java/module-info.java b/lucene/core/src/java/module-info.java index a8ebf98ad2b..94ff818c499 100644 --- a/lucene/core/src/java/module-info.java +++ b/lucene/core/src/java/module-info.java @@ -64,8 +64,6 @@ exports org.apache.lucene.util.quantization; exports org.apache.lucene.codecs.hnsw; - exports org.apache.lucene.internal.vectorization to - org.apache.lucene.benchmark.jmh; provides org.apache.lucene.analysis.TokenizerFactory with org.apache.lucene.analysis.standard.StandardTokenizerFactory; diff --git a/lucene/core/src/java/org/apache/lucene/internal/vectorization/FlatVectorScorerProvider.java b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorScorerProvider.java similarity index 81% rename from lucene/core/src/java/org/apache/lucene/internal/vectorization/FlatVectorScorerProvider.java rename to lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorScorerProvider.java index 0b2d1732ad0..6cc3ef0d7d3 100644 --- a/lucene/core/src/java/org/apache/lucene/internal/vectorization/FlatVectorScorerProvider.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorScorerProvider.java @@ -15,17 +15,20 @@ * limitations under the License. */ -package org.apache.lucene.internal.vectorization; +package org.apache.lucene.codecs.hnsw; import java.lang.invoke.MethodHandles; import java.lang.invoke.MethodType; -import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer; -import org.apache.lucene.codecs.hnsw.FlatVectorsScorer; +import org.apache.lucene.internal.vectorization.VectorizationProvider; -/** A provider of FlatVectorsScorer. */ +/** + * A utility class that provides access to the default FlatVectorsScorer. + * + * @lucene.experimental + */ public class FlatVectorScorerProvider { - /** Returns the default FlatVectorsScorer. TODO: find a better name than default. */ + /** Returns the default FlatVectorsScorer. */ public static FlatVectorsScorer createDefault() { if (isPanamaVectorUtilSupportEnabled()) { // we only enable this scorer if the Panama vector provider is also enabled @@ -49,8 +52,12 @@ public static FlatVectorsScorer lookup() { private static boolean isPanamaVectorUtilSupportEnabled() { var name = VectorizationProvider.getInstance().getClass().getSimpleName(); - assert name.equals("PanamaVectorizationProvider") - || name.equals("DefaultVectorizationProvider"); + assert assertExpectedProvider(name); return name.equals("PanamaVectorizationProvider"); } + + static boolean assertExpectedProvider(String name) { + return name.equals("PanamaVectorizationProvider") + || name.equals("DefaultVectorizationProvider"); + } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java index 9dfe13e00fc..4eac258462a 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java @@ -22,13 +22,13 @@ import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.KnnVectorsWriter; +import org.apache.lucene.codecs.hnsw.FlatVectorScorerProvider; import org.apache.lucene.codecs.hnsw.FlatVectorsFormat; import org.apache.lucene.codecs.lucene90.IndexedDISI; import org.apache.lucene.index.MergePolicy; import org.apache.lucene.index.MergeScheduler; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; -import org.apache.lucene.internal.vectorization.FlatVectorScorerProvider; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.TaskExecutor; import org.apache.lucene.store.IndexOutput; diff --git a/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java b/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java index 5affb3a9e98..850b0a0677b 100644 --- a/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java +++ b/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java @@ -180,7 +180,7 @@ private static Optional lookupVectorModule() { private static final Set VALID_CALLERS = Set.of( "org.apache.lucene.util.VectorUtil", - "org.apache.lucene.internal.vectorization.FlatVectorScorerProvider"); + "org.apache.lucene.codecs.hnsw.FlatVectorScorerProvider"); private static void ensureCaller() { final boolean validCaller = diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java index 4dcb922bcbe..f9b924dbde3 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java @@ -22,6 +22,7 @@ import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.store.FilterIndexInput; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.MemorySegmentAccess; import org.apache.lucene.util.Bits; import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorer; diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentAccess.java b/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentAccess.java similarity index 87% rename from lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentAccess.java rename to lucene/core/src/java21/org/apache/lucene/store/MemorySegmentAccess.java index 2afc607f3e3..d200014e664 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentAccess.java +++ b/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentAccess.java @@ -14,12 +14,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.internal.vectorization; +package org.apache.lucene.store; import java.io.IOException; import java.lang.foreign.MemorySegment; -/** Provides access to the backing memory segment. */ +/** + * Provides access to the backing memory segment. + * + *

Expert API, allows access to the backing memory. + */ public interface MemorySegmentAccess { /** Returns the memory segment for a given position and length, or null. */ diff --git a/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java b/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java index 9a6e950430e..20c3d2c737c 100644 --- a/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java +++ b/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java @@ -24,7 +24,6 @@ import java.nio.ByteOrder; import java.util.Arrays; import java.util.Objects; -import org.apache.lucene.internal.vectorization.MemorySegmentAccess; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.GroupVIntUtil; diff --git a/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java b/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java index 71bc5dd1498..9d4ce4e15c8 100644 --- a/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java +++ b/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java @@ -27,6 +27,7 @@ import java.util.List; import java.util.function.Function; import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer; +import org.apache.lucene.codecs.hnsw.FlatVectorScorerProvider; import org.apache.lucene.codecs.hnsw.FlatVectorsScorer; import org.apache.lucene.codecs.lucene95.OffHeapByteVectorValues; import org.apache.lucene.store.Directory; From 8c6ab61d9a2ee7603861074b4268be12a1d77a4b Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Fri, 3 May 2024 12:23:17 +0100 Subject: [PATCH 03/37] restore --- .../org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java index 28c4d70b0ac..4b57d9d424c 100644 --- a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java @@ -43,7 +43,7 @@ @Measurement(iterations = 5, time = 1) // engage some noise reduction @Fork( - value = 1, // TODO restore to 3, + value = 3, jvmArgsAppend = {"-Xmx2g", "-Xms2g", "-XX:+AlwaysPreTouch"}) public class VectorScorerBenchmark { From 89aa9a258ea78db7db032447ec1564f92e4d3957 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Fri, 3 May 2024 12:31:32 +0100 Subject: [PATCH 04/37] renames and cleanup --- lucene/CHANGES.txt | 2 ++ ...duct.java => DotProductByteVectorScorerSupplier.java} | 9 +++++---- ...idean.java => EuclideanByteVectorScorerSupplier.java} | 9 +++++---- .../MemorySegmentByteVectorScorerSupplier.java | 9 ++++++--- .../apache/lucene/search/TestKnnByteVectorQueryMMap.java | 1 - 5 files changed, 18 insertions(+), 12 deletions(-) rename lucene/core/src/java21/org/apache/lucene/internal/vectorization/{DotProduct.java => DotProductByteVectorScorerSupplier.java} (82%) rename lucene/core/src/java21/org/apache/lucene/internal/vectorization/{Euclidean.java => EuclideanByteVectorScorerSupplier.java} (81%) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 1557689c1e3..7161f568baf 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -328,6 +328,8 @@ Optimizations * GITHUB#13321: Improve compressed int4 quantized vector search by utilizing SIMD inline with the decompression process. (Ben Trent) +* GITHUB#13339: Add a MemorySegment Vector scorer - for scoring without copying on-heap (Chris Hegarty) + Bug Fixes --------------------- diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/DotProduct.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/DotProductByteVectorScorerSupplier.java similarity index 82% rename from lucene/core/src/java21/org/apache/lucene/internal/vectorization/DotProduct.java rename to lucene/core/src/java21/org/apache/lucene/internal/vectorization/DotProductByteVectorScorerSupplier.java index a9ed482b763..ffa22a9e9a2 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/DotProduct.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/DotProductByteVectorScorerSupplier.java @@ -20,9 +20,9 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.hnsw.RandomAccessVectorValues; -final class DotProduct extends MemorySegmentByteVectorScorerSupplier { +final class DotProductByteVectorScorerSupplier extends MemorySegmentByteVectorScorerSupplier { - DotProduct( + DotProductByteVectorScorerSupplier( int dims, int maxOrd, int vectorByteSize, IndexInput input, RandomAccessVectorValues values) { super(dims, maxOrd, vectorByteSize, input, values); } @@ -35,7 +35,8 @@ public float score(int node) throws IOException { } @Override - public DotProduct copy() throws IOException { - return new DotProduct(dims, maxOrd, vectorByteSize, input.clone(), values); + public DotProductByteVectorScorerSupplier copy() throws IOException { + return new DotProductByteVectorScorerSupplier( + dims, maxOrd, vectorByteSize, input.clone(), values); } } diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Euclidean.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/EuclideanByteVectorScorerSupplier.java similarity index 81% rename from lucene/core/src/java21/org/apache/lucene/internal/vectorization/Euclidean.java rename to lucene/core/src/java21/org/apache/lucene/internal/vectorization/EuclideanByteVectorScorerSupplier.java index 9c0e73e90f8..9c840553881 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Euclidean.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/EuclideanByteVectorScorerSupplier.java @@ -20,9 +20,9 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.hnsw.RandomAccessVectorValues; -final class Euclidean extends MemorySegmentByteVectorScorerSupplier { +final class EuclideanByteVectorScorerSupplier extends MemorySegmentByteVectorScorerSupplier { - Euclidean( + EuclideanByteVectorScorerSupplier( int dims, int maxOrd, int vectorByteSize, IndexInput input, RandomAccessVectorValues values) { super(dims, maxOrd, vectorByteSize, input, values); } @@ -34,7 +34,8 @@ public float score(int node) throws IOException { } @Override - public Euclidean copy() throws IOException { - return new Euclidean(dims, maxOrd, vectorByteSize, input.clone(), values); + public EuclideanByteVectorScorerSupplier copy() throws IOException { + return new EuclideanByteVectorScorerSupplier( + dims, maxOrd, vectorByteSize, input.clone(), values); } } diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java index f9b924dbde3..dad08ada254 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java @@ -30,7 +30,8 @@ /** A scorer of vectors whose element size is byte. */ public abstract sealed class MemorySegmentByteVectorScorerSupplier - implements RandomVectorScorerSupplier, RandomVectorScorer permits DotProduct, Euclidean { + implements RandomVectorScorerSupplier, RandomVectorScorer + permits DotProductByteVectorScorerSupplier, EuclideanByteVectorScorerSupplier { final int vectorByteSize; final int dims; final int maxOrd; @@ -59,8 +60,10 @@ public static Optional create( } checkInvariants(maxOrd, vectorByteSize, input); return switch (type) { - case DOT_PRODUCT -> Optional.of(new DotProduct(dims, maxOrd, vectorByteSize, input, values)); - case EUCLIDEAN -> Optional.of(new Euclidean(dims, maxOrd, vectorByteSize, input, values)); + case DOT_PRODUCT -> Optional.of( + new DotProductByteVectorScorerSupplier(dims, maxOrd, vectorByteSize, input, values)); + case EUCLIDEAN -> Optional.of( + new EuclideanByteVectorScorerSupplier(dims, maxOrd, vectorByteSize, input, values)); case MAXIMUM_INNER_PRODUCT -> Optional.empty(); // TODO: implement MAXIMUM_INNER_PRODUCT case COSINE -> Optional.empty(); // TODO: implement Cosine }; diff --git a/lucene/core/src/test/org/apache/lucene/search/TestKnnByteVectorQueryMMap.java b/lucene/core/src/test/org/apache/lucene/search/TestKnnByteVectorQueryMMap.java index 812ac1b312a..3010749da72 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestKnnByteVectorQueryMMap.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestKnnByteVectorQueryMMap.java @@ -22,7 +22,6 @@ import org.apache.lucene.tests.store.BaseDirectoryWrapper; import org.apache.lucene.tests.store.MockDirectoryWrapper; -@com.carrotsearch.randomizedtesting.annotations.Repeat(iterations = 100) public class TestKnnByteVectorQueryMMap extends TestKnnByteVectorQuery { @Override From ede3dfe62af81a606500b2ebeec87db2aeaba9c4 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Fri, 3 May 2024 15:49:56 +0100 Subject: [PATCH 05/37] move creation to VectorizationProvider - much nicer! --- .../codecs/hnsw/FlatVectorScorerProvider.java | 63 ------------------- .../lucene99/Lucene99HnswVectorsFormat.java | 4 +- .../DefaultVectorizationProvider.java | 8 +++ .../vectorization/VectorizationProvider.java | 12 ++-- .../PanamaVectorizationProvider.java | 7 +++ .../vectorization/TestVectorScorer.java | 4 +- 6 files changed, 24 insertions(+), 74 deletions(-) delete mode 100644 lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorScorerProvider.java diff --git a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorScorerProvider.java b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorScorerProvider.java deleted file mode 100644 index 6cc3ef0d7d3..00000000000 --- a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorScorerProvider.java +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.lucene.codecs.hnsw; - -import java.lang.invoke.MethodHandles; -import java.lang.invoke.MethodType; -import org.apache.lucene.internal.vectorization.VectorizationProvider; - -/** - * A utility class that provides access to the default FlatVectorsScorer. - * - * @lucene.experimental - */ -public class FlatVectorScorerProvider { - - /** Returns the default FlatVectorsScorer. */ - public static FlatVectorsScorer createDefault() { - if (isPanamaVectorUtilSupportEnabled()) { - // we only enable this scorer if the Panama vector provider is also enabled - return lookup(); - } - return new DefaultFlatVectorScorer(); - } - - public static FlatVectorsScorer lookup() { - try { - var cls = - Class.forName("org.apache.lucene.internal.vectorization.MemorySegmentFlatVectorsScorer"); - var lookup = MethodHandles.lookup(); - var mh = - lookup.findConstructor(cls, MethodType.methodType(void.class, FlatVectorsScorer.class)); - return (FlatVectorsScorer) mh.invoke(new DefaultFlatVectorScorer()); - } catch (Throwable t) { - throw new RuntimeException(t); - } - } - - private static boolean isPanamaVectorUtilSupportEnabled() { - var name = VectorizationProvider.getInstance().getClass().getSimpleName(); - assert assertExpectedProvider(name); - return name.equals("PanamaVectorizationProvider"); - } - - static boolean assertExpectedProvider(String name) { - return name.equals("PanamaVectorizationProvider") - || name.equals("DefaultVectorizationProvider"); - } -} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java index 4eac258462a..ac3474a0346 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java @@ -22,13 +22,13 @@ import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.KnnVectorsWriter; -import org.apache.lucene.codecs.hnsw.FlatVectorScorerProvider; import org.apache.lucene.codecs.hnsw.FlatVectorsFormat; import org.apache.lucene.codecs.lucene90.IndexedDISI; import org.apache.lucene.index.MergePolicy; import org.apache.lucene.index.MergeScheduler; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.internal.vectorization.VectorizationProvider; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.TaskExecutor; import org.apache.lucene.store.IndexOutput; @@ -139,7 +139,7 @@ public final class Lucene99HnswVectorsFormat extends KnnVectorsFormat { /** The format for storing, reading, merging vectors on disk */ private static final FlatVectorsFormat flatVectorsFormat = - new Lucene99FlatVectorsFormat(FlatVectorScorerProvider.createDefault()); + new Lucene99FlatVectorsFormat(VectorizationProvider.getInstance().newFlatVectorScorer()); private final int numMergeWorkers; private final TaskExecutor mergeExec; diff --git a/lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorizationProvider.java b/lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorizationProvider.java index f3d9aa95fd3..f64ec931026 100644 --- a/lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorizationProvider.java +++ b/lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorizationProvider.java @@ -17,6 +17,9 @@ package org.apache.lucene.internal.vectorization; +import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer; +import org.apache.lucene.codecs.hnsw.FlatVectorsScorer; + /** Default provider returning scalar implementations. */ final class DefaultVectorizationProvider extends VectorizationProvider { @@ -30,4 +33,9 @@ final class DefaultVectorizationProvider extends VectorizationProvider { public VectorUtilSupport getVectorUtilSupport() { return vectorUtilSupport; } + + @Override + public FlatVectorsScorer newFlatVectorScorer() { + return new DefaultFlatVectorScorer(); + } } diff --git a/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java b/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java index 850b0a0677b..f1c9cb79e7a 100644 --- a/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java +++ b/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java @@ -27,6 +27,7 @@ import java.util.function.Predicate; import java.util.logging.Logger; import java.util.stream.Stream; +import org.apache.lucene.codecs.hnsw.FlatVectorsScorer; import org.apache.lucene.util.Constants; import org.apache.lucene.util.VectorUtil; @@ -91,6 +92,8 @@ public static VectorizationProvider getInstance() { */ public abstract VectorUtilSupport getVectorUtilSupport(); + public abstract FlatVectorsScorer newFlatVectorScorer(); + // *** Lookup mechanism: *** private static final Logger LOG = Logger.getLogger(VectorizationProvider.class.getName()); @@ -180,7 +183,7 @@ private static Optional lookupVectorModule() { private static final Set VALID_CALLERS = Set.of( "org.apache.lucene.util.VectorUtil", - "org.apache.lucene.codecs.hnsw.FlatVectorScorerProvider"); + "org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat"); private static void ensureCaller() { final boolean validCaller = @@ -201,11 +204,6 @@ private static void ensureCaller() { private static final class Holder { private Holder() {} - // TODO: this is not quite right. But we should be able to run tests with Panama Vector - static boolean testMode() { - return TESTS_VECTOR_SIZE.isPresent() || TESTS_FORCE_INTEGER_VECTORS; - } - - static final VectorizationProvider INSTANCE = lookup(testMode()); + static final VectorizationProvider INSTANCE = lookup(false); } } diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorizationProvider.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorizationProvider.java index 11901d74f42..516a3ea9f69 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorizationProvider.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorizationProvider.java @@ -21,6 +21,8 @@ import java.util.Locale; import java.util.logging.Logger; import jdk.incubator.vector.FloatVector; +import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer; +import org.apache.lucene.codecs.hnsw.FlatVectorsScorer; import org.apache.lucene.util.Constants; import org.apache.lucene.util.SuppressForbidden; @@ -73,4 +75,9 @@ private static T doPrivileged(PrivilegedAction action) { public VectorUtilSupport getVectorUtilSupport() { return vectorUtilSupport; } + + @Override + public FlatVectorsScorer newFlatVectorScorer() { + return new MemorySegmentFlatVectorsScorer(new DefaultFlatVectorScorer()); + } } diff --git a/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java b/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java index 9d4ce4e15c8..b6997e028cb 100644 --- a/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java +++ b/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java @@ -27,7 +27,6 @@ import java.util.List; import java.util.function.Function; import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer; -import org.apache.lucene.codecs.hnsw.FlatVectorScorerProvider; import org.apache.lucene.codecs.hnsw.FlatVectorsScorer; import org.apache.lucene.codecs.lucene95.OffHeapByteVectorValues; import org.apache.lucene.store.Directory; @@ -43,7 +42,8 @@ public class TestVectorScorer extends LuceneTestCase { static final FlatVectorsScorer DEFAULT_SCORER = new DefaultFlatVectorScorer(); - static final FlatVectorsScorer MEMSEG_SCORER = FlatVectorScorerProvider.lookup(); + static final FlatVectorsScorer MEMSEG_SCORER = + VectorizationProvider.lookup(true).newFlatVectorScorer(); @BeforeClass public static void beforeClass() throws Exception { From 2f6a9e2cc4a6764c3fe3f3ca89372b1449eae13e Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Fri, 3 May 2024 16:28:01 +0100 Subject: [PATCH 06/37] fix benchmark --- .../lucene/benchmark/jmh/VectorScorerBenchmark.java | 7 ++++--- lucene/core/src/java/module-info.java | 2 ++ .../apache/lucene/distribution/TestModularLayer.java | 12 +++++++++++- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java index 4b57d9d424c..8887783bdec 100644 --- a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java @@ -22,8 +22,8 @@ import java.nio.file.Files; import java.util.concurrent.ThreadLocalRandom; import java.util.concurrent.TimeUnit; -import org.apache.lucene.codecs.hnsw.FlatVectorScorerProvider; import org.apache.lucene.codecs.lucene95.OffHeapByteVectorValues; +import org.apache.lucene.internal.vectorization.VectorizationProvider; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; @@ -71,11 +71,12 @@ public void init() throws IOException { in = dir.openInput("vector.data", IOContext.DEFAULT); vectorValues = vectorValues(size, 2, in); scorer = - FlatVectorScorerProvider.createDefault() + VectorizationProvider.getInstance() + .newFlatVectorScorer() .getRandomVectorScorerSupplier(DOT_PRODUCT, vectorValues); // Ensure we're using the right vector scorer - var name = FlatVectorScorerProvider.createDefault().getClass().getSimpleName(); + var name = VectorizationProvider.getInstance().newFlatVectorScorer().getClass().getSimpleName(); if (Object.class.getModule().getLayer().findModule("jdk.incubator.vector").isPresent()) { if (!name.equals("MemorySegmentFlatVectorsScorer")) { throw new AssertionError("expected MemorySegmentFlatVectorsScorer, got:" + name); diff --git a/lucene/core/src/java/module-info.java b/lucene/core/src/java/module-info.java index 94ff818c499..a8ebf98ad2b 100644 --- a/lucene/core/src/java/module-info.java +++ b/lucene/core/src/java/module-info.java @@ -64,6 +64,8 @@ exports org.apache.lucene.util.quantization; exports org.apache.lucene.codecs.hnsw; + exports org.apache.lucene.internal.vectorization to + org.apache.lucene.benchmark.jmh; provides org.apache.lucene.analysis.TokenizerFactory with org.apache.lucene.analysis.standard.StandardTokenizerFactory; diff --git a/lucene/distribution.tests/src/test/org/apache/lucene/distribution/TestModularLayer.java b/lucene/distribution.tests/src/test/org/apache/lucene/distribution/TestModularLayer.java index 84d5df2256a..8384a93cee7 100644 --- a/lucene/distribution.tests/src/test/org/apache/lucene/distribution/TestModularLayer.java +++ b/lucene/distribution.tests/src/test/org/apache/lucene/distribution/TestModularLayer.java @@ -16,6 +16,8 @@ */ package org.apache.lucene.distribution; +import static org.hamcrest.Matchers.is; + import java.io.IOException; import java.io.InputStream; import java.lang.module.Configuration; @@ -355,7 +357,15 @@ public void testAllExportedPackagesInSync() throws IOException { boolean isInternal = export.source().startsWith("org.apache.lucene.internal"); if (isInternal) { Assertions.assertThat(export.targets()) - .containsExactlyInAnyOrder("org.apache.lucene.test_framework"); + .as("We only support qualified exports of internal packages") + .isNotEmpty(); + var allowable = + List.of("org.apache.lucene.test_framework", "org.apache.lucene.benchmark.jmh"); + for (String target : export.targets()) { + Assertions.assertThat(allowable.contains(target)) + .as("Qualified export to unexpected package: " + target) + .isEqualTo(true); + } } return isInternal; }); From c6ef6ea5b234203e59b4bb06ce84d6d0f3dc2471 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Fri, 3 May 2024 16:41:38 +0100 Subject: [PATCH 07/37] unused import --- .../test/org/apache/lucene/distribution/TestModularLayer.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/lucene/distribution.tests/src/test/org/apache/lucene/distribution/TestModularLayer.java b/lucene/distribution.tests/src/test/org/apache/lucene/distribution/TestModularLayer.java index 8384a93cee7..0d6d1ee8558 100644 --- a/lucene/distribution.tests/src/test/org/apache/lucene/distribution/TestModularLayer.java +++ b/lucene/distribution.tests/src/test/org/apache/lucene/distribution/TestModularLayer.java @@ -16,8 +16,6 @@ */ package org.apache.lucene.distribution; -import static org.hamcrest.Matchers.is; - import java.io.IOException; import java.io.InputStream; import java.lang.module.Configuration; From 7a1faa1ed5eccd602c0bcccfe596d481c939d4f7 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Fri, 3 May 2024 17:10:16 +0100 Subject: [PATCH 08/37] MemorySegmentAccessInput refactor --- .../DotProductByteVectorScorerSupplier.java | 8 +++++-- .../EuclideanByteVectorScorerSupplier.java | 8 +++++-- ...MemorySegmentByteVectorScorerSupplier.java | 23 ++++++++++--------- ...ess.java => MemorySegmentAccessInput.java} | 4 +++- .../lucene/store/MemorySegmentIndexInput.java | 2 +- 5 files changed, 28 insertions(+), 17 deletions(-) rename lucene/core/src/java21/org/apache/lucene/store/{MemorySegmentAccess.java => MemorySegmentAccessInput.java} (90%) diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/DotProductByteVectorScorerSupplier.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/DotProductByteVectorScorerSupplier.java index ffa22a9e9a2..8b3f423f65b 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/DotProductByteVectorScorerSupplier.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/DotProductByteVectorScorerSupplier.java @@ -17,13 +17,17 @@ package org.apache.lucene.internal.vectorization; import java.io.IOException; -import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.MemorySegmentAccessInput; import org.apache.lucene.util.hnsw.RandomAccessVectorValues; final class DotProductByteVectorScorerSupplier extends MemorySegmentByteVectorScorerSupplier { DotProductByteVectorScorerSupplier( - int dims, int maxOrd, int vectorByteSize, IndexInput input, RandomAccessVectorValues values) { + int dims, + int maxOrd, + int vectorByteSize, + MemorySegmentAccessInput input, + RandomAccessVectorValues values) { super(dims, maxOrd, vectorByteSize, input, values); } diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/EuclideanByteVectorScorerSupplier.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/EuclideanByteVectorScorerSupplier.java index 9c840553881..3570d13a1d6 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/EuclideanByteVectorScorerSupplier.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/EuclideanByteVectorScorerSupplier.java @@ -17,13 +17,17 @@ package org.apache.lucene.internal.vectorization; import java.io.IOException; -import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.MemorySegmentAccessInput; import org.apache.lucene.util.hnsw.RandomAccessVectorValues; final class EuclideanByteVectorScorerSupplier extends MemorySegmentByteVectorScorerSupplier { EuclideanByteVectorScorerSupplier( - int dims, int maxOrd, int vectorByteSize, IndexInput input, RandomAccessVectorValues values) { + int dims, + int maxOrd, + int vectorByteSize, + MemorySegmentAccessInput input, + RandomAccessVectorValues values) { super(dims, maxOrd, vectorByteSize, input, values); } diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java index dad08ada254..59e5f35acce 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java @@ -22,7 +22,7 @@ import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.store.FilterIndexInput; import org.apache.lucene.store.IndexInput; -import org.apache.lucene.store.MemorySegmentAccess; +import org.apache.lucene.store.MemorySegmentAccessInput; import org.apache.lucene.util.Bits; import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorer; @@ -35,8 +35,7 @@ public abstract sealed class MemorySegmentByteVectorScorerSupplier final int vectorByteSize; final int dims; final int maxOrd; - final IndexInput input; - final MemorySegmentAccess memorySegmentAccess; + final MemorySegmentAccessInput input; final RandomAccessVectorValues values; // to support ordToDoc/getAcceptOrds final byte[] scratch1, scratch2; @@ -55,27 +54,30 @@ public static Optional create( IndexInput input, RandomAccessVectorValues values) { input = FilterIndexInput.unwrap(input); - if (!(input instanceof MemorySegmentAccess)) { + if (!(input instanceof MemorySegmentAccessInput msInput)) { return Optional.empty(); } checkInvariants(maxOrd, vectorByteSize, input); return switch (type) { case DOT_PRODUCT -> Optional.of( - new DotProductByteVectorScorerSupplier(dims, maxOrd, vectorByteSize, input, values)); + new DotProductByteVectorScorerSupplier(dims, maxOrd, vectorByteSize, msInput, values)); case EUCLIDEAN -> Optional.of( - new EuclideanByteVectorScorerSupplier(dims, maxOrd, vectorByteSize, input, values)); + new EuclideanByteVectorScorerSupplier(dims, maxOrd, vectorByteSize, msInput, values)); case MAXIMUM_INNER_PRODUCT -> Optional.empty(); // TODO: implement MAXIMUM_INNER_PRODUCT case COSINE -> Optional.empty(); // TODO: implement Cosine }; } MemorySegmentByteVectorScorerSupplier( - int dims, int maxOrd, int vectorByteSize, IndexInput input, RandomAccessVectorValues values) { + int dims, + int maxOrd, + int vectorByteSize, + MemorySegmentAccessInput input, + RandomAccessVectorValues values) { this.vectorByteSize = vectorByteSize; this.dims = dims; this.maxOrd = maxOrd; this.input = input; - this.memorySegmentAccess = (MemorySegmentAccess) input; this.values = values; scratch1 = new byte[vectorByteSize]; scratch2 = new byte[vectorByteSize]; @@ -96,10 +98,9 @@ final void checkOrdinal(int ord, int maxOrd) { protected final MemorySegment getSegment(int ord, byte[] scratch) throws IOException { checkOrdinal(ord, maxOrd); int byteOffset = ord * vectorByteSize; // TODO: random + meta size - MemorySegment seg = memorySegmentAccess.segmentSliceOrNull(byteOffset, vectorByteSize); + MemorySegment seg = input.segmentSliceOrNull(byteOffset, vectorByteSize); if (seg == null) { - input.seek(byteOffset); - input.readBytes(scratch, 0, vectorByteSize); + input.readBytes(byteOffset, scratch, 0, vectorByteSize); seg = MemorySegment.ofArray(scratch); } return seg; diff --git a/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentAccess.java b/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentAccessInput.java similarity index 90% rename from lucene/core/src/java21/org/apache/lucene/store/MemorySegmentAccess.java rename to lucene/core/src/java21/org/apache/lucene/store/MemorySegmentAccessInput.java index d200014e664..7c22eccdcf1 100644 --- a/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentAccess.java +++ b/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentAccessInput.java @@ -24,8 +24,10 @@ * *

Expert API, allows access to the backing memory. */ -public interface MemorySegmentAccess { +public interface MemorySegmentAccessInput extends RandomAccessInput, Cloneable { /** Returns the memory segment for a given position and length, or null. */ MemorySegment segmentSliceOrNull(long pos, int len) throws IOException; + + MemorySegmentAccessInput clone(); } diff --git a/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java b/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java index 20c3d2c737c..5f7052a1f55 100644 --- a/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java +++ b/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java @@ -35,7 +35,7 @@ */ @SuppressWarnings("preview") abstract class MemorySegmentIndexInput extends IndexInput - implements RandomAccessInput, MemorySegmentAccess { + implements RandomAccessInput, MemorySegmentAccessInput { static final ValueLayout.OfByte LAYOUT_BYTE = ValueLayout.JAVA_BYTE; static final ValueLayout.OfShort LAYOUT_LE_SHORT = ValueLayout.JAVA_SHORT_UNALIGNED.withOrder(ByteOrder.LITTLE_ENDIAN); From d8c76d8c1c14933850d8998b19198ff1c05afa30 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Fri, 3 May 2024 17:15:05 +0100 Subject: [PATCH 09/37] fix benchmark again --- .../lucene/internal/vectorization/VectorizationProvider.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java b/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java index f1c9cb79e7a..bba66da240d 100644 --- a/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java +++ b/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java @@ -182,8 +182,9 @@ private static Optional lookupVectorModule() { // add all possible callers here as FQCN: private static final Set VALID_CALLERS = Set.of( - "org.apache.lucene.util.VectorUtil", - "org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat"); + "org.apache.lucene.benchmark.jmh.VectorScorerBenchmark", + "org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat", + "org.apache.lucene.util.VectorUtil"); private static void ensureCaller() { final boolean validCaller = From 3b2bc6325be000304696fbd7172744d4964670d4 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Sat, 4 May 2024 09:32:12 +0100 Subject: [PATCH 10/37] remove scorer name checking from benchmark --- .../lucene/benchmark/jmh/VectorScorerBenchmark.java | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java index 8887783bdec..696e44aeef7 100644 --- a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java @@ -74,18 +74,6 @@ public void init() throws IOException { VectorizationProvider.getInstance() .newFlatVectorScorer() .getRandomVectorScorerSupplier(DOT_PRODUCT, vectorValues); - - // Ensure we're using the right vector scorer - var name = VectorizationProvider.getInstance().newFlatVectorScorer().getClass().getSimpleName(); - if (Object.class.getModule().getLayer().findModule("jdk.incubator.vector").isPresent()) { - if (!name.equals("MemorySegmentFlatVectorsScorer")) { - throw new AssertionError("expected MemorySegmentFlatVectorsScorer, got:" + name); - } - } else { - if (!name.equals("DefaultFlatVectorScorer")) { - throw new AssertionError("expected DefaultFlatVectorScorer, got:" + name); - } - } } @TearDown From 8b14344efc82b6157075446d2a72166524333a45 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Sat, 4 May 2024 10:00:43 +0100 Subject: [PATCH 11/37] add level of indirection to avoid directly using VectorizationProvider --- .../benchmark/jmh/VectorScorerBenchmark.java | 5 +-- lucene/core/src/java/module-info.java | 2 - .../codecs/hnsw/FlatVectorScorerUtil.java | 40 +++++++++++++++++++ .../lucene99/Lucene99HnswVectorsFormat.java | 4 +- .../vectorization/VectorizationProvider.java | 3 +- .../lucene/distribution/TestModularLayer.java | 10 +---- 6 files changed, 46 insertions(+), 18 deletions(-) create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorScorerUtil.java diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java index 696e44aeef7..08f245852e9 100644 --- a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java @@ -22,8 +22,8 @@ import java.nio.file.Files; import java.util.concurrent.ThreadLocalRandom; import java.util.concurrent.TimeUnit; +import org.apache.lucene.codecs.hnsw.FlatVectorScorerUtil; import org.apache.lucene.codecs.lucene95.OffHeapByteVectorValues; -import org.apache.lucene.internal.vectorization.VectorizationProvider; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; @@ -71,8 +71,7 @@ public void init() throws IOException { in = dir.openInput("vector.data", IOContext.DEFAULT); vectorValues = vectorValues(size, 2, in); scorer = - VectorizationProvider.getInstance() - .newFlatVectorScorer() + FlatVectorScorerUtil.newFlatVectorScorer() .getRandomVectorScorerSupplier(DOT_PRODUCT, vectorValues); } diff --git a/lucene/core/src/java/module-info.java b/lucene/core/src/java/module-info.java index a8ebf98ad2b..94ff818c499 100644 --- a/lucene/core/src/java/module-info.java +++ b/lucene/core/src/java/module-info.java @@ -64,8 +64,6 @@ exports org.apache.lucene.util.quantization; exports org.apache.lucene.codecs.hnsw; - exports org.apache.lucene.internal.vectorization to - org.apache.lucene.benchmark.jmh; provides org.apache.lucene.analysis.TokenizerFactory with org.apache.lucene.analysis.standard.StandardTokenizerFactory; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorScorerUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorScorerUtil.java new file mode 100644 index 00000000000..5a1100b1119 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorScorerUtil.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.codecs.hnsw; + +import org.apache.lucene.internal.vectorization.VectorizationProvider; + +/** + * Utilities for {@link FlatVectorsScorer}. + * + * @lucene.experimental + */ +public final class FlatVectorScorerUtil { + + private static final VectorizationProvider IMPL = VectorizationProvider.getInstance(); + + private FlatVectorScorerUtil() {} + + /** + * Creates a new flat vector scorer. Scorers created through this factory method may be optimized + * on certain platforms. Otherwise, a DefaultFlatVectorScorer is returned. + */ + public static FlatVectorsScorer newFlatVectorScorer() { + return IMPL.newFlatVectorScorer(); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java index ac3474a0346..43676f9fc66 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java @@ -22,13 +22,13 @@ import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.KnnVectorsWriter; +import org.apache.lucene.codecs.hnsw.FlatVectorScorerUtil; import org.apache.lucene.codecs.hnsw.FlatVectorsFormat; import org.apache.lucene.codecs.lucene90.IndexedDISI; import org.apache.lucene.index.MergePolicy; import org.apache.lucene.index.MergeScheduler; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; -import org.apache.lucene.internal.vectorization.VectorizationProvider; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.TaskExecutor; import org.apache.lucene.store.IndexOutput; @@ -139,7 +139,7 @@ public final class Lucene99HnswVectorsFormat extends KnnVectorsFormat { /** The format for storing, reading, merging vectors on disk */ private static final FlatVectorsFormat flatVectorsFormat = - new Lucene99FlatVectorsFormat(VectorizationProvider.getInstance().newFlatVectorScorer()); + new Lucene99FlatVectorsFormat(FlatVectorScorerUtil.newFlatVectorScorer()); private final int numMergeWorkers; private final TaskExecutor mergeExec; diff --git a/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java b/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java index bba66da240d..8b046e942b7 100644 --- a/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java +++ b/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java @@ -182,8 +182,7 @@ private static Optional lookupVectorModule() { // add all possible callers here as FQCN: private static final Set VALID_CALLERS = Set.of( - "org.apache.lucene.benchmark.jmh.VectorScorerBenchmark", - "org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat", + "org.apache.lucene.codecs.hnsw.FlatVectorScorerUtil", "org.apache.lucene.util.VectorUtil"); private static void ensureCaller() { diff --git a/lucene/distribution.tests/src/test/org/apache/lucene/distribution/TestModularLayer.java b/lucene/distribution.tests/src/test/org/apache/lucene/distribution/TestModularLayer.java index 0d6d1ee8558..84d5df2256a 100644 --- a/lucene/distribution.tests/src/test/org/apache/lucene/distribution/TestModularLayer.java +++ b/lucene/distribution.tests/src/test/org/apache/lucene/distribution/TestModularLayer.java @@ -355,15 +355,7 @@ public void testAllExportedPackagesInSync() throws IOException { boolean isInternal = export.source().startsWith("org.apache.lucene.internal"); if (isInternal) { Assertions.assertThat(export.targets()) - .as("We only support qualified exports of internal packages") - .isNotEmpty(); - var allowable = - List.of("org.apache.lucene.test_framework", "org.apache.lucene.benchmark.jmh"); - for (String target : export.targets()) { - Assertions.assertThat(allowable.contains(target)) - .as("Qualified export to unexpected package: " + target) - .isEqualTo(true); - } + .containsExactlyInAnyOrder("org.apache.lucene.test_framework"); } return isInternal; }); From 653bd2643ac98d975c16716947902bca3d2ad168 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Sat, 4 May 2024 21:44:48 +0100 Subject: [PATCH 12/37] unwrap only test filter index inputs --- .../org/apache/lucene/store/FilterIndexInput.java | 14 ++++++++++++++ .../MemorySegmentByteVectorScorerSupplier.java | 2 +- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/lucene/core/src/java/org/apache/lucene/store/FilterIndexInput.java b/lucene/core/src/java/org/apache/lucene/store/FilterIndexInput.java index 5b4a5c506ee..6f452e0b3f5 100644 --- a/lucene/core/src/java/org/apache/lucene/store/FilterIndexInput.java +++ b/lucene/core/src/java/org/apache/lucene/store/FilterIndexInput.java @@ -40,6 +40,20 @@ public static IndexInput unwrap(IndexInput in) { return in; } + /** + * Unwraps all test FilterIndexInputs until the first non-test FilterIndexInput IndexInput + * instance and returns it + */ + public static IndexInput unwrapOnlyTest(IndexInput in) { + while (in instanceof FilterIndexInput + && in.getClass().getName().equals("org.apache.lucene.tests.store.MockIndexInputWrapper")) { + in = ((FilterIndexInput) in).in; + } + return in; + } + + // org.apache.lucene.tests.store.MockIndexInputWrapper" + protected final IndexInput in; /** Creates a FilterIndexInput with a resource description and wrapped delegate IndexInput */ diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java index 59e5f35acce..e300596b3a7 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java @@ -53,7 +53,7 @@ public static Optional create( VectorSimilarityFunction type, IndexInput input, RandomAccessVectorValues values) { - input = FilterIndexInput.unwrap(input); + input = FilterIndexInput.unwrapOnlyTest(input); if (!(input instanceof MemorySegmentAccessInput msInput)) { return Optional.empty(); } From fa6db6856269b6675d673f22f071a589f0750eee Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Mon, 6 May 2024 09:33:12 +0100 Subject: [PATCH 13/37] add FilterIndexInputAccess to register test filter classes --- .../tests/FilterIndexInputAccess.java | 31 +++++++++++++++++++ .../lucene/internal/tests/TestSecrets.java | 15 +++++++++ .../apache/lucene/store/FilterIndexInput.java | 11 +++++-- .../tests/store/MockIndexInputWrapper.java | 6 ++++ .../SlowClosingMockIndexInputWrapper.java | 6 ++++ .../SlowOpeningMockIndexInputWrapper.java | 6 ++++ 6 files changed, 73 insertions(+), 2 deletions(-) create mode 100644 lucene/core/src/java/org/apache/lucene/internal/tests/FilterIndexInputAccess.java diff --git a/lucene/core/src/java/org/apache/lucene/internal/tests/FilterIndexInputAccess.java b/lucene/core/src/java/org/apache/lucene/internal/tests/FilterIndexInputAccess.java new file mode 100644 index 00000000000..eee40b43610 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/internal/tests/FilterIndexInputAccess.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.internal.tests; + +import org.apache.lucene.store.FilterIndexInput; + +/** + * Access to {@link org.apache.lucene.store.FilterIndexInput} internals exposed to the test + * framework. + * + * @lucene.internal + */ +public interface FilterIndexInputAccess { + /** Adds the given test FilterIndexInput class. */ + void addTestFilterType(Class cls); +} diff --git a/lucene/core/src/java/org/apache/lucene/internal/tests/TestSecrets.java b/lucene/core/src/java/org/apache/lucene/internal/tests/TestSecrets.java index e2d74fc6ae6..cfcf2008c3e 100644 --- a/lucene/core/src/java/org/apache/lucene/internal/tests/TestSecrets.java +++ b/lucene/core/src/java/org/apache/lucene/internal/tests/TestSecrets.java @@ -23,6 +23,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.SegmentReader; +import org.apache.lucene.store.FilterIndexInput; /** * A set of static methods returning accessors for internal, package-private functionality in @@ -48,12 +49,14 @@ public final class TestSecrets { ensureInitialized.accept(ConcurrentMergeScheduler.class); ensureInitialized.accept(SegmentReader.class); ensureInitialized.accept(IndexWriter.class); + ensureInitialized.accept(FilterIndexInput.class); } private static IndexPackageAccess indexPackageAccess; private static ConcurrentMergeSchedulerAccess cmsAccess; private static SegmentReaderAccess segmentReaderAccess; private static IndexWriterAccess indexWriterAccess; + private static FilterIndexInputAccess filterIndexInputAccess; private TestSecrets() {} @@ -81,6 +84,12 @@ public static IndexWriterAccess getIndexWriterAccess() { return Objects.requireNonNull(indexWriterAccess); } + /** Return the accessor to internal secrets for an {@link FilterIndexInput}. */ + public static FilterIndexInputAccess getFilterInputIndexAccess() { + ensureCaller(); + return Objects.requireNonNull(filterIndexInputAccess); + } + /** For internal initialization only. */ public static void setIndexWriterAccess(IndexWriterAccess indexWriterAccess) { ensureNull(TestSecrets.indexWriterAccess); @@ -105,6 +114,12 @@ public static void setSegmentReaderAccess(SegmentReaderAccess segmentReaderAcces TestSecrets.segmentReaderAccess = segmentReaderAccess; } + /** For internal initialization only. */ + public static void setFilterInputIndexAccess(FilterIndexInputAccess filterIndexInputAccess) { + ensureNull(TestSecrets.filterIndexInputAccess); + TestSecrets.filterIndexInputAccess = filterIndexInputAccess; + } + private static void ensureNull(Object ob) { if (ob != null) { throw new AssertionError( diff --git a/lucene/core/src/java/org/apache/lucene/store/FilterIndexInput.java b/lucene/core/src/java/org/apache/lucene/store/FilterIndexInput.java index 6f452e0b3f5..59669ebffd3 100644 --- a/lucene/core/src/java/org/apache/lucene/store/FilterIndexInput.java +++ b/lucene/core/src/java/org/apache/lucene/store/FilterIndexInput.java @@ -17,6 +17,8 @@ package org.apache.lucene.store; import java.io.IOException; +import java.util.concurrent.CopyOnWriteArrayList; +import org.apache.lucene.internal.tests.TestSecrets; /** * IndexInput implementation that delegates calls to another directory. This class can be used to @@ -29,6 +31,12 @@ */ public class FilterIndexInput extends IndexInput { + static final CopyOnWriteArrayList> TEST_FILTER_INPUTS = new CopyOnWriteArrayList<>(); + + static { + TestSecrets.setFilterInputIndexAccess(TEST_FILTER_INPUTS::add); + } + /** * Unwraps all FilterIndexInputs until the first non-FilterIndexInput IndexInput instance and * returns it @@ -45,8 +53,7 @@ public static IndexInput unwrap(IndexInput in) { * instance and returns it */ public static IndexInput unwrapOnlyTest(IndexInput in) { - while (in instanceof FilterIndexInput - && in.getClass().getName().equals("org.apache.lucene.tests.store.MockIndexInputWrapper")) { + while (in instanceof FilterIndexInput && TEST_FILTER_INPUTS.contains(in.getClass())) { in = ((FilterIndexInput) in).in; } return in; diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/store/MockIndexInputWrapper.java b/lucene/test-framework/src/java/org/apache/lucene/tests/store/MockIndexInputWrapper.java index 39c41d46825..09015ed3fcf 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/store/MockIndexInputWrapper.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/store/MockIndexInputWrapper.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.util.Map; import java.util.Set; +import org.apache.lucene.internal.tests.TestSecrets; import org.apache.lucene.store.FilterIndexInput; import org.apache.lucene.store.IndexInput; @@ -27,6 +28,11 @@ * Used by MockDirectoryWrapper to create an input stream that keeps track of when it's been closed. */ public class MockIndexInputWrapper extends FilterIndexInput { + + static { + TestSecrets.getFilterInputIndexAccess().addTestFilterType(MockIndexInputWrapper.class); + } + private MockDirectoryWrapper dir; final String name; private volatile boolean closed; diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/store/SlowClosingMockIndexInputWrapper.java b/lucene/test-framework/src/java/org/apache/lucene/tests/store/SlowClosingMockIndexInputWrapper.java index 490fbaddd6f..73197a66155 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/store/SlowClosingMockIndexInputWrapper.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/store/SlowClosingMockIndexInputWrapper.java @@ -17,6 +17,7 @@ package org.apache.lucene.tests.store; import java.io.IOException; +import org.apache.lucene.internal.tests.TestSecrets; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.SuppressForbidden; import org.apache.lucene.util.ThreadInterruptedException; @@ -28,6 +29,11 @@ */ class SlowClosingMockIndexInputWrapper extends MockIndexInputWrapper { + static { + TestSecrets.getFilterInputIndexAccess() + .addTestFilterType(SlowClosingMockIndexInputWrapper.class); + } + public SlowClosingMockIndexInputWrapper( MockDirectoryWrapper dir, String name, IndexInput delegate) { super(dir, name, delegate, null); diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/store/SlowOpeningMockIndexInputWrapper.java b/lucene/test-framework/src/java/org/apache/lucene/tests/store/SlowOpeningMockIndexInputWrapper.java index ec31d40e594..da0e13537c9 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/store/SlowOpeningMockIndexInputWrapper.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/store/SlowOpeningMockIndexInputWrapper.java @@ -17,6 +17,7 @@ package org.apache.lucene.tests.store; import java.io.IOException; +import org.apache.lucene.internal.tests.TestSecrets; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.SuppressForbidden; import org.apache.lucene.util.ThreadInterruptedException; @@ -27,6 +28,11 @@ */ class SlowOpeningMockIndexInputWrapper extends MockIndexInputWrapper { + static { + TestSecrets.getFilterInputIndexAccess() + .addTestFilterType(SlowOpeningMockIndexInputWrapper.class); + } + @SuppressForbidden(reason = "Thread sleep") public SlowOpeningMockIndexInputWrapper( MockDirectoryWrapper dir, String name, IndexInput delegate) throws IOException { From 0223c94bcdfce32cf9b46717849a4ba411921132 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Mon, 6 May 2024 10:09:28 +0100 Subject: [PATCH 14/37] Add cosine and max inner product --- .../CosineByteVectorScorerSupplier.java | 44 ++++++++++++++++ ...xInnerProductByteVectorScorerSupplier.java | 52 +++++++++++++++++++ ...MemorySegmentByteVectorScorerSupplier.java | 19 +++++-- .../PanamaVectorUtilSupport.java | 36 +++++++------ .../vectorization/TestVectorScorer.java | 6 ++- 5 files changed, 135 insertions(+), 22 deletions(-) create mode 100644 lucene/core/src/java21/org/apache/lucene/internal/vectorization/CosineByteVectorScorerSupplier.java create mode 100644 lucene/core/src/java21/org/apache/lucene/internal/vectorization/MaxInnerProductByteVectorScorerSupplier.java diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/CosineByteVectorScorerSupplier.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/CosineByteVectorScorerSupplier.java new file mode 100644 index 00000000000..274888fc61b --- /dev/null +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/CosineByteVectorScorerSupplier.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.internal.vectorization; + +import java.io.IOException; +import org.apache.lucene.store.MemorySegmentAccessInput; +import org.apache.lucene.util.hnsw.RandomAccessVectorValues; + +final class CosineByteVectorScorerSupplier extends MemorySegmentByteVectorScorerSupplier { + + CosineByteVectorScorerSupplier( + int dims, + int maxOrd, + int vectorByteSize, + MemorySegmentAccessInput input, + RandomAccessVectorValues values) { + super(dims, maxOrd, vectorByteSize, input, values); + } + + @Override + public float score(int node) throws IOException { + float raw = PanamaVectorUtilSupport.cosine(first, getSegment(node, scratch2)); + return (1 + raw) / 2; + } + + @Override + public CosineByteVectorScorerSupplier copy() throws IOException { + return new CosineByteVectorScorerSupplier(dims, maxOrd, vectorByteSize, input.clone(), values); + } +} diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MaxInnerProductByteVectorScorerSupplier.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MaxInnerProductByteVectorScorerSupplier.java new file mode 100644 index 00000000000..2ea4848b33e --- /dev/null +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MaxInnerProductByteVectorScorerSupplier.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.internal.vectorization; + +import java.io.IOException; +import org.apache.lucene.store.MemorySegmentAccessInput; +import org.apache.lucene.util.hnsw.RandomAccessVectorValues; + +final class MaxInnerProductByteVectorScorerSupplier extends MemorySegmentByteVectorScorerSupplier { + + MaxInnerProductByteVectorScorerSupplier( + int dims, + int maxOrd, + int vectorByteSize, + MemorySegmentAccessInput input, + RandomAccessVectorValues values) { + super(dims, maxOrd, vectorByteSize, input, values); + } + + @Override + public float score(int node) throws IOException { + float raw = PanamaVectorUtilSupport.dotProduct(first, getSegment(node, scratch2)); + return scaleMaxInnerProductScore(raw); + } + + static float scaleMaxInnerProductScore(float vectorDotProductSimilarity) { + if (vectorDotProductSimilarity < 0) { + return 1 / (1 + -1 * vectorDotProductSimilarity); + } + return vectorDotProductSimilarity + 1; + } + + @Override + public MaxInnerProductByteVectorScorerSupplier copy() throws IOException { + return new MaxInnerProductByteVectorScorerSupplier( + dims, maxOrd, vectorByteSize, input.clone(), values); + } +} diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java index e300596b3a7..a47abaf1d5b 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java @@ -28,10 +28,18 @@ import org.apache.lucene.util.hnsw.RandomVectorScorer; import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; -/** A scorer of vectors whose element size is byte. */ +/** + * A scorer of vectors whose element size is byte. + * + *

This class is both a scorer supplier and a scorer. Since score suppliers and their scorers are + * not thread-safe, this allows to share per-thread state and temporary scratch buffers. + */ public abstract sealed class MemorySegmentByteVectorScorerSupplier implements RandomVectorScorerSupplier, RandomVectorScorer - permits DotProductByteVectorScorerSupplier, EuclideanByteVectorScorerSupplier { + permits CosineByteVectorScorerSupplier, + DotProductByteVectorScorerSupplier, + EuclideanByteVectorScorerSupplier, + MaxInnerProductByteVectorScorerSupplier { final int vectorByteSize; final int dims; final int maxOrd; @@ -59,12 +67,15 @@ public static Optional create( } checkInvariants(maxOrd, vectorByteSize, input); return switch (type) { + case COSINE -> Optional.of( + new CosineByteVectorScorerSupplier(dims, maxOrd, vectorByteSize, msInput, values)); case DOT_PRODUCT -> Optional.of( new DotProductByteVectorScorerSupplier(dims, maxOrd, vectorByteSize, msInput, values)); case EUCLIDEAN -> Optional.of( new EuclideanByteVectorScorerSupplier(dims, maxOrd, vectorByteSize, msInput, values)); - case MAXIMUM_INNER_PRODUCT -> Optional.empty(); // TODO: implement MAXIMUM_INNER_PRODUCT - case COSINE -> Optional.empty(); // TODO: implement Cosine + case MAXIMUM_INNER_PRODUCT -> Optional.of( + new MaxInnerProductByteVectorScorerSupplier( + dims, maxOrd, vectorByteSize, msInput, values)); }; } diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java index abfb3bf4e1e..867d0c684cb 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java @@ -577,6 +577,10 @@ private int int4DotProductBody128(byte[] a, byte[] b, int limit) { @Override public float cosine(byte[] a, byte[] b) { + return cosine(MemorySegment.ofArray(a), MemorySegment.ofArray(b)); + } + + public static float cosine(MemorySegment a, MemorySegment b) { int i = 0; int sum = 0; int norm1 = 0; @@ -584,17 +588,17 @@ public float cosine(byte[] a, byte[] b) { // only vectorize if we'll at least enter the loop a single time, and we have at least 128-bit // vectors (256-bit on intel to dodge performance landmines) - if (a.length >= 16 && HAS_FAST_INTEGER_VECTORS) { + if (a.byteSize() >= 16 && HAS_FAST_INTEGER_VECTORS) { final float[] ret; if (VECTOR_BITSIZE >= 512) { - i += BYTE_SPECIES.loopBound(a.length); + i += BYTE_SPECIES.loopBound((int) a.byteSize()); ret = cosineBody512(a, b, i); } else if (VECTOR_BITSIZE == 256) { - i += BYTE_SPECIES.loopBound(a.length); + i += BYTE_SPECIES.loopBound((int) a.byteSize()); ret = cosineBody256(a, b, i); } else { // tricky: we don't have SPECIES_32, so we workaround with "overlapping read" - i += ByteVector.SPECIES_64.loopBound(a.length - ByteVector.SPECIES_64.length()); + i += ByteVector.SPECIES_64.loopBound(a.byteSize() - ByteVector.SPECIES_64.length()); ret = cosineBody128(a, b, i); } sum += ret[0]; @@ -603,9 +607,9 @@ public float cosine(byte[] a, byte[] b) { } // scalar tail - for (; i < a.length; i++) { - byte elem1 = a[i]; - byte elem2 = b[i]; + for (; i < a.byteSize(); i++) { + byte elem1 = a.get(JAVA_BYTE, i); + byte elem2 = b.get(JAVA_BYTE, i); sum += elem1 * elem2; norm1 += elem1 * elem1; norm2 += elem2 * elem2; @@ -614,13 +618,13 @@ public float cosine(byte[] a, byte[] b) { } /** vectorized cosine body (512 bit vectors) */ - private float[] cosineBody512(byte[] a, byte[] b, int limit) { + private static float[] cosineBody512(MemorySegment a, MemorySegment b, int limit) { IntVector accSum = IntVector.zero(INT_SPECIES); IntVector accNorm1 = IntVector.zero(INT_SPECIES); IntVector accNorm2 = IntVector.zero(INT_SPECIES); for (int i = 0; i < limit; i += BYTE_SPECIES.length()) { - ByteVector va8 = ByteVector.fromArray(BYTE_SPECIES, a, i); - ByteVector vb8 = ByteVector.fromArray(BYTE_SPECIES, b, i); + ByteVector va8 = ByteVector.fromMemorySegment(BYTE_SPECIES, a, i, LITTLE_ENDIAN); + ByteVector vb8 = ByteVector.fromMemorySegment(BYTE_SPECIES, b, i, LITTLE_ENDIAN); // 16-bit multiply: avoid AVX-512 heavy multiply on zmm Vector va16 = va8.convertShape(B2S, SHORT_SPECIES, 0); @@ -644,13 +648,13 @@ private float[] cosineBody512(byte[] a, byte[] b, int limit) { } /** vectorized cosine body (256 bit vectors) */ - private float[] cosineBody256(byte[] a, byte[] b, int limit) { + private static float[] cosineBody256(MemorySegment a, MemorySegment b, int limit) { IntVector accSum = IntVector.zero(IntVector.SPECIES_256); IntVector accNorm1 = IntVector.zero(IntVector.SPECIES_256); IntVector accNorm2 = IntVector.zero(IntVector.SPECIES_256); for (int i = 0; i < limit; i += ByteVector.SPECIES_64.length()) { - ByteVector va8 = ByteVector.fromArray(ByteVector.SPECIES_64, a, i); - ByteVector vb8 = ByteVector.fromArray(ByteVector.SPECIES_64, b, i); + ByteVector va8 = ByteVector.fromMemorySegment(ByteVector.SPECIES_64, a, i, LITTLE_ENDIAN); + ByteVector vb8 = ByteVector.fromMemorySegment(ByteVector.SPECIES_64, b, i, LITTLE_ENDIAN); // 16-bit multiply, and add into accumulators Vector va32 = va8.convertShape(B2I, IntVector.SPECIES_256, 0); @@ -669,13 +673,13 @@ private float[] cosineBody256(byte[] a, byte[] b, int limit) { } /** vectorized cosine body (128 bit vectors) */ - private float[] cosineBody128(byte[] a, byte[] b, int limit) { + private static float[] cosineBody128(MemorySegment a, MemorySegment b, int limit) { IntVector accSum = IntVector.zero(IntVector.SPECIES_128); IntVector accNorm1 = IntVector.zero(IntVector.SPECIES_128); IntVector accNorm2 = IntVector.zero(IntVector.SPECIES_128); for (int i = 0; i < limit; i += ByteVector.SPECIES_64.length() >> 1) { - ByteVector va8 = ByteVector.fromArray(ByteVector.SPECIES_64, a, i); - ByteVector vb8 = ByteVector.fromArray(ByteVector.SPECIES_64, b, i); + ByteVector va8 = ByteVector.fromMemorySegment(ByteVector.SPECIES_64, a, i, LITTLE_ENDIAN); + ByteVector vb8 = ByteVector.fromMemorySegment(ByteVector.SPECIES_64, b, i, LITTLE_ENDIAN); // process first half only: 16-bit multiply Vector va16 = va8.convert(B2S, 0); diff --git a/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java b/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java index b6997e028cb..5b1a8487a34 100644 --- a/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java +++ b/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java @@ -16,8 +16,10 @@ */ package org.apache.lucene.internal.vectorization; +import static org.apache.lucene.index.VectorSimilarityFunction.COSINE; import static org.apache.lucene.index.VectorSimilarityFunction.DOT_PRODUCT; import static org.apache.lucene.index.VectorSimilarityFunction.EUCLIDEAN; +import static org.apache.lucene.index.VectorSimilarityFunction.MAXIMUM_INNER_PRODUCT; import static org.hamcrest.Matchers.equalTo; import com.carrotsearch.randomizedtesting.generators.RandomNumbers; @@ -83,7 +85,7 @@ void testSimpleScorer(long maxChunkSize) throws IOException { } try (IndexInput in = dir.openInput(fileName, IOContext.DEFAULT)) { var vectorValues = vectorValues(dims, 2, in); - for (var sim : List.of(DOT_PRODUCT, EUCLIDEAN)) { + for (var sim : List.of(COSINE, EUCLIDEAN, DOT_PRODUCT, MAXIMUM_INNER_PRODUCT)) { for (var ords : List.of(List.of(0, 1), List.of(1, 0))) { int idx0 = ords.get(0); int idx1 = ords.get(1); @@ -142,7 +144,7 @@ void testRandomScorer(long maxChunkSize, Function byteArraySupp try (IndexInput in = dir.openInput(fileName, IOContext.DEFAULT)) { var vectorValues = vectorValues(dims, size, in); for (int times = 0; times < TIMES; times++) { - for (var sim : List.of(DOT_PRODUCT, EUCLIDEAN)) { + for (var sim : List.of(COSINE, EUCLIDEAN, DOT_PRODUCT, MAXIMUM_INNER_PRODUCT)) { int idx0 = randomIntBetween(0, size - 1); int idx1 = randomIntBetween(0, size - 1); // may be the same as idx0 - which is ok. From ccb1d09d81bda8f41295418334a1fae0f837b87d Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Mon, 6 May 2024 15:14:59 +0100 Subject: [PATCH 15/37] rework into nested classes --- .../CosineByteVectorScorerSupplier.java | 44 ------- .../DotProductByteVectorScorerSupplier.java | 46 ------- .../EuclideanByteVectorScorerSupplier.java | 45 ------- ...xInnerProductByteVectorScorerSupplier.java | 52 -------- ...MemorySegmentByteVectorScorerSupplier.java | 113 ++++++++++++++++-- 5 files changed, 101 insertions(+), 199 deletions(-) delete mode 100644 lucene/core/src/java21/org/apache/lucene/internal/vectorization/CosineByteVectorScorerSupplier.java delete mode 100644 lucene/core/src/java21/org/apache/lucene/internal/vectorization/DotProductByteVectorScorerSupplier.java delete mode 100644 lucene/core/src/java21/org/apache/lucene/internal/vectorization/EuclideanByteVectorScorerSupplier.java delete mode 100644 lucene/core/src/java21/org/apache/lucene/internal/vectorization/MaxInnerProductByteVectorScorerSupplier.java diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/CosineByteVectorScorerSupplier.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/CosineByteVectorScorerSupplier.java deleted file mode 100644 index 274888fc61b..00000000000 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/CosineByteVectorScorerSupplier.java +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.internal.vectorization; - -import java.io.IOException; -import org.apache.lucene.store.MemorySegmentAccessInput; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; - -final class CosineByteVectorScorerSupplier extends MemorySegmentByteVectorScorerSupplier { - - CosineByteVectorScorerSupplier( - int dims, - int maxOrd, - int vectorByteSize, - MemorySegmentAccessInput input, - RandomAccessVectorValues values) { - super(dims, maxOrd, vectorByteSize, input, values); - } - - @Override - public float score(int node) throws IOException { - float raw = PanamaVectorUtilSupport.cosine(first, getSegment(node, scratch2)); - return (1 + raw) / 2; - } - - @Override - public CosineByteVectorScorerSupplier copy() throws IOException { - return new CosineByteVectorScorerSupplier(dims, maxOrd, vectorByteSize, input.clone(), values); - } -} diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/DotProductByteVectorScorerSupplier.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/DotProductByteVectorScorerSupplier.java deleted file mode 100644 index 8b3f423f65b..00000000000 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/DotProductByteVectorScorerSupplier.java +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.internal.vectorization; - -import java.io.IOException; -import org.apache.lucene.store.MemorySegmentAccessInput; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; - -final class DotProductByteVectorScorerSupplier extends MemorySegmentByteVectorScorerSupplier { - - DotProductByteVectorScorerSupplier( - int dims, - int maxOrd, - int vectorByteSize, - MemorySegmentAccessInput input, - RandomAccessVectorValues values) { - super(dims, maxOrd, vectorByteSize, input, values); - } - - @Override - public float score(int node) throws IOException { - // divide by 2 * 2^14 (maximum absolute value of product of 2 signed bytes) * len - float raw = PanamaVectorUtilSupport.dotProduct(first, getSegment(node, scratch2)); - return 0.5f + raw / (float) (dims * (1 << 15)); - } - - @Override - public DotProductByteVectorScorerSupplier copy() throws IOException { - return new DotProductByteVectorScorerSupplier( - dims, maxOrd, vectorByteSize, input.clone(), values); - } -} diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/EuclideanByteVectorScorerSupplier.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/EuclideanByteVectorScorerSupplier.java deleted file mode 100644 index 3570d13a1d6..00000000000 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/EuclideanByteVectorScorerSupplier.java +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.internal.vectorization; - -import java.io.IOException; -import org.apache.lucene.store.MemorySegmentAccessInput; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; - -final class EuclideanByteVectorScorerSupplier extends MemorySegmentByteVectorScorerSupplier { - - EuclideanByteVectorScorerSupplier( - int dims, - int maxOrd, - int vectorByteSize, - MemorySegmentAccessInput input, - RandomAccessVectorValues values) { - super(dims, maxOrd, vectorByteSize, input, values); - } - - @Override - public float score(int node) throws IOException { - float raw = PanamaVectorUtilSupport.squareDistance(first, getSegment(node, scratch2)); - return 1 / (1f + raw); - } - - @Override - public EuclideanByteVectorScorerSupplier copy() throws IOException { - return new EuclideanByteVectorScorerSupplier( - dims, maxOrd, vectorByteSize, input.clone(), values); - } -} diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MaxInnerProductByteVectorScorerSupplier.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MaxInnerProductByteVectorScorerSupplier.java deleted file mode 100644 index 2ea4848b33e..00000000000 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MaxInnerProductByteVectorScorerSupplier.java +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.internal.vectorization; - -import java.io.IOException; -import org.apache.lucene.store.MemorySegmentAccessInput; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; - -final class MaxInnerProductByteVectorScorerSupplier extends MemorySegmentByteVectorScorerSupplier { - - MaxInnerProductByteVectorScorerSupplier( - int dims, - int maxOrd, - int vectorByteSize, - MemorySegmentAccessInput input, - RandomAccessVectorValues values) { - super(dims, maxOrd, vectorByteSize, input, values); - } - - @Override - public float score(int node) throws IOException { - float raw = PanamaVectorUtilSupport.dotProduct(first, getSegment(node, scratch2)); - return scaleMaxInnerProductScore(raw); - } - - static float scaleMaxInnerProductScore(float vectorDotProductSimilarity) { - if (vectorDotProductSimilarity < 0) { - return 1 / (1 + -1 * vectorDotProductSimilarity); - } - return vectorDotProductSimilarity + 1; - } - - @Override - public MaxInnerProductByteVectorScorerSupplier copy() throws IOException { - return new MaxInnerProductByteVectorScorerSupplier( - dims, maxOrd, vectorByteSize, input.clone(), values); - } -} diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java index a47abaf1d5b..43874dfe2d3 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java @@ -35,11 +35,7 @@ * not thread-safe, this allows to share per-thread state and temporary scratch buffers. */ public abstract sealed class MemorySegmentByteVectorScorerSupplier - implements RandomVectorScorerSupplier, RandomVectorScorer - permits CosineByteVectorScorerSupplier, - DotProductByteVectorScorerSupplier, - EuclideanByteVectorScorerSupplier, - MaxInnerProductByteVectorScorerSupplier { + implements RandomVectorScorerSupplier, RandomVectorScorer { final int vectorByteSize; final int dims; final int maxOrd; @@ -67,15 +63,12 @@ public static Optional create( } checkInvariants(maxOrd, vectorByteSize, input); return switch (type) { - case COSINE -> Optional.of( - new CosineByteVectorScorerSupplier(dims, maxOrd, vectorByteSize, msInput, values)); + case COSINE -> Optional.of(new Cosine(dims, maxOrd, vectorByteSize, msInput, values)); case DOT_PRODUCT -> Optional.of( - new DotProductByteVectorScorerSupplier(dims, maxOrd, vectorByteSize, msInput, values)); - case EUCLIDEAN -> Optional.of( - new EuclideanByteVectorScorerSupplier(dims, maxOrd, vectorByteSize, msInput, values)); + new DotProduct(dims, maxOrd, vectorByteSize, msInput, values)); + case EUCLIDEAN -> Optional.of(new Euclidean(dims, maxOrd, vectorByteSize, msInput, values)); case MAXIMUM_INNER_PRODUCT -> Optional.of( - new MaxInnerProductByteVectorScorerSupplier( - dims, maxOrd, vectorByteSize, msInput, values)); + new MaxInnerProduct(dims, maxOrd, vectorByteSize, msInput, values)); }; } @@ -142,4 +135,100 @@ public final int ordToDoc(int ord) { public final Bits getAcceptOrds(Bits acceptDocs) { return values.getAcceptOrds(acceptDocs); } + + static final class Cosine extends MemorySegmentByteVectorScorerSupplier { + + Cosine( + int dims, + int maxOrd, + int vectorByteSize, + MemorySegmentAccessInput input, + RandomAccessVectorValues values) { + super(dims, maxOrd, vectorByteSize, input, values); + } + + @Override + public float score(int node) throws IOException { + float raw = PanamaVectorUtilSupport.cosine(first, getSegment(node, scratch2)); + return (1 + raw) / 2; + } + + @Override + public Cosine copy() throws IOException { + return new Cosine(dims, maxOrd, vectorByteSize, input.clone(), values); + } + } + + static final class DotProduct extends MemorySegmentByteVectorScorerSupplier { + + DotProduct( + int dims, + int maxOrd, + int vectorByteSize, + MemorySegmentAccessInput input, + RandomAccessVectorValues values) { + super(dims, maxOrd, vectorByteSize, input, values); + } + + @Override + public float score(int node) throws IOException { + // divide by 2 * 2^14 (maximum absolute value of product of 2 signed bytes) * len + float raw = PanamaVectorUtilSupport.dotProduct(first, getSegment(node, scratch2)); + return 0.5f + raw / (float) (dims * (1 << 15)); + } + + @Override + public DotProduct copy() throws IOException { + return new DotProduct(dims, maxOrd, vectorByteSize, input.clone(), values); + } + } + + static final class Euclidean extends MemorySegmentByteVectorScorerSupplier { + + Euclidean( + int dims, + int maxOrd, + int vectorByteSize, + MemorySegmentAccessInput input, + RandomAccessVectorValues values) { + super(dims, maxOrd, vectorByteSize, input, values); + } + + @Override + public float score(int node) throws IOException { + float raw = PanamaVectorUtilSupport.squareDistance(first, getSegment(node, scratch2)); + return 1 / (1f + raw); + } + + @Override + public Euclidean copy() throws IOException { + return new Euclidean(dims, maxOrd, vectorByteSize, input.clone(), values); + } + } + + static final class MaxInnerProduct extends MemorySegmentByteVectorScorerSupplier { + + MaxInnerProduct( + int dims, + int maxOrd, + int vectorByteSize, + MemorySegmentAccessInput input, + RandomAccessVectorValues values) { + super(dims, maxOrd, vectorByteSize, input, values); + } + + @Override + public float score(int node) throws IOException { + float raw = PanamaVectorUtilSupport.dotProduct(first, getSegment(node, scratch2)); + if (raw < 0) { + return 1 / (1 + -1 * raw); + } + return raw + 1; + } + + @Override + public MaxInnerProduct copy() throws IOException { + return new MaxInnerProduct(dims, maxOrd, vectorByteSize, input.clone(), values); + } + } } From 2a1ba051a308833040764d0109daa39f40b0ab42 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Thu, 9 May 2024 09:59:20 +0100 Subject: [PATCH 16/37] remove unwanted comment --- .../core/src/java/org/apache/lucene/store/FilterIndexInput.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/store/FilterIndexInput.java b/lucene/core/src/java/org/apache/lucene/store/FilterIndexInput.java index 59669ebffd3..9e60a51790f 100644 --- a/lucene/core/src/java/org/apache/lucene/store/FilterIndexInput.java +++ b/lucene/core/src/java/org/apache/lucene/store/FilterIndexInput.java @@ -59,8 +59,6 @@ public static IndexInput unwrapOnlyTest(IndexInput in) { return in; } - // org.apache.lucene.tests.store.MockIndexInputWrapper" - protected final IndexInput in; /** Creates a FilterIndexInput with a resource description and wrapped delegate IndexInput */ From 5f0c5537f4e020854a10cd32036862335cdfe1fb Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Thu, 9 May 2024 10:21:31 +0100 Subject: [PATCH 17/37] add more tests --- .../codecs/hnsw/TestFlatVectorScorerUtil.java | 83 +++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorerUtil.java diff --git a/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorerUtil.java b/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorerUtil.java new file mode 100644 index 00000000000..884f54c0059 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorerUtil.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.hnsw; + +import static org.apache.lucene.index.VectorSimilarityFunction.EUCLIDEAN; +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.is; +import static org.hamcrest.Matchers.oneOf; + +import java.io.IOException; +import org.apache.lucene.codecs.lucene95.OffHeapByteVectorValues; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.MMapDirectory; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.hnsw.RandomAccessVectorValues; +import org.hamcrest.Matcher; +import org.hamcrest.MatcherAssert; + +public class TestFlatVectorScorerUtil extends LuceneTestCase { + + public void testDefaultOrMemSegScorer() { + var scorer = FlatVectorScorerUtil.newFlatVectorScorer(); + System.out.println("HEGO: " + scorer); + assertThat( + scorer.toString(), + is(oneOf("DefaultFlatVectorScorer()", "MemorySegmentFlatVectorsScorer()"))); + } + + // Tests that the creation of another scorer does not disturb previous scorers + public void testMultipleScorers() throws IOException { + byte[] vec0 = new byte[] {0, 0, 0, 0}; + byte[] vec1 = new byte[] {1, 1, 1, 1}; + byte[] vec2 = new byte[] {32, 32, 32, 32}; + + try (Directory dir = new MMapDirectory(createTempDir(getTestName()))) { + try (IndexOutput out = dir.createOutput("testFoo", IOContext.DEFAULT)) { + out.writeBytes(vec0, 0, vec0.length); + out.writeBytes(vec1, 0, vec1.length); + out.writeBytes(vec2, 0, vec2.length); + } + try (IndexInput in = dir.openInput("testFoo", IOContext.DEFAULT)) { + var vectorValues = vectorValues(4, 3, in); + var factory = FlatVectorScorerUtil.newFlatVectorScorer(); + var ss = factory.getRandomVectorScorerSupplier(EUCLIDEAN, vectorValues); + + var scorerAgainstOrd0 = ss.scorer(0); + var firstScore = scorerAgainstOrd0.score(1); + // ensure that the creation of another scorer does not disturb previous scorers + var scorerAgainstOrd2 = ss.scorer(2); + var scoreAgain = scorerAgainstOrd0.score(1); + + assertThat(scoreAgain, equalTo(firstScore)); + } + } + } + + public static void assertThat(T actual, Matcher matcher) { + MatcherAssert.assertThat("", actual, matcher); + } + + static RandomAccessVectorValues vectorValues(int dims, int size, IndexInput in) + throws IOException { + return new OffHeapByteVectorValues.DenseOffHeapVectorValues( + dims, size, in.slice("test", 0, in.length()), dims); + } +} From 7dedf445f1bb7341fc0d0a7803e23bb2cb132b19 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Thu, 9 May 2024 11:58:32 +0100 Subject: [PATCH 18/37] fix bug --- .../lucene/benchmark/jmh/VectorScorerBenchmark.java | 13 ++++++------- .../MemorySegmentByteVectorScorerSupplier.java | 11 +++++++---- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java index 08f245852e9..6f6ac13368b 100644 --- a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java @@ -31,7 +31,7 @@ import org.apache.lucene.store.MMapDirectory; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.hnsw.RandomAccessVectorValues; -import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; +import org.apache.lucene.util.hnsw.RandomVectorScorer; import org.openjdk.jmh.annotations.*; @BenchmarkMode(Mode.Throughput) @@ -54,7 +54,7 @@ public class VectorScorerBenchmark { IndexInput in; RandomAccessVectorValues vectorValues; byte[] vec1, vec2; - RandomVectorScorerSupplier scorer; + RandomVectorScorer scorer; @Setup(Level.Iteration) public void init() throws IOException { @@ -72,7 +72,8 @@ public void init() throws IOException { vectorValues = vectorValues(size, 2, in); scorer = FlatVectorScorerUtil.newFlatVectorScorer() - .getRandomVectorScorerSupplier(DOT_PRODUCT, vectorValues); + .getRandomVectorScorerSupplier(DOT_PRODUCT, vectorValues) + .scorer(0); } @TearDown @@ -82,15 +83,13 @@ public void teardown() throws IOException { @Benchmark public float binaryDotProductDefault() throws IOException { - // score twice to invalidate and re-read the vector at the first position - return scorer.scorer(0).score(1) + scorer.scorer(1).score(0); + return scorer.score(1); } @Benchmark @Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) public float binaryDotProductMemSeg() throws IOException { - // score twice to invalidate and re-read the vector at the first position - return scorer.scorer(0).score(1) + scorer.scorer(1).score(0); + return scorer.score(1); } static RandomAccessVectorValues vectorValues(int dims, int size, IndexInput in) diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java index 43874dfe2d3..39780f907ea 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java @@ -31,8 +31,7 @@ /** * A scorer of vectors whose element size is byte. * - *

This class is both a scorer supplier and a scorer. Since score suppliers and their scorers are - * not thread-safe, this allows to share per-thread state and temporary scratch buffers. + *

This class is both a scorer supplier and a scorer. */ public abstract sealed class MemorySegmentByteVectorScorerSupplier implements RandomVectorScorerSupplier, RandomVectorScorer { @@ -115,10 +114,14 @@ public final RandomVectorScorer scorer(byte[] target) { return this; } + @Override + public abstract MemorySegmentByteVectorScorerSupplier copy() throws IOException; + @Override public final RandomVectorScorer scorer(int ord) throws IOException { - first = getSegment(ord, scratch1); - return this; + var copy = copy(); + copy.first = copy.getSegment(ord, copy.scratch1); + return copy; } @Override From 330f55a3288dec4330692e34a17d46edff3212bc Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Thu, 9 May 2024 12:13:21 +0100 Subject: [PATCH 19/37] use as raw scorer in SQ --- .../Lucene99ScalarQuantizedVectorsFormat.java | 3 ++- .../TestLucene99HnswQuantizedVectorsFormat.java | 12 +++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsFormat.java index c10f87da2a6..e09d331ef0b 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsFormat.java @@ -19,6 +19,7 @@ import java.io.IOException; import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer; +import org.apache.lucene.codecs.hnsw.FlatVectorScorerUtil; import org.apache.lucene.codecs.hnsw.FlatVectorsFormat; import org.apache.lucene.codecs.hnsw.FlatVectorsReader; import org.apache.lucene.codecs.hnsw.FlatVectorsWriter; @@ -48,7 +49,7 @@ public class Lucene99ScalarQuantizedVectorsFormat extends FlatVectorsFormat { static final String VECTOR_DATA_EXTENSION = "veq"; private static final FlatVectorsFormat rawVectorFormat = - new Lucene99FlatVectorsFormat(new DefaultFlatVectorScorer()); + new Lucene99FlatVectorsFormat(FlatVectorScorerUtil.newFlatVectorScorer()); /** The minimum confidence interval */ private static final float MINIMUM_CONFIDENCE_INTERVAL = 0.9f; diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java index fb8ffe369f4..0c8ad329f6e 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java @@ -16,11 +16,15 @@ */ package org.apache.lucene.codecs.lucene99; +import static java.lang.String.format; import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; +import static org.hamcrest.Matchers.is; +import static org.hamcrest.Matchers.oneOf; import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.Locale; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.KnnVectorsFormat; @@ -181,9 +185,11 @@ public KnnVectorsFormat knnVectorsFormat() { 10, 20, 1, (byte) 4, false, 0.9f, null); } }; - String expectedString = - "Lucene99HnswScalarQuantizedVectorsFormat(name=Lucene99HnswScalarQuantizedVectorsFormat, maxConn=10, beamWidth=20, flatVectorFormat=Lucene99ScalarQuantizedVectorsFormat(name=Lucene99ScalarQuantizedVectorsFormat, confidenceInterval=0.9, bits=4, compress=false, flatVectorScorer=ScalarQuantizedVectorScorer(nonQuantizedDelegate=DefaultFlatVectorScorer()), rawVectorFormat=Lucene99FlatVectorsFormat(vectorsScorer=DefaultFlatVectorScorer())))"; - assertEquals(expectedString, customCodec.knnVectorsFormat().toString()); + String expectedPattern = + "Lucene99HnswScalarQuantizedVectorsFormat(name=Lucene99HnswScalarQuantizedVectorsFormat, maxConn=10, beamWidth=20, flatVectorFormat=Lucene99ScalarQuantizedVectorsFormat(name=Lucene99ScalarQuantizedVectorsFormat, confidenceInterval=0.9, bits=4, compress=false, flatVectorScorer=ScalarQuantizedVectorScorer(nonQuantizedDelegate=DefaultFlatVectorScorer()), rawVectorFormat=Lucene99FlatVectorsFormat(vectorsScorer=%s())))"; + var defaultScorer = format(Locale.ROOT, expectedPattern, "DefaultFlatVectorScorer"); + var memSegScorer = format(Locale.ROOT, expectedPattern, "MemorySegmentFlatVectorsScorer"); + assertThat(customCodec.knnVectorsFormat().toString(), is(oneOf(defaultScorer, memSegScorer))); } public void testLimits() { From ed570376723ff7e6090c84265bf188c84d83e6fd Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Thu, 9 May 2024 12:44:01 +0100 Subject: [PATCH 20/37] more test improvements --- .../codecs/hnsw/TestFlatVectorScorerUtil.java | 27 +++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorerUtil.java b/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorerUtil.java index 884f54c0059..dd52c622476 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorerUtil.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorerUtil.java @@ -16,12 +16,16 @@ */ package org.apache.lucene.codecs.hnsw; +import static org.apache.lucene.index.VectorSimilarityFunction.COSINE; +import static org.apache.lucene.index.VectorSimilarityFunction.DOT_PRODUCT; import static org.apache.lucene.index.VectorSimilarityFunction.EUCLIDEAN; +import static org.apache.lucene.index.VectorSimilarityFunction.MAXIMUM_INNER_PRODUCT; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.is; import static org.hamcrest.Matchers.oneOf; import java.io.IOException; +import java.util.List; import org.apache.lucene.codecs.lucene95.OffHeapByteVectorValues; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; @@ -50,12 +54,12 @@ public void testMultipleScorers() throws IOException { byte[] vec2 = new byte[] {32, 32, 32, 32}; try (Directory dir = new MMapDirectory(createTempDir(getTestName()))) { - try (IndexOutput out = dir.createOutput("testFoo", IOContext.DEFAULT)) { + try (IndexOutput out = dir.createOutput("testMultipleScorers", IOContext.DEFAULT)) { out.writeBytes(vec0, 0, vec0.length); out.writeBytes(vec1, 0, vec1.length); out.writeBytes(vec2, 0, vec2.length); } - try (IndexInput in = dir.openInput("testFoo", IOContext.DEFAULT)) { + try (IndexInput in = dir.openInput("testMultipleScorers", IOContext.DEFAULT)) { var vectorValues = vectorValues(4, 3, in); var factory = FlatVectorScorerUtil.newFlatVectorScorer(); var ss = factory.getRandomVectorScorerSupplier(EUCLIDEAN, vectorValues); @@ -64,6 +68,7 @@ public void testMultipleScorers() throws IOException { var firstScore = scorerAgainstOrd0.score(1); // ensure that the creation of another scorer does not disturb previous scorers var scorerAgainstOrd2 = ss.scorer(2); + assertThat(ss.scorer(2), equalTo(scorerAgainstOrd2)); // just to avoid unused warnings var scoreAgain = scorerAgainstOrd0.score(1); assertThat(scoreAgain, equalTo(firstScore)); @@ -71,6 +76,24 @@ public void testMultipleScorers() throws IOException { } } + public void testCheckDimensions() throws IOException { + byte[] vec0 = new byte[4]; + try (Directory dir = new MMapDirectory(createTempDir(getTestName()))) { + try (IndexOutput out = dir.createOutput("testCheckDimensions", IOContext.DEFAULT)) { + out.writeBytes(vec0, 0, vec0.length); + } + try (IndexInput in = dir.openInput("testCheckDimensions", IOContext.DEFAULT)) { + var vectorValues = vectorValues(4, 1, in); + var factory = FlatVectorScorerUtil.newFlatVectorScorer(); + for (var sim : List.of(COSINE, DOT_PRODUCT, EUCLIDEAN, MAXIMUM_INNER_PRODUCT)) { + expectThrows( + IllegalArgumentException.class, + () -> factory.getRandomVectorScorer(sim, vectorValues, new byte[5])); + } + } + } + } + public static void assertThat(T actual, Matcher matcher) { MatcherAssert.assertThat("", actual, matcher); } From 05ebc423d2200750f532aea67cf5bbd8200a7f33 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Thu, 9 May 2024 12:47:48 +0100 Subject: [PATCH 21/37] unused --- .../org/apache/lucene/codecs/hnsw/TestFlatVectorScorerUtil.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorerUtil.java b/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorerUtil.java index dd52c622476..d38361f2b79 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorerUtil.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorerUtil.java @@ -67,8 +67,8 @@ public void testMultipleScorers() throws IOException { var scorerAgainstOrd0 = ss.scorer(0); var firstScore = scorerAgainstOrd0.score(1); // ensure that the creation of another scorer does not disturb previous scorers + @SuppressWarnings("unused") var scorerAgainstOrd2 = ss.scorer(2); - assertThat(ss.scorer(2), equalTo(scorerAgainstOrd2)); // just to avoid unused warnings var scoreAgain = scorerAgainstOrd0.score(1); assertThat(scoreAgain, equalTo(firstScore)); From cba82815761822d297881a76c757b2e1345a3751 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Thu, 9 May 2024 14:09:30 +0100 Subject: [PATCH 22/37] more testing --- ...rerUtil.java => TestFlatVectorScorer.java} | 41 +++++++++++++++---- 1 file changed, 32 insertions(+), 9 deletions(-) rename lucene/core/src/test/org/apache/lucene/codecs/hnsw/{TestFlatVectorScorerUtil.java => TestFlatVectorScorer.java} (75%) diff --git a/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorerUtil.java b/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorer.java similarity index 75% rename from lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorerUtil.java rename to lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorer.java index d38361f2b79..b9f5a427f26 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorerUtil.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorer.java @@ -37,7 +37,7 @@ import org.hamcrest.Matcher; import org.hamcrest.MatcherAssert; -public class TestFlatVectorScorerUtil extends LuceneTestCase { +public class TestFlatVectorScorer extends LuceneTestCase { public void testDefaultOrMemSegScorer() { var scorer = FlatVectorScorerUtil.newFlatVectorScorer(); @@ -47,19 +47,28 @@ public void testDefaultOrMemSegScorer() { is(oneOf("DefaultFlatVectorScorer()", "MemorySegmentFlatVectorsScorer()"))); } - // Tests that the creation of another scorer does not disturb previous scorers public void testMultipleScorers() throws IOException { + testMultipleScorersImpl(TestFlatVectorScorer::newDirectory); + } + + public void testMultipleScorersMMap() throws IOException { + testMultipleScorersImpl(() -> new MMapDirectory(createTempDir(getTestName()))); + } + + // Tests that the creation of another scorer does not disturb previous scorers + void testMultipleScorersImpl(ThrowingSupplier newDirectory) throws IOException { byte[] vec0 = new byte[] {0, 0, 0, 0}; byte[] vec1 = new byte[] {1, 1, 1, 1}; - byte[] vec2 = new byte[] {32, 32, 32, 32}; + byte[] vec2 = new byte[] {15, 15, 15, 15}; - try (Directory dir = new MMapDirectory(createTempDir(getTestName()))) { - try (IndexOutput out = dir.createOutput("testMultipleScorers", IOContext.DEFAULT)) { + String fileName = getTestName(); + try (Directory dir = newDirectory.get()) { + try (IndexOutput out = dir.createOutput(fileName, IOContext.DEFAULT)) { out.writeBytes(vec0, 0, vec0.length); out.writeBytes(vec1, 0, vec1.length); out.writeBytes(vec2, 0, vec2.length); } - try (IndexInput in = dir.openInput("testMultipleScorers", IOContext.DEFAULT)) { + try (IndexInput in = dir.openInput(fileName, IOContext.DEFAULT)) { var vectorValues = vectorValues(4, 3, in); var factory = FlatVectorScorerUtil.newFlatVectorScorer(); var ss = factory.getRandomVectorScorerSupplier(EUCLIDEAN, vectorValues); @@ -77,12 +86,21 @@ public void testMultipleScorers() throws IOException { } public void testCheckDimensions() throws IOException { + testCheckDimensionsImpl(TestFlatVectorScorer::newDirectory); + } + + public void testCheckDimensionsMMap() throws IOException { + testCheckDimensionsImpl(() -> new MMapDirectory(createTempDir(getTestName()))); + } + + void testCheckDimensionsImpl(ThrowingSupplier newDirectory) throws IOException { byte[] vec0 = new byte[4]; - try (Directory dir = new MMapDirectory(createTempDir(getTestName()))) { - try (IndexOutput out = dir.createOutput("testCheckDimensions", IOContext.DEFAULT)) { + String fileName = getTestName(); + try (Directory dir = newDirectory.get()) { + try (IndexOutput out = dir.createOutput(fileName, IOContext.DEFAULT)) { out.writeBytes(vec0, 0, vec0.length); } - try (IndexInput in = dir.openInput("testCheckDimensions", IOContext.DEFAULT)) { + try (IndexInput in = dir.openInput(fileName, IOContext.DEFAULT)) { var vectorValues = vectorValues(4, 1, in); var factory = FlatVectorScorerUtil.newFlatVectorScorer(); for (var sim : List.of(COSINE, DOT_PRODUCT, EUCLIDEAN, MAXIMUM_INNER_PRODUCT)) { @@ -103,4 +121,9 @@ static RandomAccessVectorValues vectorValues(int dims, int size, IndexInput in) return new OffHeapByteVectorValues.DenseOffHeapVectorValues( dims, size, in.slice("test", 0, in.length()), dims); } + + @FunctionalInterface + interface ThrowingSupplier { + T get() throws IOException; + } } From 8b3f3c2c3b4abc4d122c7fc80fa0f285910d9cf0 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Fri, 10 May 2024 16:31:48 +0100 Subject: [PATCH 23/37] expand test --- .../codecs/hnsw/TestFlatVectorScorer.java | 156 ++++++++++++++---- 1 file changed, 126 insertions(+), 30 deletions(-) diff --git a/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorer.java b/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorer.java index b9f5a427f26..a8443c61bb7 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorer.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorer.java @@ -24,9 +24,17 @@ import static org.hamcrest.Matchers.is; import static org.hamcrest.Matchers.oneOf; +import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; +import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.ArrayList; import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; import org.apache.lucene.codecs.lucene95.OffHeapByteVectorValues; +import org.apache.lucene.codecs.lucene95.OffHeapFloatVectorValues; +import org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorScorer; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; @@ -37,41 +45,61 @@ import org.hamcrest.Matcher; import org.hamcrest.MatcherAssert; +@LuceneTestCase.AwaitsFix(bugUrl = "") public class TestFlatVectorScorer extends LuceneTestCase { + static volatile AtomicInteger count = new AtomicInteger(); + final FlatVectorsScorer flatVectorsScorer; + final ThrowingSupplier newDirectory; + + public TestFlatVectorScorer( + FlatVectorsScorer flatVectorsScorer, ThrowingSupplier newDirectory) { + this.flatVectorsScorer = flatVectorsScorer; + this.newDirectory = newDirectory; + } + + @ParametersFactory + public static Iterable parametersFactory() { + var scorers = + List.of( + new DefaultFlatVectorScorer(), + new Lucene99ScalarQuantizedVectorScorer(new DefaultFlatVectorScorer()), + FlatVectorScorerUtil.newFlatVectorScorer()); + var dirs = + List.>of( + TestFlatVectorScorer::newDirectory, + () -> new MMapDirectory(createTempDir(count.getAndIncrement() + "-"))); + + List objs = new ArrayList<>(); + for (var scorer : scorers) { + for (var dir : dirs) { + objs.add(new Object[] {scorer, dir}); + } + } + return objs; + } + public void testDefaultOrMemSegScorer() { var scorer = FlatVectorScorerUtil.newFlatVectorScorer(); - System.out.println("HEGO: " + scorer); assertThat( scorer.toString(), is(oneOf("DefaultFlatVectorScorer()", "MemorySegmentFlatVectorsScorer()"))); } - public void testMultipleScorers() throws IOException { - testMultipleScorersImpl(TestFlatVectorScorer::newDirectory); - } - - public void testMultipleScorersMMap() throws IOException { - testMultipleScorersImpl(() -> new MMapDirectory(createTempDir(getTestName()))); - } - // Tests that the creation of another scorer does not disturb previous scorers - void testMultipleScorersImpl(ThrowingSupplier newDirectory) throws IOException { + public void testMultipleByteScorers() throws IOException { byte[] vec0 = new byte[] {0, 0, 0, 0}; byte[] vec1 = new byte[] {1, 1, 1, 1}; byte[] vec2 = new byte[] {15, 15, 15, 15}; - String fileName = getTestName(); + String fileName = "testMultipleByteScorers"; try (Directory dir = newDirectory.get()) { try (IndexOutput out = dir.createOutput(fileName, IOContext.DEFAULT)) { - out.writeBytes(vec0, 0, vec0.length); - out.writeBytes(vec1, 0, vec1.length); - out.writeBytes(vec2, 0, vec2.length); + out.writeBytes(concat(vec0, vec1, vec2), 0, vec0.length * 3); } try (IndexInput in = dir.openInput(fileName, IOContext.DEFAULT)) { - var vectorValues = vectorValues(4, 3, in); - var factory = FlatVectorScorerUtil.newFlatVectorScorer(); - var ss = factory.getRandomVectorScorerSupplier(EUCLIDEAN, vectorValues); + var vectorValues = byteVectorValues(4, 3, in); + var ss = flatVectorsScorer.getRandomVectorScorerSupplier(EUCLIDEAN, vectorValues); var scorerAgainstOrd0 = ss.scorer(0); var firstScore = scorerAgainstOrd0.score(1); @@ -85,23 +113,42 @@ void testMultipleScorersImpl(ThrowingSupplier newDirectory) throws IO } } - public void testCheckDimensions() throws IOException { - testCheckDimensionsImpl(TestFlatVectorScorer::newDirectory); - } + // Tests that the creation of another scorer does not disturb previous scorers + public void testMultipleFloatScorers() throws IOException { + float[] vec0 = new float[] {0, 0, 0, 0}; + float[] vec1 = new float[] {1, 1, 1, 1}; + float[] vec2 = new float[] {15, 15, 15, 15}; - public void testCheckDimensionsMMap() throws IOException { - testCheckDimensionsImpl(() -> new MMapDirectory(createTempDir(getTestName()))); + String fileName = "testMultipleFloatScorers"; + try (Directory dir = newDirectory.get()) { + try (IndexOutput out = dir.createOutput(fileName, IOContext.DEFAULT)) { + out.writeBytes(concat(vec0, vec1, vec2), 0, vec0.length * Float.BYTES * 3); + } + try (IndexInput in = dir.openInput(fileName, IOContext.DEFAULT)) { + var vectorValues = floatVectorValues(4, 3, in); + var ss = flatVectorsScorer.getRandomVectorScorerSupplier(EUCLIDEAN, vectorValues); + + var scorerAgainstOrd0 = ss.scorer(0); + var firstScore = scorerAgainstOrd0.score(1); + // ensure that the creation of another scorer does not disturb previous scorers + @SuppressWarnings("unused") + var scorerAgainstOrd2 = ss.scorer(2); + var scoreAgain = scorerAgainstOrd0.score(1); + + assertThat(scoreAgain, equalTo(firstScore)); + } + } } - void testCheckDimensionsImpl(ThrowingSupplier newDirectory) throws IOException { + public void testCheckByteDimensions() throws IOException { byte[] vec0 = new byte[4]; - String fileName = getTestName(); + String fileName = "testCheckByteDimensions"; try (Directory dir = newDirectory.get()) { try (IndexOutput out = dir.createOutput(fileName, IOContext.DEFAULT)) { out.writeBytes(vec0, 0, vec0.length); } try (IndexInput in = dir.openInput(fileName, IOContext.DEFAULT)) { - var vectorValues = vectorValues(4, 1, in); + var vectorValues = byteVectorValues(4, 1, in); var factory = FlatVectorScorerUtil.newFlatVectorScorer(); for (var sim : List.of(COSINE, DOT_PRODUCT, EUCLIDEAN, MAXIMUM_INNER_PRODUCT)) { expectThrows( @@ -112,18 +159,67 @@ void testCheckDimensionsImpl(ThrowingSupplier newDirectory) throws IO } } - public static void assertThat(T actual, Matcher matcher) { - MatcherAssert.assertThat("", actual, matcher); + public void testCheckFloatDimensions() throws IOException { + float[] vec0 = new float[4]; + String fileName = "testCheckFloatDimensions"; + try (Directory dir = newDirectory.get()) { + try (IndexOutput out = dir.createOutput(fileName, IOContext.DEFAULT)) { + out.writeBytes(concat(vec0), 0, vec0.length * Float.BYTES); + } + try (IndexInput in = dir.openInput(fileName, IOContext.DEFAULT)) { + var vectorValues = floatVectorValues(4, 1, in); + var factory = FlatVectorScorerUtil.newFlatVectorScorer(); + for (var sim : List.of(COSINE, DOT_PRODUCT, EUCLIDEAN, MAXIMUM_INNER_PRODUCT)) { + expectThrows( + IllegalArgumentException.class, + () -> factory.getRandomVectorScorer(sim, vectorValues, new float[5])); + } + } + } } - static RandomAccessVectorValues vectorValues(int dims, int size, IndexInput in) + static RandomAccessVectorValues byteVectorValues(int dims, int size, IndexInput in) throws IOException { return new OffHeapByteVectorValues.DenseOffHeapVectorValues( - dims, size, in.slice("test", 0, in.length()), dims); + dims, size, in.slice("byteValues", 0, in.length()), dims); + } + + static RandomAccessVectorValues floatVectorValues(int dims, int size, IndexInput in) + throws IOException { + return new OffHeapFloatVectorValues.DenseOffHeapVectorValues( + dims, size, in.slice("floatValues", 0, in.length()), dims * Float.BYTES); + } + + /** Concatenates float arrays as byte[]. */ + public static byte[] concat(float[]... arrays) throws IOException { + var bb = ByteBuffer.allocate(4).order(ByteOrder.LITTLE_ENDIAN); + try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + for (var fa : arrays) { + for (var f : fa) { + bb.putFloat(0, f); + baos.write(bb.array()); + } + } + return baos.toByteArray(); + } + } + + /** Concatenates byte arrays. */ + public static byte[] concat(byte[]... arrays) throws IOException { + try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + for (var ba : arrays) { + baos.write(ba); + } + return baos.toByteArray(); + } + } + + public static void assertThat(T actual, Matcher matcher) { + MatcherAssert.assertThat("", actual, matcher); } @FunctionalInterface - interface ThrowingSupplier { + public interface ThrowingSupplier { T get() throws IOException; } } From 17923f2152fe7407b0bb7a7d02fa70744ad58890 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Sun, 12 May 2024 15:06:06 +0100 Subject: [PATCH 24/37] separate supplier and scorer --- .../MemorySegmentByteVectorScorer.java | 135 +++++++++++ ...MemorySegmentByteVectorScorerSupplier.java | 229 ++++++++---------- .../MemorySegmentFlatVectorsScorer.java | 26 +- 3 files changed, 246 insertions(+), 144 deletions(-) create mode 100644 lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorer.java diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorer.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorer.java new file mode 100644 index 00000000000..7bc67f5c5fd --- /dev/null +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorer.java @@ -0,0 +1,135 @@ +package org.apache.lucene.internal.vectorization; + +import java.io.IOException; +import java.lang.foreign.MemorySegment; +import java.util.Optional; +import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.store.FilterIndexInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.MemorySegmentAccessInput; +import org.apache.lucene.util.hnsw.RandomAccessVectorValues; +import org.apache.lucene.util.hnsw.RandomVectorScorer; + +abstract sealed class MemorySegmentByteVectorScorer + extends RandomVectorScorer.AbstractRandomVectorScorer { + + final int vectorByteSize; + final MemorySegmentAccessInput input; + final MemorySegment query; + byte[] scratch; + + /** + * Return an optional whose value, if present, is the scorer. Otherwise, an empty optional is + * returned. + */ + public static Optional create( + VectorSimilarityFunction type, + IndexInput input, + RandomAccessVectorValues values, + byte[] queryVector) { + input = FilterIndexInput.unwrapOnlyTest(input); + if (!(input instanceof MemorySegmentAccessInput msInput)) { + return Optional.empty(); + } + checkInvariants(values.size(), values.getVectorByteLength(), input); + return switch (type) { + case COSINE -> Optional.of(new CosineScorer(msInput, values, queryVector)); + case DOT_PRODUCT -> Optional.of(new DotProductScorer(msInput, values, queryVector)); + case EUCLIDEAN -> Optional.of(new EuclideanScorer(msInput, values, queryVector)); + case MAXIMUM_INNER_PRODUCT -> Optional.of( + new MaxInnerProductScorer(msInput, values, queryVector)); + }; + } + + MemorySegmentByteVectorScorer( + MemorySegmentAccessInput input, RandomAccessVectorValues values, byte[] queryVector) { + super(values); + this.input = input; + this.vectorByteSize = values.getVectorByteLength(); + this.query = MemorySegment.ofArray(queryVector); + } + + final MemorySegment getSegment(int ord) throws IOException { + checkOrdinal(ord); + long byteOffset = (long) ord * vectorByteSize; + MemorySegment seg = input.segmentSliceOrNull(byteOffset, vectorByteSize); + if (seg == null) { + if (scratch == null) { + scratch = new byte[vectorByteSize]; + } + input.readBytes(byteOffset, scratch, 0, vectorByteSize); + seg = MemorySegment.ofArray(scratch); + } + return seg; + } + + static void checkInvariants(int maxOrd, int vectorByteLength, IndexInput input) { + if (input.length() < (long) vectorByteLength * maxOrd) { + throw new IllegalArgumentException("input length is less than expected vector data"); + } + } + + final void checkOrdinal(int ord) { + if (ord < 0 || ord >= maxOrd()) { + throw new IllegalArgumentException("illegal ordinal: " + ord); + } + } + + static final class CosineScorer extends MemorySegmentByteVectorScorer { + CosineScorer(MemorySegmentAccessInput input, RandomAccessVectorValues values, byte[] query) { + super(input, values, query); + } + + @Override + public float score(int node) throws IOException { + checkOrdinal(node); + float raw = PanamaVectorUtilSupport.cosine(query, getSegment(node)); + return (1 + raw) / 2; + } + } + + static final class DotProductScorer extends MemorySegmentByteVectorScorer { + DotProductScorer( + MemorySegmentAccessInput input, RandomAccessVectorValues values, byte[] query) { + super(input, values, query); + } + + @Override + public float score(int node) throws IOException { + checkOrdinal(node); + // divide by 2 * 2^14 (maximum absolute value of product of 2 signed bytes) * len + float raw = PanamaVectorUtilSupport.dotProduct(query, getSegment(node)); + return 0.5f + raw / (float) (query.byteSize() * (1 << 15)); + } + } + + static final class EuclideanScorer extends MemorySegmentByteVectorScorer { + EuclideanScorer(MemorySegmentAccessInput input, RandomAccessVectorValues values, byte[] query) { + super(input, values, query); + } + + @Override + public float score(int node) throws IOException { + checkOrdinal(node); + float raw = PanamaVectorUtilSupport.squareDistance(query, getSegment(node)); + return 1 / (1f + raw); + } + } + + static final class MaxInnerProductScorer extends MemorySegmentByteVectorScorer { + MaxInnerProductScorer( + MemorySegmentAccessInput input, RandomAccessVectorValues values, byte[] query) { + super(input, values, query); + } + + @Override + public float score(int node) throws IOException { + checkOrdinal(node); + float raw = PanamaVectorUtilSupport.dotProduct(query, getSegment(node)); + if (raw < 0) { + return 1 / (1 + -1 * raw); + } + return raw + 1; + } + } +} diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java index 39780f907ea..64152a86b09 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java @@ -23,215 +23,190 @@ import org.apache.lucene.store.FilterIndexInput; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.MemorySegmentAccessInput; -import org.apache.lucene.util.Bits; import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorer; import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; /** - * A scorer of vectors whose element size is byte. - * - *

This class is both a scorer supplier and a scorer. + * A score supplier of vectors whose element size is byte. */ public abstract sealed class MemorySegmentByteVectorScorerSupplier - implements RandomVectorScorerSupplier, RandomVectorScorer { + implements RandomVectorScorerSupplier { final int vectorByteSize; - final int dims; final int maxOrd; final MemorySegmentAccessInput input; - final RandomAccessVectorValues values; // to support ordToDoc/getAcceptOrds - final byte[] scratch1, scratch2; - - MemorySegment first; + byte[] scratch1, scratch2; /** - * Return an optional whose value, if present, is the scorer. Otherwise, an empty optional is - * returned. + * Return an optional whose value, if present, is the scorer supplier. Otherwise, an empty + * optional is returned. */ - public static Optional create( - int dims, - int maxOrd, - int vectorByteSize, - VectorSimilarityFunction type, - IndexInput input, - RandomAccessVectorValues values) { + static Optional create( + VectorSimilarityFunction type, IndexInput input, RandomAccessVectorValues values) { input = FilterIndexInput.unwrapOnlyTest(input); if (!(input instanceof MemorySegmentAccessInput msInput)) { return Optional.empty(); } - checkInvariants(maxOrd, vectorByteSize, input); + checkInvariants(values.size(), values.getVectorByteLength(), input); return switch (type) { - case COSINE -> Optional.of(new Cosine(dims, maxOrd, vectorByteSize, msInput, values)); - case DOT_PRODUCT -> Optional.of( - new DotProduct(dims, maxOrd, vectorByteSize, msInput, values)); - case EUCLIDEAN -> Optional.of(new Euclidean(dims, maxOrd, vectorByteSize, msInput, values)); - case MAXIMUM_INNER_PRODUCT -> Optional.of( - new MaxInnerProduct(dims, maxOrd, vectorByteSize, msInput, values)); + case COSINE -> Optional.of(new CosineSupplier(msInput, values)); + case DOT_PRODUCT -> Optional.of(new DotProductSupplier(msInput, values)); + case EUCLIDEAN -> Optional.of(new EuclideanSupplier(msInput, values)); + case MAXIMUM_INNER_PRODUCT -> Optional.of(new MaxInnerProductSupplier(msInput, values)); }; } MemorySegmentByteVectorScorerSupplier( - int dims, - int maxOrd, - int vectorByteSize, - MemorySegmentAccessInput input, - RandomAccessVectorValues values) { - this.vectorByteSize = vectorByteSize; - this.dims = dims; - this.maxOrd = maxOrd; + MemorySegmentAccessInput input, RandomAccessVectorValues values) { this.input = input; this.values = values; - scratch1 = new byte[vectorByteSize]; - scratch2 = new byte[vectorByteSize]; + this.vectorByteSize = values.getVectorByteLength(); + this.maxOrd = values.size(); } static void checkInvariants(int maxOrd, int vectorByteLength, IndexInput input) { if (input.length() < (long) vectorByteLength * maxOrd) { - throw new IllegalArgumentException("input length not equal to expected vector data"); + throw new IllegalArgumentException("input length is less than expected vector data"); } } - final void checkOrdinal(int ord, int maxOrd) { + final void checkOrdinal(int ord) { if (ord < 0 || ord >= maxOrd) { throw new IllegalArgumentException("illegal ordinal: " + ord); } } - protected final MemorySegment getSegment(int ord, byte[] scratch) throws IOException { - checkOrdinal(ord, maxOrd); - int byteOffset = ord * vectorByteSize; // TODO: random + meta size + final MemorySegment getFirstSegment(int ord) throws IOException { + long byteOffset = (long) ord * vectorByteSize; MemorySegment seg = input.segmentSliceOrNull(byteOffset, vectorByteSize); if (seg == null) { - input.readBytes(byteOffset, scratch, 0, vectorByteSize); - seg = MemorySegment.ofArray(scratch); + if (scratch1 == null) { + scratch1 = new byte[vectorByteSize]; + } + input.readBytes(byteOffset, scratch1, 0, vectorByteSize); + seg = MemorySegment.ofArray(scratch1); } return seg; } - public final RandomVectorScorer scorer(byte[] target) { - first = MemorySegment.ofArray(target); - return this; - } - - @Override - public abstract MemorySegmentByteVectorScorerSupplier copy() throws IOException; - - @Override - public final RandomVectorScorer scorer(int ord) throws IOException { - var copy = copy(); - copy.first = copy.getSegment(ord, copy.scratch1); - return copy; - } - - @Override - public final int maxOrd() { - return maxOrd; - } - - @Override - public final int ordToDoc(int ord) { - return values.ordToDoc(ord); - } - - @Override - public final Bits getAcceptOrds(Bits acceptDocs) { - return values.getAcceptOrds(acceptDocs); + final MemorySegment getSecondSegment(int ord) throws IOException { + long byteOffset = (long) ord * vectorByteSize; + MemorySegment seg = input.segmentSliceOrNull(byteOffset, vectorByteSize); + if (seg == null) { + if (scratch2 == null) { + scratch2 = new byte[vectorByteSize]; + } + input.readBytes(byteOffset, scratch2, 0, vectorByteSize); + seg = MemorySegment.ofArray(scratch2); + } + return seg; } - static final class Cosine extends MemorySegmentByteVectorScorerSupplier { + static final class CosineSupplier extends MemorySegmentByteVectorScorerSupplier { - Cosine( - int dims, - int maxOrd, - int vectorByteSize, - MemorySegmentAccessInput input, - RandomAccessVectorValues values) { - super(dims, maxOrd, vectorByteSize, input, values); + CosineSupplier(MemorySegmentAccessInput input, RandomAccessVectorValues values) { + super(input, values); } @Override - public float score(int node) throws IOException { - float raw = PanamaVectorUtilSupport.cosine(first, getSegment(node, scratch2)); - return (1 + raw) / 2; + public RandomVectorScorer scorer(int ord) { + checkOrdinal(ord); + return new RandomVectorScorer.AbstractRandomVectorScorer(values) { + @Override + public float score(int node) throws IOException { + checkOrdinal(node); + float raw = PanamaVectorUtilSupport.cosine(getFirstSegment(ord), getSecondSegment(node)); + return (1 + raw) / 2; + } + }; } @Override - public Cosine copy() throws IOException { - return new Cosine(dims, maxOrd, vectorByteSize, input.clone(), values); + public CosineSupplier copy() throws IOException { + return new CosineSupplier(input.clone(), values); } } - static final class DotProduct extends MemorySegmentByteVectorScorerSupplier { + static final class DotProductSupplier extends MemorySegmentByteVectorScorerSupplier { - DotProduct( - int dims, - int maxOrd, - int vectorByteSize, - MemorySegmentAccessInput input, - RandomAccessVectorValues values) { - super(dims, maxOrd, vectorByteSize, input, values); + DotProductSupplier(MemorySegmentAccessInput input, RandomAccessVectorValues values) { + super(input, values); } @Override - public float score(int node) throws IOException { - // divide by 2 * 2^14 (maximum absolute value of product of 2 signed bytes) * len - float raw = PanamaVectorUtilSupport.dotProduct(first, getSegment(node, scratch2)); - return 0.5f + raw / (float) (dims * (1 << 15)); + public RandomVectorScorer scorer(int ord) { + checkOrdinal(ord); + return new RandomVectorScorer.AbstractRandomVectorScorer(values) { + @Override + public float score(int node) throws IOException { + checkOrdinal(node); + // divide by 2 * 2^14 (maximum absolute value of product of 2 signed bytes) * len + float raw = + PanamaVectorUtilSupport.dotProduct(getFirstSegment(ord), getSecondSegment(node)); + return 0.5f + raw / (float) (values.dimension() * (1 << 15)); + } + }; } @Override - public DotProduct copy() throws IOException { - return new DotProduct(dims, maxOrd, vectorByteSize, input.clone(), values); + public DotProductSupplier copy() throws IOException { + return new DotProductSupplier(input.clone(), values); } } - static final class Euclidean extends MemorySegmentByteVectorScorerSupplier { + static final class EuclideanSupplier extends MemorySegmentByteVectorScorerSupplier { - Euclidean( - int dims, - int maxOrd, - int vectorByteSize, - MemorySegmentAccessInput input, - RandomAccessVectorValues values) { - super(dims, maxOrd, vectorByteSize, input, values); + EuclideanSupplier(MemorySegmentAccessInput input, RandomAccessVectorValues values) { + super(input, values); } @Override - public float score(int node) throws IOException { - float raw = PanamaVectorUtilSupport.squareDistance(first, getSegment(node, scratch2)); - return 1 / (1f + raw); + public RandomVectorScorer scorer(int ord) { + checkOrdinal(ord); + return new RandomVectorScorer.AbstractRandomVectorScorer(values) { + @Override + public float score(int node) throws IOException { + checkOrdinal(node); + float raw = + PanamaVectorUtilSupport.squareDistance(getFirstSegment(ord), getSecondSegment(node)); + return 1 / (1f + raw); + } + }; } @Override - public Euclidean copy() throws IOException { - return new Euclidean(dims, maxOrd, vectorByteSize, input.clone(), values); + public EuclideanSupplier copy() throws IOException { + return new EuclideanSupplier(input.clone(), values); } } - static final class MaxInnerProduct extends MemorySegmentByteVectorScorerSupplier { + static final class MaxInnerProductSupplier extends MemorySegmentByteVectorScorerSupplier { - MaxInnerProduct( - int dims, - int maxOrd, - int vectorByteSize, - MemorySegmentAccessInput input, - RandomAccessVectorValues values) { - super(dims, maxOrd, vectorByteSize, input, values); + MaxInnerProductSupplier(MemorySegmentAccessInput input, RandomAccessVectorValues values) { + super(input, values); } @Override - public float score(int node) throws IOException { - float raw = PanamaVectorUtilSupport.dotProduct(first, getSegment(node, scratch2)); - if (raw < 0) { - return 1 / (1 + -1 * raw); - } - return raw + 1; + public RandomVectorScorer scorer(int ord) { + checkOrdinal(ord); + return new RandomVectorScorer.AbstractRandomVectorScorer(values) { + @Override + public float score(int node) throws IOException { + checkOrdinal(node); + float raw = + PanamaVectorUtilSupport.dotProduct(getFirstSegment(ord), getSecondSegment(node)); + if (raw < 0) { + return 1 / (1 + -1 * raw); + } + return raw + 1; + } + }; } @Override - public MaxInnerProduct copy() throws IOException { - return new MaxInnerProduct(dims, maxOrd, vectorByteSize, input.clone(), values); + public MaxInnerProductSupplier copy() throws IOException { + return new MaxInnerProductSupplier(input.clone(), values); } } } diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentFlatVectorsScorer.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentFlatVectorsScorer.java index f95ec27dc20..7b181a0bb3f 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentFlatVectorsScorer.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentFlatVectorsScorer.java @@ -39,12 +39,7 @@ public RandomVectorScorerSupplier getRandomVectorScorerSupplier( if (vectorValues instanceof RandomAccessVectorValues.Bytes && vectorValues.getSlice() != null) { var scorer = MemorySegmentByteVectorScorerSupplier.create( - vectorValues.dimension(), - vectorValues.size(), - vectorValues.getVectorByteLength(), - similarityType, - vectorValues.getSlice(), - vectorValues); + similarityType, vectorValues.getSlice(), vectorValues); if (scorer.isPresent()) { return scorer.get(); } @@ -64,23 +59,20 @@ public RandomVectorScorer getRandomVectorScorer( @Override public RandomVectorScorer getRandomVectorScorer( - VectorSimilarityFunction similarityType, RandomAccessVectorValues vectorValues, byte[] target) + VectorSimilarityFunction similarityType, + RandomAccessVectorValues vectorValues, + byte[] queryVector) throws IOException { - checkDimensions(target.length, vectorValues.dimension()); + checkDimensions(queryVector.length, vectorValues.dimension()); if (vectorValues instanceof RandomAccessVectorValues.Bytes && vectorValues.getSlice() != null) { var scorer = - MemorySegmentByteVectorScorerSupplier.create( - vectorValues.dimension(), - vectorValues.size(), - vectorValues.getVectorByteLength(), - similarityType, - vectorValues.getSlice(), - vectorValues); + MemorySegmentByteVectorScorer.create( + similarityType, vectorValues.getSlice(), vectorValues, queryVector); if (scorer.isPresent()) { - return scorer.get().scorer(target); + return scorer.get(); } } - return delegate.getRandomVectorScorer(similarityType, vectorValues, target); + return delegate.getRandomVectorScorer(similarityType, vectorValues, queryVector); } static void checkDimensions(int queryLen, int fieldLen) { From eca47c35dba6e3ab4372c7e057561e32f82bfabd Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Sun, 12 May 2024 15:22:20 +0100 Subject: [PATCH 25/37] fix benchmark --- .../benchmark/jmh/VectorScorerBenchmark.java | 36 ++++++++++++++++--- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java index 6f6ac13368b..7f6a09eb5af 100644 --- a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java @@ -23,7 +23,9 @@ import java.util.concurrent.ThreadLocalRandom; import java.util.concurrent.TimeUnit; import org.apache.lucene.codecs.hnsw.FlatVectorScorerUtil; +import org.apache.lucene.codecs.hnsw.FlatVectorsScorer; import org.apache.lucene.codecs.lucene95.OffHeapByteVectorValues; +import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; @@ -32,6 +34,7 @@ import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorer; +import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; import org.openjdk.jmh.annotations.*; @BenchmarkMode(Mode.Throughput) @@ -69,7 +72,7 @@ public void init() throws IOException { out.writeBytes(vec2, 0, vec2.length); } in = dir.openInput("vector.data", IOContext.DEFAULT); - vectorValues = vectorValues(size, 2, in); + vectorValues = vectorValues(size, 2, in, DOT_PRODUCT); scorer = FlatVectorScorerUtil.newFlatVectorScorer() .getRandomVectorScorerSupplier(DOT_PRODUCT, vectorValues) @@ -92,9 +95,34 @@ public float binaryDotProductMemSeg() throws IOException { return scorer.score(1); } - static RandomAccessVectorValues vectorValues(int dims, int size, IndexInput in) - throws IOException { + static RandomAccessVectorValues vectorValues( + int dims, int size, IndexInput in, VectorSimilarityFunction sim) throws IOException { return new OffHeapByteVectorValues.DenseOffHeapVectorValues( - dims, size, in.slice("test", 0, in.length()), dims); + dims, size, in.slice("test", 0, in.length()), dims, new ThrowingFlatVectorScorer(), sim); + } + + static final class ThrowingFlatVectorScorer implements FlatVectorsScorer { + + @Override + public RandomVectorScorerSupplier getRandomVectorScorerSupplier( + VectorSimilarityFunction similarityFunction, RandomAccessVectorValues vectorValues) { + throw new UnsupportedOperationException(); + } + + @Override + public RandomVectorScorer getRandomVectorScorer( + VectorSimilarityFunction similarityFunction, + RandomAccessVectorValues vectorValues, + float[] target) { + throw new UnsupportedOperationException(); + } + + @Override + public RandomVectorScorer getRandomVectorScorer( + VectorSimilarityFunction similarityFunction, + RandomAccessVectorValues vectorValues, + byte[] target) { + throw new UnsupportedOperationException(); + } } } From 8efed149d404e7d044a5d31f8dd7e20cbb1414a5 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Sun, 12 May 2024 15:27:08 +0100 Subject: [PATCH 26/37] include Lucene99 in the name --- ... => Lucene99MemorySegmentByteVectorScorer.java} | 14 +++++++------- ...ne99MemorySegmentByteVectorScorerSupplier.java} | 12 ++++++------ .../MemorySegmentFlatVectorsScorer.java | 4 ++-- 3 files changed, 15 insertions(+), 15 deletions(-) rename lucene/core/src/java21/org/apache/lucene/internal/vectorization/{MemorySegmentByteVectorScorer.java => Lucene99MemorySegmentByteVectorScorer.java} (89%) rename lucene/core/src/java21/org/apache/lucene/internal/vectorization/{MemorySegmentByteVectorScorerSupplier.java => Lucene99MemorySegmentByteVectorScorerSupplier.java} (93%) diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorer.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorer.java similarity index 89% rename from lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorer.java rename to lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorer.java index 7bc67f5c5fd..41c106fe428 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorer.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorer.java @@ -10,7 +10,7 @@ import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorer; -abstract sealed class MemorySegmentByteVectorScorer +abstract sealed class Lucene99MemorySegmentByteVectorScorer extends RandomVectorScorer.AbstractRandomVectorScorer { final int vectorByteSize; @@ -22,7 +22,7 @@ abstract sealed class MemorySegmentByteVectorScorer * Return an optional whose value, if present, is the scorer. Otherwise, an empty optional is * returned. */ - public static Optional create( + public static Optional create( VectorSimilarityFunction type, IndexInput input, RandomAccessVectorValues values, @@ -41,7 +41,7 @@ public static Optional create( }; } - MemorySegmentByteVectorScorer( + Lucene99MemorySegmentByteVectorScorer( MemorySegmentAccessInput input, RandomAccessVectorValues values, byte[] queryVector) { super(values); this.input = input; @@ -75,7 +75,7 @@ final void checkOrdinal(int ord) { } } - static final class CosineScorer extends MemorySegmentByteVectorScorer { + static final class CosineScorer extends Lucene99MemorySegmentByteVectorScorer { CosineScorer(MemorySegmentAccessInput input, RandomAccessVectorValues values, byte[] query) { super(input, values, query); } @@ -88,7 +88,7 @@ public float score(int node) throws IOException { } } - static final class DotProductScorer extends MemorySegmentByteVectorScorer { + static final class DotProductScorer extends Lucene99MemorySegmentByteVectorScorer { DotProductScorer( MemorySegmentAccessInput input, RandomAccessVectorValues values, byte[] query) { super(input, values, query); @@ -103,7 +103,7 @@ public float score(int node) throws IOException { } } - static final class EuclideanScorer extends MemorySegmentByteVectorScorer { + static final class EuclideanScorer extends Lucene99MemorySegmentByteVectorScorer { EuclideanScorer(MemorySegmentAccessInput input, RandomAccessVectorValues values, byte[] query) { super(input, values, query); } @@ -116,7 +116,7 @@ public float score(int node) throws IOException { } } - static final class MaxInnerProductScorer extends MemorySegmentByteVectorScorer { + static final class MaxInnerProductScorer extends Lucene99MemorySegmentByteVectorScorer { MaxInnerProductScorer( MemorySegmentAccessInput input, RandomAccessVectorValues values, byte[] query) { super(input, values, query); diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorerSupplier.java similarity index 93% rename from lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java rename to lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorerSupplier.java index 363a17a5630..90b3bfb014c 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorerSupplier.java @@ -28,7 +28,7 @@ import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; /** A score supplier of vectors whose element size is byte. */ -public abstract sealed class MemorySegmentByteVectorScorerSupplier +public abstract sealed class Lucene99MemorySegmentByteVectorScorerSupplier implements RandomVectorScorerSupplier { final int vectorByteSize; final int maxOrd; @@ -55,7 +55,7 @@ static Optional create( }; } - MemorySegmentByteVectorScorerSupplier( + Lucene99MemorySegmentByteVectorScorerSupplier( MemorySegmentAccessInput input, RandomAccessVectorValues values) { this.input = input; this.values = values; @@ -101,7 +101,7 @@ final MemorySegment getSecondSegment(int ord) throws IOException { return seg; } - static final class CosineSupplier extends MemorySegmentByteVectorScorerSupplier { + static final class CosineSupplier extends Lucene99MemorySegmentByteVectorScorerSupplier { CosineSupplier(MemorySegmentAccessInput input, RandomAccessVectorValues values) { super(input, values); @@ -126,7 +126,7 @@ public CosineSupplier copy() throws IOException { } } - static final class DotProductSupplier extends MemorySegmentByteVectorScorerSupplier { + static final class DotProductSupplier extends Lucene99MemorySegmentByteVectorScorerSupplier { DotProductSupplier(MemorySegmentAccessInput input, RandomAccessVectorValues values) { super(input, values); @@ -153,7 +153,7 @@ public DotProductSupplier copy() throws IOException { } } - static final class EuclideanSupplier extends MemorySegmentByteVectorScorerSupplier { + static final class EuclideanSupplier extends Lucene99MemorySegmentByteVectorScorerSupplier { EuclideanSupplier(MemorySegmentAccessInput input, RandomAccessVectorValues values) { super(input, values); @@ -179,7 +179,7 @@ public EuclideanSupplier copy() throws IOException { } } - static final class MaxInnerProductSupplier extends MemorySegmentByteVectorScorerSupplier { + static final class MaxInnerProductSupplier extends Lucene99MemorySegmentByteVectorScorerSupplier { MaxInnerProductSupplier(MemorySegmentAccessInput input, RandomAccessVectorValues values) { super(input, values); diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentFlatVectorsScorer.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentFlatVectorsScorer.java index 7b181a0bb3f..70144ace300 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentFlatVectorsScorer.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentFlatVectorsScorer.java @@ -38,7 +38,7 @@ public RandomVectorScorerSupplier getRandomVectorScorerSupplier( // currently only supports binary vectors if (vectorValues instanceof RandomAccessVectorValues.Bytes && vectorValues.getSlice() != null) { var scorer = - MemorySegmentByteVectorScorerSupplier.create( + Lucene99MemorySegmentByteVectorScorerSupplier.create( similarityType, vectorValues.getSlice(), vectorValues); if (scorer.isPresent()) { return scorer.get(); @@ -66,7 +66,7 @@ public RandomVectorScorer getRandomVectorScorer( checkDimensions(queryVector.length, vectorValues.dimension()); if (vectorValues instanceof RandomAccessVectorValues.Bytes && vectorValues.getSlice() != null) { var scorer = - MemorySegmentByteVectorScorer.create( + Lucene99MemorySegmentByteVectorScorer.create( similarityType, vectorValues.getSlice(), vectorValues, queryVector); if (scorer.isPresent()) { return scorer.get(); From 9edb42311b731e6ba5f7261e736a320ee7664a8a Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Sun, 12 May 2024 19:22:38 +0100 Subject: [PATCH 27/37] fix license header --- .../Lucene99MemorySegmentByteVectorScorer.java | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorer.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorer.java index 41c106fe428..aae36204240 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorer.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorer.java @@ -1,3 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.internal.vectorization; import java.io.IOException; From 244352e5e91b9d643eed6365da8cf62cdc25cbbe Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Fri, 17 May 2024 10:58:10 +0100 Subject: [PATCH 28/37] clean up and more tests --- .../lucene/store/MemorySegmentIndexInput.java | 25 ++-- .../vectorization/TestVectorScorer.java | 135 ++++++++++++++++-- 2 files changed, 131 insertions(+), 29 deletions(-) diff --git a/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java b/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java index 5680cf7e35a..737db76561b 100644 --- a/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java +++ b/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java @@ -548,19 +548,6 @@ MemorySegmentIndexInput buildSlice(String sliceDescription, long offset, long le } } - public MemorySegment segmentSliceOrNull(long pos, int len) throws IOException { - if (pos + len > length) { - throw handlePositionalIOOBE(null, "segmentSliceOrNull", pos); - } - final int si = (int) (pos >> chunkSizePower); - final MemorySegment seg = segments[si]; - final long segOffset = pos & chunkSizeMask; - if (checkIndex(segOffset + len, seg.byteSize() + 1)) { - return seg.asSlice(segOffset, len); - } - return null; - } - static boolean checkIndex(long index, long length) { return index >= 0 && index < length; } @@ -752,7 +739,17 @@ public long readLong(long pos) throws IOException { } public MemorySegment segmentSliceOrNull(long pos, int len) throws IOException { - return super.segmentSliceOrNull(pos + offset, len); + if (pos + len > length) { + throw handlePositionalIOOBE(null, "segmentSliceOrNull", pos); + } + pos = pos + offset; + final int si = (int) (pos >> chunkSizePower); + final MemorySegment seg = segments[si]; + final long segOffset = pos & chunkSizeMask; + if (checkIndex(segOffset + len, seg.byteSize() + 1)) { + return seg.asSlice(segOffset, len); + } + return null; } @Override diff --git a/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java b/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java index faea7e29ee3..02b2e4c2fb6 100644 --- a/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java +++ b/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java @@ -20,13 +20,14 @@ import static org.apache.lucene.index.VectorSimilarityFunction.DOT_PRODUCT; import static org.apache.lucene.index.VectorSimilarityFunction.EUCLIDEAN; import static org.apache.lucene.index.VectorSimilarityFunction.MAXIMUM_INNER_PRODUCT; -import static org.hamcrest.Matchers.equalTo; import com.carrotsearch.randomizedtesting.generators.RandomNumbers; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.Arrays; import java.util.List; +import java.util.Objects; +import java.util.Random; import java.util.function.Function; import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer; import org.apache.lucene.codecs.hnsw.FlatVectorsScorer; @@ -41,9 +42,10 @@ import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.junit.BeforeClass; -// @com.carrotsearch.randomizedtesting.annotations.Repeat(iterations = 10) public class TestVectorScorer extends LuceneTestCase { + private static final double DELTA = 1e-5; + static final FlatVectorsScorer DEFAULT_SCORER = new DefaultFlatVectorScorer(); static final FlatVectorsScorer MEMSEG_SCORER = VectorizationProvider.lookup(true).newFlatVectorScorer(); @@ -70,12 +72,11 @@ public void testSimpleScorerMedChunkSize() throws IOException { } void testSimpleScorer(long maxChunkSize) throws IOException { - try (Directory dir = new MMapDirectory(createTempDir(getTestName()), maxChunkSize)) { + try (Directory dir = new MMapDirectory(createTempDir("testSimpleScorer"), maxChunkSize)) { for (int dims : List.of(31, 32, 33)) { - // System.out.println("testing with dim=" + dims); // dimensions that, in some scenarios, cross the mmap chunk sizes byte[][] vectors = new byte[2][dims]; - String fileName = getTestName() + "-" + dims; + String fileName = "bar-" + dims; try (IndexOutput out = dir.createOutput(fileName, IOContext.DEFAULT)) { for (int i = 0; i < dims; i++) { vectors[0][i] = (byte) i; @@ -95,13 +96,13 @@ void testSimpleScorer(long maxChunkSize) throws IOException { var scorer1 = DEFAULT_SCORER.getRandomVectorScorerSupplier(sim, vectorValues); float expected = scorer1.scorer(idx0).score(idx1); var scorer2 = MEMSEG_SCORER.getRandomVectorScorerSupplier(sim, vectorValues); - assertThat(scorer2.scorer(idx0).score(idx1), equalTo(expected)); + assertEquals(scorer2.scorer(idx0).score(idx1), expected, DELTA); // getRandomVectorScorer var scorer3 = DEFAULT_SCORER.getRandomVectorScorer(sim, vectorValues, vectors[idx0]); - assertThat(scorer3.score(idx1), equalTo(expected)); + assertEquals(scorer3.score(idx1), expected, DELTA); var scorer4 = MEMSEG_SCORER.getRandomVectorScorer(sim, vectorValues, vectors[idx0]); - assertThat(scorer4.score(idx1), equalTo(expected)); + assertEquals(scorer4.score(idx1), expected, DELTA); } } } @@ -128,12 +129,11 @@ public void testRandomSmallChunkSize() throws IOException { void testRandomScorer(long maxChunkSize, Function byteArraySupplier) throws IOException { - try (Directory dir = new MMapDirectory(createTempDir(getTestName()), maxChunkSize)) { + try (Directory dir = new MMapDirectory(createTempDir("testRandomScorer"), maxChunkSize)) { final int dims = randomIntBetween(1, 4096); final int size = randomIntBetween(2, 100); final byte[][] vectors = new byte[size][]; - String fileName = getTestName() + "-" + dims; - // System.out.println("Testing, maxChunkSize=" + maxChunkSize + ",fn=" + fileName); + String fileName = "foo-" + dims; try (IndexOutput out = dir.createOutput(fileName, IOContext.DEFAULT)) { for (int i = 0; i < size; i++) { var vec = byteArraySupplier.apply(dims); @@ -153,20 +153,115 @@ void testRandomScorer(long maxChunkSize, Function byteArraySupp var scorer1 = DEFAULT_SCORER.getRandomVectorScorerSupplier(sim, vectorValues); float expected = scorer1.scorer(idx0).score(idx1); var scorer2 = MEMSEG_SCORER.getRandomVectorScorerSupplier(sim, vectorValues); - assertThat(scorer2.scorer(idx0).score(idx1), equalTo(expected)); + assertEquals(scorer2.scorer(idx0).score(idx1), expected, DELTA); // getRandomVectorScorer var scorer3 = DEFAULT_SCORER.getRandomVectorScorer(sim, vectorValues, vectors[idx0]); - assertThat(scorer3.score(idx1), equalTo(expected)); + assertEquals(scorer3.score(idx1), expected, DELTA); var scorer4 = MEMSEG_SCORER.getRandomVectorScorer(sim, vectorValues, vectors[idx0]); - assertThat(scorer4.score(idx1), equalTo(expected)); + assertEquals(scorer4.score(idx1), expected, DELTA); } } } } } - // TODO: add initial offset tests + public void testRandomSliceSmall() throws IOException { + testRandomSliceImpl(30, 64, 1, BYTE_ARRAY_RANDOM_FUNC); + } + + public void testRandomSlice() throws IOException { + int dims = randomIntBetween(1, 4096); + long maxChunkSize = randomLongBetween(32, 128); + int size = randomIntBetween(1, 129); + testRandomSliceImpl(dims, maxChunkSize, size, BYTE_ARRAY_RANDOM_FUNC); + } + + void testRandomSliceImpl( + int dims, long maxChunkSize, int initialPadding, Function byteArraySupplier) + throws IOException { + try (Directory dir = new MMapDirectory(createTempDir("testRandomSliceImpl"), maxChunkSize)) { + final int size = randomIntBetween(2, 100); + final byte[][] vectors = new byte[size][]; + String fileName = "baz-" + dims; + try (IndexOutput out = dir.createOutput(fileName, IOContext.DEFAULT)) { + byte[] ba = new byte[initialPadding]; + out.writeBytes(ba, 0, ba.length); + for (int i = 0; i < size; i++) { + var vec = byteArraySupplier.apply(dims); + out.writeBytes(vec, 0, vec.length); + vectors[i] = vec; + } + } + + try (var outter = dir.openInput(fileName, IOContext.DEFAULT); + var in = outter.slice("slice", initialPadding, outter.length() - initialPadding)) { + for (int times = 0; times < TIMES; times++) { + for (var sim : List.of(COSINE, EUCLIDEAN, DOT_PRODUCT, MAXIMUM_INNER_PRODUCT)) { + var vectorValues = vectorValues(dims, size, in, sim); + int idx0 = randomIntBetween(0, size - 1); + int idx1 = randomIntBetween(0, size - 1); // may be the same as idx0 - which is ok. + + // getRandomVectorScorerSupplier + var scorer1 = DEFAULT_SCORER.getRandomVectorScorerSupplier(sim, vectorValues); + float expected = scorer1.scorer(idx0).score(idx1); + var scorer2 = MEMSEG_SCORER.getRandomVectorScorerSupplier(sim, vectorValues); + assertEquals(scorer2.scorer(idx0).score(idx1), expected, DELTA); + + // getRandomVectorScorer + var scorer3 = DEFAULT_SCORER.getRandomVectorScorer(sim, vectorValues, vectors[idx0]); + assertEquals(scorer3.score(idx1), expected, DELTA); + var scorer4 = MEMSEG_SCORER.getRandomVectorScorer(sim, vectorValues, vectors[idx0]); + assertEquals(scorer4.score(idx1), expected, DELTA); + } + } + } + } + } + + // Tests with a large amount of data (> 2GB), which ensures that data offsets do not overflow + @Nightly + public void testLarge() throws IOException { + try (Directory dir = new MMapDirectory(createTempDir("testLarge"))) { + final int dims = 8192; + final int size = 262500; + final String fileName = "large-" + dims; + try (IndexOutput out = dir.createOutput(fileName, IOContext.DEFAULT)) { + for (int i = 0; i < size; i++) { + var vec = vector(i, dims); + out.writeBytes(vec, 0, vec.length); + } + } + + try (IndexInput in = dir.openInput(fileName, IOContext.DEFAULT)) { + assert in.length() > Integer.MAX_VALUE; + for (int times = 0; times < TIMES; times++) { + for (var sim : List.of(COSINE, EUCLIDEAN, DOT_PRODUCT, MAXIMUM_INNER_PRODUCT)) { + var vectorValues = vectorValues(dims, size, in, sim); + int ord1 = randomIntBetween(0, size - 1); + int ord2 = size - 1; + for (var ords : List.of(List.of(ord1, ord2), List.of(ord2, ord1))) { + int idx0 = ords.getFirst(); + int idx1 = ords.getLast(); + + // getRandomVectorScorerSupplier + var scorer1 = DEFAULT_SCORER.getRandomVectorScorerSupplier(sim, vectorValues); + float expected = scorer1.scorer(idx0).score(idx1); + var scorer2 = MEMSEG_SCORER.getRandomVectorScorerSupplier(sim, vectorValues); + assertEquals(scorer2.scorer(idx0).score(idx1), expected, DELTA); + + // getRandomVectorScorer + var query = vector(idx0, dims); + var scorer3 = DEFAULT_SCORER.getRandomVectorScorer(sim, vectorValues, query); + assertEquals(scorer3.score(idx1), expected, DELTA); + var scorer4 = MEMSEG_SCORER.getRandomVectorScorer(sim, vectorValues, query); + assertEquals(scorer4.score(idx1), expected, DELTA); + } + } + } + } + } + } RandomAccessVectorValues vectorValues( int dims, int size, IndexInput in, VectorSimilarityFunction sim) throws IOException { @@ -174,6 +269,16 @@ RandomAccessVectorValues vectorValues( dims, size, in.slice("byteValues", 0, in.length()), dims, MEMSEG_SCORER, sim); } + // creates the vector based on the given ordinal, which is reproducible given the ord and dims + static byte[] vector(int ord, int dims) { + var random = new Random(Objects.hash(ord, dims)); + byte[] ba = new byte[dims]; + for (int i = 0; i < dims; i++) { + ba[i] = (byte) RandomNumbers.randomIntBetween(random, Byte.MIN_VALUE, Byte.MAX_VALUE); + } + return ba; + } + /** Concatenates byte arrays. */ static byte[] concat(byte[]... arrays) throws IOException { try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { From 2a7096ebb5355f27a9f505d22b91248e619d9e01 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Tue, 21 May 2024 09:59:52 +0100 Subject: [PATCH 29/37] test copies in threads do not interfere with each other --- .../vectorization/TestVectorScorer.java | 83 +++++++++++++++++-- 1 file changed, 78 insertions(+), 5 deletions(-) diff --git a/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java b/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java index 02b2e4c2fb6..3c9c5693ad4 100644 --- a/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java +++ b/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java @@ -27,8 +27,15 @@ import java.util.Arrays; import java.util.List; import java.util.Objects; +import java.util.Optional; import java.util.Random; +import java.util.concurrent.Callable; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; import java.util.function.Function; +import java.util.function.Predicate; +import java.util.stream.IntStream; import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer; import org.apache.lucene.codecs.hnsw.FlatVectorsScorer; import org.apache.lucene.codecs.lucene95.OffHeapByteVectorValues; @@ -40,6 +47,7 @@ import org.apache.lucene.store.MMapDirectory; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.util.hnsw.RandomAccessVectorValues; +import org.apache.lucene.util.hnsw.RandomVectorScorer; import org.junit.BeforeClass; public class TestVectorScorer extends LuceneTestCase { @@ -173,19 +181,20 @@ public void testRandomSliceSmall() throws IOException { public void testRandomSlice() throws IOException { int dims = randomIntBetween(1, 4096); long maxChunkSize = randomLongBetween(32, 128); - int size = randomIntBetween(1, 129); - testRandomSliceImpl(dims, maxChunkSize, size, BYTE_ARRAY_RANDOM_FUNC); + int initialOffset = randomIntBetween(1, 129); + testRandomSliceImpl(dims, maxChunkSize, initialOffset, BYTE_ARRAY_RANDOM_FUNC); } + // Tests with a slice that has a non-zero initial offset void testRandomSliceImpl( - int dims, long maxChunkSize, int initialPadding, Function byteArraySupplier) + int dims, long maxChunkSize, int initialOffset, Function byteArraySupplier) throws IOException { try (Directory dir = new MMapDirectory(createTempDir("testRandomSliceImpl"), maxChunkSize)) { final int size = randomIntBetween(2, 100); final byte[][] vectors = new byte[size][]; String fileName = "baz-" + dims; try (IndexOutput out = dir.createOutput(fileName, IOContext.DEFAULT)) { - byte[] ba = new byte[initialPadding]; + byte[] ba = new byte[initialOffset]; out.writeBytes(ba, 0, ba.length); for (int i = 0; i < size; i++) { var vec = byteArraySupplier.apply(dims); @@ -195,7 +204,7 @@ void testRandomSliceImpl( } try (var outter = dir.openInput(fileName, IOContext.DEFAULT); - var in = outter.slice("slice", initialPadding, outter.length() - initialPadding)) { + var in = outter.slice("slice", initialOffset, outter.length() - initialOffset)) { for (int times = 0; times < TIMES; times++) { for (var sim : List.of(COSINE, EUCLIDEAN, DOT_PRODUCT, MAXIMUM_INNER_PRODUCT)) { var vectorValues = vectorValues(dims, size, in, sim); @@ -219,6 +228,70 @@ void testRandomSliceImpl( } } + // Tests that copies in threads do not interfere with each other + public void testCopiesAcrossThreads() throws Exception { + final long maxChunkSize = 32; + final int dims = 34; // dimensions that are larger than the chunk size, to force fallback + byte[] vec1 = new byte[dims]; + byte[] vec2 = new byte[dims]; + IntStream.range(0, dims).forEach(i -> vec1[i] = 1); + IntStream.range(0, dims).forEach(i -> vec2[i] = 2); + try (Directory dir = new MMapDirectory(createTempDir("testRace"), maxChunkSize)) { + String fileName = "biz-" + dims; + try (IndexOutput out = dir.createOutput(fileName, IOContext.DEFAULT)) { + byte[] bytes = concat(vec1, vec1, vec2, vec2); + out.writeBytes(bytes, 0, bytes.length); + } + try (IndexInput in = dir.openInput(fileName, IOContext.DEFAULT)) { + for (var sim : List.of(COSINE, EUCLIDEAN, DOT_PRODUCT, MAXIMUM_INNER_PRODUCT)) { + var vectorValues = vectorValues(dims, 4, in, sim); + var scoreSupplier = DEFAULT_SCORER.getRandomVectorScorerSupplier(sim, vectorValues); + var expectedScore1 = scoreSupplier.scorer(0).score(1); + var expectedScore2 = scoreSupplier.scorer(2).score(3); + + var scorer = MEMSEG_SCORER.getRandomVectorScorerSupplier(sim, vectorValues); + var tasks = + List.>>of( + new AssertingScoreCallable(scorer.copy().scorer(0), 1, expectedScore1), + new AssertingScoreCallable(scorer.copy().scorer(2), 3, expectedScore2)); + var executor = Executors.newFixedThreadPool(2); + var results = executor.invokeAll(tasks); + executor.shutdown(); + assertTrue(executor.awaitTermination(30, TimeUnit.SECONDS)); + assertEquals(results.stream().filter(Predicate.not(Future::isDone)).count(), 0L); + for (var res : results) { + assertTrue("Unexpected exception" + res.get(), res.get().isEmpty()); + } + } + } + } + } + + // A callable that scores the given ord and scorer and asserts the expected result. + static class AssertingScoreCallable implements Callable> { + final RandomVectorScorer scorer; + final int ord; + final float expectedScore; + + AssertingScoreCallable(RandomVectorScorer scorer, int ord, float expectedScore) { + this.scorer = scorer; + this.ord = ord; + this.expectedScore = expectedScore; + } + + @Override + public Optional call() throws Exception { + try { + for (int i = 0; i < 100; i++) { + assertEquals(scorer.score(ord), expectedScore, DELTA); + } + } catch (Throwable t) { + return Optional.of(t); + } + return Optional.empty(); + } + } + // Tests with a large amount of data (> 2GB), which ensures that data offsets do not overflow @Nightly public void testLarge() throws IOException { From e018da1d1d1e9df4ae32df4076ea8be2a71bbeb4 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Tue, 21 May 2024 10:12:51 +0100 Subject: [PATCH 30/37] fix compilation --- .../apache/lucene/internal/vectorization/TestVectorScorer.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java b/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java index 3c9c5693ad4..0e475ab59c4 100644 --- a/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java +++ b/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java @@ -46,6 +46,7 @@ import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.MMapDirectory; import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.NamedThreadFactory; import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorer; import org.junit.BeforeClass; @@ -254,7 +255,7 @@ public void testCopiesAcrossThreads() throws Exception { List.>>of( new AssertingScoreCallable(scorer.copy().scorer(0), 1, expectedScore1), new AssertingScoreCallable(scorer.copy().scorer(2), 3, expectedScore2)); - var executor = Executors.newFixedThreadPool(2); + var executor = Executors.newFixedThreadPool(2, new NamedThreadFactory("copiesThreads")); var results = executor.invokeAll(tasks); executor.shutdown(); assertTrue(executor.awaitTermination(30, TimeUnit.SECONDS)); From a7439074bb59ade3dae1f00c5db6a9ad0984237a Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Tue, 21 May 2024 11:36:34 +0100 Subject: [PATCH 31/37] static instance --- .../apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java | 3 +++ .../org/apache/lucene/codecs/hnsw/FlatVectorScorerUtil.java | 2 +- .../codecs/lucene99/Lucene99ScalarQuantizedVectorsFormat.java | 3 ++- .../internal/vectorization/DefaultVectorizationProvider.java | 4 ++-- .../lucene/internal/vectorization/VectorizationProvider.java | 2 +- .../internal/vectorization/PanamaVectorizationProvider.java | 4 ++-- .../org/apache/lucene/codecs/hnsw/TestFlatVectorScorer.java | 2 +- .../lucene/internal/vectorization/TestVectorScorer.java | 4 ++-- 8 files changed, 14 insertions(+), 10 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java index 7fd500e85a2..c112d2ccdbe 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java @@ -29,6 +29,9 @@ * @lucene.experimental */ public class DefaultFlatVectorScorer implements FlatVectorsScorer { + + public static final DefaultFlatVectorScorer INSTANCE = new DefaultFlatVectorScorer(); + @Override public RandomVectorScorerSupplier getRandomVectorScorerSupplier( VectorSimilarityFunction similarityFunction, RandomAccessVectorValues vectorValues) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorScorerUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorScorerUtil.java index 5a1100b1119..5082b3f492b 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorScorerUtil.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorScorerUtil.java @@ -35,6 +35,6 @@ private FlatVectorScorerUtil() {} * on certain platforms. Otherwise, a DefaultFlatVectorScorer is returned. */ public static FlatVectorsScorer newFlatVectorScorer() { - return IMPL.newFlatVectorScorer(); + return IMPL.getFlatVectorScorer(); } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsFormat.java index e09d331ef0b..15e2e843df9 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsFormat.java @@ -102,7 +102,8 @@ public Lucene99ScalarQuantizedVectorsFormat( this.bits = (byte) bits; this.confidenceInterval = confidenceInterval; this.compress = compress; - this.flatVectorScorer = new Lucene99ScalarQuantizedVectorScorer(new DefaultFlatVectorScorer()); + this.flatVectorScorer = + new Lucene99ScalarQuantizedVectorScorer(DefaultFlatVectorScorer.INSTANCE); } public static float calculateDefaultConfidenceInterval(int vectorDimension) { diff --git a/lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorizationProvider.java b/lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorizationProvider.java index f64ec931026..f894c514db3 100644 --- a/lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorizationProvider.java +++ b/lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorizationProvider.java @@ -35,7 +35,7 @@ public VectorUtilSupport getVectorUtilSupport() { } @Override - public FlatVectorsScorer newFlatVectorScorer() { - return new DefaultFlatVectorScorer(); + public FlatVectorsScorer getFlatVectorScorer() { + return DefaultFlatVectorScorer.INSTANCE; } } diff --git a/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java b/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java index 2cecfa2c6b7..9831ac1147a 100644 --- a/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java +++ b/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java @@ -92,7 +92,7 @@ public static VectorizationProvider getInstance() { */ public abstract VectorUtilSupport getVectorUtilSupport(); - public abstract FlatVectorsScorer newFlatVectorScorer(); + public abstract FlatVectorsScorer getFlatVectorScorer(); // *** Lookup mechanism: *** diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorizationProvider.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorizationProvider.java index 516a3ea9f69..655bcf9f69d 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorizationProvider.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorizationProvider.java @@ -77,7 +77,7 @@ public VectorUtilSupport getVectorUtilSupport() { } @Override - public FlatVectorsScorer newFlatVectorScorer() { - return new MemorySegmentFlatVectorsScorer(new DefaultFlatVectorScorer()); + public FlatVectorsScorer getFlatVectorScorer() { + return new MemorySegmentFlatVectorsScorer(DefaultFlatVectorScorer.INSTANCE); } } diff --git a/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorer.java b/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorer.java index 5c8fa6cc2cf..71c5fd3694c 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorer.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorer.java @@ -62,7 +62,7 @@ public TestFlatVectorScorer( public static Iterable parametersFactory() { var scorers = List.of( - new DefaultFlatVectorScorer(), + DefaultFlatVectorScorer.INSTANCE, new Lucene99ScalarQuantizedVectorScorer(new DefaultFlatVectorScorer()), FlatVectorScorerUtil.newFlatVectorScorer()); var dirs = diff --git a/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java b/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java index 0e475ab59c4..b58293bca9a 100644 --- a/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java +++ b/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java @@ -55,9 +55,9 @@ public class TestVectorScorer extends LuceneTestCase { private static final double DELTA = 1e-5; - static final FlatVectorsScorer DEFAULT_SCORER = new DefaultFlatVectorScorer(); + static final FlatVectorsScorer DEFAULT_SCORER = DefaultFlatVectorScorer.INSTANCE; static final FlatVectorsScorer MEMSEG_SCORER = - VectorizationProvider.lookup(true).newFlatVectorScorer(); + VectorizationProvider.lookup(true).getFlatVectorScorer(); @BeforeClass public static void beforeClass() throws Exception { From c8c70ee0eba4410c83b25802fddb0fa05df0f576 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Tue, 21 May 2024 11:45:41 +0100 Subject: [PATCH 32/37] new -> get --- .../apache/lucene/benchmark/jmh/VectorScorerBenchmark.java | 2 +- .../org/apache/lucene/codecs/hnsw/FlatVectorScorerUtil.java | 6 +++--- .../lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java | 2 +- .../lucene99/Lucene99ScalarQuantizedVectorsFormat.java | 2 +- .../org/apache/lucene/codecs/hnsw/TestFlatVectorScorer.java | 4 ++-- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java index 7f6a09eb5af..50a8709e30d 100644 --- a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java @@ -74,7 +74,7 @@ public void init() throws IOException { in = dir.openInput("vector.data", IOContext.DEFAULT); vectorValues = vectorValues(size, 2, in, DOT_PRODUCT); scorer = - FlatVectorScorerUtil.newFlatVectorScorer() + FlatVectorScorerUtil.getFlatVectorScorer() .getRandomVectorScorerSupplier(DOT_PRODUCT, vectorValues) .scorer(0); } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorScorerUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorScorerUtil.java index 5082b3f492b..7e320ff17af 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorScorerUtil.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorScorerUtil.java @@ -31,10 +31,10 @@ public final class FlatVectorScorerUtil { private FlatVectorScorerUtil() {} /** - * Creates a new flat vector scorer. Scorers created through this factory method may be optimized - * on certain platforms. Otherwise, a DefaultFlatVectorScorer is returned. + * Gets a flat vector scorer. Scorers retrieved through this method may be optimized on certain + * platforms. Otherwise, a DefaultFlatVectorScorer is returned. */ - public static FlatVectorsScorer newFlatVectorScorer() { + public static FlatVectorsScorer getFlatVectorScorer() { return IMPL.getFlatVectorScorer(); } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java index 43676f9fc66..0dd0cacd18d 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java @@ -139,7 +139,7 @@ public final class Lucene99HnswVectorsFormat extends KnnVectorsFormat { /** The format for storing, reading, merging vectors on disk */ private static final FlatVectorsFormat flatVectorsFormat = - new Lucene99FlatVectorsFormat(FlatVectorScorerUtil.newFlatVectorScorer()); + new Lucene99FlatVectorsFormat(FlatVectorScorerUtil.getFlatVectorScorer()); private final int numMergeWorkers; private final TaskExecutor mergeExec; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsFormat.java index 15e2e843df9..eef1a955c68 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsFormat.java @@ -49,7 +49,7 @@ public class Lucene99ScalarQuantizedVectorsFormat extends FlatVectorsFormat { static final String VECTOR_DATA_EXTENSION = "veq"; private static final FlatVectorsFormat rawVectorFormat = - new Lucene99FlatVectorsFormat(FlatVectorScorerUtil.newFlatVectorScorer()); + new Lucene99FlatVectorsFormat(FlatVectorScorerUtil.getFlatVectorScorer()); /** The minimum confidence interval */ private static final float MINIMUM_CONFIDENCE_INTERVAL = 0.9f; diff --git a/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorer.java b/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorer.java index 71c5fd3694c..df0949997a5 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorer.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorer.java @@ -64,7 +64,7 @@ public static Iterable parametersFactory() { List.of( DefaultFlatVectorScorer.INSTANCE, new Lucene99ScalarQuantizedVectorScorer(new DefaultFlatVectorScorer()), - FlatVectorScorerUtil.newFlatVectorScorer()); + FlatVectorScorerUtil.getFlatVectorScorer()); var dirs = List.>of( TestFlatVectorScorer::newDirectory, @@ -80,7 +80,7 @@ public static Iterable parametersFactory() { } public void testDefaultOrMemSegScorer() { - var scorer = FlatVectorScorerUtil.newFlatVectorScorer(); + var scorer = FlatVectorScorerUtil.getFlatVectorScorer(); assertThat( scorer.toString(), is(oneOf("DefaultFlatVectorScorer()", "MemorySegmentFlatVectorsScorer()"))); From b5a3f45cb960db3e5757947111b7bfce8634336d Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Tue, 21 May 2024 11:51:47 +0100 Subject: [PATCH 33/37] one more INSTANCE --- .../vectorization/MemorySegmentFlatVectorsScorer.java | 4 ++++ .../internal/vectorization/PanamaVectorizationProvider.java | 3 +-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentFlatVectorsScorer.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentFlatVectorsScorer.java index 70144ace300..1c7deb9c761 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentFlatVectorsScorer.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentFlatVectorsScorer.java @@ -17,6 +17,7 @@ package org.apache.lucene.internal.vectorization; import java.io.IOException; +import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer; import org.apache.lucene.codecs.hnsw.FlatVectorsScorer; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.util.hnsw.RandomAccessVectorValues; @@ -25,6 +26,9 @@ public class MemorySegmentFlatVectorsScorer implements FlatVectorsScorer { + public static final MemorySegmentFlatVectorsScorer INSTANCE = + new MemorySegmentFlatVectorsScorer(DefaultFlatVectorScorer.INSTANCE); + private final FlatVectorsScorer delegate; public MemorySegmentFlatVectorsScorer(FlatVectorsScorer delegate) { diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorizationProvider.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorizationProvider.java index 655bcf9f69d..34b84010d2a 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorizationProvider.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorizationProvider.java @@ -21,7 +21,6 @@ import java.util.Locale; import java.util.logging.Logger; import jdk.incubator.vector.FloatVector; -import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer; import org.apache.lucene.codecs.hnsw.FlatVectorsScorer; import org.apache.lucene.util.Constants; import org.apache.lucene.util.SuppressForbidden; @@ -78,6 +77,6 @@ public VectorUtilSupport getVectorUtilSupport() { @Override public FlatVectorsScorer getFlatVectorScorer() { - return new MemorySegmentFlatVectorsScorer(DefaultFlatVectorScorer.INSTANCE); + return MemorySegmentFlatVectorsScorer.INSTANCE; } } From ad271f33b1a016b20e68b4a22aa17e71283503ed Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Tue, 21 May 2024 12:23:16 +0100 Subject: [PATCH 34/37] make private --- .../internal/vectorization/MemorySegmentFlatVectorsScorer.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentFlatVectorsScorer.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentFlatVectorsScorer.java index 1c7deb9c761..3f29c57d7ea 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentFlatVectorsScorer.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentFlatVectorsScorer.java @@ -31,7 +31,7 @@ public class MemorySegmentFlatVectorsScorer implements FlatVectorsScorer { private final FlatVectorsScorer delegate; - public MemorySegmentFlatVectorsScorer(FlatVectorsScorer delegate) { + private MemorySegmentFlatVectorsScorer(FlatVectorsScorer delegate) { this.delegate = delegate; } From c42c9a13588b359af24b9856d90651c37f1ab4e6 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Tue, 21 May 2024 15:55:16 +0100 Subject: [PATCH 35/37] add lucene99 --- .../lucene/benchmark/jmh/VectorScorerBenchmark.java | 2 +- .../apache/lucene/codecs/hnsw/FlatVectorScorerUtil.java | 8 ++++---- .../lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java | 2 +- .../lucene99/Lucene99ScalarQuantizedVectorsFormat.java | 2 +- .../vectorization/DefaultVectorizationProvider.java | 2 +- .../internal/vectorization/VectorizationProvider.java | 3 ++- ...r.java => Lucene99MemorySegmentFlatVectorsScorer.java} | 8 ++++---- .../vectorization/PanamaVectorizationProvider.java | 4 ++-- .../apache/lucene/codecs/hnsw/TestFlatVectorScorer.java | 4 ++-- .../lucene/internal/vectorization/TestVectorScorer.java | 2 +- 10 files changed, 19 insertions(+), 18 deletions(-) rename lucene/core/src/java21/org/apache/lucene/internal/vectorization/{MemorySegmentFlatVectorsScorer.java => Lucene99MemorySegmentFlatVectorsScorer.java} (91%) diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java index 50a8709e30d..c4d3040f283 100644 --- a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java @@ -74,7 +74,7 @@ public void init() throws IOException { in = dir.openInput("vector.data", IOContext.DEFAULT); vectorValues = vectorValues(size, 2, in, DOT_PRODUCT); scorer = - FlatVectorScorerUtil.getFlatVectorScorer() + FlatVectorScorerUtil.getLucene99FlatVectorsScorer() .getRandomVectorScorerSupplier(DOT_PRODUCT, vectorValues) .scorer(0); } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorScorerUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorScorerUtil.java index 7e320ff17af..808d7b3cc88 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorScorerUtil.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorScorerUtil.java @@ -31,10 +31,10 @@ public final class FlatVectorScorerUtil { private FlatVectorScorerUtil() {} /** - * Gets a flat vector scorer. Scorers retrieved through this method may be optimized on certain - * platforms. Otherwise, a DefaultFlatVectorScorer is returned. + * Returns a FlatVectorsScorer that supports the Lucene99 format. Scorers retrieved through this + * method may be optimized on certain platforms. Otherwise, a DefaultFlatVectorScorer is returned. */ - public static FlatVectorsScorer getFlatVectorScorer() { - return IMPL.getFlatVectorScorer(); + public static FlatVectorsScorer getLucene99FlatVectorsScorer() { + return IMPL.getLucene99FlatVectorsScorer(); } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java index 0dd0cacd18d..3238fd1f4ae 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java @@ -139,7 +139,7 @@ public final class Lucene99HnswVectorsFormat extends KnnVectorsFormat { /** The format for storing, reading, merging vectors on disk */ private static final FlatVectorsFormat flatVectorsFormat = - new Lucene99FlatVectorsFormat(FlatVectorScorerUtil.getFlatVectorScorer()); + new Lucene99FlatVectorsFormat(FlatVectorScorerUtil.getLucene99FlatVectorsScorer()); private final int numMergeWorkers; private final TaskExecutor mergeExec; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsFormat.java index eef1a955c68..26fa791468d 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsFormat.java @@ -49,7 +49,7 @@ public class Lucene99ScalarQuantizedVectorsFormat extends FlatVectorsFormat { static final String VECTOR_DATA_EXTENSION = "veq"; private static final FlatVectorsFormat rawVectorFormat = - new Lucene99FlatVectorsFormat(FlatVectorScorerUtil.getFlatVectorScorer()); + new Lucene99FlatVectorsFormat(FlatVectorScorerUtil.getLucene99FlatVectorsScorer()); /** The minimum confidence interval */ private static final float MINIMUM_CONFIDENCE_INTERVAL = 0.9f; diff --git a/lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorizationProvider.java b/lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorizationProvider.java index f894c514db3..c5193aa23de 100644 --- a/lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorizationProvider.java +++ b/lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorizationProvider.java @@ -35,7 +35,7 @@ public VectorUtilSupport getVectorUtilSupport() { } @Override - public FlatVectorsScorer getFlatVectorScorer() { + public FlatVectorsScorer getLucene99FlatVectorsScorer() { return DefaultFlatVectorScorer.INSTANCE; } } diff --git a/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java b/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java index 9831ac1147a..a236c303eb4 100644 --- a/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java +++ b/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java @@ -92,7 +92,8 @@ public static VectorizationProvider getInstance() { */ public abstract VectorUtilSupport getVectorUtilSupport(); - public abstract FlatVectorsScorer getFlatVectorScorer(); + /** Returns a FlatVectorsScorer that supports the Lucene99 format. */ + public abstract FlatVectorsScorer getLucene99FlatVectorsScorer(); // *** Lookup mechanism: *** diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentFlatVectorsScorer.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentFlatVectorsScorer.java similarity index 91% rename from lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentFlatVectorsScorer.java rename to lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentFlatVectorsScorer.java index 3f29c57d7ea..c0c491a2b62 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentFlatVectorsScorer.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentFlatVectorsScorer.java @@ -24,14 +24,14 @@ import org.apache.lucene.util.hnsw.RandomVectorScorer; import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; -public class MemorySegmentFlatVectorsScorer implements FlatVectorsScorer { +public class Lucene99MemorySegmentFlatVectorsScorer implements FlatVectorsScorer { - public static final MemorySegmentFlatVectorsScorer INSTANCE = - new MemorySegmentFlatVectorsScorer(DefaultFlatVectorScorer.INSTANCE); + public static final Lucene99MemorySegmentFlatVectorsScorer INSTANCE = + new Lucene99MemorySegmentFlatVectorsScorer(DefaultFlatVectorScorer.INSTANCE); private final FlatVectorsScorer delegate; - private MemorySegmentFlatVectorsScorer(FlatVectorsScorer delegate) { + private Lucene99MemorySegmentFlatVectorsScorer(FlatVectorsScorer delegate) { this.delegate = delegate; } diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorizationProvider.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorizationProvider.java index 34b84010d2a..87f7cf2baf7 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorizationProvider.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorizationProvider.java @@ -76,7 +76,7 @@ public VectorUtilSupport getVectorUtilSupport() { } @Override - public FlatVectorsScorer getFlatVectorScorer() { - return MemorySegmentFlatVectorsScorer.INSTANCE; + public FlatVectorsScorer getLucene99FlatVectorsScorer() { + return Lucene99MemorySegmentFlatVectorsScorer.INSTANCE; } } diff --git a/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorer.java b/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorer.java index df0949997a5..74687cae8cd 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorer.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorer.java @@ -64,7 +64,7 @@ public static Iterable parametersFactory() { List.of( DefaultFlatVectorScorer.INSTANCE, new Lucene99ScalarQuantizedVectorScorer(new DefaultFlatVectorScorer()), - FlatVectorScorerUtil.getFlatVectorScorer()); + FlatVectorScorerUtil.getLucene99FlatVectorsScorer()); var dirs = List.>of( TestFlatVectorScorer::newDirectory, @@ -80,7 +80,7 @@ public static Iterable parametersFactory() { } public void testDefaultOrMemSegScorer() { - var scorer = FlatVectorScorerUtil.getFlatVectorScorer(); + var scorer = FlatVectorScorerUtil.getLucene99FlatVectorsScorer(); assertThat( scorer.toString(), is(oneOf("DefaultFlatVectorScorer()", "MemorySegmentFlatVectorsScorer()"))); diff --git a/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java b/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java index b58293bca9a..ce2ad6854a2 100644 --- a/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java +++ b/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java @@ -57,7 +57,7 @@ public class TestVectorScorer extends LuceneTestCase { static final FlatVectorsScorer DEFAULT_SCORER = DefaultFlatVectorScorer.INSTANCE; static final FlatVectorsScorer MEMSEG_SCORER = - VectorizationProvider.lookup(true).getFlatVectorScorer(); + VectorizationProvider.lookup(true).getLucene99FlatVectorsScorer(); @BeforeClass public static void beforeClass() throws Exception { From e6cac8b5a51dafb9fe4a84628b68e89c5dd91777 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Tue, 21 May 2024 16:02:26 +0100 Subject: [PATCH 36/37] fix toString --- .../vectorization/Lucene99MemorySegmentFlatVectorsScorer.java | 2 +- .../org/apache/lucene/codecs/hnsw/TestFlatVectorScorer.java | 2 +- .../codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java | 2 +- .../lucene/codecs/lucene99/TestLucene99HnswVectorsFormat.java | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentFlatVectorsScorer.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentFlatVectorsScorer.java index c0c491a2b62..78dd70d4d83 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentFlatVectorsScorer.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentFlatVectorsScorer.java @@ -88,6 +88,6 @@ static void checkDimensions(int queryLen, int fieldLen) { @Override public String toString() { - return "MemorySegmentFlatVectorsScorer()"; + return "Lucene99MemorySegmentFlatVectorsScorer()"; } } diff --git a/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorer.java b/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorer.java index 74687cae8cd..9bce1f10a43 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorer.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorer.java @@ -83,7 +83,7 @@ public void testDefaultOrMemSegScorer() { var scorer = FlatVectorScorerUtil.getLucene99FlatVectorsScorer(); assertThat( scorer.toString(), - is(oneOf("DefaultFlatVectorScorer()", "MemorySegmentFlatVectorsScorer()"))); + is(oneOf("DefaultFlatVectorScorer()", "Lucene99MemorySegmentFlatVectorsScorer()"))); } // Tests that the creation of another scorer does not disturb previous scorers diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java index 8245b89edc0..986de011969 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java @@ -226,7 +226,7 @@ public KnnVectorsFormat knnVectorsFormat() { String expectedPattern = "Lucene99HnswScalarQuantizedVectorsFormat(name=Lucene99HnswScalarQuantizedVectorsFormat, maxConn=10, beamWidth=20, flatVectorFormat=Lucene99ScalarQuantizedVectorsFormat(name=Lucene99ScalarQuantizedVectorsFormat, confidenceInterval=0.9, bits=4, compress=false, flatVectorScorer=ScalarQuantizedVectorScorer(nonQuantizedDelegate=DefaultFlatVectorScorer()), rawVectorFormat=Lucene99FlatVectorsFormat(vectorsScorer=%s())))"; var defaultScorer = format(Locale.ROOT, expectedPattern, "DefaultFlatVectorScorer"); - var memSegScorer = format(Locale.ROOT, expectedPattern, "MemorySegmentFlatVectorsScorer"); + var memSegScorer = format(Locale.ROOT, expectedPattern, "Lucene99MemorySegmentFlatVectorsScorer"); assertThat(customCodec.knnVectorsFormat().toString(), is(oneOf(defaultScorer, memSegScorer))); } diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswVectorsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswVectorsFormat.java index 54f9bda2af8..4579fb35f0e 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswVectorsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswVectorsFormat.java @@ -45,7 +45,7 @@ public KnnVectorsFormat knnVectorsFormat() { String expectedPattern = "Lucene99HnswVectorsFormat(name=Lucene99HnswVectorsFormat, maxConn=10, beamWidth=20, flatVectorFormat=Lucene99FlatVectorsFormat(vectorsScorer=%s()))"; var defaultScorer = format(Locale.ROOT, expectedPattern, "DefaultFlatVectorScorer"); - var memSegScorer = format(Locale.ROOT, expectedPattern, "MemorySegmentFlatVectorsScorer"); + var memSegScorer = format(Locale.ROOT, expectedPattern, "Lucene99MemorySegmentFlatVectorsScorer"); assertThat(customCodec.knnVectorsFormat().toString(), is(oneOf(defaultScorer, memSegScorer))); } From 80229fbece8da6019289d29eac0c0487b3641e20 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Tue, 21 May 2024 16:59:23 +0100 Subject: [PATCH 37/37] tidy --- .../lucene99/TestLucene99HnswQuantizedVectorsFormat.java | 3 ++- .../lucene/codecs/lucene99/TestLucene99HnswVectorsFormat.java | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java index 986de011969..d7e232485eb 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java @@ -226,7 +226,8 @@ public KnnVectorsFormat knnVectorsFormat() { String expectedPattern = "Lucene99HnswScalarQuantizedVectorsFormat(name=Lucene99HnswScalarQuantizedVectorsFormat, maxConn=10, beamWidth=20, flatVectorFormat=Lucene99ScalarQuantizedVectorsFormat(name=Lucene99ScalarQuantizedVectorsFormat, confidenceInterval=0.9, bits=4, compress=false, flatVectorScorer=ScalarQuantizedVectorScorer(nonQuantizedDelegate=DefaultFlatVectorScorer()), rawVectorFormat=Lucene99FlatVectorsFormat(vectorsScorer=%s())))"; var defaultScorer = format(Locale.ROOT, expectedPattern, "DefaultFlatVectorScorer"); - var memSegScorer = format(Locale.ROOT, expectedPattern, "Lucene99MemorySegmentFlatVectorsScorer"); + var memSegScorer = + format(Locale.ROOT, expectedPattern, "Lucene99MemorySegmentFlatVectorsScorer"); assertThat(customCodec.knnVectorsFormat().toString(), is(oneOf(defaultScorer, memSegScorer))); } diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswVectorsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswVectorsFormat.java index 4579fb35f0e..aea32b0a13d 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswVectorsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswVectorsFormat.java @@ -45,7 +45,8 @@ public KnnVectorsFormat knnVectorsFormat() { String expectedPattern = "Lucene99HnswVectorsFormat(name=Lucene99HnswVectorsFormat, maxConn=10, beamWidth=20, flatVectorFormat=Lucene99FlatVectorsFormat(vectorsScorer=%s()))"; var defaultScorer = format(Locale.ROOT, expectedPattern, "DefaultFlatVectorScorer"); - var memSegScorer = format(Locale.ROOT, expectedPattern, "Lucene99MemorySegmentFlatVectorsScorer"); + var memSegScorer = + format(Locale.ROOT, expectedPattern, "Lucene99MemorySegmentFlatVectorsScorer"); assertThat(customCodec.knnVectorsFormat().toString(), is(oneOf(defaultScorer, memSegScorer))); }