From 6839182c28860244570056130bc17f98a4510fc6 Mon Sep 17 00:00:00 2001
From: Kaival Parikh <kaivalp2000@gmail.com>
Date: Thu, 16 Oct 2025 17:13:58 +0000
Subject: [PATCH 1/6] Align float vectors to 64 bytes

---
 .../benchmark/jmh/VectorScorerBenchmark.java  | 121 +++++++++++++++---
 .../lucene99/Lucene99FlatVectorsWriter.java   |  33 +++--
 2 files changed, 123 insertions(+), 31 deletions(-)

diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java
index 66e72bf11c3b..035ea98e4fde 100644
--- a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java
+++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java
@@ -19,12 +19,16 @@
 import static org.apache.lucene.index.VectorSimilarityFunction.DOT_PRODUCT;
 
 import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
 import java.nio.file.Files;
+import java.util.Random;
 import java.util.concurrent.ThreadLocalRandom;
 import java.util.concurrent.TimeUnit;
 import org.apache.lucene.codecs.hnsw.FlatVectorScorerUtil;
 import org.apache.lucene.codecs.hnsw.FlatVectorsScorer;
 import org.apache.lucene.codecs.lucene95.OffHeapByteVectorValues;
+import org.apache.lucene.codecs.lucene95.OffHeapFloatVectorValues;
 import org.apache.lucene.index.KnnVectorValues;
 import org.apache.lucene.index.VectorSimilarityFunction;
 import org.apache.lucene.store.Directory;
@@ -62,57 +66,140 @@
     value = 3,
     jvmArgsAppend = {"-Xmx2g", "-Xms2g", "-XX:+AlwaysPreTouch"})
 public class VectorScorerBenchmark {
+  private static final float EPSILON = 1e-4f;
 
   @Param({"1", "128", "207", "256", "300", "512", "702", "1024"})
   public int size;
 
+  @Param({"0", "1", "2", "4", "6", "8", "16", "20", "32", "50", "64", "100", "128", "255", "256"})
+  public int padBytes;
+
   Directory dir;
-  IndexInput in;
-  KnnVectorValues vectorValues;
+  IndexInput bytesIn;
+  IndexInput floatsIn;
+  KnnVectorValues byteVectorValues;
+  KnnVectorValues floatVectorValues;
   byte[] vec1, vec2;
-  UpdateableRandomVectorScorer scorer;
+  float[] floatsA, floatsB;
+  float expectedBytes, expectedFloats;
+  UpdateableRandomVectorScorer byteScorer;
+  UpdateableRandomVectorScorer floatScorer;
 
   @Setup(Level.Iteration)
   public void init() throws IOException {
+    Random random = ThreadLocalRandom.current();
+
     vec1 = new byte[size];
     vec2 = new byte[size];
-    ThreadLocalRandom.current().nextBytes(vec1);
-    ThreadLocalRandom.current().nextBytes(vec2);
+    random.nextBytes(vec1);
+    random.nextBytes(vec2);
+    expectedBytes = DOT_PRODUCT.compare(vec1, vec2);
+
+    // random float arrays for float methods
+    floatsA = new float[size];
+    floatsB = new float[size];
+    for (int i = 0; i < size; ++i) {
+      floatsA[i] = random.nextFloat();
+      floatsB[i] = random.nextFloat();
+    }
+    expectedFloats = DOT_PRODUCT.compare(floatsA, floatsB);
 
     dir = new MMapDirectory(Files.createTempDirectory("VectorScorerBenchmark"));
-    try (IndexOutput out = dir.createOutput("vector.data", IOContext.DEFAULT)) {
+    try (IndexOutput out = dir.createOutput("byteVector.data", IOContext.DEFAULT)) {
+      out.writeBytes(new byte[padBytes], 0, padBytes);
+
       out.writeBytes(vec1, 0, vec1.length);
       out.writeBytes(vec2, 0, vec2.length);
     }
-    in = dir.openInput("vector.data", IOContext.DEFAULT);
-    vectorValues = vectorValues(size, 2, in, DOT_PRODUCT);
-    scorer =
+    try (IndexOutput out = dir.createOutput("floatVector.data", IOContext.DEFAULT)) {
+      out.writeBytes(new byte[padBytes], 0, padBytes);
+
+      byte[] buffer = new byte[size * Float.BYTES];
+      ByteBuffer.wrap(buffer).order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer().put(floatsA);
+      out.writeBytes(buffer, 0, buffer.length);
+      ByteBuffer.wrap(buffer).order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer().put(floatsB);
+      out.writeBytes(buffer, 0, buffer.length);
+    }
+
+    bytesIn = dir.openInput("byteVector.data", IOContext.DEFAULT);
+    byteVectorValues = byteVectorValues(DOT_PRODUCT);
+    byteScorer =
+        FlatVectorScorerUtil.getLucene99FlatVectorsScorer()
+            .getRandomVectorScorerSupplier(DOT_PRODUCT, byteVectorValues)
+            .scorer();
+    byteScorer.setScoringOrdinal(0);
+
+    floatsIn = dir.openInput("floatVector.data", IOContext.DEFAULT);
+    floatVectorValues = floatVectorValues(DOT_PRODUCT);
+    floatScorer =
         FlatVectorScorerUtil.getLucene99FlatVectorsScorer()
-            .getRandomVectorScorerSupplier(DOT_PRODUCT, vectorValues)
+            .getRandomVectorScorerSupplier(DOT_PRODUCT, floatVectorValues)
             .scorer();
-    scorer.setScoringOrdinal(0);
+    floatScorer.setScoringOrdinal(0);
   }
 
   @TearDown
   public void teardown() throws IOException {
-    IOUtils.close(dir, in);
+    IOUtils.close(dir, bytesIn);
   }
 
   @Benchmark
   public float binaryDotProductDefault() throws IOException {
-    return scorer.score(1);
+    float result = byteScorer.score(1);
+    if (Math.abs(result - expectedBytes) > EPSILON) {
+      throw new RuntimeException("Expected " + result + " but got " + expectedBytes);
+    }
+    return result;
   }
 
   @Benchmark
   @Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
   public float binaryDotProductMemSeg() throws IOException {
-    return scorer.score(1);
+    float result = byteScorer.score(1);
+    if (Math.abs(result - expectedBytes) > EPSILON) {
+      throw new RuntimeException("Expected " + result + " but got " + expectedBytes);
+    }
+    return result;
+  }
+
+  @Benchmark
+  public float floatDotProductDefault() throws IOException {
+    float result = floatScorer.score(1);
+    if (Math.abs(result - expectedFloats) > EPSILON) {
+      throw new RuntimeException("Expected " + result + " but got " + expectedFloats);
+    }
+    return result;
+  }
+
+  @Benchmark
+  @Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
+  public float floatDotProductMemSeg() throws IOException {
+    float result = floatScorer.score(1);
+    if (Math.abs(result - expectedFloats) > EPSILON) {
+      throw new RuntimeException("Expected " + result + " but got " + expectedFloats);
+    }
+    return result;
   }
 
-  static KnnVectorValues vectorValues(
-      int dims, int size, IndexInput in, VectorSimilarityFunction sim) throws IOException {
+  KnnVectorValues byteVectorValues(VectorSimilarityFunction sim) throws IOException {
     return new OffHeapByteVectorValues.DenseOffHeapVectorValues(
-        dims, size, in.slice("test", 0, in.length()), dims, new ThrowingFlatVectorScorer(), sim);
+        size,
+        2,
+        bytesIn.slice("test", padBytes, size * 2L),
+        size,
+        new ThrowingFlatVectorScorer(),
+        sim);
+  }
+
+  KnnVectorValues floatVectorValues(VectorSimilarityFunction sim) throws IOException {
+    int byteSize = size * Float.BYTES;
+    return new OffHeapFloatVectorValues.DenseOffHeapVectorValues(
+        size,
+        2,
+        floatsIn.slice("test", padBytes, byteSize * 2L),
+        byteSize,
+        new ThrowingFlatVectorScorer(),
+        sim);
   }
 
   static final class ThrowingFlatVectorScorer implements FlatVectorsScorer {
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java
index 1432f5ea46b8..3289909a09ed 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java
@@ -153,9 +153,18 @@ public long ramBytesUsed() {
     return total;
   }
 
+  /** Align vectors for optimal vectorized performance. */
+  private static long alignOutput(IndexOutput output, VectorEncoding encoding) throws IOException {
+    return output.alignFilePointer(
+        switch (encoding) {
+          case BYTE -> Float.BYTES;
+          case FLOAT32 -> 64;
+        });
+  }
+
   private void writeField(FieldWriter<?> fieldData, int maxDoc) throws IOException {
     // write vector values
-    long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES);
+    long vectorDataOffset = alignOutput(vectorData, fieldData.fieldInfo.getVectorEncoding());
     switch (fieldData.fieldInfo.getVectorEncoding()) {
       case BYTE -> writeByteVectors(fieldData);
       case FLOAT32 -> writeFloat32Vectors(fieldData);
@@ -190,19 +199,18 @@ private void writeSortingField(FieldWriter<?> fieldData, int maxDoc, Sorter.DocM
     mapOldOrdToNewOrd(fieldData.docsWithField, sortMap, null, ordMap, newDocsWithField);
 
     // write vector values
-    long vectorDataOffset =
-        switch (fieldData.fieldInfo.getVectorEncoding()) {
-          case BYTE -> writeSortedByteVectors(fieldData, ordMap);
-          case FLOAT32 -> writeSortedFloat32Vectors(fieldData, ordMap);
-        };
+    long vectorDataOffset = alignOutput(vectorData, fieldData.fieldInfo.getVectorEncoding());
+    switch (fieldData.fieldInfo.getVectorEncoding()) {
+      case BYTE -> writeSortedByteVectors(fieldData, ordMap);
+      case FLOAT32 -> writeSortedFloat32Vectors(fieldData, ordMap);
+    }
     long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset;
 
     writeMeta(fieldData.fieldInfo, maxDoc, vectorDataOffset, vectorDataLength, newDocsWithField);
   }
 
-  private long writeSortedFloat32Vectors(FieldWriter<?> fieldData, int[] ordMap)
+  private void writeSortedFloat32Vectors(FieldWriter<?> fieldData, int[] ordMap)
       throws IOException {
-    long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES);
     final ByteBuffer buffer =
         ByteBuffer.allocate(fieldData.dim * Float.BYTES).order(ByteOrder.LITTLE_ENDIAN);
     for (int ordinal : ordMap) {
@@ -210,23 +218,20 @@ private long writeSortedFloat32Vectors(FieldWriter<?> fieldData, int[] ordMap)
       buffer.asFloatBuffer().put(vector);
       vectorData.writeBytes(buffer.array(), buffer.array().length);
     }
-    return vectorDataOffset;
   }
 
-  private long writeSortedByteVectors(FieldWriter<?> fieldData, int[] ordMap) throws IOException {
-    long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES);
+  private void writeSortedByteVectors(FieldWriter<?> fieldData, int[] ordMap) throws IOException {
     for (int ordinal : ordMap) {
       byte[] vector = (byte[]) fieldData.vectors.get(ordinal);
       vectorData.writeBytes(vector, vector.length);
     }
-    return vectorDataOffset;
   }
 
   @Override
   public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOException {
     // Since we know we will not be searching for additional indexing, we can just write the
     // the vectors directly to the new segment.
-    long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES);
+    long vectorDataOffset = alignOutput(vectorData, fieldInfo.getVectorEncoding());
     // No need to use temporary file as we don't have to re-open for reading
     DocsWithFieldSet docsWithField =
         switch (fieldInfo.getVectorEncoding()) {
@@ -252,7 +257,7 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE
   @Override
   public CloseableRandomVectorScorerSupplier mergeOneFieldToIndex(
       FieldInfo fieldInfo, MergeState mergeState) throws IOException {
-    long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES);
+    long vectorDataOffset = alignOutput(vectorData, fieldInfo.getVectorEncoding());
     IndexOutput tempVectorData =
         segmentWriteState.directory.createTempOutput(
             vectorData.getName(), "temp", segmentWriteState.context);

From 5764ac8174ab3f54e40af5ce72d7a843f71e3c42 Mon Sep 17 00:00:00 2001
From: Kaival Parikh <kaivalp2000@gmail.com>
Date: Mon, 20 Oct 2025 19:17:53 +0000
Subject: [PATCH 2/6] Also align temp file used during merge

---
 .../lucene99/Lucene99FlatVectorsWriter.java     | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java
index 3289909a09ed..b511028bdb81 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java
@@ -257,13 +257,13 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE
   @Override
   public CloseableRandomVectorScorerSupplier mergeOneFieldToIndex(
       FieldInfo fieldInfo, MergeState mergeState) throws IOException {
-    long vectorDataOffset = alignOutput(vectorData, fieldInfo.getVectorEncoding());
     IndexOutput tempVectorData =
         segmentWriteState.directory.createTempOutput(
             vectorData.getName(), "temp", segmentWriteState.context);
     IndexInput vectorDataInput = null;
     try {
       // write the vector data to a temporary file
+      long tempVectorDataOffset = alignOutput(tempVectorData, fieldInfo.getVectorEncoding());
       DocsWithFieldSet docsWithField =
           switch (fieldInfo.getVectorEncoding()) {
             case BYTE ->
@@ -277,6 +277,8 @@ public CloseableRandomVectorScorerSupplier mergeOneFieldToIndex(
                     KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(
                         fieldInfo, mergeState));
           };
+      long vectorDataLength = tempVectorData.getFilePointer() - tempVectorDataOffset;
+
       CodecUtil.writeFooter(tempVectorData);
       IOUtils.close(tempVectorData);
 
@@ -288,10 +290,13 @@ public CloseableRandomVectorScorerSupplier mergeOneFieldToIndex(
               tempVectorData.getName(),
               IOContext.DEFAULT.withHints(
                   FileTypeHint.DATA, FileDataHint.KNN_VECTORS, DataAccessHint.RANDOM));
+      vectorDataInput.seek(tempVectorDataOffset);
+
       // copy the temporary file vectors to the actual data file
-      vectorData.copyBytes(vectorDataInput, vectorDataInput.length() - CodecUtil.footerLength());
+      long vectorDataOffset = alignOutput(vectorData, fieldInfo.getVectorEncoding());
+      vectorData.copyBytes(vectorDataInput, vectorDataLength);
+
       CodecUtil.retrieveChecksum(vectorDataInput);
-      long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset;
       writeMeta(
           fieldInfo,
           segmentWriteState.segmentInfo.maxDoc(),
@@ -310,7 +315,8 @@ public CloseableRandomVectorScorerSupplier mergeOneFieldToIndex(
                     new OffHeapByteVectorValues.DenseOffHeapVectorValues(
                         fieldInfo.getVectorDimension(),
                         docsWithField.cardinality(),
-                        finalVectorDataInput,
+                        finalVectorDataInput.slice(
+                            "temp-vector-data", tempVectorDataOffset, vectorDataLength),
                         fieldInfo.getVectorDimension() * Byte.BYTES,
                         vectorsScorer,
                         fieldInfo.getVectorSimilarityFunction()));
@@ -320,7 +326,8 @@ public CloseableRandomVectorScorerSupplier mergeOneFieldToIndex(
                     new OffHeapFloatVectorValues.DenseOffHeapVectorValues(
                         fieldInfo.getVectorDimension(),
                         docsWithField.cardinality(),
-                        finalVectorDataInput,
+                        finalVectorDataInput.slice(
+                            "temp-vector-data", tempVectorDataOffset, vectorDataLength),
                         fieldInfo.getVectorDimension() * Float.BYTES,
                         vectorsScorer,
                         fieldInfo.getVectorSimilarityFunction()));

From f9402e9c7b279e99649a89c0286ee7b7e51b3b8c Mon Sep 17 00:00:00 2001
From: Kaival Parikh <kaivalp2000@gmail.com>
Date: Tue, 21 Oct 2025 17:03:19 +0000
Subject: [PATCH 3/6] Revert "Also align temp file used during merge"

This reverts commit 5764ac8174ab3f54e40af5ce72d7a843f71e3c42.
---
 .../lucene99/Lucene99FlatVectorsWriter.java     | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java
index b511028bdb81..3289909a09ed 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java
@@ -257,13 +257,13 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE
   @Override
   public CloseableRandomVectorScorerSupplier mergeOneFieldToIndex(
       FieldInfo fieldInfo, MergeState mergeState) throws IOException {
+    long vectorDataOffset = alignOutput(vectorData, fieldInfo.getVectorEncoding());
     IndexOutput tempVectorData =
         segmentWriteState.directory.createTempOutput(
             vectorData.getName(), "temp", segmentWriteState.context);
     IndexInput vectorDataInput = null;
     try {
       // write the vector data to a temporary file
-      long tempVectorDataOffset = alignOutput(tempVectorData, fieldInfo.getVectorEncoding());
       DocsWithFieldSet docsWithField =
           switch (fieldInfo.getVectorEncoding()) {
             case BYTE ->
@@ -277,8 +277,6 @@ public CloseableRandomVectorScorerSupplier mergeOneFieldToIndex(
                     KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(
                         fieldInfo, mergeState));
           };
-      long vectorDataLength = tempVectorData.getFilePointer() - tempVectorDataOffset;
-
       CodecUtil.writeFooter(tempVectorData);
       IOUtils.close(tempVectorData);
 
@@ -290,13 +288,10 @@ public CloseableRandomVectorScorerSupplier mergeOneFieldToIndex(
               tempVectorData.getName(),
               IOContext.DEFAULT.withHints(
                   FileTypeHint.DATA, FileDataHint.KNN_VECTORS, DataAccessHint.RANDOM));
-      vectorDataInput.seek(tempVectorDataOffset);
-
       // copy the temporary file vectors to the actual data file
-      long vectorDataOffset = alignOutput(vectorData, fieldInfo.getVectorEncoding());
-      vectorData.copyBytes(vectorDataInput, vectorDataLength);
-
+      vectorData.copyBytes(vectorDataInput, vectorDataInput.length() - CodecUtil.footerLength());
       CodecUtil.retrieveChecksum(vectorDataInput);
+      long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset;
       writeMeta(
           fieldInfo,
           segmentWriteState.segmentInfo.maxDoc(),
@@ -315,8 +310,7 @@ public CloseableRandomVectorScorerSupplier mergeOneFieldToIndex(
                     new OffHeapByteVectorValues.DenseOffHeapVectorValues(
                         fieldInfo.getVectorDimension(),
                         docsWithField.cardinality(),
-                        finalVectorDataInput.slice(
-                            "temp-vector-data", tempVectorDataOffset, vectorDataLength),
+                        finalVectorDataInput,
                         fieldInfo.getVectorDimension() * Byte.BYTES,
                         vectorsScorer,
                         fieldInfo.getVectorSimilarityFunction()));
@@ -326,8 +320,7 @@ public CloseableRandomVectorScorerSupplier mergeOneFieldToIndex(
                     new OffHeapFloatVectorValues.DenseOffHeapVectorValues(
                         fieldInfo.getVectorDimension(),
                         docsWithField.cardinality(),
-                        finalVectorDataInput.slice(
-                            "temp-vector-data", tempVectorDataOffset, vectorDataLength),
+                        finalVectorDataInput,
                         fieldInfo.getVectorDimension() * Float.BYTES,
                         vectorsScorer,
                         fieldInfo.getVectorSimilarityFunction()));

From 0f46f3c4a0d1dbe6bc7559de000d87c872605a50 Mon Sep 17 00:00:00 2001
From: Kaival Parikh <kaivalp2000@gmail.com>
Date: Wed, 12 Nov 2025 04:54:05 +0000
Subject: [PATCH 4/6] Refactor + add comment + CHANGES.txt entry

---
 lucene/CHANGES.txt                            |  3 +++
 .../lucene99/Lucene99FlatVectorsWriter.java   | 25 +++++++++++--------
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 0742b8d96fa6..936af863cd11 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -243,6 +243,9 @@ Other
   Applications using SecurityManager now need to grant SerializablePermission("serialFilter")
   to the analysis-smartcn module. (Uwe Schindler, Isaac David)
 
+* GITHUB#15341: Align float vectors on disk to 64 bytes, for optimal performance on Arm Neoverse
+  machines. (Mike McCandless, Kaival Parikh)
+
 Build
 ---------------------
 * Upgrade forbiddenapis to version 3.10.  (Uwe Schindler)
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java
index 3289909a09ed..3416a131735d 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java
@@ -153,19 +153,19 @@ public long ramBytesUsed() {
     return total;
   }
 
-  /** Align vectors for optimal vectorized performance. */
   private static long alignOutput(IndexOutput output, VectorEncoding encoding) throws IOException {
     return output.alignFilePointer(
         switch (encoding) {
           case BYTE -> Float.BYTES;
-          case FLOAT32 -> 64;
+          case FLOAT32 -> 64; // optimal alignment for Arm Neoverse machines.
         });
   }
 
   private void writeField(FieldWriter<?> fieldData, int maxDoc) throws IOException {
     // write vector values
-    long vectorDataOffset = alignOutput(vectorData, fieldData.fieldInfo.getVectorEncoding());
-    switch (fieldData.fieldInfo.getVectorEncoding()) {
+    VectorEncoding encoding = fieldData.fieldInfo.getVectorEncoding();
+    long vectorDataOffset = alignOutput(vectorData, encoding);
+    switch (encoding) {
       case BYTE -> writeByteVectors(fieldData);
       case FLOAT32 -> writeFloat32Vectors(fieldData);
     }
@@ -199,8 +199,9 @@ private void writeSortingField(FieldWriter<?> fieldData, int maxDoc, Sorter.DocM
     mapOldOrdToNewOrd(fieldData.docsWithField, sortMap, null, ordMap, newDocsWithField);
 
     // write vector values
-    long vectorDataOffset = alignOutput(vectorData, fieldData.fieldInfo.getVectorEncoding());
-    switch (fieldData.fieldInfo.getVectorEncoding()) {
+    VectorEncoding encoding = fieldData.fieldInfo.getVectorEncoding();
+    long vectorDataOffset = alignOutput(vectorData, encoding);
+    switch (encoding) {
       case BYTE -> writeSortedByteVectors(fieldData, ordMap);
       case FLOAT32 -> writeSortedFloat32Vectors(fieldData, ordMap);
     }
@@ -231,10 +232,11 @@ private void writeSortedByteVectors(FieldWriter<?> fieldData, int[] ordMap) thro
   public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOException {
     // Since we know we will not be searching for additional indexing, we can just write the
     // the vectors directly to the new segment.
-    long vectorDataOffset = alignOutput(vectorData, fieldInfo.getVectorEncoding());
+    VectorEncoding encoding = fieldInfo.getVectorEncoding();
+    long vectorDataOffset = alignOutput(vectorData, encoding);
     // No need to use temporary file as we don't have to re-open for reading
     DocsWithFieldSet docsWithField =
-        switch (fieldInfo.getVectorEncoding()) {
+        switch (encoding) {
           case BYTE ->
               writeByteVectorData(
                   vectorData,
@@ -257,7 +259,8 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE
   @Override
   public CloseableRandomVectorScorerSupplier mergeOneFieldToIndex(
       FieldInfo fieldInfo, MergeState mergeState) throws IOException {
-    long vectorDataOffset = alignOutput(vectorData, fieldInfo.getVectorEncoding());
+    VectorEncoding encoding = fieldInfo.getVectorEncoding();
+    long vectorDataOffset = alignOutput(vectorData, encoding);
     IndexOutput tempVectorData =
         segmentWriteState.directory.createTempOutput(
             vectorData.getName(), "temp", segmentWriteState.context);
@@ -265,7 +268,7 @@ public CloseableRandomVectorScorerSupplier mergeOneFieldToIndex(
     try {
       // write the vector data to a temporary file
       DocsWithFieldSet docsWithField =
-          switch (fieldInfo.getVectorEncoding()) {
+          switch (encoding) {
             case BYTE ->
                 writeByteVectorData(
                     tempVectorData,
@@ -303,7 +306,7 @@ public CloseableRandomVectorScorerSupplier mergeOneFieldToIndex(
       vectorDataInput = null;
 
       final RandomVectorScorerSupplier randomVectorScorerSupplier =
-          switch (fieldInfo.getVectorEncoding()) {
+          switch (encoding) {
             case BYTE ->
                 vectorsScorer.getRandomVectorScorerSupplier(
                     fieldInfo.getVectorSimilarityFunction(),

From 8b70ce896474b0a38be176c59727d7481e45062b Mon Sep 17 00:00:00 2001
From: Kaival Parikh <kaivalp2000@gmail.com>
Date: Mon, 17 Nov 2025 21:28:06 +0000
Subject: [PATCH 5/6] Only apply the optimal byte alignment if it will hold for
 all vectors

i.e. only applied when dimension is a multiple of 16

Also add Javadoc comment about the alignment
---
 .../lucene99/Lucene99FlatVectorsFormat.java   |  4 ++++
 .../lucene99/Lucene99FlatVectorsWriter.java   | 24 ++++++++++++++-----
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsFormat.java
index c8ef2709db66..2bee5dfe898a 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsFormat.java
@@ -63,6 +63,10 @@
  *       that only in sparse case
  * </ul>
  *
+ * <p>NOTE: This format aligns float vectors of specific dimensions (multiples of 16) to 64 bytes in
+ * the index, for optimal performance on Arm Neoverse machines. There may be a small performance
+ * penalty in using float vectors of other dimensions on these machines.
+ *
  * @lucene.experimental
  */
 public final class Lucene99FlatVectorsFormat extends FlatVectorsFormat {
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java
index 3416a131735d..d611ed30b20a 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java
@@ -153,18 +153,27 @@ public long ramBytesUsed() {
     return total;
   }
 
-  private static long alignOutput(IndexOutput output, VectorEncoding encoding) throws IOException {
+  private static long alignOutput(IndexOutput output, VectorEncoding encoding, int dimension)
+      throws IOException {
     return output.alignFilePointer(
         switch (encoding) {
           case BYTE -> Float.BYTES;
-          case FLOAT32 -> 64; // optimal alignment for Arm Neoverse machines.
+          case FLOAT32 -> {
+            if (dimension % 16 == 0) {
+              yield 64; // optimal alignment for Arm Neoverse machines.
+            }
+            // vector dimension is such that 64 byte alignment will not hold for all subsequent
+            // vectors, use next best alignment that will hold.
+            yield Float.BYTES;
+          }
         });
   }
 
   private void writeField(FieldWriter<?> fieldData, int maxDoc) throws IOException {
     // write vector values
     VectorEncoding encoding = fieldData.fieldInfo.getVectorEncoding();
-    long vectorDataOffset = alignOutput(vectorData, encoding);
+    int dimension = fieldData.fieldInfo.getVectorDimension();
+    long vectorDataOffset = alignOutput(vectorData, encoding, dimension);
     switch (encoding) {
       case BYTE -> writeByteVectors(fieldData);
       case FLOAT32 -> writeFloat32Vectors(fieldData);
@@ -200,7 +209,8 @@ private void writeSortingField(FieldWriter<?> fieldData, int maxDoc, Sorter.DocM
 
     // write vector values
     VectorEncoding encoding = fieldData.fieldInfo.getVectorEncoding();
-    long vectorDataOffset = alignOutput(vectorData, encoding);
+    int dimension = fieldData.fieldInfo.getVectorDimension();
+    long vectorDataOffset = alignOutput(vectorData, encoding, dimension);
     switch (encoding) {
       case BYTE -> writeSortedByteVectors(fieldData, ordMap);
       case FLOAT32 -> writeSortedFloat32Vectors(fieldData, ordMap);
@@ -233,7 +243,8 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE
     // Since we know we will not be searching for additional indexing, we can just write the
     // the vectors directly to the new segment.
     VectorEncoding encoding = fieldInfo.getVectorEncoding();
-    long vectorDataOffset = alignOutput(vectorData, encoding);
+    int dimension = fieldInfo.getVectorDimension();
+    long vectorDataOffset = alignOutput(vectorData, encoding, dimension);
     // No need to use temporary file as we don't have to re-open for reading
     DocsWithFieldSet docsWithField =
         switch (encoding) {
@@ -260,7 +271,8 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE
   public CloseableRandomVectorScorerSupplier mergeOneFieldToIndex(
       FieldInfo fieldInfo, MergeState mergeState) throws IOException {
     VectorEncoding encoding = fieldInfo.getVectorEncoding();
-    long vectorDataOffset = alignOutput(vectorData, encoding);
+    int dimension = fieldInfo.getVectorDimension();
+    long vectorDataOffset = alignOutput(vectorData, encoding, dimension);
     IndexOutput tempVectorData =
         segmentWriteState.directory.createTempOutput(
             vectorData.getName(), "temp", segmentWriteState.context);

From 08ab76e9411ddb10242dfb495f71d08293004c2d Mon Sep 17 00:00:00 2001
From: Kaival Parikh <kaivalp2000@gmail.com>
Date: Wed, 19 Nov 2025 18:09:30 +0000
Subject: [PATCH 6/6] Undo "Only apply the optimal byte alignment if it will
 hold for all vectors"

Also add comment about padBytes in VectorScorerBenchmark (used to capture performance impact of byte alignment)
---
 .../benchmark/jmh/VectorScorerBenchmark.java  |  4 ++--
 .../lucene99/Lucene99FlatVectorsFormat.java   |  8 ++++---
 .../lucene99/Lucene99FlatVectorsWriter.java   | 24 +++++--------------
 3 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java
index 035ea98e4fde..10b6818f2a43 100644
--- a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java
+++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java
@@ -71,8 +71,8 @@ public class VectorScorerBenchmark {
   @Param({"1", "128", "207", "256", "300", "512", "702", "1024"})
   public int size;
 
-  @Param({"0", "1", "2", "4", "6", "8", "16", "20", "32", "50", "64", "100", "128", "255", "256"})
-  public int padBytes;
+  @Param({"0", "1", "4", "64"})
+  public int padBytes; // capture performance impact of byte alignment in the index
 
   Directory dir;
   IndexInput bytesIn;
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsFormat.java
index 2bee5dfe898a..46be88836ca1 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsFormat.java
@@ -63,9 +63,11 @@
  *       that only in sparse case
  * </ul>
  *
- * <p>NOTE: This format aligns float vectors of specific dimensions (multiples of 16) to 64 bytes in
- * the index, for optimal performance on Arm Neoverse machines. There may be a small performance
- * penalty in using float vectors of other dimensions on these machines.
+ * <p>NOTE: Arm Neoverse machines have a performance overhead in reading data that is not aligned to
+ * 64 bytes, so this format aligns the <code>.vec</code> file to that size. There may be a
+ * performance penalty in searching of float vectors that do <b>not</b> have a dimension of a
+ * multiple of 16 (equivalent to 64 bytes), because the alignment will not hold for all vectors in
+ * the file.
  *
  * @lucene.experimental
  */
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java
index d611ed30b20a..3416a131735d 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java
@@ -153,27 +153,18 @@ public long ramBytesUsed() {
     return total;
   }
 
-  private static long alignOutput(IndexOutput output, VectorEncoding encoding, int dimension)
-      throws IOException {
+  private static long alignOutput(IndexOutput output, VectorEncoding encoding) throws IOException {
     return output.alignFilePointer(
         switch (encoding) {
           case BYTE -> Float.BYTES;
-          case FLOAT32 -> {
-            if (dimension % 16 == 0) {
-              yield 64; // optimal alignment for Arm Neoverse machines.
-            }
-            // vector dimension is such that 64 byte alignment will not hold for all subsequent
-            // vectors, use next best alignment that will hold.
-            yield Float.BYTES;
-          }
+          case FLOAT32 -> 64; // optimal alignment for Arm Neoverse machines.
         });
   }
 
   private void writeField(FieldWriter<?> fieldData, int maxDoc) throws IOException {
     // write vector values
     VectorEncoding encoding = fieldData.fieldInfo.getVectorEncoding();
-    int dimension = fieldData.fieldInfo.getVectorDimension();
-    long vectorDataOffset = alignOutput(vectorData, encoding, dimension);
+    long vectorDataOffset = alignOutput(vectorData, encoding);
     switch (encoding) {
       case BYTE -> writeByteVectors(fieldData);
       case FLOAT32 -> writeFloat32Vectors(fieldData);
@@ -209,8 +200,7 @@ private void writeSortingField(FieldWriter<?> fieldData, int maxDoc, Sorter.DocM
 
     // write vector values
     VectorEncoding encoding = fieldData.fieldInfo.getVectorEncoding();
-    int dimension = fieldData.fieldInfo.getVectorDimension();
-    long vectorDataOffset = alignOutput(vectorData, encoding, dimension);
+    long vectorDataOffset = alignOutput(vectorData, encoding);
     switch (encoding) {
       case BYTE -> writeSortedByteVectors(fieldData, ordMap);
       case FLOAT32 -> writeSortedFloat32Vectors(fieldData, ordMap);
@@ -243,8 +233,7 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE
     // Since we know we will not be searching for additional indexing, we can just write the
     // the vectors directly to the new segment.
     VectorEncoding encoding = fieldInfo.getVectorEncoding();
-    int dimension = fieldInfo.getVectorDimension();
-    long vectorDataOffset = alignOutput(vectorData, encoding, dimension);
+    long vectorDataOffset = alignOutput(vectorData, encoding);
     // No need to use temporary file as we don't have to re-open for reading
     DocsWithFieldSet docsWithField =
         switch (encoding) {
@@ -271,8 +260,7 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE
   public CloseableRandomVectorScorerSupplier mergeOneFieldToIndex(
       FieldInfo fieldInfo, MergeState mergeState) throws IOException {
     VectorEncoding encoding = fieldInfo.getVectorEncoding();
-    int dimension = fieldInfo.getVectorDimension();
-    long vectorDataOffset = alignOutput(vectorData, encoding, dimension);
+    long vectorDataOffset = alignOutput(vectorData, encoding);
     IndexOutput tempVectorData =
         segmentWriteState.directory.createTempOutput(
             vectorData.getName(), "temp", segmentWriteState.context);