apache · rahil-c · May 3, 2026 · May 8, 2026 · hudi-agent · May 8, 2026
diff --git a/...ent/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/VectorConversionUtils.java b/...ent/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/VectorConversionUtils.java
@@ -38,7 +38,9 @@
 import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
 
 import java.nio.ByteBuffer;
+import java.util.ArrayList;
 import java.util.LinkedHashMap;
+import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@@ -265,6 +267,104 @@ public static void convertRowVectorColumns(InternalRow row, GenericInternalRow r
     }
   }
 
+  // ---------------------------------------------------------------------------
+  // Blob descriptor-mode helpers (Parquet DESCRIPTOR read path)
+  // ---------------------------------------------------------------------------
+
+  /**
+   * Detects BLOB columns from Spark StructType metadata annotations.
+   *
+   * @param schema Spark StructType (may be null)
+   * @return set of field ordinals that are BLOB columns; empty set if none found
+   */
+  public static Set<Integer> detectBlobColumnsFromMetadata(StructType schema) {
+    Set<Integer> blobColumnIndices = new LinkedHashSet<>();
+    if (schema == null) {
+      return blobColumnIndices;
+    }
+    StructField[] fields = schema.fields();
+    for (int i = 0; i < fields.length; i++) {
+      StructField field = fields[i];
+      if (field.metadata().contains(HoodieSchema.TYPE_METADATA_FIELD)) {
+        String typeStr = field.metadata().getString(HoodieSchema.TYPE_METADATA_FIELD);
+        HoodieSchema parsed = HoodieSchema.parseTypeDescriptor(typeStr);
+        if (parsed != null && parsed.getType() == HoodieSchemaType.BLOB) {
+          blobColumnIndices.add(i);
+        }
+      }
+    }
+    return blobColumnIndices;
+  }
+
+  /**
+   * Strips the {@code data} sub-field from BLOB struct columns so the Parquet reader
+   * skips the binary column chunk entirely (genuine I/O savings).
+   *
+   * <p>The returned schema has 2-field blob structs: {@code {type, reference}} instead of
+   * the full {@code {type, data, reference}}. Use {@link #buildBlobNullPadRowMapper} to
+   * re-insert null at the {@code data} position after reading.
+   *
+   * @param schema      the original Spark schema
+   * @param blobColumns ordinals of blob columns (from {@link #detectBlobColumnsFromMetadata})
+   * @return a new StructType with the {@code data} sub-field removed from blob structs
+   */
+  public static StructType stripBlobDataField(StructType schema, Set<Integer> blobColumns) {
+    StructField[] fields = schema.fields();
+    StructField[] newFields = new StructField[fields.length];
+    for (int i = 0; i < fields.length; i++) {
+      if (blobColumns.contains(i) && fields[i].dataType() instanceof StructType) {
+        StructType blobStruct = (StructType) fields[i].dataType();
+        List<StructField> kept = new ArrayList<>();
+        for (StructField sub : blobStruct.fields()) {
+          if (!sub.name().equals(HoodieSchema.Blob.INLINE_DATA_FIELD)) {
+            kept.add(sub);
+          }
+        }
+        StructType strippedStruct = new StructType(kept.toArray(new StructField[0]));
+        newFields[i] = new StructField(fields[i].name(), strippedStruct, fields[i].nullable(), fields[i].metadata());
+      } else {
+        newFields[i] = fields[i];
+      }
+    }
+    return new StructType(newFields);
+  }
+
+  /**
+   * Returns a {@link Function} that expands 2-field blob structs {@code {type, reference}}
+   * back to 3-field structs {@code {type, null, reference}} by inserting null at the
+   * {@code data} position, then applies the projection callback.
+   *
+   * @param readSchema         the Spark schema of incoming rows (blob structs have 2 fields)
+   * @param blobColumns        ordinals of blob columns in {@code readSchema}
+   * @param projectionCallback called with the expanded row; must copy any data it needs to retain
+   * @return a function that converts one row and returns the projected result
+   */
+  public static Function<InternalRow, InternalRow> buildBlobNullPadRowMapper(
+      StructType readSchema,
+      Set<Integer> blobColumns,
+      Function<InternalRow, InternalRow> projectionCallback) {
+    int numFields = readSchema.fields().length;
+    GenericInternalRow buffer = new GenericInternalRow(numFields);
+    return row -> {
+      for (int i = 0; i < numFields; i++) {
+        if (row.isNullAt(i)) {
+          buffer.setNullAt(i);
+        } else if (blobColumns.contains(i)) {
+          InternalRow blobStruct = row.getStruct(i, 2);
+          // Expand {type, reference} → {type, null, reference}
+          GenericInternalRow expanded = new GenericInternalRow(3);
+          expanded.update(0, blobStruct.isNullAt(0) ? null : blobStruct.getUTF8String(0));
+          expanded.setNullAt(1);
+          expanded.update(2, blobStruct.isNullAt(1) ? null : blobStruct.getStruct(1, HoodieSchema.Blob.getReferenceFieldCount()));
+          buffer.update(i, expanded);
+        } else {
+          buffer.update(i, row.get(i, readSchema.apply(i).dataType()));
+        }
+      }
+      return projectionCallback.apply(buffer);
+    };
+  }
+
   /**
    * Re-attaches {@link HoodieSchema#TYPE_METADATA_FIELD} to Spark fields that are
    * Arrow {@code FixedSizeList<Float32|Float64, dim>} in the Lance file.

diff --git a/...spark-client/src/main/scala/org/apache/hudi/SparkFileFormatInternalRowReaderContext.scala b/...spark-client/src/main/scala/org/apache/hudi/SparkFileFormatInternalRowReaderContext.scala
@@ -97,7 +97,29 @@ class SparkFileFormatInternalRowReaderContext(baseFileReader: SparkColumnarFileR
       structType
     }
 
-    val (readSchema, readFilters) = getSchemaAndFiltersForRead(parquetReadStructType, hasRowIndexField)
+    // Blob DESCRIPTOR mode: strip `data` sub-field from blob structs for Parquet base files.
+    // Applied after vector rewrite; not applied to Lance base files or log files.
+    val isParquetBaseFile = FSUtils.isBaseFile(filePath) && !isLanceBaseFile
+    val isBlobDescriptorMode = isParquetBaseFile && {
+      val hadoopConf = storageConfiguration.unwrapAs(classOf[Configuration])
+      import org.apache.hudi.common.config.HoodieReaderConfig
+      val modeValue = hadoopConf.get(HoodieReaderConfig.BLOB_INLINE_READ_MODE.key(),
+        HoodieReaderConfig.BLOB_INLINE_READ_MODE.defaultValue())
+      modeValue.equalsIgnoreCase(HoodieReaderConfig.BLOB_INLINE_READ_MODE_DESCRIPTOR)
+    }
+    val blobColumnIndices: Set[Int] = if (isBlobDescriptorMode) {
+      VectorConversionUtils.detectBlobColumnsFromMetadata(parquetReadStructType).asScala.map(_.intValue()).toSet
+    } else {
+      Set.empty
+    }
+    val blobReadStructType = if (blobColumnIndices.nonEmpty) {
+      val javaBlobCols: java.util.Set[Integer] = blobColumnIndices.map(Integer.valueOf).asJava
+      VectorConversionUtils.stripBlobDataField(parquetReadStructType, javaBlobCols)
+    } else {
+      parquetReadStructType
+    }
+
+    val (readSchema, readFilters) = getSchemaAndFiltersForRead(blobReadStructType, hasRowIndexField)
     if (FSUtils.isLogFile(filePath)) {
       // NOTE: now only primary key based filtering is supported for log files
       new HoodieSparkFileReaderFactory(storage).newParquetFileReader(filePath)
@@ -120,12 +142,18 @@ class SparkFileFormatInternalRowReaderContext(baseFileReader: SparkColumnarFileR
         readSchema, StructType(Seq.empty), getSchemaHandler.getInternalSchemaOpt,
         readFilters, storage.getConf.asInstanceOf[StorageConfiguration[Configuration]], tableSchemaOpt))
 
-      // Post-process: convert binary VECTOR columns back to typed arrays
-      if (vectorColumnInfo.nonEmpty) {
-        SparkFileFormatInternalRowReaderContext.wrapWithVectorConversion(rawIterator, vectorColumnInfo, readSchema)
+      // Post-process: re-insert null `data` field into blob structs, then convert vectors
+      val blobPaddedIterator = if (blobColumnIndices.nonEmpty) {
+        SparkFileFormatInternalRowReaderContext.wrapWithBlobNullPadding(rawIterator, blobColumnIndices, readSchema, parquetReadStructType)
       } else {
         rawIterator
       }
+
+      if (vectorColumnInfo.nonEmpty) {
+        SparkFileFormatInternalRowReaderContext.wrapWithVectorConversion(blobPaddedIterator, vectorColumnInfo, if (blobColumnIndices.nonEmpty) parquetReadStructType else readSchema)
+      } else {
+        blobPaddedIterator
+      }
     }
   }
 
@@ -375,4 +403,23 @@ object SparkFileFormatInternalRowReaderContext {
     }
   }
 
+  /**
+   * Wraps a closable iterator to re-insert null {@code data} fields into blob structs
+   * after Parquet DESCRIPTOR mode read (expanding 2-field → 3-field structs).
+   */
+  private[hudi] def wrapWithBlobNullPadding(
+      iterator: ClosableIterator[InternalRow],
+      blobColumnIndices: Set[Int],
+      readSchema: StructType,
+      targetSchema: StructType): ClosableIterator[InternalRow] = {
+    val javaBlobCols: java.util.Set[Integer] = blobColumnIndices.map(Integer.valueOf).asJava
+    val projection = UnsafeProjection.create(targetSchema)
+    val mapper = VectorConversionUtils.buildBlobNullPadRowMapper(readSchema, javaBlobCols, projection.apply(_))
+    new ClosableIterator[InternalRow] {
+      override def hasNext: Boolean = iterator.hasNext
+      override def next(): InternalRow = mapper.apply(iterator.next())
+      override def close(): Unit = iterator.close()
+    }
+  }
+
 }
diff --git a/...-spark-client/src/test/java/org/apache/hudi/io/storage/TestVectorConversionUtilsBlob.java b/...-spark-client/src/test/java/org/apache/hudi/io/storage/TestVectorConversionUtilsBlob.java
@@ -0,0 +1,177 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.io.storage;
+
+import org.apache.hudi.common.schema.HoodieSchema;
+import org.apache.hudi.common.schema.HoodieSchemaType;
+
+import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.MetadataBuilder;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+import org.apache.spark.unsafe.types.UTF8String;
+import org.junit.jupiter.api.Test;
+
+import java.util.Set;
+import java.util.function.Function;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+/**
+ * Unit tests for the blob descriptor-mode helpers in {@link VectorConversionUtils}.
+ */
+public class TestVectorConversionUtilsBlob {
+
+  private static Metadata blobMeta() {
+    return new MetadataBuilder()
+        .putString(HoodieSchema.TYPE_METADATA_FIELD, HoodieSchemaType.BLOB.name())
+        .build();
+  }
+
+  private static StructType blobStruct3Field() {
+    return new StructType(new StructField[] {
+        new StructField(HoodieSchema.Blob.TYPE, DataTypes.StringType, true, Metadata.empty()),
+        new StructField(HoodieSchema.Blob.INLINE_DATA_FIELD, DataTypes.BinaryType, true, Metadata.empty()),
+        new StructField(HoodieSchema.Blob.EXTERNAL_REFERENCE,
+            new StructType(new StructField[] {
+                new StructField(HoodieSchema.Blob.EXTERNAL_REFERENCE_PATH, DataTypes.StringType, true, Metadata.empty()),
+                new StructField(HoodieSchema.Blob.EXTERNAL_REFERENCE_OFFSET, DataTypes.LongType, true, Metadata.empty()),
+                new StructField(HoodieSchema.Blob.EXTERNAL_REFERENCE_LENGTH, DataTypes.LongType, true, Metadata.empty()),
+                new StructField(HoodieSchema.Blob.EXTERNAL_REFERENCE_IS_MANAGED, DataTypes.BooleanType, true, Metadata.empty())
+            }), true, Metadata.empty())
+    });
+  }
+
+  @Test
+  public void detectBlobColumnsFromMetadataFindsMarkedFields() {
+    StructType schema = new StructType(new StructField[] {
+        new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
+        new StructField("payload", blobStruct3Field(), true, blobMeta()),
+        new StructField("name", DataTypes.StringType, true, Metadata.empty())
+    });
+    Set<Integer> blobs = VectorConversionUtils.detectBlobColumnsFromMetadata(schema);
+    assertEquals(1, blobs.size());
+    assertTrue(blobs.contains(1));
+  }
+
+  @Test
+  public void detectBlobColumnsFromMetadataReturnsEmptyForNonBlob() {
+    StructType schema = new StructType(new StructField[] {
+        new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
+        new StructField("name", DataTypes.StringType, true, Metadata.empty())
+    });
+    assertTrue(VectorConversionUtils.detectBlobColumnsFromMetadata(schema).isEmpty());
+  }
+
+  @Test
+  public void detectBlobColumnsFromMetadataNullSchema() {
+    assertTrue(VectorConversionUtils.detectBlobColumnsFromMetadata(null).isEmpty());
+  }
+
+  @Test
+  public void stripBlobDataFieldRemovesDataAndPreservesOthers() {
+    StructType schema = new StructType(new StructField[] {
+        new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
+        new StructField("payload", blobStruct3Field(), true, blobMeta())
+    });
+    Set<Integer> blobs = VectorConversionUtils.detectBlobColumnsFromMetadata(schema);
+
+    StructType stripped = VectorConversionUtils.stripBlobDataField(schema, blobs);
+
+    // Top-level fields preserved.
+    assertEquals(2, stripped.fields().length);
+    assertEquals("id", stripped.fields()[0].name());
+    assertEquals("payload", stripped.fields()[1].name());
+    // Top-level metadata preserved.
+    assertTrue(stripped.fields()[1].metadata().contains(HoodieSchema.TYPE_METADATA_FIELD));
+
+    // Blob struct now has 2 fields: type, reference (no data).
+    StructType blob = (StructType) stripped.fields()[1].dataType();
+    assertEquals(2, blob.fields().length);
+    assertEquals(HoodieSchema.Blob.TYPE, blob.fields()[0].name());
+    assertEquals(HoodieSchema.Blob.EXTERNAL_REFERENCE, blob.fields()[1].name());
+    assertFalse(blob.getFieldIndex(HoodieSchema.Blob.INLINE_DATA_FIELD).isDefined());
+  }
+
+  @Test
+  public void buildBlobNullPadRowMapperReinsertsNullData() {
+    StructType readSchema = new StructType(new StructField[] {
+        new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
+        new StructField("payload",
+            new StructType(new StructField[] {
+                new StructField(HoodieSchema.Blob.TYPE, DataTypes.StringType, true, Metadata.empty()),
+                new StructField(HoodieSchema.Blob.EXTERNAL_REFERENCE,
+                    new StructType(new StructField[] {
+                        new StructField(HoodieSchema.Blob.EXTERNAL_REFERENCE_PATH, DataTypes.StringType, true, Metadata.empty()),
+                        new StructField(HoodieSchema.Blob.EXTERNAL_REFERENCE_OFFSET, DataTypes.LongType, true, Metadata.empty()),
+                        new StructField(HoodieSchema.Blob.EXTERNAL_REFERENCE_LENGTH, DataTypes.LongType, true, Metadata.empty()),
+                        new StructField(HoodieSchema.Blob.EXTERNAL_REFERENCE_IS_MANAGED, DataTypes.BooleanType, true, Metadata.empty())
+                    }), true, Metadata.empty())
+            }), true, blobMeta())
+    });
+
+    GenericInternalRow blob2Field = new GenericInternalRow(2);
+    blob2Field.update(0, UTF8String.fromString(HoodieSchema.Blob.INLINE));
+    blob2Field.setNullAt(1); // reference null
+    GenericInternalRow input = new GenericInternalRow(2);
+    input.update(0, 42);
+    input.update(1, blob2Field);
+
+    Function<InternalRow, InternalRow> mapper =
+        VectorConversionUtils.buildBlobNullPadRowMapper(readSchema, java.util.Collections.singleton(1), row -> row);
+    InternalRow out = mapper.apply(input);
+
+    assertEquals(42, out.getInt(0));
+    InternalRow expanded = out.getStruct(1, 3);
+    assertEquals(HoodieSchema.Blob.INLINE,
+        expanded.getUTF8String(0).toString());
+    assertTrue(expanded.isNullAt(1), "data should be null after pad");
+    assertTrue(expanded.isNullAt(2), "reference was null in input");
+  }
+
+  @Test
+  public void buildBlobNullPadRowMapperHandlesNullBlobRow() {
+    StructType readSchema = new StructType(new StructField[] {
+        new StructField("payload",
+            new StructType(new StructField[] {
+                new StructField(HoodieSchema.Blob.TYPE, DataTypes.StringType, true, Metadata.empty()),
+                new StructField(HoodieSchema.Blob.EXTERNAL_REFERENCE,
+                    new StructType(new StructField[] {
+                        new StructField(HoodieSchema.Blob.EXTERNAL_REFERENCE_PATH, DataTypes.StringType, true, Metadata.empty()),
+                        new StructField(HoodieSchema.Blob.EXTERNAL_REFERENCE_OFFSET, DataTypes.LongType, true, Metadata.empty()),
+                        new StructField(HoodieSchema.Blob.EXTERNAL_REFERENCE_LENGTH, DataTypes.LongType, true, Metadata.empty()),
+                        new StructField(HoodieSchema.Blob.EXTERNAL_REFERENCE_IS_MANAGED, DataTypes.BooleanType, true, Metadata.empty())
+                    }), true, Metadata.empty())
+            }), true, blobMeta())
+    });
+
+    GenericInternalRow input = new GenericInternalRow(1);
+    input.setNullAt(0);
+
+    Function<InternalRow, InternalRow> mapper =
+        VectorConversionUtils.buildBlobNullPadRowMapper(readSchema, java.util.Collections.singleton(0), row -> row);
+    InternalRow out = mapper.apply(input);
+    assertTrue(out.isNullAt(0), "null blob row stays null");
+  }
+}
diff --git a/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieReaderConfig.java b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieReaderConfig.java
@@ -113,7 +113,10 @@ public class HoodieReaderConfig extends HoodieConfig {
       .withValidValues(BLOB_INLINE_READ_MODE_CONTENT, BLOB_INLINE_READ_MODE_DESCRIPTOR)
       .withDocumentation("How Hudi interprets INLINE BLOB values on read. "
           + "CONTENT (default) returns the raw inline bytes. "
-          + "DESCRIPTOR returns an OUT_OF_LINE-shaped reference pointing at the backing "
-          + "Lance file with the INLINE payload's position and size, so callers can defer "
-          + "the byte read via read_blob().");
+          + "DESCRIPTOR suppresses the inline bytes (data field is null) and returns metadata only, "
+          + "avoiding the I/O cost of reading large binary payloads. "
+          + "For Lance files, the reference struct is populated with blob stream coordinates "
+          + "so read_blob() can materialize bytes on demand. "
+          + "For Parquet files, the data column is skipped via nested column projection; "
+          + "read_blob() automatically downgrades that scan to CONTENT so bytes are materialized.");
 }