-
Notifications
You must be signed in to change notification settings - Fork 2.5k
feat(vector): add VECTOR type to HoodieSchema #18146
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
7bb7c8e
51e4e4f
630fa00
07d42d8
ead6acd
48b1d17
726a6f7
38c881b
a5e52c6
5d7f7bd
d786515
c9fb73d
8f3b855
fa3a743
cec2075
008a5ba
57bbf8d
3e870f3
f68e550
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -123,6 +123,7 @@ public class HoodieSchema implements Serializable { | |
| static { | ||
| LogicalTypes.register(VariantLogicalType.VARIANT_LOGICAL_TYPE_NAME, new VariantLogicalTypeFactory()); | ||
| LogicalTypes.register(BlobLogicalType.BLOB_LOGICAL_TYPE_NAME, new BlobLogicalTypeFactory()); | ||
| LogicalTypes.register(VectorLogicalType.VECTOR_LOGICAL_TYPE_NAME, new VectorLogicalTypeFactory()); | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -173,6 +174,8 @@ public static HoodieSchema fromAvroSchema(Schema avroSchema) { | |
| return new HoodieSchema.Variant(avroSchema); | ||
| } else if (logicalType == BlobLogicalType.blob()) { | ||
| return new HoodieSchema.Blob(avroSchema); | ||
| } else if (logicalType instanceof VectorLogicalType) { | ||
| return new HoodieSchema.Vector(avroSchema); | ||
| } | ||
| } | ||
| return new HoodieSchema(avroSchema); | ||
|
|
@@ -645,6 +648,62 @@ public static HoodieSchema.Blob createBlob() { | |
| return new HoodieSchema.Blob(Blob.DEFAULT_NAME); | ||
| } | ||
|
|
||
| /** | ||
| * Creates Vector schema with default name and specified dimension. | ||
| * Defaults to {@link Vector.VectorElementType#FLOAT} element type. | ||
| * | ||
| * <p>The generated FIXED type name encodes dimension and element type (e.g., {@code vector_float_128}) | ||
| * to avoid Avro name collisions when multiple vector columns exist in the same record.</p> | ||
| * | ||
| * @param dimension vector dimension (must be > 0) | ||
| * @return new HoodieSchema.Vector | ||
| */ | ||
| public static HoodieSchema.Vector createVector(int dimension) { | ||
| return createVector(dimension, Vector.VectorElementType.FLOAT); | ||
| } | ||
|
|
||
| /** | ||
| * Creates Vector schema with custom name and dimension. | ||
| * Defaults to {@link Vector.VectorElementType#FLOAT} element type. | ||
| * | ||
| * @param name FIXED type name (must not be null or empty) | ||
| * @param dimension vector dimension (must be > 0) | ||
| * @return new HoodieSchema.Vector | ||
| */ | ||
| public static HoodieSchema.Vector createVector(String name, int dimension) { | ||
| return createVector(name, dimension, Vector.VectorElementType.FLOAT); | ||
| } | ||
|
|
||
| /** | ||
| * Creates Vector schema with custom dimension and element type. | ||
| * | ||
| * <p>The generated FIXED type name encodes dimension and element type (e.g., {@code vector_double_256}) | ||
| * to avoid Avro name collisions when multiple vector columns exist in the same record.</p> | ||
| * | ||
| * @param dimension vector dimension (must be > 0) | ||
| * @param elementType element type (use {@link Vector.VectorElementType#FLOAT} or {@link Vector.VectorElementType#DOUBLE}) | ||
| * @return new HoodieSchema.Vector | ||
| */ | ||
| public static HoodieSchema.Vector createVector(int dimension, Vector.VectorElementType elementType) { | ||
| String vectorName = Vector.DEFAULT_NAME + "_" + elementType.name().toLowerCase() + "_" + dimension; | ||
| return createVector(vectorName, dimension, elementType); | ||
| } | ||
|
|
||
| /** | ||
| * Creates Vector schema with custom name, dimension, and element type. | ||
| * | ||
| * @param name FIXED type name (must not be null or empty) | ||
| * @param dimension vector dimension (must be > 0) | ||
| * @param elementType element type (use {@link Vector.VectorElementType#FLOAT} or {@link Vector.VectorElementType#DOUBLE}) | ||
| * @return new HoodieSchema.Vector | ||
| */ | ||
| public static HoodieSchema.Vector createVector(String name, int dimension, Vector.VectorElementType elementType) { | ||
| ValidationUtils.checkArgument(name != null && !name.isEmpty(), | ||
| () -> "Vector name must not be null or empty"); | ||
| Schema vectorSchema = Vector.createSchema(name, dimension, elementType); | ||
| return new HoodieSchema.Vector(vectorSchema); | ||
| } | ||
|
|
||
| /** | ||
| * Returns the Hudi schema version information. | ||
| * | ||
|
|
@@ -1551,6 +1610,215 @@ public int hashCode() { | |
| } | ||
| } | ||
|
|
||
| public static class Vector extends HoodieSchema { | ||
| private static final String DEFAULT_NAME = "vector"; | ||
|
|
||
| /** | ||
| * Enum representing vector element data types. | ||
| */ | ||
| public enum VectorElementType { | ||
| FLOAT(4), | ||
| DOUBLE(8), | ||
| INT8(1); | ||
|
|
||
| private final int elementSize; | ||
|
|
||
| VectorElementType(int elementSize) { | ||
| this.elementSize = elementSize; | ||
| } | ||
|
|
||
| /** | ||
| * Returns the byte size of a single element. | ||
| * | ||
| * @return number of bytes per element | ||
| */ | ||
| public int getElementSize() { | ||
| return elementSize; | ||
| } | ||
|
|
||
| /** | ||
| * Converts a string to VectorElementType enum. | ||
| * | ||
| * @param name the element type name (e.g., "FLOAT", "DOUBLE", "INT8") | ||
| * @return the corresponding enum value | ||
| * @throws IllegalArgumentException if name is unknown | ||
| */ | ||
| public static VectorElementType fromString(String name) { | ||
| for (VectorElementType type : values()) { | ||
| if (type.name().equalsIgnoreCase(name)) { | ||
| return type; | ||
| } | ||
| } | ||
| throw new IllegalArgumentException("Unknown element type: " + name); | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Enum representing the physical storage format backing a vector. | ||
| */ | ||
| public enum StorageBacking { | ||
| FIXED_BYTES; | ||
|
|
||
| /** | ||
| * Converts a string to StorageBacking enum. | ||
| * | ||
| * @param name the storage backing name (e.g., "FIXED_BYTES") | ||
| * @return the corresponding enum value | ||
| * @throws IllegalArgumentException if name is unknown | ||
| */ | ||
| public static StorageBacking fromString(String name) { | ||
| for (StorageBacking b : values()) { | ||
| if (b.name().equalsIgnoreCase(name)) { | ||
| return b; | ||
| } | ||
| } | ||
| throw new IllegalArgumentException("Unknown storage backing: " + name); | ||
| } | ||
| } | ||
|
|
||
| private final int dimension; | ||
| private final VectorElementType elementType; | ||
| private final StorageBacking storageBacking; | ||
|
|
||
| /** | ||
| * Creates Vector from pre-built schema (used by factory methods). | ||
| * | ||
| * @param avroSchema the Avro schema to wrap, must be a valid Vector schema | ||
| * @throws IllegalArgumentException if avroSchema is null or not a valid Vector schema | ||
| */ | ||
| private Vector(Schema avroSchema) { | ||
| super(avroSchema); | ||
|
|
||
| // Extract properties from LogicalType | ||
| LogicalType logicalType = avroSchema.getLogicalType(); | ||
| if (!(logicalType instanceof VectorLogicalType)) { | ||
| throw new IllegalArgumentException( | ||
| "Schema must have VectorLogicalType, got: " + logicalType); | ||
| } | ||
|
|
||
| VectorLogicalType vectorLogicalType = (VectorLogicalType) logicalType; | ||
| this.dimension = vectorLogicalType.getDimension(); | ||
| this.elementType = VectorElementType.fromString(vectorLogicalType.getElementType()); | ||
| this.storageBacking = StorageBacking.fromString(vectorLogicalType.getStorageBacking()); | ||
|
|
||
| // Validate schema structure | ||
| validateVectorSchema(avroSchema); | ||
| } | ||
|
|
||
| @Override | ||
| public String getName() { | ||
| return VectorLogicalType.VECTOR_LOGICAL_TYPE_NAME; | ||
| } | ||
|
|
||
| @Override | ||
| public HoodieSchemaType getType() { | ||
| return HoodieSchemaType.VECTOR; | ||
| } | ||
|
|
||
| /** | ||
| * Creates vector schema with specified dimension and element type. | ||
| * | ||
| * @param name fixed type name (not null) | ||
| * @param dimension vector dimension (must be > 0) | ||
|
Comment on lines
+1720
to
+1722
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💅 NIT: Unnecessary static method This method just delegates to
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On second look i agree will remove this
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. okay. lets resolve when done |
||
| * @param elementType element type (defaults to FLOAT if null) | ||
| * @return new Vector schema | ||
| */ | ||
| private static Schema createSchema(String name, int dimension, VectorElementType elementType) { | ||
| ValidationUtils.checkArgument(dimension > 0, | ||
| () -> "Vector dimension must be positive: " + dimension); | ||
|
|
||
| // Validate elementType | ||
| VectorElementType resolvedElementType = elementType != null ? elementType : VectorElementType.FLOAT; | ||
|
|
||
| // Calculate fixed size: dimension × element size in bytes | ||
| int elementSize = resolvedElementType.getElementSize(); | ||
| int fixedSize = dimension * elementSize; | ||
|
|
||
| // Create fixed Schema | ||
| Schema vectorSchema = Schema.createFixed(name, null, null, fixedSize); | ||
|
|
||
| // Apply logical type with properties directly to FIXED | ||
| VectorLogicalType vectorLogicalType = new VectorLogicalType(dimension, resolvedElementType.name(), StorageBacking.FIXED_BYTES.name()); | ||
| vectorLogicalType.addToSchema(vectorSchema); | ||
|
|
||
| return vectorSchema; | ||
| } | ||
|
|
||
| /** | ||
| * Validates that the given Avro schema conforms to Vector specification. | ||
| * | ||
| * @param avroSchema the schema to validate | ||
| * @throws IllegalArgumentException if schema is invalid | ||
| */ | ||
| private void validateVectorSchema(Schema avroSchema) { | ||
| // Verify FIXED size matches: dimension × elementSize | ||
| int expectedSize = dimension * elementType.getElementSize(); | ||
| int actualSize = avroSchema.getFixedSize(); | ||
| ValidationUtils.checkArgument(actualSize == expectedSize, | ||
| () -> "Vector FIXED size mismatch: expected " + expectedSize | ||
| + " bytes (dimension=" + dimension + " × elementSize=" | ||
| + elementType.getElementSize() + "), got " + actualSize); | ||
| } | ||
|
|
||
| /** | ||
| * Returns the dimension of this vector. | ||
| * | ||
| * @return vector dimension (always > 0) | ||
| */ | ||
| public int getDimension() { | ||
| return dimension; | ||
| } | ||
|
|
||
| /** | ||
| * Returns the element type of this vector. | ||
| * | ||
| * @return element type enum (e.g., {@link VectorElementType#FLOAT}, {@link VectorElementType#DOUBLE}, {@link VectorElementType#INT8}) | ||
| */ | ||
| public VectorElementType getVectorElementType() { | ||
| return elementType; | ||
| } | ||
|
|
||
| /** | ||
| * Returns the storage backing type. | ||
| * | ||
| * @return storage backing enum value | ||
| */ | ||
| public StorageBacking getStorageBacking() { | ||
| return storageBacking; | ||
| } | ||
|
|
||
| /** | ||
| * Returns the size of the fixed bytes backing this vector. | ||
| * | ||
| * @return size in bytes (dimension × elementSize) | ||
| */ | ||
| public int getFixedSize() { | ||
| return getAvroSchema().getFixedSize(); | ||
| } | ||
|
|
||
| @Override | ||
| public boolean equals(Object o) { | ||
| if (this == o) { | ||
| return true; | ||
| } | ||
| if (o == null || getClass() != o.getClass()) { | ||
| return false; | ||
| } | ||
| if (!super.equals(o)) { | ||
| return false; | ||
| } | ||
| Vector vector = (Vector) o; | ||
| return dimension == vector.dimension | ||
| && Objects.equals(elementType, vector.elementType) | ||
| && Objects.equals(storageBacking, vector.storageBacking); | ||
| } | ||
|
|
||
| @Override | ||
| public int hashCode() { | ||
| return Objects.hash(super.hashCode(), dimension, elementType, storageBacking); | ||
| } | ||
| } | ||
|
|
||
| public static class Timestamp extends HoodieSchema { | ||
| private final boolean isUtcAdjusted; | ||
| private final TimePrecision precision; | ||
|
|
@@ -1719,6 +1987,81 @@ public void validate(Schema schema) { | |
| } | ||
| } | ||
|
|
||
| static class VectorLogicalType extends LogicalType { | ||
| private static final String VECTOR_LOGICAL_TYPE_NAME = "vector"; | ||
| private static final String PROP_DIMENSION = "dimension"; | ||
| private static final String PROP_ELEMENT_TYPE = "elementType"; | ||
| private static final String PROP_STORAGE_BACKING = "storageBacking"; | ||
|
|
||
| private final int dimension; | ||
| private final String elementType; | ||
| private final String storageBacking; | ||
|
|
||
| public VectorLogicalType(int dimension, String elementType, String storageBacking) { | ||
| super(VectorLogicalType.VECTOR_LOGICAL_TYPE_NAME); | ||
| ValidationUtils.checkArgument(dimension > 0, | ||
| () -> "Vector dimension must be positive: " + dimension); | ||
| ValidationUtils.checkArgument(elementType != null && !elementType.isEmpty(), | ||
| () -> "Element type cannot be null or empty"); | ||
| ValidationUtils.checkArgument(storageBacking != null && !storageBacking.isEmpty(), | ||
| () -> "Storage backing cannot be null or empty"); | ||
|
|
||
| this.dimension = dimension; | ||
| this.elementType = elementType; | ||
| this.storageBacking = storageBacking; | ||
| } | ||
|
|
||
| public int getDimension() { | ||
| return dimension; | ||
| } | ||
|
|
||
| public String getElementType() { | ||
| return elementType; | ||
| } | ||
|
|
||
| public String getStorageBacking() { | ||
| return storageBacking; | ||
| } | ||
|
|
||
| @Override | ||
| public Schema addToSchema(Schema schema) { | ||
| super.addToSchema(schema); | ||
| schema.addProp(PROP_DIMENSION, dimension); | ||
| schema.addProp(PROP_ELEMENT_TYPE, elementType); | ||
| schema.addProp(PROP_STORAGE_BACKING, storageBacking); | ||
| return schema; | ||
| } | ||
rahil-c marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| /** | ||
| * Factory for creating VectorLogicalType instances. | ||
| */ | ||
| private static class VectorLogicalTypeFactory implements LogicalTypes.LogicalTypeFactory { | ||
| @Override | ||
| public LogicalType fromSchema(Schema schema) { | ||
| // Extract properties from schema | ||
| Object dimObj = schema.getObjectProp(VectorLogicalType.PROP_DIMENSION); | ||
| int dimension = dimObj instanceof Number ? ((Number) dimObj).intValue() : 0; | ||
vinothchandar marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| String elementType = schema.getProp(VectorLogicalType.PROP_ELEMENT_TYPE); | ||
| if (elementType == null) { | ||
| elementType = Vector.VectorElementType.FLOAT.name(); | ||
| } | ||
|
|
||
| String storageBacking = schema.getProp(VectorLogicalType.PROP_STORAGE_BACKING); | ||
| if (storageBacking == null) { | ||
| storageBacking = Vector.StorageBacking.FIXED_BYTES.name(); // default | ||
| } | ||
|
|
||
| return new VectorLogicalType(dimension, elementType, storageBacking); | ||
| } | ||
|
|
||
| @Override | ||
| public String getTypeName() { | ||
| return VectorLogicalType.VECTOR_LOGICAL_TYPE_NAME; | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Factory for creating VariantLogicalType instances. | ||
| */ | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
💅 NIT rename createFloatVector() ? (assuming we anticipate, vectors of couple different types)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I actually think we should keep the naming
createVectoracross the apis, however maybe I can add the following java doc comment, to make this more clear?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see you have a overload below, that passes the type. I was flagging since there was no docs, and it was silently assuming float. So, with the elementType overload and the doc. We can resolve