diff --git a/.github/patches/enable-binaryview-integration-tests.patch b/.github/patches/enable-binaryview-integration-tests.patch new file mode 100644 index 00000000..ac5c17e1 --- /dev/null +++ b/.github/patches/enable-binaryview-integration-tests.patch @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py +index 83913dc379..7ace28e1be 100644 +--- a/dev/archery/archery/integration/datagen.py ++++ b/dev/archery/archery/integration/datagen.py +@@ -2003,7 +2003,6 @@ def get_generated_json_files(tempdir=None): + .skip_tester('Rust'), + + generate_binary_view_case() +- .skip_tester('JS') + # TODO(https://github.com/apache/arrow-nanoarrow/issues/618) + .skip_tester('nanoarrow') + .skip_tester('Rust'), diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 5e96bc17..344942a2 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -193,6 +193,9 @@ jobs: uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: python-version: 3 + - name: Patch Archery to enable BinaryView tests + run: | + patch -p1 < js/.github/patches/enable-binaryview-integration-tests.patch - name: Setup Archery run: pip install -e dev/archery[docker] - name: Execute Docker Build diff --git a/scripts/update_flatbuffers.sh b/scripts/update_flatbuffers.sh new file mode 100755 index 00000000..a439593a --- /dev/null +++ b/scripts/update_flatbuffers.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Regenerate the FlatBuffers helper files used by arrow-js. Requires a sibling +# checkout of apache/arrow (../arrow) if not provided in env and a working flatc on PATH. + +set -euo pipefail + +PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +FORMAT_DIR="${PROJECT_ROOT}/../arrow/format" + +if [[ ! -d "${FORMAT_DIR}" ]]; then + echo "error: expected FlatBuffers schemas in ${FORMAT_DIR}" >&2 + exit 1 +fi + +if ! command -v flatc >/dev/null 2>&1; then + echo "error: flatc not found on PATH" >&2 + exit 1 +fi + +TMPDIR="$(mktemp -d "${PROJECT_ROOT}/.flatc.XXXXXX")" +cleanup() { + rm -rf "${TMPDIR}" +} +trap cleanup EXIT + +schemas=(File Schema Message Tensor SparseTensor) + +for schema in "${schemas[@]}"; do + cp "${FORMAT_DIR}/${schema}.fbs" "${TMPDIR}/${schema}.fbs" + sed \ + -e 's/namespace org.apache.arrow.flatbuf;//g' \ + -e 's/org\.apache\.arrow\.flatbuf\.//g' \ + "${FORMAT_DIR}/${schema}.fbs" >"${TMPDIR}/${schema}.fbs" +done + +flatc --ts --ts-flat-files --ts-omit-entrypoint \ + -o "${TMPDIR}" \ + "${TMPDIR}"/{File,Schema,Message,Tensor,SparseTensor}.fbs + +generated_files=( + binary-view.ts + list-view.ts + large-list-view.ts + message.ts + record-batch.ts + schema.ts + type.ts + utf8-view.ts +) + +for file in "${generated_files[@]}"; do + if [[ ! -f "${TMPDIR}/${file}" ]]; then + echo "error: expected generated file ${file} not found" >&2 + exit 1 + fi + install -m 0644 "${TMPDIR}/${file}" "${PROJECT_ROOT}/src/fb/${file}" +done diff --git a/src/Arrow.dom.ts b/src/Arrow.dom.ts index e0cd681c..30feeb83 100644 --- a/src/Arrow.dom.ts +++ b/src/Arrow.dom.ts @@ -49,8 +49,8 @@ export { Bool, Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64, Float, Float16, Float32, Float64, - Utf8, LargeUtf8, - Binary, LargeBinary, + Utf8, LargeUtf8, Utf8View, + Binary, LargeBinary, BinaryView, FixedSizeBinary, Date_, DateDay, DateMillisecond, Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, @@ -81,7 +81,7 @@ export { } from './Arrow.js'; export { - BinaryBuilder, LargeBinaryBuilder, + BinaryBuilder, BinaryViewBuilder, LargeBinaryBuilder, BoolBuilder, DateBuilder, DateDayBuilder, DateMillisecondBuilder, DecimalBuilder, @@ -99,5 +99,5 @@ export { TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuilder, TimestampMicrosecondBuilder, TimestampNanosecondBuilder, TimeBuilder, TimeSecondBuilder, TimeMillisecondBuilder, TimeMicrosecondBuilder, TimeNanosecondBuilder, UnionBuilder, DenseUnionBuilder, SparseUnionBuilder, - Utf8Builder, LargeUtf8Builder + Utf8Builder, Utf8ViewBuilder, LargeUtf8Builder } from './Arrow.js'; diff --git a/src/Arrow.ts b/src/Arrow.ts index 8321026f..20495838 100644 --- a/src/Arrow.ts +++ b/src/Arrow.ts @@ -37,8 +37,8 @@ export { Bool, Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64, Float, Float16, Float32, Float64, - Utf8, LargeUtf8, - Binary, LargeBinary, + Utf8, LargeUtf8, Utf8View, + Binary, LargeBinary, BinaryView, FixedSizeBinary, Date_, DateDay, DateMillisecond, Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, @@ -79,8 +79,10 @@ export { TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuilder, export { IntervalBuilder, IntervalDayTimeBuilder, IntervalYearMonthBuilder, IntervalMonthDayNanoBuilder } from './builder/interval.js'; export { DurationBuilder, DurationSecondBuilder, DurationMillisecondBuilder, DurationMicrosecondBuilder, DurationNanosecondBuilder } from './builder/duration.js'; export { Utf8Builder } from './builder/utf8.js'; +export { Utf8ViewBuilder } from './builder/utf8view.js'; export { LargeUtf8Builder } from './builder/largeutf8.js'; export { BinaryBuilder } from './builder/binary.js'; +export { BinaryViewBuilder } from './builder/binaryview.js'; export { LargeBinaryBuilder } from './builder/largebinary.js'; export { ListBuilder } from './builder/list.js'; export { FixedSizeListBuilder } from './builder/fixedsizelist.js'; diff --git a/src/builder/binaryview.ts b/src/builder/binaryview.ts new file mode 100644 index 00000000..002bfc36 --- /dev/null +++ b/src/builder/binaryview.ts @@ -0,0 +1,183 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { BinaryView, Utf8View } from '../type.js'; +import { Builder, BuilderOptions } from '../builder.js'; +import { BufferBuilder } from './buffer.js'; +import { toUint8Array } from '../util/buffer.js'; +import { makeData } from '../data.js'; +import type { DataProps } from '../data.js'; + +/** @ignore */ +export class BinaryViewBuilder< + TType extends BinaryView | Utf8View = BinaryView, + TNull = any +> extends Builder { + protected _views: BufferBuilder; + protected _variadicBuffers: Uint8Array[] = []; + protected _currentBuffer: BufferBuilder | null = null; + protected _currentBufferIndex = 0; + protected _currentBufferOffset = 0; + protected readonly _bufferSize = 32 * 1024 * 1024; // 32MB per buffer as per spec recommendation + + constructor(opts: BuilderOptions) { + super(opts); + this._views = new BufferBuilder(Uint8Array); + } + + public get byteLength(): number { + let size = 0; + this._views && (size += this._views.byteLength); + this._nulls && (size += this._nulls.byteLength); + for (const buffer of this._variadicBuffers) { + size += buffer.byteLength; + } + this._currentBuffer && (size += this._currentBuffer.byteLength); + return size; + } + + public setValue(index: number, value: TType['TValue']) { + return this.writeBinaryValue(index, this.encodeValue(value)); + } + + protected writeBinaryValue(index: number, data: Uint8Array) { + const length = data.length; + + // Ensure views buffer has space up to this index (similar to FixedWidthBuilder) + const bytesNeeded = (index + 1) * BinaryView.ELEMENT_WIDTH; + const currentBytes = this._views.length; + if (bytesNeeded > currentBytes) { + this._views.reserve(bytesNeeded - currentBytes); + } + + const viewBuffer = this._views.buffer; + const viewOffset = index * BinaryView.ELEMENT_WIDTH; + const view = new DataView(viewBuffer.buffer, viewBuffer.byteOffset + viewOffset, BinaryView.ELEMENT_WIDTH); + + // Write length (4 bytes, little-endian) + view.setInt32(BinaryView.LENGTH_OFFSET, length, true); + + if (length <= BinaryView.INLINE_CAPACITY) { + // Inline: store data directly in view struct (up to 12 bytes) + viewBuffer.set(data, viewOffset + BinaryView.INLINE_OFFSET); + // Zero out remaining bytes + for (let i = length; i < BinaryView.INLINE_CAPACITY; i++) { + viewBuffer[viewOffset + BinaryView.INLINE_OFFSET + i] = 0; + } + } else { + // Out-of-line: store in variadic buffer + // Write prefix (first 4 bytes of data) + const prefix = new DataView(data.buffer, data.byteOffset, Math.min(4, length)); + view.setUint32(BinaryView.INLINE_OFFSET, prefix.getUint32(0, true), true); + + // Allocate space in variadic buffer + if (!this._currentBuffer || this._currentBufferOffset + length > this._bufferSize) { + // Start a new buffer + if (this._currentBuffer) { + this._variadicBuffers.push(this._currentBuffer.buffer.slice(0, this._currentBufferOffset)); + } + this._currentBuffer = new BufferBuilder(Uint8Array); + this._currentBufferIndex = this._variadicBuffers.length; + this._currentBufferOffset = 0; + } + + // Write data to current buffer + const bufferData = this._currentBuffer.reserve(length).buffer; + bufferData.set(data, this._currentBufferOffset); + + // Write buffer index and offset to view struct + view.setInt32(BinaryView.BUFFER_INDEX_OFFSET, this._currentBufferIndex, true); + view.setInt32(BinaryView.BUFFER_OFFSET_OFFSET, this._currentBufferOffset, true); + + this._currentBufferOffset += length; + } + + return this; + } + + protected encodeValue(value: TType['TValue']): Uint8Array { + return toUint8Array(value as unknown as Uint8Array); + } + + public setValid(index: number, isValid: boolean) { + // Ensure space is allocated in the views buffer for this index + const bytesNeeded = (index + 1) * BinaryView.ELEMENT_WIDTH; + const currentBytes = this._views.length; + if (bytesNeeded > currentBytes) { + this._views.reserve(bytesNeeded - currentBytes); + } + + const result = super.setValid(index, isValid); + + if (!result) { + // For null values, zero out the view struct + const viewBuffer = this._views.buffer; + const viewOffset = index * BinaryView.ELEMENT_WIDTH; + for (let i = 0; i < BinaryView.ELEMENT_WIDTH; i++) { + viewBuffer[viewOffset + i] = 0; + } + } + + return result; + } + + public clear() { + this._variadicBuffers = []; + this._currentBuffer = null; + this._currentBufferIndex = 0; + this._currentBufferOffset = 0; + this._views.clear(); + return super.clear(); + } + + public flush() { + const { type, length, nullCount, _views, _nulls } = this; + + // Finalize current buffer if it exists + if (this._currentBuffer && this._currentBufferOffset > 0) { + this._variadicBuffers.push(this._currentBuffer.buffer.slice(0, this._currentBufferOffset)); + this._currentBuffer = null; + this._currentBufferOffset = 0; + } + + const views = _views.flush(length * BinaryView.ELEMENT_WIDTH); + const nullBitmap = nullCount > 0 ? _nulls.flush(length) : undefined; + const variadicBuffers = this._variadicBuffers.slice(); + + // Reset variadic buffers for next batch + this._variadicBuffers = []; + this._currentBufferIndex = 0; + + this.clear(); + + const props = { + type, + length, + nullCount, + nullBitmap, + ['views']: views, + ['variadicBuffers']: variadicBuffers + }; + + return makeData(props as unknown as DataProps); + } + + public finish() { + this.finished = true; + return this; + } +} diff --git a/src/builder/utf8view.ts b/src/builder/utf8view.ts new file mode 100644 index 00000000..0fea14cb --- /dev/null +++ b/src/builder/utf8view.ts @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Utf8View } from '../type.js'; +import { BuilderOptions } from '../builder.js'; +import { BinaryViewBuilder } from './binaryview.js'; +import { encodeUtf8 } from '../util/utf8.js'; + +/** @ignore */ +export class Utf8ViewBuilder extends BinaryViewBuilder { + constructor(opts: BuilderOptions) { + super(opts); + } + + public override setValue(index: number, value: Utf8View['TValue']) { + return this.writeBinaryValue(index, encodeUtf8(value)); + } +} diff --git a/src/data.ts b/src/data.ts index 45fcc35d..b5edff8a 100644 --- a/src/data.ts +++ b/src/data.ts @@ -68,6 +68,7 @@ export class Data { declare public readonly typeIds: Buffers[BufferType.TYPE]; declare public readonly nullBitmap: Buffers[BufferType.VALIDITY]; declare public readonly valueOffsets: Buffers[BufferType.OFFSET]; + declare public readonly variadicBuffers: ReadonlyArray; public get typeId(): T['TType'] { return this.type.typeId; } @@ -97,6 +98,7 @@ export class Data { values && (byteLength += values.byteLength); nullBitmap && (byteLength += nullBitmap.byteLength); typeIds && (byteLength += typeIds.byteLength); + byteLength += this.variadicBuffers.reduce((size, data) => size + (data?.byteLength ?? 0), 0); return this.children.reduce((byteLength, child) => byteLength + child.byteLength, byteLength); } @@ -117,7 +119,16 @@ export class Data { return nullCount; } - constructor(type: T, offset: number, length: number, nullCount?: number, buffers?: Partial> | Data, children: Data[] = [], dictionary?: Vector) { + constructor( + type: T, + offset: number, + length: number, + nullCount?: number, + buffers?: Partial> | Data, + children: Data[] = [], + dictionary?: Vector, + variadicBuffers: ReadonlyArray = [] + ) { this.type = type; this.children = children; this.dictionary = dictionary; @@ -131,6 +142,7 @@ export class Data { this.typeIds = buffers.typeIds; this.nullBitmap = buffers.nullBitmap; this.valueOffsets = buffers.valueOffsets; + this.variadicBuffers = buffers.variadicBuffers; } else { this.stride = strideForType(type); if (buffers) { @@ -139,15 +151,22 @@ export class Data { (buffer = (buffers as Buffers)[2]) && (this.nullBitmap = buffer); (buffer = (buffers as Buffers)[3]) && (this.typeIds = buffer); } + this.variadicBuffers = variadicBuffers; } + this.variadicBuffers ??= []; } public getValid(index: number): boolean { const { type } = this; if (DataType.isUnion(type)) { const union = (type as Union); - const child = this.children[union.typeIdToChildIndex[this.typeIds[index]]]; - const indexInChild = union.mode === UnionMode.Dense ? this.valueOffsets[index] : index; + const typeId = this.typeIds[index]; + const childIndex = union.typeIdToChildIndex[typeId]; + const child = this.children[childIndex]; + const valueOffsets = this.valueOffsets as Int32Array | BigInt64Array | undefined; + const indexInChild = union.mode === UnionMode.Dense && valueOffsets + ? Number(valueOffsets[index]) + : index; return child.getValid(indexInChild); } if (this.nullable && this.nullCount > 0) { @@ -163,8 +182,13 @@ export class Data { const { type } = this; if (DataType.isUnion(type)) { const union = (type as Union); - const child = this.children[union.typeIdToChildIndex[this.typeIds[index]]]; - const indexInChild = union.mode === UnionMode.Dense ? this.valueOffsets[index] : index; + const typeId = this.typeIds[index]; + const childIndex = union.typeIdToChildIndex[typeId]; + const child = this.children[childIndex]; + const valueOffsets = this.valueOffsets as Int32Array | BigInt64Array | undefined; + const indexInChild = union.mode === UnionMode.Dense && valueOffsets + ? Number(valueOffsets[index]) + : index; prev = child.getValid(indexInChild); child.setValid(indexInChild, value); } else { @@ -200,8 +224,16 @@ export class Data { return value; } - public clone(type: R = this.type as any, offset = this.offset, length = this.length, nullCount = this._nullCount, buffers: Buffers = this, children: Data[] = this.children) { - return new Data(type, offset, length, nullCount, buffers, children, this.dictionary); + public clone( + type: R = this.type as any, + offset = this.offset, + length = this.length, + nullCount = this._nullCount, + buffers: Buffers = this, + children: Data[] = this.children, + variadicBuffers: ReadonlyArray = this.variadicBuffers + ) { + return new Data(type, offset, length, nullCount, buffers, children, this.dictionary, variadicBuffers); } public slice(offset: number, length: number): Data { @@ -214,12 +246,13 @@ export class Data { const buffers = this._sliceBuffers(offset, length, stride, typeId); return this.clone(this.type, this.offset + offset, length, nullCount, buffers, // Don't slice children if we have value offsets (the variable-width types) - (children.length === 0 || this.valueOffsets) ? children : this._sliceChildren(children, childStride * offset, childStride * length)); + (children.length === 0 || this.valueOffsets) ? children : this._sliceChildren(children, childStride * offset, childStride * length), + this.variadicBuffers); } public _changeLengthAndBackfillNullBitmap(newLength: number): Data { if (this.typeId === Type.Null) { - return this.clone(this.type, 0, newLength, 0); + return this.clone(this.type, 0, newLength, 0, this.buffers, this.children, this.variadicBuffers); } const { length, nullCount } = this; // start initialized with 0s (nulls), then fill from 0 to length with 1s (not null) @@ -232,7 +265,7 @@ export class Data { } const buffers = this.buffers; buffers[BufferType.VALIDITY] = bitmap; - return this.clone(this.type, 0, newLength, nullCount + (newLength - length), buffers); + return this.clone(this.type, 0, newLength, nullCount + (newLength - length), buffers, this.children, this.variadicBuffers); } protected _sliceBuffers(offset: number, length: number, stride: number, typeId: T['TType']): Buffers { @@ -240,10 +273,15 @@ export class Data { const { buffers } = this; // If typeIds exist, slice the typeIds buffer (arr = buffers[BufferType.TYPE]) && (buffers[BufferType.TYPE] = arr.subarray(offset, offset + length)); - // If offsets exist, only slice the offsets buffer - (arr = buffers[BufferType.OFFSET]) && (buffers[BufferType.OFFSET] = arr.subarray(offset, offset + length + 1)) || - // Otherwise if no offsets, slice the data buffer. Don't slice the data vector for Booleans, since the offset goes by bits not bytes - (arr = buffers[BufferType.DATA]) && (buffers[BufferType.DATA] = typeId === 6 ? arr : arr.subarray(stride * offset, stride * (offset + length))); + if (DataType.isBinaryView(this.type) || DataType.isUtf8View(this.type)) { + const width = BinaryView.ELEMENT_WIDTH; + (arr = buffers[BufferType.DATA]) && (buffers[BufferType.DATA] = arr.subarray(offset * width, (offset + length) * width)); + } else { + // If offsets exist, only slice the offsets buffer + (arr = buffers[BufferType.OFFSET]) && (buffers[BufferType.OFFSET] = arr.subarray(offset, offset + length + 1)) || + // Otherwise if no offsets, slice the data buffer. Don't slice the data vector for Booleans, since the offset goes by bits not bytes + (arr = buffers[BufferType.DATA]) && (buffers[BufferType.DATA] = typeId === 6 ? arr : arr.subarray(stride * offset, stride * (offset + length))); + } return buffers; } @@ -256,7 +294,7 @@ export class Data { import { Dictionary, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Int, Date_, @@ -311,6 +349,15 @@ class MakeDataVisitor extends Visitor { const { ['length']: length = valueOffsets.length - 1, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props; return new Data(type, offset, length, nullCount, [valueOffsets, data, nullBitmap]); } + public visitUtf8View(props: Utf8ViewDataProps) { + const { ['type']: type, ['offset']: offset = 0 } = props; + const views = toArrayBufferView(type.ArrayType, props['views']); + const nullBitmap = toUint8Array(props['nullBitmap']); + const variadicBuffers = (props['variadicBuffers'] || []).map((buffer) => toUint8Array(buffer)); + const length = props['length'] ?? Math.trunc(views.length / Utf8View.ELEMENT_WIDTH); + const nullCount = props['nullBitmap'] ? -1 : 0; + return new Data(type, offset, length, nullCount, [undefined, views, nullBitmap], [], undefined, variadicBuffers); + } public visitLargeUtf8(props: LargeUtf8DataProps) { const { ['type']: type, ['offset']: offset = 0 } = props; const data = toUint8Array(props['data']); @@ -327,6 +374,15 @@ class MakeDataVisitor extends Visitor { const { ['length']: length = valueOffsets.length - 1, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props; return new Data(type, offset, length, nullCount, [valueOffsets, data, nullBitmap]); } + public visitBinaryView(props: BinaryViewDataProps) { + const { ['type']: type, ['offset']: offset = 0 } = props; + const views = toArrayBufferView(type.ArrayType, props['views']); + const nullBitmap = toUint8Array(props['nullBitmap']); + const variadicBuffers = (props['variadicBuffers'] || []).map((buffer) => toUint8Array(buffer)); + const length = props['length'] ?? Math.trunc(views.length / BinaryView.ELEMENT_WIDTH); + const nullCount = props['nullBitmap'] ? -1 : 0; + return new Data(type, offset, length, nullCount, [undefined, views, nullBitmap], [], undefined, variadicBuffers); + } public visitLargeBinary(props: LargeBinaryDataProps) { const { ['type']: type, ['offset']: offset = 0 } = props; const data = toUint8Array(props['data']); @@ -455,8 +511,10 @@ interface IntervalDataProps extends DataProps_ { data?: D interface DurationDataProps extends DataProps_ { data?: DataBuffer } interface FixedSizeBinaryDataProps extends DataProps_ { data?: DataBuffer } interface BinaryDataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer } +interface BinaryViewDataProps extends DataProps_ { views: DataBuffer; variadicBuffers?: ReadonlyArray | Iterable | Uint8Array>; data?: DataBuffer } interface LargeBinaryDataProps extends DataProps_ { valueOffsets: LargeValueOffsetsBuffer | ValueOffsetsBuffer; data?: DataBuffer } interface Utf8DataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer } +interface Utf8ViewDataProps extends DataProps_ { views: DataBuffer; variadicBuffers?: ReadonlyArray | Iterable | Uint8Array>; data?: DataBuffer } interface LargeUtf8DataProps extends DataProps_ { valueOffsets: LargeValueOffsetsBuffer | ValueOffsetsBuffer; data?: DataBuffer } interface ListDataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; child: Data } interface FixedSizeListDataProps extends DataProps_ { child: Data } @@ -481,8 +539,10 @@ export type DataProps = ( T extends FixedSizeBinary /* */ ? FixedSizeBinaryDataProps : T extends Binary /* */ ? BinaryDataProps : T extends LargeBinary /* */ ? LargeBinaryDataProps : + T extends BinaryView /* */ ? BinaryViewDataProps : T extends Utf8 /* */ ? Utf8DataProps : T extends LargeUtf8 /* */ ? LargeUtf8DataProps : + T extends Utf8View /* */ ? Utf8ViewDataProps : T extends List /* */ ? ListDataProps : T extends FixedSizeList /* */ ? FixedSizeListDataProps : T extends Struct /* */ ? StructDataProps : @@ -507,10 +567,12 @@ export function makeData(props: TimestampDataProps): Dat export function makeData(props: IntervalDataProps): Data; export function makeData(props: DurationDataProps): Data; export function makeData(props: FixedSizeBinaryDataProps): Data; +export function makeData(props: BinaryViewDataProps): Data; export function makeData(props: BinaryDataProps): Data; export function makeData(props: LargeBinaryDataProps): Data; export function makeData(props: Utf8DataProps): Data; export function makeData(props: LargeUtf8DataProps): Data; +export function makeData(props: Utf8ViewDataProps): Data; export function makeData(props: ListDataProps): Data; export function makeData(props: FixedSizeListDataProps): Data; export function makeData(props: StructDataProps): Data; diff --git a/src/enum.ts b/src/enum.ts index 73d95538..514a8168 100644 --- a/src/enum.ts +++ b/src/enum.ts @@ -70,6 +70,8 @@ export enum Type { Duration = 18, /** Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds */ LargeBinary = 19, /** Large variable-length bytes (no guarantee of UTF8-ness) */ LargeUtf8 = 20, /** Large variable-length string as List */ + BinaryView = 23, /** Variable-length binary values backed by inline-or-referenced views */ + Utf8View = 24, /** Variable-length UTF8 string values backed by inline-or-referenced views */ Dictionary = -1, /** Dictionary aka Category type */ Int8 = -2, diff --git a/src/fb/File.ts b/src/fb/File.ts new file mode 100644 index 00000000..12c6f822 --- /dev/null +++ b/src/fb/File.ts @@ -0,0 +1,47 @@ +// automatically generated by the FlatBuffers compiler, do not modify + +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + +export { Binary } from './binary.js'; +export { BinaryView } from './binary-view.js'; +export { Block } from './block.js'; +export { Bool } from './bool.js'; +export { Buffer } from './buffer.js'; +export { Date } from './date.js'; +export { DateUnit } from './date-unit.js'; +export { Decimal } from './decimal.js'; +export { DictionaryEncoding } from './dictionary-encoding.js'; +export { DictionaryKind } from './dictionary-kind.js'; +export { Duration } from './duration.js'; +export { Endianness } from './endianness.js'; +export { Feature } from './feature.js'; +export { Field } from './field.js'; +export { FixedSizeBinary } from './fixed-size-binary.js'; +export { FixedSizeList } from './fixed-size-list.js'; +export { FloatingPoint } from './floating-point.js'; +export { Footer } from './footer.js'; +export { Int } from './int.js'; +export { Interval } from './interval.js'; +export { IntervalUnit } from './interval-unit.js'; +export { KeyValue } from './key-value.js'; +export { LargeBinary } from './large-binary.js'; +export { LargeList } from './large-list.js'; +export { LargeListView } from './large-list-view.js'; +export { LargeUtf8 } from './large-utf8.js'; +export { List } from './list.js'; +export { ListView } from './list-view.js'; +export { Map } from './map.js'; +export { MetadataVersion } from './metadata-version.js'; +export { Null } from './null.js'; +export { Precision } from './precision.js'; +export { RunEndEncoded } from './run-end-encoded.js'; +export { Schema } from './schema.js'; +export { Struct_ } from './struct-.js'; +export { Time } from './time.js'; +export { TimeUnit } from './time-unit.js'; +export { Timestamp } from './timestamp.js'; +export { Type } from './type.js'; +export { Union } from './union.js'; +export { UnionMode } from './union-mode.js'; +export { Utf8 } from './utf8.js'; +export { Utf8View } from './utf8-view.js'; diff --git a/src/fb/binary-view.ts b/src/fb/binary-view.ts new file mode 100644 index 00000000..f91f910f --- /dev/null +++ b/src/fb/binary-view.ts @@ -0,0 +1,47 @@ +// automatically generated by the FlatBuffers compiler, do not modify + +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + +import * as flatbuffers from 'flatbuffers'; + +/** + * Logically the same as Binary, but the internal representation uses a view + * struct that contains the string length and either the string's entire data + * inline (for small strings) or an inlined prefix, an index of another buffer, + * and an offset pointing to a slice in that buffer (for non-small strings). + * + * Since it uses a variable number of data buffers, each Field with this type + * must have a corresponding entry in `variadicBufferCounts`. + */ +export class BinaryView { + bb: flatbuffers.ByteBuffer|null = null; + bb_pos = 0; + __init(i:number, bb:flatbuffers.ByteBuffer):BinaryView { + this.bb_pos = i; + this.bb = bb; + return this; +} + +static getRootAsBinaryView(bb:flatbuffers.ByteBuffer, obj?:BinaryView):BinaryView { + return (obj || new BinaryView()).__init(bb.readInt32(bb.position()) + bb.position(), bb); +} + +static getSizePrefixedRootAsBinaryView(bb:flatbuffers.ByteBuffer, obj?:BinaryView):BinaryView { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new BinaryView()).__init(bb.readInt32(bb.position()) + bb.position(), bb); +} + +static startBinaryView(builder:flatbuffers.Builder) { + builder.startObject(0); +} + +static endBinaryView(builder:flatbuffers.Builder):flatbuffers.Offset { + const offset = builder.endObject(); + return offset; +} + +static createBinaryView(builder:flatbuffers.Builder):flatbuffers.Offset { + BinaryView.startBinaryView(builder); + return BinaryView.endBinaryView(builder); +} +} diff --git a/src/fb/large-list-view.ts b/src/fb/large-list-view.ts new file mode 100644 index 00000000..5785cd3f --- /dev/null +++ b/src/fb/large-list-view.ts @@ -0,0 +1,42 @@ +// automatically generated by the FlatBuffers compiler, do not modify + +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + +import * as flatbuffers from 'flatbuffers'; + +/** + * Same as ListView, but with 64-bit offsets and sizes, allowing to represent + * extremely large data values. + */ +export class LargeListView { + bb: flatbuffers.ByteBuffer|null = null; + bb_pos = 0; + __init(i:number, bb:flatbuffers.ByteBuffer):LargeListView { + this.bb_pos = i; + this.bb = bb; + return this; +} + +static getRootAsLargeListView(bb:flatbuffers.ByteBuffer, obj?:LargeListView):LargeListView { + return (obj || new LargeListView()).__init(bb.readInt32(bb.position()) + bb.position(), bb); +} + +static getSizePrefixedRootAsLargeListView(bb:flatbuffers.ByteBuffer, obj?:LargeListView):LargeListView { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new LargeListView()).__init(bb.readInt32(bb.position()) + bb.position(), bb); +} + +static startLargeListView(builder:flatbuffers.Builder) { + builder.startObject(0); +} + +static endLargeListView(builder:flatbuffers.Builder):flatbuffers.Offset { + const offset = builder.endObject(); + return offset; +} + +static createLargeListView(builder:flatbuffers.Builder):flatbuffers.Offset { + LargeListView.startLargeListView(builder); + return LargeListView.endLargeListView(builder); +} +} diff --git a/src/fb/list-view.ts b/src/fb/list-view.ts new file mode 100644 index 00000000..f9afae01 --- /dev/null +++ b/src/fb/list-view.ts @@ -0,0 +1,43 @@ +// automatically generated by the FlatBuffers compiler, do not modify + +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + +import * as flatbuffers from 'flatbuffers'; + +/** + * Represents the same logical types that List can, but contains offsets and + * sizes allowing for writes in any order and sharing of child values among + * list values. + */ +export class ListView { + bb: flatbuffers.ByteBuffer|null = null; + bb_pos = 0; + __init(i:number, bb:flatbuffers.ByteBuffer):ListView { + this.bb_pos = i; + this.bb = bb; + return this; +} + +static getRootAsListView(bb:flatbuffers.ByteBuffer, obj?:ListView):ListView { + return (obj || new ListView()).__init(bb.readInt32(bb.position()) + bb.position(), bb); +} + +static getSizePrefixedRootAsListView(bb:flatbuffers.ByteBuffer, obj?:ListView):ListView { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new ListView()).__init(bb.readInt32(bb.position()) + bb.position(), bb); +} + +static startListView(builder:flatbuffers.Builder) { + builder.startObject(0); +} + +static endListView(builder:flatbuffers.Builder):flatbuffers.Offset { + const offset = builder.endObject(); + return offset; +} + +static createListView(builder:flatbuffers.Builder):flatbuffers.Offset { + ListView.startListView(builder); + return ListView.endListView(builder); +} +} diff --git a/src/fb/message.ts b/src/fb/message.ts index d752b91b..d3518599 100644 --- a/src/fb/message.ts +++ b/src/fb/message.ts @@ -1,5 +1,7 @@ // automatically generated by the FlatBuffers compiler, do not modify +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + import * as flatbuffers from 'flatbuffers'; import { KeyValue } from './key-value.js'; diff --git a/src/fb/record-batch.ts b/src/fb/record-batch.ts index 00681999..e6f41d02 100644 --- a/src/fb/record-batch.ts +++ b/src/fb/record-batch.ts @@ -1,5 +1,7 @@ // automatically generated by the FlatBuffers compiler, do not modify +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + import * as flatbuffers from 'flatbuffers'; import { BodyCompression } from './body-compression.js'; @@ -78,8 +80,34 @@ compression(obj?:BodyCompression):BodyCompression|null { return offset ? (obj || new BodyCompression()).__init(this.bb!.__indirect(this.bb_pos + offset), this.bb!) : null; } +/** + * Some types such as Utf8View are represented using a variable number of buffers. + * For each such Field in the pre-ordered flattened logical schema, there will be + * an entry in variadicBufferCounts to indicate the number of number of variadic + * buffers which belong to that Field in the current RecordBatch. + * + * For example, the schema + * col1: Struct + * col2: Utf8View + * contains two Fields with variadic buffers so variadicBufferCounts will have + * two entries, the first counting the variadic buffers of `col1.beta` and the + * second counting `col2`'s. + * + * This field may be omitted if and only if the schema contains no Fields with + * a variable number of buffers, such as BinaryView and Utf8View. + */ +variadicBufferCounts(index: number):bigint|null { + const offset = this.bb!.__offset(this.bb_pos, 12); + return offset ? this.bb!.readInt64(this.bb!.__vector(this.bb_pos + offset) + index * 8) : BigInt(0); +} + +variadicBufferCountsLength():number { + const offset = this.bb!.__offset(this.bb_pos, 12); + return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0; +} + static startRecordBatch(builder:flatbuffers.Builder) { - builder.startObject(4); + builder.startObject(5); } static addLength(builder:flatbuffers.Builder, length:bigint) { @@ -106,6 +134,22 @@ static addCompression(builder:flatbuffers.Builder, compressionOffset:flatbuffers builder.addFieldOffset(3, compressionOffset, 0); } +static addVariadicBufferCounts(builder:flatbuffers.Builder, variadicBufferCountsOffset:flatbuffers.Offset) { + builder.addFieldOffset(4, variadicBufferCountsOffset, 0); +} + +static createVariadicBufferCountsVector(builder:flatbuffers.Builder, data:bigint[]):flatbuffers.Offset { + builder.startVector(8, data.length, 8); + for (let i = data.length - 1; i >= 0; i--) { + builder.addInt64(data[i]!); + } + return builder.endVector(); +} + +static startVariadicBufferCountsVector(builder:flatbuffers.Builder, numElems:number) { + builder.startVector(8, numElems, 8); +} + static endRecordBatch(builder:flatbuffers.Builder):flatbuffers.Offset { const offset = builder.endObject(); return offset; diff --git a/src/fb/schema.ts b/src/fb/schema.ts index 394883eb..daae447e 100644 --- a/src/fb/schema.ts +++ b/src/fb/schema.ts @@ -1,5 +1,7 @@ // automatically generated by the FlatBuffers compiler, do not modify +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + import * as flatbuffers from 'flatbuffers'; import { Endianness } from './endianness.js'; @@ -133,14 +135,6 @@ static endSchema(builder:flatbuffers.Builder):flatbuffers.Offset { return offset; } -static finishSchemaBuffer(builder:flatbuffers.Builder, offset:flatbuffers.Offset) { - builder.finish(offset); -} - -static finishSizePrefixedSchemaBuffer(builder:flatbuffers.Builder, offset:flatbuffers.Offset) { - builder.finish(offset, undefined, true); -} - static createSchema(builder:flatbuffers.Builder, endianness:Endianness, fieldsOffset:flatbuffers.Offset, customMetadataOffset:flatbuffers.Offset, featuresOffset:flatbuffers.Offset):flatbuffers.Offset { Schema.startSchema(builder); Schema.addEndianness(builder, endianness); diff --git a/src/fb/type.ts b/src/fb/type.ts index 8eb87042..8f913d01 100644 --- a/src/fb/type.ts +++ b/src/fb/type.ts @@ -1,6 +1,9 @@ // automatically generated by the FlatBuffers compiler, do not modify +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + import { Binary } from './binary.js'; +import { BinaryView } from './binary-view.js'; import { Bool } from './bool.js'; import { Date } from './date.js'; import { Decimal } from './decimal.js'; @@ -12,8 +15,10 @@ import { Int } from './int.js'; import { Interval } from './interval.js'; import { LargeBinary } from './large-binary.js'; import { LargeList } from './large-list.js'; +import { LargeListView } from './large-list-view.js'; import { LargeUtf8 } from './large-utf8.js'; import { List } from './list.js'; +import { ListView } from './list-view.js'; import { Map } from './map.js'; import { Null } from './null.js'; import { RunEndEncoded } from './run-end-encoded.js'; @@ -22,6 +27,7 @@ import { Time } from './time.js'; import { Timestamp } from './timestamp.js'; import { Union } from './union.js'; import { Utf8 } from './utf8.js'; +import { Utf8View } from './utf8-view.js'; /** @@ -52,15 +58,19 @@ export enum Type { LargeBinary = 19, LargeUtf8 = 20, LargeList = 21, - RunEndEncoded = 22 + RunEndEncoded = 22, + BinaryView = 23, + Utf8View = 24, + ListView = 25, + LargeListView = 26 } export function unionToType( type: Type, - accessor: (obj:Binary|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeUtf8|List|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8) => Binary|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeUtf8|List|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|null -): Binary|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeUtf8|List|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|null { + accessor: (obj:Binary|BinaryView|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeListView|LargeUtf8|List|ListView|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|Utf8View) => Binary|BinaryView|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeListView|LargeUtf8|List|ListView|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|Utf8View|null +): Binary|BinaryView|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeListView|LargeUtf8|List|ListView|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|Utf8View|null { switch(Type[type]) { - case 'NONE': return null; + case 'NONE': return null; case 'Null': return accessor(new Null())! as Null; case 'Int': return accessor(new Int())! as Int; case 'FloatingPoint': return accessor(new FloatingPoint())! as FloatingPoint; @@ -83,17 +93,21 @@ export function unionToType( case 'LargeUtf8': return accessor(new LargeUtf8())! as LargeUtf8; case 'LargeList': return accessor(new LargeList())! as LargeList; case 'RunEndEncoded': return accessor(new RunEndEncoded())! as RunEndEncoded; + case 'BinaryView': return accessor(new BinaryView())! as BinaryView; + case 'Utf8View': return accessor(new Utf8View())! as Utf8View; + case 'ListView': return accessor(new ListView())! as ListView; + case 'LargeListView': return accessor(new LargeListView())! as LargeListView; default: return null; } } export function unionListToType( - type: Type, - accessor: (index: number, obj:Binary|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeUtf8|List|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8) => Binary|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeUtf8|List|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|null, + type: Type, + accessor: (index: number, obj:Binary|BinaryView|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeListView|LargeUtf8|List|ListView|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|Utf8View) => Binary|BinaryView|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeListView|LargeUtf8|List|ListView|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|Utf8View|null, index: number -): Binary|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeUtf8|List|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|null { +): Binary|BinaryView|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeListView|LargeUtf8|List|ListView|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|Utf8View|null { switch(Type[type]) { - case 'NONE': return null; + case 'NONE': return null; case 'Null': return accessor(index, new Null())! as Null; case 'Int': return accessor(index, new Int())! as Int; case 'FloatingPoint': return accessor(index, new FloatingPoint())! as FloatingPoint; @@ -116,6 +130,10 @@ export function unionListToType( case 'LargeUtf8': return accessor(index, new LargeUtf8())! as LargeUtf8; case 'LargeList': return accessor(index, new LargeList())! as LargeList; case 'RunEndEncoded': return accessor(index, new RunEndEncoded())! as RunEndEncoded; + case 'BinaryView': return accessor(index, new BinaryView())! as BinaryView; + case 'Utf8View': return accessor(index, new Utf8View())! as Utf8View; + case 'ListView': return accessor(index, new ListView())! as ListView; + case 'LargeListView': return accessor(index, new LargeListView())! as LargeListView; default: return null; } } diff --git a/src/fb/utf8-view.ts b/src/fb/utf8-view.ts new file mode 100644 index 00000000..886a9df7 --- /dev/null +++ b/src/fb/utf8-view.ts @@ -0,0 +1,47 @@ +// automatically generated by the FlatBuffers compiler, do not modify + +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + +import * as flatbuffers from 'flatbuffers'; + +/** + * Logically the same as Utf8, but the internal representation uses a view + * struct that contains the string length and either the string's entire data + * inline (for small strings) or an inlined prefix, an index of another buffer, + * and an offset pointing to a slice in that buffer (for non-small strings). + * + * Since it uses a variable number of data buffers, each Field with this type + * must have a corresponding entry in `variadicBufferCounts`. + */ +export class Utf8View { + bb: flatbuffers.ByteBuffer|null = null; + bb_pos = 0; + __init(i:number, bb:flatbuffers.ByteBuffer):Utf8View { + this.bb_pos = i; + this.bb = bb; + return this; +} + +static getRootAsUtf8View(bb:flatbuffers.ByteBuffer, obj?:Utf8View):Utf8View { + return (obj || new Utf8View()).__init(bb.readInt32(bb.position()) + bb.position(), bb); +} + +static getSizePrefixedRootAsUtf8View(bb:flatbuffers.ByteBuffer, obj?:Utf8View):Utf8View { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new Utf8View()).__init(bb.readInt32(bb.position()) + bb.position(), bb); +} + +static startUtf8View(builder:flatbuffers.Builder) { + builder.startObject(0); +} + +static endUtf8View(builder:flatbuffers.Builder):flatbuffers.Offset { + const offset = builder.endObject(); + return offset; +} + +static createUtf8View(builder:flatbuffers.Builder):flatbuffers.Offset { + Utf8View.startUtf8View(builder); + return Utf8View.endUtf8View(builder); +} +} diff --git a/src/interfaces.ts b/src/interfaces.ts index 0645753b..eea88bd4 100644 --- a/src/interfaces.ts +++ b/src/interfaces.ts @@ -212,6 +212,7 @@ export type TypeToDataType = { [Type.LargeUtf8]: type.LargeUtf8; [Type.Binary]: type.Binary; [Type.LargeBinary]: type.LargeBinary; + [Type.BinaryView]: type.BinaryView; [Type.FixedSizeBinary]: type.FixedSizeBinary; [Type.Date]: type.Date_; [Type.DateDay]: type.DateDay; @@ -244,6 +245,7 @@ export type TypeToDataType = { [Type.Struct]: type.Struct; [Type.Dictionary]: type.Dictionary; [Type.FixedSizeList]: type.FixedSizeList; + [Type.Utf8View]: type.Utf8View; }[T]; /** @ignore */ @@ -268,6 +270,7 @@ type TypeToBuilder = { [Type.LargeUtf8]: LargeUtf8Builder; [Type.Binary]: BinaryBuilder; [Type.LargeBinary]: LargeBinaryBuilder; + [Type.BinaryView]: Builder; [Type.FixedSizeBinary]: FixedSizeBinaryBuilder; [Type.Date]: DateBuilder; [Type.DateDay]: DateDayBuilder; @@ -300,6 +303,7 @@ type TypeToBuilder = { [Type.Struct]: StructBuilder; [Type.Dictionary]: DictionaryBuilder; [Type.FixedSizeList]: FixedSizeListBuilder; + [Type.Utf8View]: Builder; }[T]; /** @ignore */ @@ -324,6 +328,7 @@ type DataTypeToBuilder = { [Type.LargeUtf8]: T extends type.LargeUtf8 ? LargeUtf8Builder : never; [Type.Binary]: T extends type.Binary ? BinaryBuilder : never; [Type.LargeBinary]: T extends type.LargeBinary ? LargeBinaryBuilder : never; + [Type.BinaryView]: T extends type.BinaryView ? Builder : never; [Type.FixedSizeBinary]: T extends type.FixedSizeBinary ? FixedSizeBinaryBuilder : never; [Type.Date]: T extends type.Date_ ? DateBuilder : never; [Type.DateDay]: T extends type.DateDay ? DateDayBuilder : never; @@ -356,4 +361,5 @@ type DataTypeToBuilder = { [Type.Struct]: T extends type.Struct ? StructBuilder : never; [Type.Dictionary]: T extends type.Dictionary ? DictionaryBuilder : never; [Type.FixedSizeList]: T extends type.FixedSizeList ? FixedSizeListBuilder : never; + [Type.Utf8View]: T extends type.Utf8View ? Builder : never; }[T['TType']]; diff --git a/src/ipc/message.ts b/src/ipc/message.ts index 3dc86252..40a65439 100644 --- a/src/ipc/message.ts +++ b/src/ipc/message.ts @@ -204,6 +204,8 @@ export class JSONMessageReader extends MessageReader { ...(column['TYPE_ID'] && [column['TYPE_ID']] || []), ...(column['OFFSET'] && [column['OFFSET']] || []), ...(column['DATA'] && [column['DATA']] || []), + ...(column['VIEWS'] && [column['VIEWS']] || []), + ...(column['VARIADIC_DATA_BUFFERS'] || []), ...flattenDataSources(column['children']) ], [] as any[][]); } diff --git a/src/ipc/metadata/json.ts b/src/ipc/metadata/json.ts index 15f87189..8aed54ec 100644 --- a/src/ipc/metadata/json.ts +++ b/src/ipc/metadata/json.ts @@ -18,7 +18,7 @@ import { Schema, Field } from '../../schema.js'; import { DataType, Dictionary, TimeBitWidth, - Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, + Utf8, LargeUtf8, Binary, LargeBinary, BinaryView, Utf8View, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Union, Bool, Null, Int, Float, Date_, Time, Interval, Timestamp, IntBitWidth, Int32, TKeys, Duration, } from '../../type.js'; @@ -41,7 +41,8 @@ export function recordBatchFromJSON(b: any) { b['count'], fieldNodesFromJSON(b['columns']), buffersFromJSON(b['columns']), - null + null, + variadicBufferCountsFromJSON(b['columns']) ); } @@ -83,6 +84,13 @@ function buffersFromJSON(xs: any[], buffers: BufferRegion[] = []): BufferRegion[ column['TYPE_ID'] && buffers.push(new BufferRegion(buffers.length, column['TYPE_ID'].length)); column['OFFSET'] && buffers.push(new BufferRegion(buffers.length, column['OFFSET'].length)); column['DATA'] && buffers.push(new BufferRegion(buffers.length, column['DATA'].length)); + column['VIEWS'] && buffers.push(new BufferRegion(buffers.length, column['VIEWS'].length)); + // Handle variadic buffers for view types (BinaryView, Utf8View) + if (column['VARIADIC_DATA_BUFFERS']) { + for (const buf of column['VARIADIC_DATA_BUFFERS']) { + buffers.push(new BufferRegion(buffers.length, buf.length)); + } + } buffers = buffersFromJSON(column['children'], buffers); } return buffers; @@ -93,6 +101,15 @@ function nullCountFromJSON(validity: number[]) { return (validity || []).reduce((sum, val) => sum + +(val === 0), 0); } +/** @ignore */ +function variadicBufferCountsFromJSON(xs: any[]): number[] { + return (xs || []).reduce((counts, column: any) => [ + ...counts, + ...(column['VARIADIC_DATA_BUFFERS'] ? [column['VARIADIC_DATA_BUFFERS'].length] : []), + ...variadicBufferCountsFromJSON(column['children']) + ], [] as number[]); +} + /** @ignore */ export function fieldFromJSON(_field: any, dictionaries?: Map) { @@ -149,8 +166,10 @@ function typeFromJSON(f: any, children?: Field[]): DataType { case 'null': return new Null(); case 'binary': return new Binary(); case 'largebinary': return new LargeBinary(); + case 'binaryview': return new BinaryView(); case 'utf8': return new Utf8(); case 'largeutf8': return new LargeUtf8(); + case 'utf8view': return new Utf8View(); case 'bool': return new Bool(); case 'list': return new List((children || [])[0]); case 'struct': return new Struct(children || []); diff --git a/src/ipc/metadata/message.ts b/src/ipc/metadata/message.ts index 17e8897b..b41ec4a5 100644 --- a/src/ipc/metadata/message.ts +++ b/src/ipc/metadata/message.ts @@ -57,7 +57,7 @@ import ByteBuffer = flatbuffers.ByteBuffer; import { DataType, Dictionary, TimeBitWidth, - Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, + Utf8, LargeUtf8, Binary, LargeBinary, BinaryView, Utf8View, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Union, Bool, Null, Int, Float, Date_, Time, Interval, Timestamp, IntBitWidth, Int32, TKeys, Duration, } from '../../type.js'; @@ -156,20 +156,24 @@ export class RecordBatch { protected _nodes: FieldNode[]; protected _buffers: BufferRegion[]; protected _compression: BodyCompression | null; + protected _variadicBufferCounts: number[]; public get nodes() { return this._nodes; } public get length() { return this._length; } public get buffers() { return this._buffers; } public get compression() { return this._compression; } + public get variadicBufferCounts() { return this._variadicBufferCounts; } constructor( length: bigint | number, nodes: FieldNode[], buffers: BufferRegion[], - compression: BodyCompression | null + compression: BodyCompression | null, + variadicBufferCounts: number[] = [] ) { this._nodes = nodes; this._buffers = buffers; this._length = bigIntToNumber(length); this._compression = compression; + this._variadicBufferCounts = variadicBufferCounts; } } @@ -334,7 +338,8 @@ function decodeRecordBatch(batch: _RecordBatch, version = MetadataVersion.V5) { batch.length(), decodeFieldNodes(batch), decodeBuffers(batch, version), - decodeBodyCompression(batch.compression()) + decodeBodyCompression(batch.compression()), + decodeVariadicBufferCounts(batch) ); return recordBatch; } @@ -382,6 +387,16 @@ function decodeBuffers(batch: _RecordBatch, version: MetadataVersion) { return bufferRegions; } +/** @ignore */ +function decodeVariadicBufferCounts(batch: _RecordBatch) { + const counts = [] as number[]; + const length = Math.trunc(batch.variadicBufferCountsLength()); + for (let i = 0; i < length; ++i) { + counts.push(bigIntToNumber(batch.variadicBufferCounts(i)!)); + } + return counts; +} + /** @ignore */ function decodeSchemaFields(schema: _Schema, dictionaries?: Map) { const fields = [] as Field[]; @@ -468,8 +483,10 @@ function decodeFieldType(f: _Field, children?: Field[]): DataType { case Type['Null']: return new Null(); case Type['Binary']: return new Binary(); case Type['LargeBinary']: return new LargeBinary(); + case Type['BinaryView']: return new BinaryView(); case Type['Utf8']: return new Utf8(); case Type['LargeUtf8']: return new LargeUtf8(); + case Type['Utf8View']: return new Utf8View(); case Type['Bool']: return new Bool(); case Type['List']: return new List((children || [])[0]); case Type['Struct_']: return new Struct(children || []); @@ -614,6 +631,7 @@ function encodeRecordBatch(b: Builder, recordBatch: RecordBatch) { const nodes = recordBatch.nodes || []; const buffers = recordBatch.buffers || []; + const variadicBufferCounts = recordBatch.variadicBufferCounts || []; _RecordBatch.startNodesVector(b, nodes.length); for (const n of nodes.slice().reverse()) FieldNode.encode(b, n); @@ -630,6 +648,11 @@ function encodeRecordBatch(b: Builder, recordBatch: RecordBatch) { bodyCompressionOffset = encodeBodyCompression(b, recordBatch.compression); } + let variadicBufferCountsOffset = -1; + if (variadicBufferCounts.length > 0) { + variadicBufferCountsOffset = _RecordBatch.createVariadicBufferCountsVector(b, variadicBufferCounts.map(BigInt)); + } + _RecordBatch.startRecordBatch(b); _RecordBatch.addLength(b, BigInt(recordBatch.length)); _RecordBatch.addNodes(b, nodesVectorOffset); @@ -637,6 +660,9 @@ function encodeRecordBatch(b: Builder, recordBatch: RecordBatch) { if (recordBatch.compression !== null && bodyCompressionOffset) { _RecordBatch.addCompression(b, bodyCompressionOffset); } + if (variadicBufferCountsOffset !== -1) { + _RecordBatch.addVariadicBufferCounts(b, variadicBufferCountsOffset); + } return _RecordBatch.endRecordBatch(b); } diff --git a/src/ipc/reader.ts b/src/ipc/reader.ts index e36eeb52..af49f372 100644 --- a/src/ipc/reader.ts +++ b/src/ipc/reader.ts @@ -397,7 +397,8 @@ abstract class RecordBatchReaderImpl implements RecordB header.data.length, header.data.nodes, buffers, - null + null, + header.data.variadicBufferCounts ), id, isDelta) } else { throw new Error('Dictionary batch is compressed but codec not found'); @@ -412,11 +413,11 @@ abstract class RecordBatchReaderImpl implements RecordB } protected _loadVectors(header: metadata.RecordBatch, body: Uint8Array, types: (Field | DataType)[]) { - return new VectorLoader(body, header.nodes, header.buffers, this.dictionaries, this.schema.metadataVersion).visitMany(types); + return new VectorLoader(body, header.nodes, header.buffers, this.dictionaries, this.schema.metadataVersion, header.variadicBufferCounts).visitMany(types); } protected _loadCompressedVectors(header: metadata.RecordBatch, body: Uint8Array[], types: (Field | DataType)[]) { - return new CompressedVectorLoader(body, header.nodes, header.buffers, this.dictionaries, this.schema.metadataVersion).visitMany(types); + return new CompressedVectorLoader(body, header.nodes, header.buffers, this.dictionaries, this.schema.metadataVersion, header.variadicBufferCounts).visitMany(types); } private _decompressBuffers(header: metadata.RecordBatch, body: Uint8Array, codec: Codec): { decommpressedBody: Uint8Array[]; buffers: metadata.BufferRegion[] } { @@ -757,7 +758,7 @@ class RecordBatchJSONReaderImpl extends RecordBatchStre super(source, dictionaries); } protected _loadVectors(header: metadata.RecordBatch, body: any, types: (Field | DataType)[]) { - return new JSONVectorLoader(body, header.nodes, header.buffers, this.dictionaries, this.schema.metadataVersion).visitMany(types); + return new JSONVectorLoader(body, header.nodes, header.buffers, this.dictionaries, this.schema.metadataVersion, header.variadicBufferCounts).visitMany(types); } } diff --git a/src/ipc/writer.ts b/src/ipc/writer.ts index 17c8f0b6..0b13fdfc 100644 --- a/src/ipc/writer.ts +++ b/src/ipc/writer.ts @@ -274,8 +274,8 @@ export class RecordBatchWriter extends ReadableInterop< } protected _writeRecordBatch(batch: RecordBatch) { - const { byteLength, nodes, bufferRegions, buffers } = this._assembleRecordBatch(batch); - const recordBatch = new metadata.RecordBatch(batch.numRows, nodes, bufferRegions, this._compression); + const { byteLength, nodes, bufferRegions, buffers, variadicBufferCounts } = this._assembleRecordBatch(batch); + const recordBatch = new metadata.RecordBatch(batch.numRows, nodes, bufferRegions, this._compression, variadicBufferCounts); const message = Message.from(recordBatch, byteLength); return this ._writeDictionaries(batch) @@ -284,11 +284,11 @@ export class RecordBatchWriter extends ReadableInterop< } protected _assembleRecordBatch(batch: RecordBatch | Vector) { - let { byteLength, nodes, bufferRegions, buffers } = VectorAssembler.assemble(batch); + let { byteLength, nodes, bufferRegions, buffers, variadicBufferCounts } = VectorAssembler.assemble(batch); if (this._compression != null) { ({ byteLength, bufferRegions, buffers } = this._compressBodyBuffers(buffers)); } - return { byteLength, nodes, bufferRegions, buffers }; + return { byteLength, nodes, bufferRegions, buffers, variadicBufferCounts }; } protected _compressBodyBuffers(buffers: ArrayBufferView[]) { @@ -337,8 +337,8 @@ export class RecordBatchWriter extends ReadableInterop< } protected _writeDictionaryBatch(dictionary: Data, id: number, isDelta = false) { - const { byteLength, nodes, bufferRegions, buffers } = this._assembleRecordBatch(new Vector([dictionary])); - const recordBatch = new metadata.RecordBatch(dictionary.length, nodes, bufferRegions, this._compression); + const { byteLength, nodes, bufferRegions, buffers, variadicBufferCounts } = this._assembleRecordBatch(new Vector([dictionary])); + const recordBatch = new metadata.RecordBatch(dictionary.length, nodes, bufferRegions, this._compression, variadicBufferCounts); const dictionaryBatch = new metadata.DictionaryBatch(recordBatch, id, isDelta); const message = Message.from(dictionaryBatch, byteLength); return this diff --git a/src/type.ts b/src/type.ts index ea5e24fa..f1fc3fcc 100644 --- a/src/type.ts +++ b/src/type.ts @@ -58,8 +58,10 @@ export abstract class DataType { })(Binary.prototype); } +/** @ignore */ +export interface BinaryView extends DataType { + TArray: Uint8Array; + TValue: Uint8Array; + ArrayType: TypedArrayConstructor; +} +/** @ignore */ +export class BinaryView extends DataType { + public static readonly ELEMENT_WIDTH = 16; + public static readonly INLINE_CAPACITY = 12; + public static readonly LENGTH_OFFSET = 0; + public static readonly INLINE_OFFSET = 4; + public static readonly BUFFER_INDEX_OFFSET = 8; + public static readonly BUFFER_OFFSET_OFFSET = 12; + constructor() { + super(Type.BinaryView); + } + public toString() { return `BinaryView`; } + protected static [Symbol.toStringTag] = ((proto: BinaryView) => { + (proto).ArrayType = Uint8Array; + return proto[Symbol.toStringTag] = 'BinaryView'; + })(BinaryView.prototype); +} + /** @ignore */ export interface LargeBinary extends DataType { TArray: Uint8Array; TOffsetArray: BigInt64Array; TValue: Uint8Array; ArrayType: TypedArrayConstructor; OffsetArrayType: BigIntArrayConstructor } /** @ignore */ @@ -269,6 +297,7 @@ export class LargeBinary extends DataType { })(LargeBinary.prototype); } +/** @ignore */ /** @ignore */ export interface Utf8 extends DataType { TArray: Uint8Array; TOffsetArray: Int32Array; TValue: string; ArrayType: TypedArrayConstructor; OffsetArrayType: TypedArrayConstructor } /** @ignore */ @@ -283,6 +312,26 @@ export class Utf8 extends DataType { })(Utf8.prototype); } +/** @ignore */ +export interface Utf8View extends DataType { + TArray: Uint8Array; + TValue: string; + ArrayType: TypedArrayConstructor; +} +/** @ignore */ +export class Utf8View extends DataType { + public static readonly ELEMENT_WIDTH = BinaryView.ELEMENT_WIDTH; + public static readonly INLINE_CAPACITY = BinaryView.INLINE_CAPACITY; + constructor() { + super(Type.Utf8View); + } + public toString() { return `Utf8View`; } + protected static [Symbol.toStringTag] = ((proto: Utf8View) => { + (proto).ArrayType = Uint8Array; + return proto[Symbol.toStringTag] = 'Utf8View'; + })(Utf8View.prototype); +} + /** @ignore */ export interface LargeUtf8 extends DataType { TArray: Uint8Array; TOffsetArray: BigInt64Array; TValue: string; ArrayType: TypedArrayConstructor; OffsetArrayType: BigIntArrayConstructor } /** @ignore */ @@ -298,6 +347,7 @@ export class LargeUtf8 extends DataType { })(LargeUtf8.prototype); } +/** @ignore */ /** @ignore */ export interface Bool extends DataType { TArray: Uint8Array; TValue: boolean; ArrayType: TypedArrayConstructor } /** @ignore */ @@ -759,6 +809,8 @@ export function strideForType(type: DataType) { } // case Type.Int: return 1 + +((t as Int_).bitWidth > 32); // case Type.Time: return 1 + +((t as Time_).bitWidth > 32); + case Type.BinaryView: + case Type.Utf8View: return 16; case Type.FixedSizeList: return (t as FixedSizeList).listSize; case Type.FixedSizeBinary: return (t as FixedSizeBinary).byteWidth; default: return 1; diff --git a/src/visitor.ts b/src/visitor.ts index 977e0a4e..a6d27a76 100644 --- a/src/visitor.ts +++ b/src/visitor.ts @@ -37,8 +37,10 @@ export abstract class Visitor { public visitFloat(_node: any, ..._args: any[]): any { return null; } public visitUtf8(_node: any, ..._args: any[]): any { return null; } public visitLargeUtf8(_node: any, ..._args: any[]): any { return null; } + public visitUtf8View(_node: any, ..._args: any[]): any { return null; } public visitBinary(_node: any, ..._args: any[]): any { return null; } public visitLargeBinary(_node: any, ..._args: any[]): any { return null; } + public visitBinaryView(_node: any, ..._args: any[]): any { return null; } public visitFixedSizeBinary(_node: any, ..._args: any[]): any { return null; } public visitDate(_node: any, ..._args: any[]): any { return null; } public visitTimestamp(_node: any, ..._args: any[]): any { return null; } @@ -92,8 +94,10 @@ function getVisitFnByTypeId(visitor: Visitor, dtype: Type, throwIfNotFound = tru case Type.Float64: fn = visitor.visitFloat64 || visitor.visitFloat; break; case Type.Utf8: fn = visitor.visitUtf8; break; case Type.LargeUtf8: fn = visitor.visitLargeUtf8; break; + case Type.Utf8View: fn = visitor.visitUtf8View || visitor.visitUtf8; break; case Type.Binary: fn = visitor.visitBinary; break; case Type.LargeBinary: fn = visitor.visitLargeBinary; break; + case Type.BinaryView: fn = visitor.visitBinaryView || visitor.visitBinary; break; case Type.FixedSizeBinary: fn = visitor.visitFixedSizeBinary; break; case Type.Date: fn = visitor.visitDate; break; case Type.DateDay: fn = visitor.visitDateDay || visitor.visitDate; break; @@ -157,8 +161,10 @@ function inferDType(type: T): Type { return Type.Float; case Type.Binary: return Type.Binary; case Type.LargeBinary: return Type.LargeBinary; + case Type.BinaryView: return Type.BinaryView; case Type.Utf8: return Type.Utf8; case Type.LargeUtf8: return Type.LargeUtf8; + case Type.Utf8View: return Type.Utf8View; case Type.Bool: return Type.Bool; case Type.Decimal: return Type.Decimal; case Type.Time: diff --git a/src/visitor/builderctor.ts b/src/visitor/builderctor.ts index 791576b0..ca7669a8 100644 --- a/src/visitor/builderctor.ts +++ b/src/visitor/builderctor.ts @@ -42,6 +42,8 @@ import { TimeBuilder, TimeSecondBuilder, TimeMillisecondBuilder, TimeMicrosecond import { UnionBuilder, DenseUnionBuilder, SparseUnionBuilder } from '../builder/union.js'; import { Utf8Builder } from '../builder/utf8.js'; import { LargeUtf8Builder } from '../builder/largeutf8.js'; +import { BinaryViewBuilder } from '../builder/binaryview.js'; +import { Utf8ViewBuilder } from '../builder/utf8view.js'; /** @ignore */ export interface GetBuilderCtor extends Visitor { @@ -104,6 +106,8 @@ export class GetBuilderCtor extends Visitor { public visitDurationNanosecond() { return DurationNanosecondBuilder; } public visitFixedSizeList() { return FixedSizeListBuilder; } public visitMap() { return MapBuilder; } + public visitBinaryView() { return BinaryViewBuilder; } + public visitUtf8View() { return Utf8ViewBuilder; } } /** @ignore */ diff --git a/src/visitor/get.ts b/src/visitor/get.ts index a5502dd3..b914624e 100644 --- a/src/visitor/get.ts +++ b/src/visitor/get.ts @@ -28,7 +28,7 @@ import { uint16ToFloat64 } from '../util/math.js'; import { Type, UnionMode, Precision, DateUnit, TimeUnit, IntervalUnit } from '../enum.js'; import { DataType, Dictionary, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -63,8 +63,10 @@ export interface GetVisitor extends Visitor { visitFloat64(data: Data, index: number): T['TValue'] | null; visitUtf8(data: Data, index: number): T['TValue'] | null; visitLargeUtf8(data: Data, index: number): T['TValue'] | null; + visitUtf8View(data: Data, index: number): T['TValue'] | null; visitBinary(data: Data, index: number): T['TValue'] | null; visitLargeBinary(data: Data, index: number): T['TValue'] | null; + visitBinaryView(data: Data, index: number): T['TValue'] | null; visitFixedSizeBinary(data: Data, index: number): T['TValue'] | null; visitDate(data: Data, index: number): T['TValue'] | null; visitDateDay(data: Data, index: number): T['TValue'] | null; @@ -109,6 +111,9 @@ function wrapGet(fn: (data: Data, _1: any) => any) { /** @ignore */const epochDaysToMs = (data: Int32Array, index: number) => 86400000 * data[index]; +const BINARY_VIEW_SIZE = 16; +const BINARY_VIEW_INLINE_CAPACITY = 12; + /** @ignore */ const getNull = (_data: Data, _index: number): T['TValue'] => null; /** @ignore */ @@ -149,10 +154,52 @@ const getFixedSizeBinary = ({ stride, values }: Data< /** @ignore */ const getBinary = ({ values, valueOffsets }: Data, index: number): T['TValue'] => getVariableWidthBytes(values, valueOffsets, index); /** @ignore */ +const getBinaryViewBytes = (data: Data, index: number): Uint8Array => { + const values = data.values as Uint8Array; + if (!values) { + throw new Error('BinaryView data is missing view buffer'); + } + const viewOffset = index * BINARY_VIEW_SIZE; + const end = viewOffset + BINARY_VIEW_SIZE; + if (viewOffset < 0 || end > values.length) { + throw new Error(`BinaryView data buffer is too short: expected ${BINARY_VIEW_SIZE} bytes, got ${Math.max(0, values.length - viewOffset)}`); + } + // Get the 16-byte view struct from the values array + const viewStruct = values.subarray(viewOffset, end); + if (viewStruct.length < BINARY_VIEW_SIZE) { + throw new Error(`BinaryView data buffer is too short: expected ${BINARY_VIEW_SIZE} bytes, got ${viewStruct.length}`); + } + const view = new DataView(values.buffer, viewStruct.byteOffset, BINARY_VIEW_SIZE); + const size = view.getInt32(0, true); + if (size <= 0) { + return new Uint8Array(0); + } + if (size <= BINARY_VIEW_INLINE_CAPACITY) { + // Inline data is in bytes 4-15 of the view struct + return viewStruct.subarray(4, 4 + size); + } + const bufferIndex = view.getInt32(8, true); + const offset = view.getInt32(12, true); + const variadicBuffer = data.variadicBuffers?.[bufferIndex]; + if (!variadicBuffer) { + throw new Error(`BinaryView variadic buffer ${bufferIndex} is missing`); + } + return variadicBuffer.subarray(offset, offset + size); +}; +/** @ignore */ +const getBinaryViewValue = (data: Data, index: number): T['TValue'] => { + return getBinaryViewBytes(data, index) as T['TValue']; +}; +/** @ignore */ const getUtf8 = ({ values, valueOffsets }: Data, index: number): T['TValue'] => { const bytes = getVariableWidthBytes(values, valueOffsets, index); return bytes !== null ? decodeUtf8(bytes) : null as any; }; +/** @ignore */ +const getUtf8ViewValue = (data: Data, index: number): T['TValue'] => { + const bytes = getBinaryViewBytes(data, index); + return decodeUtf8(bytes); +}; /* istanbul ignore next */ /** @ignore */ @@ -332,8 +379,10 @@ GetVisitor.prototype.visitFloat32 = wrapGet(getNumeric); GetVisitor.prototype.visitFloat64 = wrapGet(getNumeric); GetVisitor.prototype.visitUtf8 = wrapGet(getUtf8); GetVisitor.prototype.visitLargeUtf8 = wrapGet(getUtf8); +GetVisitor.prototype.visitUtf8View = wrapGet(getUtf8ViewValue); GetVisitor.prototype.visitBinary = wrapGet(getBinary); GetVisitor.prototype.visitLargeBinary = wrapGet(getBinary); +GetVisitor.prototype.visitBinaryView = wrapGet(getBinaryViewValue); GetVisitor.prototype.visitFixedSizeBinary = wrapGet(getFixedSizeBinary); GetVisitor.prototype.visitDate = wrapGet(getDate); GetVisitor.prototype.visitDateDay = wrapGet(getDateDay); diff --git a/src/visitor/indexof.ts b/src/visitor/indexof.ts index 3a4d1171..6881f99f 100644 --- a/src/visitor/indexof.ts +++ b/src/visitor/indexof.ts @@ -24,7 +24,7 @@ import { getBool, BitIterator } from '../util/bit.js'; import { createElementComparator } from '../util/vector.js'; import { DataType, Dictionary, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -59,8 +59,10 @@ export interface IndexOfVisitor extends Visitor { visitFloat64(data: Data, value: T['TValue'] | null, index?: number): number; visitUtf8(data: Data, value: T['TValue'] | null, index?: number): number; visitLargeUtf8(data: Data, value: T['TValue'] | null, index?: number): number; + visitUtf8View(data: Data, value: T['TValue'] | null, index?: number): number; visitBinary(data: Data, value: T['TValue'] | null, index?: number): number; visitLargeBinary(data: Data, value: T['TValue'] | null, index?: number): number; + visitBinaryView(data: Data, value: T['TValue'] | null, index?: number): number; visitFixedSizeBinary(data: Data, value: T['TValue'] | null, index?: number): number; visitDate(data: Data, value: T['TValue'] | null, index?: number): number; visitDateDay(data: Data, value: T['TValue'] | null, index?: number): number; @@ -177,8 +179,10 @@ IndexOfVisitor.prototype.visitFloat32 = indexOfValue; IndexOfVisitor.prototype.visitFloat64 = indexOfValue; IndexOfVisitor.prototype.visitUtf8 = indexOfValue; IndexOfVisitor.prototype.visitLargeUtf8 = indexOfValue; +IndexOfVisitor.prototype.visitUtf8View = indexOfValue; IndexOfVisitor.prototype.visitBinary = indexOfValue; IndexOfVisitor.prototype.visitLargeBinary = indexOfValue; +IndexOfVisitor.prototype.visitBinaryView = indexOfValue; IndexOfVisitor.prototype.visitFixedSizeBinary = indexOfValue; IndexOfVisitor.prototype.visitDate = indexOfValue; IndexOfVisitor.prototype.visitDateDay = indexOfValue; diff --git a/src/visitor/iterator.ts b/src/visitor/iterator.ts index 9f2844b3..ef54504c 100644 --- a/src/visitor/iterator.ts +++ b/src/visitor/iterator.ts @@ -21,7 +21,7 @@ import { Type, Precision } from '../enum.js'; import { TypeToDataType } from '../interfaces.js'; import { DataType, Dictionary, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -57,8 +57,10 @@ export interface IteratorVisitor extends Visitor { visitFloat64(vector: Vector): IterableIterator; visitUtf8(vector: Vector): IterableIterator; visitLargeUtf8(vector: Vector): IterableIterator; + visitUtf8View(vector: Vector): IterableIterator; visitBinary(vector: Vector): IterableIterator; visitLargeBinary(vector: Vector): IterableIterator; + visitBinaryView(vector: Vector): IterableIterator; visitFixedSizeBinary(vector: Vector): IterableIterator; visitDate(vector: Vector): IterableIterator; visitDateDay(vector: Vector): IterableIterator; @@ -164,8 +166,10 @@ IteratorVisitor.prototype.visitFloat32 = vectorIterator; IteratorVisitor.prototype.visitFloat64 = vectorIterator; IteratorVisitor.prototype.visitUtf8 = vectorIterator; IteratorVisitor.prototype.visitLargeUtf8 = vectorIterator; +IteratorVisitor.prototype.visitUtf8View = vectorIterator; IteratorVisitor.prototype.visitBinary = vectorIterator; IteratorVisitor.prototype.visitLargeBinary = vectorIterator; +IteratorVisitor.prototype.visitBinaryView = vectorIterator; IteratorVisitor.prototype.visitFixedSizeBinary = vectorIterator; IteratorVisitor.prototype.visitDate = vectorIterator; IteratorVisitor.prototype.visitDateDay = vectorIterator; diff --git a/src/visitor/jsontypeassembler.ts b/src/visitor/jsontypeassembler.ts index 823b1dea..cf110038 100644 --- a/src/visitor/jsontypeassembler.ts +++ b/src/visitor/jsontypeassembler.ts @@ -45,6 +45,9 @@ export class JSONTypeAssembler extends Visitor { public visitLargeBinary({ typeId }: T) { return { 'name': ArrowType[typeId].toLowerCase() }; } + public visitBinaryView({ typeId }: T) { + return { 'name': ArrowType[typeId].toLowerCase() }; + } public visitBool({ typeId }: T) { return { 'name': ArrowType[typeId].toLowerCase() }; } @@ -54,6 +57,9 @@ export class JSONTypeAssembler extends Visitor { public visitLargeUtf8({ typeId }: T) { return { 'name': ArrowType[typeId].toLowerCase() }; } + public visitUtf8View({ typeId }: T) { + return { 'name': ArrowType[typeId].toLowerCase() }; + } public visitDecimal({ typeId, scale, precision, bitWidth }: T) { return { 'name': ArrowType[typeId].toLowerCase(), 'scale': scale, 'precision': precision, 'bitWidth': bitWidth }; } diff --git a/src/visitor/jsonvectorassembler.ts b/src/visitor/jsonvectorassembler.ts index 6841b39d..2f4973ad 100644 --- a/src/visitor/jsonvectorassembler.ts +++ b/src/visitor/jsonvectorassembler.ts @@ -28,7 +28,7 @@ import { toIntervalDayTimeObjects, toIntervalMonthDayNanoObjects } from '../util import { DataType, Float, Int, Date_, Interval, Time, Timestamp, Union, Duration, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, IntArray, + Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, BinaryView, Utf8View, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, IntArray, } from '../type.js'; /** @ignore */ @@ -46,6 +46,8 @@ export interface JSONVectorAssembler extends Visitor { visitLargeUtf8(data: Data): { DATA: string[]; OFFSET: string[] }; visitBinary(data: Data): { DATA: string[]; OFFSET: number[] }; visitLargeBinary(data: Data): { DATA: string[]; OFFSET: string[] }; + visitBinaryView(data: Data): { VIEWS: any[]; VARIADIC_DATA_BUFFERS: string[] }; + visitUtf8View(data: Data): { VIEWS: any[]; VARIADIC_DATA_BUFFERS: string[] }; visitFixedSizeBinary(data: Data): { DATA: string[] }; visitDate(data: Data): { DATA: number[] }; visitTimestamp(data: Data): { DATA: string[] }; @@ -112,6 +114,15 @@ export class JSONVectorAssembler extends Visitor { public visitLargeBinary(data: Data) { return { 'DATA': [...binaryToString(new Vector([data]))], 'OFFSET': [...bigNumsToStrings(data.valueOffsets, 2)] }; } + public visitBinaryView(data: Data) { + return binaryViewDataToJSON(data, (bytes) => Array.from(bytes) + .map(b => ('0' + (b & 0xFF).toString(16)).slice(-2)) + .join('') + .toUpperCase()); + } + public visitUtf8View(data: Data) { + return binaryViewDataToJSON(data, (bytes) => Array.from(bytes).map(b => String.fromCodePoint(b)).join('')); + } public visitFixedSizeBinary(data: Data) { return { 'DATA': [...binaryToString(new Vector([data]))] }; } @@ -195,3 +206,46 @@ function* bigNumsToStrings(values: BigUint64Array | BigInt64Array | Uint32Array yield `${BN.new(u32s.subarray((i + 0) * stride, (i + 1) * stride), false)}`; } } + +/** @ignore */ +function binaryViewDataToJSON(data: Data | Data, formatInlined: (bytes: Uint8Array) => string) { + const INLINE_SIZE = 12; + const viewsData = data.values; + const dataView = new DataView(viewsData.buffer, viewsData.byteOffset, viewsData.byteLength); + const numViews = viewsData.byteLength / 16; + const bytesToHex = (bytes: Uint8Array) => + Array.from(bytes) + .map(b => ('0' + (b & 0xFF).toString(16)).slice(-2)) + .join('') + .toUpperCase(); + const parsedViews = Array.from({ length: numViews }, (_, i) => { + const offset = i * 16; + const size = dataView.getInt32(offset, true); + return [offset, size]; + }).map(([offset, size]) => (size > INLINE_SIZE) ? { + 'SIZE': size, + 'PREFIX_HEX': bytesToHex(viewsData.subarray(offset + 4, offset + 8)), + 'BUFFER_INDEX': dataView.getInt32(offset + 8, true), + 'OFFSET': dataView.getInt32(offset + 12, true) + } : { + 'SIZE': size, + 'INLINED': formatInlined(viewsData.subarray(offset + 4, offset + 4 + size)) + }); + const uniqueBufferIndices = [...new Set( + parsedViews + .map(v => v['BUFFER_INDEX']) + .filter((idx): idx is number => idx !== undefined) + )]; + const variadicBuffers = uniqueBufferIndices.map(bufferIndex => + bytesToHex(data.variadicBuffers[bufferIndex]) + ); + const bufferIndexMap = new Map( + uniqueBufferIndices.map((bufferIndex, outputIndex) => [bufferIndex, outputIndex]) + ); + // Remap buffer indices in views + const views = parsedViews.map(v => v['BUFFER_INDEX'] !== undefined + ? { ...v, 'BUFFER_INDEX': bufferIndexMap.get(v['BUFFER_INDEX']) } + : v + ); + return { 'VIEWS': views, 'VARIADIC_DATA_BUFFERS': variadicBuffers }; +} diff --git a/src/visitor/set.ts b/src/visitor/set.ts index 4bf632ba..90501937 100644 --- a/src/visitor/set.ts +++ b/src/visitor/set.ts @@ -26,7 +26,7 @@ import { float64ToUint16 } from '../util/math.js'; import { Type, UnionMode, Precision, DateUnit, TimeUnit, IntervalUnit } from '../enum.js'; import { DataType, Dictionary, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -61,8 +61,10 @@ export interface SetVisitor extends Visitor { visitFloat64(data: Data, index: number, value: T['TValue']): void; visitUtf8(data: Data, index: number, value: T['TValue']): void; visitLargeUtf8(data: Data, index: number, value: T['TValue']): void; + visitUtf8View(data: Data, index: number, value: T['TValue']): void; visitBinary(data: Data, index: number, value: T['TValue']): void; visitLargeBinary(data: Data, index: number, value: T['TValue']): void; + visitBinaryView(data: Data, index: number, value: T['TValue']): void; visitFixedSizeBinary(data: Data, index: number, value: T['TValue']): void; visitDate(data: Data, index: number, value: T['TValue']): void; visitDateDay(data: Data, index: number, value: T['TValue']): void; @@ -155,7 +157,63 @@ export const setFixedSizeBinary = ({ stride, values } /** @ignore */ const setBinary = ({ values, valueOffsets }: Data, index: number, value: T['TValue']) => setVariableWidthBytes(values, valueOffsets, index, value); /** @ignore */ +const ensureWritableVariadicBuffers = (data: Data): Uint8Array[] => { + let buffers = data.variadicBuffers as unknown as Uint8Array[]; + if (!Array.isArray(buffers) || Object.isFrozen(buffers)) { + buffers = Array.from(buffers) as Uint8Array[]; + (data as any).variadicBuffers = buffers; + } + return buffers; +}; +/** @ignore */ +const setBinaryViewBytes = (data: Data, index: number, bytes: Uint8Array) => { + const views = data.values as Uint8Array | undefined; + if (!views) { + throw new Error('BinaryView data is missing view buffer'); + } + const elementWidth = BinaryView.ELEMENT_WIDTH; + const viewOffset = index * elementWidth; + const end = viewOffset + elementWidth; + if (viewOffset < 0 || end > views.length) { + throw new RangeError(`BinaryView index ${index} out of bounds`); + } + + views.fill(0, viewOffset, end); + + const view = new DataView(views.buffer, views.byteOffset + viewOffset, elementWidth); + const length = bytes.length; + view.setInt32(BinaryView.LENGTH_OFFSET, length, true); + + if (length <= BinaryView.INLINE_CAPACITY) { + views.set(bytes, viewOffset + BinaryView.INLINE_OFFSET); + return; + } + + const prefix = + (bytes[0] ?? 0) | + ((bytes[1] ?? 0) << 8) | + ((bytes[2] ?? 0) << 16) | + ((bytes[3] ?? 0) << 24); + view.setUint32(BinaryView.INLINE_OFFSET, prefix >>> 0, true); + + const buffers = ensureWritableVariadicBuffers(data); + const copy = bytes.slice(); + const bufferIndex = buffers.push(copy) - 1; + view.setInt32(BinaryView.BUFFER_INDEX_OFFSET, bufferIndex, true); + view.setInt32(BinaryView.BUFFER_OFFSET_OFFSET, 0, true); +}; +/** @ignore */ +const setBinaryView = (data: Data, index: number, value: T['TValue']) => { + const bytes = value instanceof Uint8Array ? value : new Uint8Array(value); + setBinaryViewBytes(data as unknown as Data, index, bytes); +}; +/** @ignore */ const setUtf8 = ({ values, valueOffsets }: Data, index: number, value: T['TValue']) => setVariableWidthBytes(values, valueOffsets, index, encodeUtf8(value)); +/** @ignore */ +const setUtf8View = (data: Data, index: number, value: T['TValue']) => { + const bytes = encodeUtf8(value); + setBinaryViewBytes(data as unknown as Data, index, bytes); +}; /* istanbul ignore next */ export const setDate = (data: Data, index: number, value: T['TValue']): void => { @@ -359,8 +417,10 @@ SetVisitor.prototype.visitFloat32 = wrapSet(setFloat); SetVisitor.prototype.visitFloat64 = wrapSet(setFloat); SetVisitor.prototype.visitUtf8 = wrapSet(setUtf8); SetVisitor.prototype.visitLargeUtf8 = wrapSet(setUtf8); +SetVisitor.prototype.visitUtf8View = wrapSet(setUtf8View); SetVisitor.prototype.visitBinary = wrapSet(setBinary); SetVisitor.prototype.visitLargeBinary = wrapSet(setBinary); +SetVisitor.prototype.visitBinaryView = wrapSet(setBinaryView); SetVisitor.prototype.visitFixedSizeBinary = wrapSet(setFixedSizeBinary); SetVisitor.prototype.visitDate = wrapSet(setDate); SetVisitor.prototype.visitDateDay = wrapSet(setDateDay); diff --git a/src/visitor/typeassembler.ts b/src/visitor/typeassembler.ts index 169f3627..d997f6cf 100644 --- a/src/visitor/typeassembler.ts +++ b/src/visitor/typeassembler.ts @@ -25,9 +25,11 @@ import { Null } from '../fb/null.js'; import { Int } from '../fb/int.js'; import { FloatingPoint } from '../fb/floating-point.js'; import { Binary } from '../fb/binary.js'; +import { BinaryView } from '../fb/binary-view.js'; import { LargeBinary } from '../fb/large-binary.js'; import { Bool } from '../fb/bool.js'; import { Utf8 } from '../fb/utf8.js'; +import { Utf8View } from '../fb/utf8-view.js'; import { LargeUtf8 } from '../fb/large-utf8.js'; import { Decimal } from '../fb/decimal.js'; import { Date } from '../fb/date.js'; @@ -72,6 +74,10 @@ export class TypeAssembler extends Visitor { Binary.startBinary(b); return Binary.endBinary(b); } + public visitBinaryView(_node: T, b: Builder) { + BinaryView.startBinaryView(b); + return BinaryView.endBinaryView(b); + } public visitLargeBinary(_node: T, b: Builder) { LargeBinary.startLargeBinary(b); return LargeBinary.endLargeBinary(b); @@ -84,6 +90,10 @@ export class TypeAssembler extends Visitor { Utf8.startUtf8(b); return Utf8.endUtf8(b); } + public visitUtf8View(_node: T, b: Builder) { + Utf8View.startUtf8View(b); + return Utf8View.endUtf8View(b); + } public visitLargeUtf8(_node: T, b: Builder) { LargeUtf8.startLargeUtf8(b); return LargeUtf8.endLargeUtf8(b); diff --git a/src/visitor/typecomparator.ts b/src/visitor/typecomparator.ts index 65413ccd..5c1d60a9 100644 --- a/src/visitor/typecomparator.ts +++ b/src/visitor/typecomparator.ts @@ -21,7 +21,7 @@ import { Visitor } from '../visitor.js'; import { Schema, Field } from '../schema.js'; import { DataType, TypeMap, Dictionary, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -55,8 +55,10 @@ export interface TypeComparator extends Visitor { visitFloat64(type: T, other?: DataType | null): other is T; visitUtf8(type: T, other?: DataType | null): other is T; visitLargeUtf8(type: T, other?: DataType | null): other is T; + visitUtf8View(type: T, other?: DataType | null): other is T; visitBinary(type: T, other?: DataType | null): other is T; visitLargeBinary(type: T, other?: DataType | null): other is T; + visitBinaryView(type: T, other?: DataType | null): other is T; visitFixedSizeBinary(type: T, other?: DataType | null): other is T; visitDate(type: T, other?: DataType | null): other is T; visitDateDay(type: T, other?: DataType | null): other is T; @@ -254,8 +256,10 @@ TypeComparator.prototype.visitFloat32 = compareFloat; TypeComparator.prototype.visitFloat64 = compareFloat; TypeComparator.prototype.visitUtf8 = compareAny; TypeComparator.prototype.visitLargeUtf8 = compareAny; +TypeComparator.prototype.visitUtf8View = compareAny; TypeComparator.prototype.visitBinary = compareAny; TypeComparator.prototype.visitLargeBinary = compareAny; +TypeComparator.prototype.visitBinaryView = compareAny; TypeComparator.prototype.visitFixedSizeBinary = compareFixedSizeBinary; TypeComparator.prototype.visitDate = compareDate; TypeComparator.prototype.visitDateDay = compareDate; diff --git a/src/visitor/vectorassembler.ts b/src/visitor/vectorassembler.ts index 7dc36955..2ac6f8fa 100644 --- a/src/visitor/vectorassembler.ts +++ b/src/visitor/vectorassembler.ts @@ -27,7 +27,7 @@ import { BufferRegion, FieldNode } from '../ipc/metadata/message.js'; import { DataType, Dictionary, Float, Int, Date_, Interval, Time, Timestamp, Union, Duration, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, } from '../type.js'; import { bigIntToNumber } from '../util/bigint.js'; @@ -115,11 +115,13 @@ export class VectorAssembler extends Visitor { public get buffers() { return this._buffers; } public get byteLength() { return this._byteLength; } public get bufferRegions() { return this._bufferRegions; } + public get variadicBufferCounts() { return this._variadicBufferCounts; } protected _byteLength = 0; protected _nodes: FieldNode[] = []; protected _buffers: ArrayBufferView[] = []; protected _bufferRegions: BufferRegion[] = []; + protected _variadicBufferCounts: number[] = []; } /** @ignore */ @@ -215,6 +217,22 @@ function assembleFlatListVector(this: VectorAssembler, data: Data) { + const { offset, length, stride, values, variadicBuffers = [] } = data; + if (!values) { + throw new Error('BinaryView data is missing view buffer'); + } + const start = offset * stride; + const end = start + length * stride; + addBuffer.call(this, values.subarray(start, end)); + for (const buffer of variadicBuffers) { + addBuffer.call(this, buffer); + } + this._variadicBufferCounts.push(variadicBuffers.length); + return this; +} + /** @ignore */ function assembleListVector(this: VectorAssembler, data: Data) { const { length, valueOffsets } = data; @@ -239,8 +257,10 @@ VectorAssembler.prototype.visitInt = assembleFlatVector; VectorAssembler.prototype.visitFloat = assembleFlatVector; VectorAssembler.prototype.visitUtf8 = assembleFlatListVector; VectorAssembler.prototype.visitLargeUtf8 = assembleFlatListVector; +VectorAssembler.prototype.visitUtf8View = assembleBinaryViewVector; VectorAssembler.prototype.visitBinary = assembleFlatListVector; VectorAssembler.prototype.visitLargeBinary = assembleFlatListVector; +VectorAssembler.prototype.visitBinaryView = assembleBinaryViewVector; VectorAssembler.prototype.visitFixedSizeBinary = assembleFlatVector; VectorAssembler.prototype.visitDate = assembleFlatVector; VectorAssembler.prototype.visitTimestamp = assembleFlatVector; diff --git a/src/visitor/vectorloader.ts b/src/visitor/vectorloader.ts index 7c82e7ab..a6892b28 100644 --- a/src/visitor/vectorloader.ts +++ b/src/visitor/vectorloader.ts @@ -44,13 +44,16 @@ export class VectorLoader extends Visitor { protected buffersIndex = -1; private dictionaries: Map>; private readonly metadataVersion: MetadataVersion; - constructor(bytes: Uint8Array, nodes: FieldNode[], buffers: BufferRegion[], dictionaries: Map>, metadataVersion = MetadataVersion.V5) { + private variadicBufferCounts: number[]; + private variadicBufferIndex = -1; + constructor(bytes: Uint8Array, nodes: FieldNode[], buffers: BufferRegion[], dictionaries: Map>, metadataVersion = MetadataVersion.V5, variadicBufferCounts: number[] = []) { super(); this.bytes = bytes; this.nodes = nodes; this.buffers = buffers; this.dictionaries = dictionaries; this.metadataVersion = metadataVersion; + this.variadicBufferCounts = variadicBufferCounts; } public visit(node: Field | T): Data { @@ -75,12 +78,38 @@ export class VectorLoader extends Visitor { public visitLargeUtf8(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), valueOffsets: this.readOffsets(type), data: this.readData(type) }); } + public visitUtf8View(type: T, { length, nullCount } = this.nextFieldNode()) { + const nullBitmap = this.readNullBitmap(type, nullCount); + const views = this.readData(type); + const variadicBuffers = this.readVariadicBuffers(this.nextVariadicBufferCount()); + return makeData({ + type, + length, + nullCount, + nullBitmap, + ['views']: views, + ['variadicBuffers']: variadicBuffers + }); + } public visitBinary(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), valueOffsets: this.readOffsets(type), data: this.readData(type) }); } public visitLargeBinary(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), valueOffsets: this.readOffsets(type), data: this.readData(type) }); } + public visitBinaryView(type: T, { length, nullCount } = this.nextFieldNode()) { + const nullBitmap = this.readNullBitmap(type, nullCount); + const views = this.readData(type); + const variadicBuffers = this.readVariadicBuffers(this.nextVariadicBufferCount()); + return makeData({ + type, + length, + nullCount, + nullBitmap, + ['views']: views, + ['variadicBuffers']: variadicBuffers + }); + } public visitFixedSizeBinary(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), data: this.readData(type) }); } @@ -142,6 +171,12 @@ export class VectorLoader extends Visitor { protected readData(_type: T, { length, offset } = this.nextBufferRange()) { return this.bytes.subarray(offset, offset + length); } + protected readVariadicBuffers(length: number) { + return Array.from({ length }, () => this.readData(null as any)); + } + protected nextVariadicBufferCount() { + return this.variadicBufferCounts[++this.variadicBufferIndex] ?? 0; + } protected readDictionary(type: T): Vector { return this.dictionaries.get(type.id)!; } @@ -150,8 +185,8 @@ export class VectorLoader extends Visitor { /** @ignore */ export class JSONVectorLoader extends VectorLoader { private sources: any[][]; - constructor(sources: any[][], nodes: FieldNode[], buffers: BufferRegion[], dictionaries: Map>, metadataVersion: MetadataVersion) { - super(new Uint8Array(0), nodes, buffers, dictionaries, metadataVersion); + constructor(sources: any[][], nodes: FieldNode[], buffers: BufferRegion[], dictionaries: Map>, metadataVersion: MetadataVersion, variadicBufferCounts: number[] = []) { + super(new Uint8Array(0), nodes, buffers, dictionaries, metadataVersion, variadicBufferCounts); this.sources = sources; } protected readNullBitmap(_type: T, nullCount: number, { offset } = this.nextBufferRange()) { @@ -175,6 +210,10 @@ export class JSONVectorLoader extends VectorLoader { return toArrayBufferView(Uint8Array, Int128.convertArray(sources[offset] as string[])); } else if (DataType.isBinary(type) || DataType.isLargeBinary(type) || DataType.isFixedSizeBinary(type)) { return binaryDataFromJSON(sources[offset] as string[]); + } else if (DataType.isBinaryView(type)) { + return binaryViewDataFromJSON(sources[offset] as any[]); + } else if (DataType.isUtf8View(type)) { + return utf8ViewDataFromJSON(sources[offset] as any[]); } else if (DataType.isBool(type)) { return packBools(sources[offset] as number[]); } else if (DataType.isUtf8(type) || DataType.isLargeUtf8(type)) { @@ -191,25 +230,107 @@ export class JSONVectorLoader extends VectorLoader { } return toArrayBufferView(Uint8Array, toArrayBufferView(type.ArrayType, sources[offset].map((x) => +x))); } + protected readVariadicBuffers(length: number) { + // Per Arrow C++ reference implementation (cpp/src/arrow/ipc/reader.cc), + // each variadic buffer is stored as a separate buffer region, matching + // the IPC format where each is accessed via separate GetBuffer() calls. + // VARIADIC_DATA_BUFFERS in JSON is an array, but flattenDataSources spreads + // it so each hex string gets its own sources entry, maintaining 1:1 + // correspondence with BufferRegion entries. + const buffers: Uint8Array[] = []; + for (let i = 0; i < length; i++) { + const { offset } = this.nextBufferRange(); + // sources[offset] is 'any[]' but for variadic buffers it's actually a string + // after spreading in flattenDataSources. Cast necessary due to heterogeneous + // sources array structure (most fields are arrays, variadic elements are strings). + const hexString = this.sources[offset] as unknown as string; + buffers.push(hexStringToBytes(hexString)); + } + return buffers; + } +} + +/** @ignore */ +function hexStringToBytes(hexString: string): Uint8Array { + // Parse hex string per Arrow JSON integration format (uppercase hex encoding). + // Used for: VARIADIC_DATA_BUFFERS elements, Binary DATA (after join), + // BinaryView PREFIX_HEX and INLINED fields. + const data = new Uint8Array(hexString.length / 2); + for (let i = 0; i < hexString.length; i += 2) { + data[i >> 1] = Number.parseInt(hexString.slice(i, i + 2), 16); + } + return data; +} + +/** @ignore */ +function binaryDataFromJSON(values: string[]): Uint8Array { + // Arrow JSON Binary/LargeBinary/FixedSizeBinary format: + // "DATA": ["49BC7D5B6C47D2","3F5FB6D9322026"] (array of hex strings, one per value) + // Join all values into one continuous hex string, then parse to bytes. + return hexStringToBytes(values.join('')); } /** @ignore */ -function binaryDataFromJSON(values: string[]) { - // "DATA": ["49BC7D5B6C47D2","3F5FB6D9322026"] - // There are definitely more efficient ways to do this... but it gets the - // job done. - const joined = values.join(''); - const data = new Uint8Array(joined.length / 2); - for (let i = 0; i < joined.length; i += 2) { - data[i >> 1] = Number.parseInt(joined.slice(i, i + 2), 16); +function parseViewDataFromJSON(views: any[], parseInlined: (inlined: string) => Uint8Array) { + // Each view is a 16-byte struct: [length: i32, prefix/inlined: 12 bytes, buffer_index: i32, offset: i32] + const data = new Uint8Array(views.length * 16); + const dataView = new DataView(data.buffer); + + for (const [i, view] of views.entries()) { + const offset = i * 16; + const size = view['SIZE']; + + // Write size (int32 at byte 0) + dataView.setInt32(offset, size, true); + + if (view['INLINED'] !== undefined) { + // Inline view: parse INLINED field using provided callback + const bytes = parseInlined(view['INLINED']); + for (let j = 0; j < bytes.length && j < 12; j++) { + data[offset + 4 + j] = bytes[j]; + } + } else { + // Out-of-line view: write prefix, buffer_index, offset + const prefix = view['PREFIX_HEX']; + // Write 4-byte prefix at bytes 4-7 + for (let j = 0; j < 8 && j < prefix.length; j += 2) { + data[offset + 4 + (j >> 1)] = Number.parseInt(prefix.slice(j, j + 2), 16); + } + // Write buffer_index (int32 at byte 8) + dataView.setInt32(offset + 8, view['BUFFER_INDEX'], true); + // Write offset (int32 at byte 12) + dataView.setInt32(offset + 12, view['OFFSET'], true); + } } + return data; } +/** @ignore */ +function binaryViewDataFromJSON(views: any[]) { + return parseViewDataFromJSON(views, (inlined: string) => { + // BinaryView: INLINED is hex-encoded string + const bytes = new Uint8Array(inlined.length / 2); + for (let i = 0; i < inlined.length; i += 2) { + bytes[i >> 1] = Number.parseInt(inlined.slice(i, i + 2), 16); + } + return bytes; + }); +} + +/** @ignore */ +function utf8ViewDataFromJSON(views: any[]) { + return parseViewDataFromJSON(views, (inlined: string) => { + // Utf8View: INLINED is UTF-8 string - encode to bytes + const encoder = new TextEncoder(); + return encoder.encode(inlined); + }); +} + export class CompressedVectorLoader extends VectorLoader { private bodyChunks: Uint8Array[]; - constructor(bodyChunks: Uint8Array[], nodes: FieldNode[], buffers: BufferRegion[], dictionaries: Map>, metadataVersion: MetadataVersion) { - super(new Uint8Array(0), nodes, buffers, dictionaries, metadataVersion); + constructor(bodyChunks: Uint8Array[], nodes: FieldNode[], buffers: BufferRegion[], dictionaries: Map>, metadataVersion: MetadataVersion, variadicBufferCounts: number[] = []) { + super(new Uint8Array(0), nodes, buffers, dictionaries, metadataVersion, variadicBufferCounts); this.bodyChunks = bodyChunks; } protected readData(_type: T, _buffer = this.nextBufferRange()) { diff --git a/test/data/tables.ts b/test/data/tables.ts index e9674d9b..80950a5e 100644 --- a/test/data/tables.ts +++ b/test/data/tables.ts @@ -27,7 +27,7 @@ const nestedVectorGeneratorNames = ['struct', 'denseUnion', 'sparseUnion', 'map' const dictionaryKeyGeneratorNames = ['int8', 'int16', 'int32', 'uint8', 'uint16', 'uint32']; const valueVectorGeneratorNames = [ 'null_', 'bool', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64', - 'float16', 'float32', 'float64', 'utf8', 'largeUtf8', 'binary', 'largeBinary', 'fixedSizeBinary', 'dateDay', 'dateMillisecond', + 'float16', 'float32', 'float64', 'utf8', 'largeUtf8', 'utf8View', 'binary', 'largeBinary', 'binaryView', 'fixedSizeBinary', 'dateDay', 'dateMillisecond', 'timestampSecond', 'timestampMillisecond', 'timestampMicrosecond', 'timestampNanosecond', 'timeSecond', 'timeMillisecond', 'timeMicrosecond', 'timeNanosecond', 'decimal', 'dictionary', 'intervalDayTime', 'intervalYearMonth', 'intervalMonthDayNano', diff --git a/test/generate-test-data.ts b/test/generate-test-data.ts index de4a8269..f173633b 100644 --- a/test/generate-test-data.ts +++ b/test/generate-test-data.ts @@ -16,14 +16,14 @@ // under the License. import { - makeData, Vector, Visitor, DataType, TypeMap, + makeData, Vector, vectorFromArray, Visitor, DataType, TypeMap, Table, Schema, Field, RecordBatch, Null, Bool, Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64, Float, Float16, Float32, Float64, - Utf8, LargeUtf8, - Binary, LargeBinary, + Utf8, LargeUtf8, Utf8View, + Binary, LargeBinary, BinaryView, FixedSizeBinary, Date_, DateDay, DateMillisecond, Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, @@ -79,8 +79,10 @@ interface TestDataVectorGenerator extends Visitor { visitFloat: typeof generateFloat; visitUtf8: typeof generateUtf8; visitLargeUtf8: typeof generateLargeUtf8; + visitUtf8View: typeof generateUtf8View; visitBinary: typeof generateBinary; visitLargeBinary: typeof generateLargeBinary; + visitBinaryView: typeof generateBinaryView; visitFixedSizeBinary: typeof generateFixedSizeBinary; visitDate: typeof generateDate; visitTimestamp: typeof generateTimestamp; @@ -106,8 +108,10 @@ TestDataVectorGenerator.prototype.visitUint64 = generateBigInt; TestDataVectorGenerator.prototype.visitFloat = generateFloat; TestDataVectorGenerator.prototype.visitUtf8 = generateUtf8; TestDataVectorGenerator.prototype.visitLargeUtf8 = generateLargeUtf8; +TestDataVectorGenerator.prototype.visitUtf8View = generateUtf8View; TestDataVectorGenerator.prototype.visitBinary = generateBinary; TestDataVectorGenerator.prototype.visitLargeBinary = generateLargeBinary; +TestDataVectorGenerator.prototype.visitBinaryView = generateBinaryView; TestDataVectorGenerator.prototype.visitFixedSizeBinary = generateFixedSizeBinary; TestDataVectorGenerator.prototype.visitDate = generateDate; TestDataVectorGenerator.prototype.visitTimestamp = generateTimestamp; @@ -222,8 +226,10 @@ export const float32 = (length = 100, nullCount = Math.trunc(length * 0.2)) => v export const float64 = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new Float64(), length, nullCount); export const utf8 = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new Utf8(), length, nullCount); export const largeUtf8 = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new LargeUtf8(), length, nullCount); +export const utf8View = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new Utf8View(), length, nullCount); export const binary = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new Binary(), length, nullCount); export const largeBinary = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new LargeBinary(), length, nullCount); +export const binaryView = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new BinaryView(), length, nullCount); export const fixedSizeBinary = (length = 100, nullCount = Math.trunc(length * 0.2), byteWidth = 8) => vectorGenerator.visit(new FixedSizeBinary(byteWidth), length, nullCount); export const dateDay = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new DateDay(), length, nullCount); export const dateMillisecond = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new DateMillisecond(), length, nullCount); @@ -252,7 +258,7 @@ export const fixedSizeList = (length = 100, nullCount = Math.trunc(length * 0.2) export const map = (length = 100, nullCount = Math.trunc(length * 0.2), child: Field> = defaultMapChild()) => vectorGenerator.visit(new Map_(child), length, nullCount); export const vecs = { - null_, bool, int8, int16, int32, int64, uint8, uint16, uint32, uint64, float16, float32, float64, utf8, largeUtf8, binary, largeBinary, fixedSizeBinary, dateDay, dateMillisecond, timestampSecond, timestampMillisecond, timestampMicrosecond, timestampNanosecond, timeSecond, timeMillisecond, timeMicrosecond, timeNanosecond, decimal, list, struct, denseUnion, sparseUnion, dictionary, intervalDayTime, intervalYearMonth, intervalMonthDayNano, fixedSizeList, map, durationSecond, durationMillisecond, durationMicrosecond, durationNanosecond + null_, bool, int8, int16, int32, int64, uint8, uint16, uint32, uint64, float16, float32, float64, utf8, largeUtf8, utf8View, binary, largeBinary, binaryView, fixedSizeBinary, dateDay, dateMillisecond, timestampSecond, timestampMillisecond, timestampMicrosecond, timestampNanosecond, timeSecond, timeMillisecond, timeMicrosecond, timeNanosecond, decimal, list, struct, denseUnion, sparseUnion, dictionary, intervalDayTime, intervalYearMonth, intervalMonthDayNano, fixedSizeList, map, durationSecond, durationMillisecond, durationMicrosecond, durationNanosecond } as { [k: string]: (...args: any[]) => any }; function generateNull(this: TestDataVectorGenerator, type: T, length = 100): GeneratedVector { @@ -364,6 +370,13 @@ function generateLargeUtf8(this: TestDataVectorGenerator, t return { values: () => values, vector: new Vector([makeData({ type, length, nullCount, nullBitmap, valueOffsets, data })]) }; } +function generateUtf8View(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2)): GeneratedVector { + const nullBitmap = createBitmap(length, nullCount); + const values = Array.from({ length }, (_, i) => isValid(nullBitmap, i) ? randomString(Math.trunc(Math.random() * 20)) : null); + const vector = vectorFromArray(values, type); + return { values: () => values, vector }; +} + function generateBinary(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2)): GeneratedVector { const nullBitmap = createBitmap(length, nullCount); const valueOffsets = createVariableWidthOffsets32(length, nullBitmap, 10, 20, nullCount != 0); @@ -384,6 +397,13 @@ function generateLargeBinary(this: TestDataVectorGenerato return { values: () => values, vector: new Vector([makeData({ type, length, nullCount, nullBitmap, valueOffsets, data })]) }; } +function generateBinaryView(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2)): GeneratedVector { + const nullBitmap = createBitmap(length, nullCount); + const values = Array.from({ length }, (_, i) => isValid(nullBitmap, i) ? randomBytes(Math.trunc(Math.random() * 20)) : null); + const vector = vectorFromArray(values, type); + return { values: () => values, vector }; +} + function generateFixedSizeBinary(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2)): GeneratedVector { const nullBitmap = createBitmap(length, nullCount); const data = fillRandom(Uint8Array, length * type.byteWidth); diff --git a/test/unit/builders/view-builders-tests.ts b/test/unit/builders/view-builders-tests.ts new file mode 100644 index 00000000..88ee28fe --- /dev/null +++ b/test/unit/builders/view-builders-tests.ts @@ -0,0 +1,258 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { BinaryView, Utf8View } from '../../../src/type.js'; +import { makeBuilder, vectorFromArray } from '../../../src/factories.js'; + +describe('BinaryViewBuilder', () => { + it('should build inline binary values (≤12 bytes)', () => { + const builder = makeBuilder({ type: new BinaryView() }); + const values = [ + new Uint8Array([1, 2, 3]), + new Uint8Array([4, 5, 6, 7, 8, 9, 10, 11, 12]), + new Uint8Array([13]) + ]; + + for (const value of values) { + builder.append(value); + } + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(3); + expect(vector.get(0)).toEqual(values[0]); + expect(vector.get(1)).toEqual(values[1]); + expect(vector.get(2)).toEqual(values[2]); + }); + + it('should build out-of-line binary values (>12 bytes)', () => { + const builder = makeBuilder({ type: new BinaryView() }); + const value = new Uint8Array(100); + for (let i = 0; i < 100; i++) { + value[i] = i % 256; + } + + builder.append(value); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(1); + expect(vector.get(0)).toEqual(value); + }); + + it('should build mixed inline and out-of-line values', () => { + const builder = makeBuilder({ type: new BinaryView() }); + const small = new Uint8Array([1, 2, 3]); + const large = new Uint8Array(50); + for (let i = 0; i < 50; i++) { + large[i] = i % 256; + } + + builder.append(small); + builder.append(large); + builder.append(small); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(3); + expect(vector.get(0)).toEqual(small); + expect(vector.get(1)).toEqual(large); + expect(vector.get(2)).toEqual(small); + }); + + it('should handle null values', () => { + const builder = makeBuilder({ type: new BinaryView(), nullValues: [null] }); + + builder.append(new Uint8Array([1, 2, 3])); + builder.append(null); + builder.append(new Uint8Array([4, 5, 6])); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(3); + expect(vector.get(0)).toEqual(new Uint8Array([1, 2, 3])); + expect(vector.get(1)).toBeNull(); + expect(vector.get(2)).toEqual(new Uint8Array([4, 5, 6])); + }); + + it('should handle empty values', () => { + const builder = makeBuilder({ type: new BinaryView() }); + + builder.append(new Uint8Array([])); + builder.append(new Uint8Array([1])); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(2); + expect(vector.get(0)).toEqual(new Uint8Array([])); + expect(vector.get(1)).toEqual(new Uint8Array([1])); + }); + + it('should handle exactly 12-byte boundary values', () => { + const builder = makeBuilder({ type: new BinaryView() }); + const exactly12 = new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]); + const exactly13 = new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]); + + builder.append(exactly12); + builder.append(exactly13); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(2); + expect(vector.get(0)).toEqual(exactly12); + expect(vector.get(1)).toEqual(exactly13); + }); + + it('should handle multiple flushes', () => { + const builder = makeBuilder({ type: new BinaryView() }); + + builder.append(new Uint8Array([1, 2])); + const data1 = builder.flush(); + expect(data1).toHaveLength(1); + + builder.append(new Uint8Array([3, 4])); + builder.append(new Uint8Array([5, 6])); + const data2 = builder.flush(); + expect(data2).toHaveLength(2); + }); +}); + +describe('Utf8ViewBuilder', () => { + it('should build inline string values (≤12 bytes)', () => { + const builder = makeBuilder({ type: new Utf8View() }); + const values = ['hello', 'world', 'foo']; + + for (const value of values) { + builder.append(value); + } + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(3); + expect(vector.get(0)).toBe('hello'); + expect(vector.get(1)).toBe('world'); + expect(vector.get(2)).toBe('foo'); + }); + + it('should build out-of-line string values (>12 bytes)', () => { + const builder = makeBuilder({ type: new Utf8View() }); + const longString = 'This is a long string that exceeds 12 bytes'; + + builder.append(longString); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(1); + expect(vector.get(0)).toBe(longString); + }); + + it('should build mixed inline and out-of-line strings', () => { + const builder = makeBuilder({ type: new Utf8View() }); + const short = 'hi'; + const long = 'This is a very long string that definitely exceeds the 12 byte inline capacity'; + + builder.append(short); + builder.append(long); + builder.append(short); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(3); + expect(vector.get(0)).toBe(short); + expect(vector.get(1)).toBe(long); + expect(vector.get(2)).toBe(short); + }); + + it('should handle null values', () => { + const builder = makeBuilder({ type: new Utf8View(), nullValues: [null] }); + + builder.append('hello'); + builder.append(null); + builder.append('world'); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(3); + expect(vector.get(0)).toBe('hello'); + expect(vector.get(1)).toBeNull(); + expect(vector.get(2)).toBe('world'); + }); + + it('should handle empty strings', () => { + const builder = makeBuilder({ type: new Utf8View() }); + + builder.append(''); + builder.append('a'); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(2); + expect(vector.get(0)).toBe(''); + expect(vector.get(1)).toBe('a'); + }); + + it('should handle UTF-8 multibyte characters', () => { + const builder = makeBuilder({ type: new Utf8View() }); + const values = ['🚀', '你好', 'Ñoño', 'emoji: 🎉']; + + for (const value of values) { + builder.append(value); + } + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(4); + expect(vector.get(0)).toBe('🚀'); + expect(vector.get(1)).toBe('你好'); + expect(vector.get(2)).toBe('Ñoño'); + expect(vector.get(3)).toBe('emoji: 🎉'); + }); + + it('should handle exactly 12-byte boundary strings', () => { + const builder = makeBuilder({ type: new Utf8View() }); + const exactly12 = 'twelve bytes'; // ASCII: 12 bytes + const exactly13 = 'thirteen byte'; // ASCII: 13 bytes + + builder.append(exactly12); + builder.append(exactly13); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(2); + expect(vector.get(0)).toBe(exactly12); + expect(vector.get(1)).toBe(exactly13); + }); + + it('should build from vectorFromArray', () => { + const values = ['hello', 'world', null, 'foo']; + const vector = vectorFromArray(values, new Utf8View()); + + expect(vector).toHaveLength(4); + expect(vector.get(0)).toBe('hello'); + expect(vector.get(1)).toBe('world'); + expect(vector.get(2)).toBeNull(); + expect(vector.get(3)).toBe('foo'); + }); + + it('should handle large batch of values', () => { + const builder = makeBuilder({ type: new Utf8View() }); + const count = 1000; + const values: string[] = []; + + for (let i = 0; i < count; i++) { + const value = i % 2 === 0 + ? `short_${i}` // inline + : `this_is_a_long_string_that_goes_out_of_line_${i}`; // out-of-line + values.push(value); + builder.append(value); + } + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(count); + + for (let i = 0; i < count; i++) { + expect(vector.get(i)).toBe(values[i]); + } + }); +}); diff --git a/test/unit/generated-data-tests.ts b/test/unit/generated-data-tests.ts index 9affe5f6..0d3a760e 100644 --- a/test/unit/generated-data-tests.ts +++ b/test/unit/generated-data-tests.ts @@ -39,8 +39,10 @@ describe('Generated Test Data', () => { describe('Float64', () => { validateVector(generate.float64()); }); describe('Utf8', () => { validateVector(generate.utf8()); }); describe('LargeUtf8', () => { validateVector(generate.largeUtf8()); }); + describe('Utf8View', () => { validateVector(generate.utf8View()); }); describe('Binary', () => { validateVector(generate.binary()); }); describe('LargeBinary', () => { validateVector(generate.largeBinary()); }); + describe('BinaryView', () => { validateVector(generate.binaryView()); }); describe('FixedSizeBinary', () => { validateVector(generate.fixedSizeBinary()); }); describe('DateDay', () => { validateVector(generate.dateDay()); }); describe('DateMillisecond', () => { validateVector(generate.dateMillisecond()); }); diff --git a/test/unit/vector/vector-tests.ts b/test/unit/vector/vector-tests.ts index 73c9cdbb..d8199e57 100644 --- a/test/unit/vector/vector-tests.ts +++ b/test/unit/vector/vector-tests.ts @@ -16,7 +16,7 @@ // under the License. import { - Bool, DateDay, DateMillisecond, Dictionary, Float64, Int32, List, makeVector, Struct, Utf8, LargeUtf8, util, Vector, vectorFromArray, makeData, FixedSizeList, Field, + Bool, DateDay, DateMillisecond, Dictionary, Float64, Int32, List, makeVector, Struct, Utf8, LargeUtf8, Utf8View, BinaryView, util, Vector, vectorFromArray, makeData, FixedSizeList, Field, } from 'apache-arrow'; describe(`makeVectorFromArray`, () => { @@ -256,6 +256,50 @@ describe(`LargeUtf8Vector`, () => { }); }); +describe(`Utf8ViewVector`, () => { + const values = ['foo', 'bar', 'baz', 'foo bar', 'bar']; + const vector = vectorFromArray(values, new Utf8View); + + test(`has utf8View type`, () => { + expect(vector.type).toBeInstanceOf(Utf8View); + }); + + test(`is not memoized`, () => { + expect(vector.isMemoized).toBe(false); + const memoizedVector = vector.memoize(); + expect(memoizedVector.isMemoized).toBe(true); + const unMemoizedVector = vector.unmemoize(); + expect(unMemoizedVector.isMemoized).toBe(false); + }); + + basicVectorTests(vector, values, ['abc', '123']); + describe(`sliced`, () => { + basicVectorTests(vector.slice(1, 3), values.slice(1, 3), ['foo', 'abc']); + }); +}); + +describe(`BinaryViewVector`, () => { + const values = [new Uint8Array([1, 2, 3]), new Uint8Array([4, 5]), new Uint8Array([6, 7, 8, 9])]; + const vector = vectorFromArray(values, new BinaryView); + + test(`has binaryView type`, () => { + expect(vector.type).toBeInstanceOf(BinaryView); + }); + + test(`is not memoized`, () => { + expect(vector.isMemoized).toBe(false); + const memoizedVector = vector.memoize(); + expect(memoizedVector.isMemoized).toBe(true); + const unMemoizedVector = vector.unmemoize(); + expect(unMemoizedVector.isMemoized).toBe(false); + }); + + basicVectorTests(vector, values, [new Uint8Array([10, 11]), new Uint8Array([12])]); + describe(`sliced`, () => { + basicVectorTests(vector.slice(1, 3), values.slice(1, 3), [new Uint8Array([1, 2]), new Uint8Array([3, 4])]); + }); +}); + describe(`ListVector`, () => { const values = [[1, 2], [1, 2, 3]]; const vector = vectorFromArray(values);