diff --git a/scripts/update_flatbuffers.sh b/scripts/update_flatbuffers.sh new file mode 100755 index 00000000..d81dfbc3 --- /dev/null +++ b/scripts/update_flatbuffers.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Regenerate the FlatBuffers helper files used by arrow-js. Requires a sibling +# checkout of apache/arrow (../arrow) if not provided in env and a working flatc on PATH. + +set -euo pipefail + +PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +FORMAT_DIR="${PROJECT_ROOT}/../arrow/format" + +if [[ ! -d "${FORMAT_DIR}" ]]; then + echo "error: expected FlatBuffers schemas in ../arrow/format" >&2 + exit 1 +fi + +if ! command -v flatc >/dev/null 2>&1; then + echo "error: flatc not found on PATH" >&2 + exit 1 +fi + +TMPDIR="$(mktemp -d "${PROJECT_ROOT}/.flatc.XXXXXX")" +cleanup() { + rm -rf "${TMPDIR}" +} +trap cleanup EXIT + +schemas=(File Schema Message Tensor SparseTensor) + +for schema in "${schemas[@]}"; do + cp "${FORMAT_DIR}/${schema}.fbs" "${TMPDIR}/${schema}.fbs" + sed -i '' \ + -e 's/namespace org.apache.arrow.flatbuf;//g' \ + -e 's/org\.apache\.arrow\.flatbuf\.//g' \ + "${TMPDIR}/${schema}.fbs" +done + +flatc --ts --ts-flat-files --ts-omit-entrypoint \ + -o "${TMPDIR}" \ + "${TMPDIR}"/{File,Schema,Message,Tensor,SparseTensor}.fbs + +rm -f "${TMPDIR}"/{File,Schema,Message,Tensor,SparseTensor}.fbs + +generated_files=( + binary-view.ts + list-view.ts + large-list-view.ts + message.ts + record-batch.ts + schema.ts + type.ts + utf8-view.ts +) + +for file in "${generated_files[@]}"; do + if [[ ! -f "${TMPDIR}/${file}" ]]; then + echo "error: expected generated file ${file} not found" >&2 + exit 1 + fi + install -m 0644 "${TMPDIR}/${file}" "${PROJECT_ROOT}/src/fb/${file}" +done diff --git a/src/Arrow.dom.ts b/src/Arrow.dom.ts index e0cd681c..512e761e 100644 --- a/src/Arrow.dom.ts +++ b/src/Arrow.dom.ts @@ -49,14 +49,14 @@ export { Bool, Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64, Float, Float16, Float32, Float64, - Utf8, LargeUtf8, - Binary, LargeBinary, + Utf8, LargeUtf8, Utf8View, + Binary, LargeBinary, BinaryView, FixedSizeBinary, Date_, DateDay, DateMillisecond, Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond, Decimal, - List, + List, ListView, LargeListView, Struct, StructRow, Union, DenseUnion, SparseUnion, Dictionary, @@ -81,7 +81,7 @@ export { } from './Arrow.js'; export { - BinaryBuilder, LargeBinaryBuilder, + BinaryBuilder, BinaryViewBuilder, LargeBinaryBuilder, BoolBuilder, DateBuilder, DateDayBuilder, DateMillisecondBuilder, DecimalBuilder, @@ -92,12 +92,12 @@ export { IntervalBuilder, IntervalDayTimeBuilder, IntervalYearMonthBuilder, IntervalMonthDayNanoBuilder, DurationBuilder, DurationSecondBuilder, DurationMillisecondBuilder, DurationMicrosecondBuilder, DurationNanosecondBuilder, IntBuilder, Int8Builder, Int16Builder, Int32Builder, Int64Builder, Uint8Builder, Uint16Builder, Uint32Builder, Uint64Builder, - ListBuilder, + ListBuilder, ListViewBuilder, LargeListViewBuilder, MapBuilder, NullBuilder, StructBuilder, TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuilder, TimestampMicrosecondBuilder, TimestampNanosecondBuilder, TimeBuilder, TimeSecondBuilder, TimeMillisecondBuilder, TimeMicrosecondBuilder, TimeNanosecondBuilder, UnionBuilder, DenseUnionBuilder, SparseUnionBuilder, - Utf8Builder, LargeUtf8Builder + Utf8Builder, Utf8ViewBuilder, LargeUtf8Builder } from './Arrow.js'; diff --git a/src/Arrow.ts b/src/Arrow.ts index 8321026f..73edbd42 100644 --- a/src/Arrow.ts +++ b/src/Arrow.ts @@ -37,14 +37,14 @@ export { Bool, Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64, Float, Float16, Float32, Float64, - Utf8, LargeUtf8, - Binary, LargeBinary, + Utf8, LargeUtf8, Utf8View, + Binary, LargeBinary, BinaryView, FixedSizeBinary, Date_, DateDay, DateMillisecond, Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond, Decimal, - List, + List, ListView, LargeListView, Struct, Union, DenseUnion, SparseUnion, Dictionary, @@ -79,10 +79,13 @@ export { TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuilder, export { IntervalBuilder, IntervalDayTimeBuilder, IntervalYearMonthBuilder, IntervalMonthDayNanoBuilder } from './builder/interval.js'; export { DurationBuilder, DurationSecondBuilder, DurationMillisecondBuilder, DurationMicrosecondBuilder, DurationNanosecondBuilder } from './builder/duration.js'; export { Utf8Builder } from './builder/utf8.js'; +export { Utf8ViewBuilder } from './builder/utf8view.js'; export { LargeUtf8Builder } from './builder/largeutf8.js'; export { BinaryBuilder } from './builder/binary.js'; +export { BinaryViewBuilder } from './builder/binaryview.js'; export { LargeBinaryBuilder } from './builder/largebinary.js'; export { ListBuilder } from './builder/list.js'; +export { ListViewBuilder, LargeListViewBuilder } from './builder/listview.js'; export { FixedSizeListBuilder } from './builder/fixedsizelist.js'; export { MapBuilder } from './builder/map.js'; export { StructBuilder } from './builder/struct.js'; diff --git a/src/builder/binaryview.ts b/src/builder/binaryview.ts new file mode 100644 index 00000000..80e5930f --- /dev/null +++ b/src/builder/binaryview.ts @@ -0,0 +1,169 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { BinaryView } from '../type.js'; +import { Builder, BuilderOptions } from '../builder.js'; +import { BufferBuilder } from './buffer.js'; +import { toUint8Array } from '../util/buffer.js'; +import { makeData } from '../data.js'; + +/** @ignore */ +export class BinaryViewBuilder extends Builder { + protected _views: BufferBuilder; + protected _variadicBuffers: Uint8Array[] = []; + protected _currentBuffer: BufferBuilder | null = null; + protected _currentBufferIndex = 0; + protected _currentBufferOffset = 0; + protected readonly _bufferSize = 32 * 1024 * 1024; // 32MB per buffer as per spec recommendation + + constructor(opts: BuilderOptions) { + super(opts); + this._views = new BufferBuilder(Uint8Array); + } + + public get byteLength(): number { + let size = 0; + this._views && (size += this._views.byteLength); + this._nulls && (size += this._nulls.byteLength); + for (const buffer of this._variadicBuffers) { + size += buffer.byteLength; + } + this._currentBuffer && (size += this._currentBuffer.byteLength); + return size; + } + + public setValue(index: number, value: Uint8Array) { + const data = toUint8Array(value); + const length = data.length; + + // Ensure views buffer has space up to this index (similar to FixedWidthBuilder) + const bytesNeeded = (index + 1) * BinaryView.ELEMENT_WIDTH; + const currentBytes = this._views.length; + if (bytesNeeded > currentBytes) { + this._views.reserve(bytesNeeded - currentBytes); + } + + const viewBuffer = this._views.buffer; + const viewOffset = index * BinaryView.ELEMENT_WIDTH; + const view = new DataView(viewBuffer.buffer, viewBuffer.byteOffset + viewOffset, BinaryView.ELEMENT_WIDTH); + + // Write length (4 bytes, little-endian) + view.setInt32(BinaryView.LENGTH_OFFSET, length, true); + + if (length <= BinaryView.INLINE_CAPACITY) { + // Inline: store data directly in view struct (up to 12 bytes) + viewBuffer.set(data, viewOffset + BinaryView.INLINE_OFFSET); + // Zero out remaining bytes + for (let i = length; i < BinaryView.INLINE_CAPACITY; i++) { + viewBuffer[viewOffset + BinaryView.INLINE_OFFSET + i] = 0; + } + } else { + // Out-of-line: store in variadic buffer + // Write prefix (first 4 bytes of data) + const prefix = new DataView(data.buffer, data.byteOffset, Math.min(4, length)); + view.setUint32(BinaryView.INLINE_OFFSET, prefix.getUint32(0, true), true); + + // Allocate space in variadic buffer + if (!this._currentBuffer || this._currentBufferOffset + length > this._bufferSize) { + // Start a new buffer + if (this._currentBuffer) { + this._variadicBuffers.push(this._currentBuffer.buffer.slice(0, this._currentBufferOffset)); + } + this._currentBuffer = new BufferBuilder(Uint8Array); + this._currentBufferIndex = this._variadicBuffers.length; + this._currentBufferOffset = 0; + } + + // Write data to current buffer + const bufferData = this._currentBuffer.reserve(length).buffer; + bufferData.set(data, this._currentBufferOffset); + + // Write buffer index and offset to view struct + view.setInt32(BinaryView.BUFFER_INDEX_OFFSET, this._currentBufferIndex, true); + view.setInt32(BinaryView.BUFFER_OFFSET_OFFSET, this._currentBufferOffset, true); + + this._currentBufferOffset += length; + } + + return this; + } + + public setValid(index: number, isValid: boolean) { + if (!super.setValid(index, isValid)) { + // For null values, write a zero-length view + // Ensure space is allocated + const bytesNeeded = (index + 1) * BinaryView.ELEMENT_WIDTH; + const currentBytes = this._views.length; + if (bytesNeeded > currentBytes) { + this._views.reserve(bytesNeeded - currentBytes); + } + + const viewBuffer = this._views.buffer; + const viewOffset = index * BinaryView.ELEMENT_WIDTH; + // Zero out the entire view struct + for (let i = 0; i < BinaryView.ELEMENT_WIDTH; i++) { + viewBuffer[viewOffset + i] = 0; + } + return false; + } + return true; + } + + public clear() { + this._variadicBuffers = []; + this._currentBuffer = null; + this._currentBufferIndex = 0; + this._currentBufferOffset = 0; + this._views.clear(); + return super.clear(); + } + + public flush() { + const { type, length, nullCount, _views, _nulls } = this; + + // Finalize current buffer if it exists + if (this._currentBuffer && this._currentBufferOffset > 0) { + this._variadicBuffers.push(this._currentBuffer.buffer.slice(0, this._currentBufferOffset)); + this._currentBuffer = null; + this._currentBufferOffset = 0; + } + + const views = _views.flush(length * BinaryView.ELEMENT_WIDTH); + const nullBitmap = nullCount > 0 ? _nulls.flush(length) : undefined; + const variadicBuffers = this._variadicBuffers.slice(); + + // Reset variadic buffers for next batch + this._variadicBuffers = []; + this._currentBufferIndex = 0; + + this.clear(); + + return makeData({ + type, + length, + nullCount, + nullBitmap, + views, + variadicBuffers + }); + } + + public finish() { + this.finished = true; + return this; + } +} diff --git a/src/builder/listview.ts b/src/builder/listview.ts new file mode 100644 index 00000000..82766775 --- /dev/null +++ b/src/builder/listview.ts @@ -0,0 +1,244 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Field } from '../schema.js'; +import { DataType, ListView, LargeListView } from '../type.js'; +import { DataBufferBuilder } from './buffer.js'; +import { Builder, BuilderOptions } from '../builder.js'; +import { makeData } from '../data.js'; + +/** @ignore */ +export class ListViewBuilder extends Builder, TNull> { + protected _offsets: DataBufferBuilder; + protected _sizes: DataBufferBuilder; + protected _pending: Map | undefined; + protected _writeIndex = 0; + + constructor(opts: BuilderOptions, TNull>) { + super(opts); + this._offsets = new DataBufferBuilder(Int32Array, 0); + this._sizes = new DataBufferBuilder(Int32Array, 0); + } + + public addChild(child: Builder, name = '0') { + if (this.numChildren > 0) { + throw new Error('ListViewBuilder can only have one child.'); + } + this.children[this.numChildren] = child; + this.type = new ListView(new Field(name, child.type, true)); + return this.numChildren - 1; + } + + public setValue(index: number, value: T['TValue']) { + const pending = this._pending || (this._pending = new Map()); + pending.set(index, value); + } + + public setValid(index: number, isValid: boolean) { + if (!super.setValid(index, isValid)) { + (this._pending || (this._pending = new Map())).set(index, undefined); + return false; + } + return true; + } + + public clear() { + this._pending = undefined; + this._writeIndex = 0; + return super.clear(); + } + + public flush() { + this._flush(); + + // Custom flush logic for ListView + const { type, length, nullCount, _offsets, _sizes, _nulls } = this; + + const valueOffsets = _offsets.flush(length); + const sizes = _sizes.flush(length); + const nullBitmap = nullCount > 0 ? _nulls.flush(length) : undefined; + const children = this.children.map((child) => child.flush()); + + this.clear(); + + return makeData({ + type, + length, + nullCount, + nullBitmap, + valueOffsets, + sizes, + child: children[0] + }); + } + + public finish() { + this._flush(); + return super.finish(); + } + + protected _flush() { + const pending = this._pending; + this._pending = undefined; + if (pending && pending.size > 0) { + this._flushPending(pending); + } + } + + protected _flushPending(pending: Map) { + const offsets = this._offsets; + const sizes = this._sizes; + const [child] = this.children; + + for (const [index, value] of pending) { + offsets.reserve(index + 1); + sizes.reserve(index + 1); + + if (typeof value === 'undefined') { + // Null or empty list + offsets.buffer[index] = 0; + sizes.buffer[index] = 0; + } else { + const v = value as T['TValue']; + const n = v.length; + const offset = this._writeIndex; + + // Set offset and size directly + offsets.buffer[index] = offset; + sizes.buffer[index] = n; + + // Write child values + for (let i = 0; i < n; i++) { + child.set(offset + i, v[i]); + } + + this._writeIndex += n; + } + } + } +} + +/** @ignore */ +export class LargeListViewBuilder extends Builder, TNull> { + protected _offsets: DataBufferBuilder; + protected _sizes: DataBufferBuilder; + protected _pending: Map | undefined; + protected _writeIndex = BigInt(0); // BigInt for LargeListView + + constructor(opts: BuilderOptions, TNull>) { + super(opts); + this._offsets = new DataBufferBuilder(BigInt64Array, 0); + this._sizes = new DataBufferBuilder(BigInt64Array, 0); + } + + public addChild(child: Builder, name = '0') { + if (this.numChildren > 0) { + throw new Error('LargeListViewBuilder can only have one child.'); + } + this.children[this.numChildren] = child; + this.type = new LargeListView(new Field(name, child.type, true)); + return this.numChildren - 1; + } + + public setValue(index: number, value: T['TValue']) { + const pending = this._pending || (this._pending = new Map()); + pending.set(index, value); + } + + public setValid(index: number, isValid: boolean) { + if (!super.setValid(index, isValid)) { + (this._pending || (this._pending = new Map())).set(index, undefined); + return false; + } + return true; + } + + public clear() { + this._pending = undefined; + this._writeIndex = BigInt(0); + return super.clear(); + } + + public flush() { + this._flush(); + + // Custom flush logic for LargeListView + const { type, length, nullCount, _offsets, _sizes, _nulls } = this; + + const valueOffsets = _offsets.flush(length); + const sizes = _sizes.flush(length); + const nullBitmap = nullCount > 0 ? _nulls.flush(length) : undefined; + const children = this.children.map((child) => child.flush()); + + this.clear(); + + return makeData({ + type, + length, + nullCount, + nullBitmap, + valueOffsets, + sizes, + child: children[0] + }); + } + + public finish() { + this._flush(); + return super.finish(); + } + + protected _flush() { + const pending = this._pending; + this._pending = undefined; + if (pending && pending.size > 0) { + this._flushPending(pending); + } + } + + protected _flushPending(pending: Map) { + const offsets = this._offsets; + const sizes = this._sizes; + const [child] = this.children; + + for (const [index, value] of pending) { + offsets.reserve(index + 1); + sizes.reserve(index + 1); + + if (typeof value === 'undefined') { + // Null or empty list + offsets.buffer[index] = BigInt(0); + sizes.buffer[index] = BigInt(0); + } else { + const v = value as T['TValue']; + const n = v.length; + const offset = this._writeIndex; + + // Set offset and size directly (using BigInt for LargeListView) + offsets.buffer[index] = offset; + sizes.buffer[index] = BigInt(n); + + // Write child values + for (let i = 0; i < n; i++) { + child.set(Number(offset) + i, v[i]); + } + + this._writeIndex += BigInt(n); + } + } + } +} diff --git a/src/builder/utf8view.ts b/src/builder/utf8view.ts new file mode 100644 index 00000000..299743e1 --- /dev/null +++ b/src/builder/utf8view.ts @@ -0,0 +1,154 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Utf8View, BinaryView } from '../type.js'; +import { encodeUtf8 } from '../util/utf8.js'; +import { BuilderOptions, Builder } from '../builder.js'; +import { BufferBuilder } from './buffer.js'; +import { makeData } from '../data.js'; + +/** @ignore */ +export class Utf8ViewBuilder extends Builder { + protected _views: BufferBuilder; + protected _variadicBuffers: Uint8Array[] = []; + protected _currentBuffer: BufferBuilder | null = null; + protected _currentBufferIndex = 0; + protected _currentBufferOffset = 0; + protected readonly _bufferSize = 32 * 1024 * 1024; + + constructor(opts: BuilderOptions) { + super(opts); + this._views = new BufferBuilder(Uint8Array); + } + + public get byteLength(): number { + let size = 0; + this._views && (size += this._views.byteLength); + this._nulls && (size += this._nulls.byteLength); + size += this._variadicBuffers.reduce((acc, buffer) => acc + buffer.byteLength, 0); + this._currentBuffer && (size += this._currentBuffer.byteLength); + return size; + } + + public setValue(index: number, value: string) { + const data = encodeUtf8(value); + const length = data.length; + + // Ensure views buffer has space up to this index + const bytesNeeded = (index + 1) * BinaryView.ELEMENT_WIDTH; + const currentBytes = this._views.length; + if (bytesNeeded > currentBytes) { + this._views.reserve(bytesNeeded - currentBytes); + } + + const viewBuffer = this._views.buffer; + const viewOffset = index * BinaryView.ELEMENT_WIDTH; + const view = new DataView(viewBuffer.buffer, viewBuffer.byteOffset + viewOffset, BinaryView.ELEMENT_WIDTH); + + view.setInt32(BinaryView.LENGTH_OFFSET, length, true); + + if (length <= BinaryView.INLINE_CAPACITY) { + viewBuffer.set(data, viewOffset + BinaryView.INLINE_OFFSET); + for (let i = length; i < BinaryView.INLINE_CAPACITY; i++) { + viewBuffer[viewOffset + BinaryView.INLINE_OFFSET + i] = 0; + } + } else { + const prefix = new DataView(data.buffer, data.byteOffset, Math.min(4, length)); + view.setUint32(BinaryView.INLINE_OFFSET, prefix.getUint32(0, true), true); + + if (!this._currentBuffer || this._currentBufferOffset + length > this._bufferSize) { + if (this._currentBuffer) { + this._variadicBuffers.push(this._currentBuffer.buffer.slice(0, this._currentBufferOffset)); + } + this._currentBuffer = new BufferBuilder(Uint8Array); + this._currentBufferIndex = this._variadicBuffers.length; + this._currentBufferOffset = 0; + } + + const bufferData = this._currentBuffer.reserve(length).buffer; + bufferData.set(data, this._currentBufferOffset); + + view.setInt32(BinaryView.BUFFER_INDEX_OFFSET, this._currentBufferIndex, true); + view.setInt32(BinaryView.BUFFER_OFFSET_OFFSET, this._currentBufferOffset, true); + + this._currentBufferOffset += length; + } + + return this; + } + + public setValid(index: number, isValid: boolean) { + if (!super.setValid(index, isValid)) { + // Ensure space is allocated + const bytesNeeded = (index + 1) * BinaryView.ELEMENT_WIDTH; + const currentBytes = this._views.length; + if (bytesNeeded > currentBytes) { + this._views.reserve(bytesNeeded - currentBytes); + } + + const viewBuffer = this._views.buffer; + const viewOffset = index * BinaryView.ELEMENT_WIDTH; + for (let i = 0; i < BinaryView.ELEMENT_WIDTH; i++) { + viewBuffer[viewOffset + i] = 0; + } + return false; + } + return true; + } + + public clear() { + this._variadicBuffers = []; + this._currentBuffer = null; + this._currentBufferIndex = 0; + this._currentBufferOffset = 0; + this._views.clear(); + return super.clear(); + } + + public flush() { + const { type, length, nullCount, _views, _nulls } = this; + + if (this._currentBuffer && this._currentBufferOffset > 0) { + this._variadicBuffers.push(this._currentBuffer.buffer.slice(0, this._currentBufferOffset)); + this._currentBuffer = null; + this._currentBufferOffset = 0; + } + + const views = _views.flush(length * BinaryView.ELEMENT_WIDTH); + const nullBitmap = nullCount > 0 ? _nulls.flush(length) : undefined; + const variadicBuffers = this._variadicBuffers.slice(); + + this._variadicBuffers = []; + this._currentBufferIndex = 0; + + this.clear(); + + return makeData({ + type, + length, + nullCount, + nullBitmap, + views, + variadicBuffers + }); + } + + public finish() { + this.finished = true; + return this; + } +} diff --git a/src/data.ts b/src/data.ts index 45fcc35d..3dc26e38 100644 --- a/src/data.ts +++ b/src/data.ts @@ -68,6 +68,7 @@ export class Data { declare public readonly typeIds: Buffers[BufferType.TYPE]; declare public readonly nullBitmap: Buffers[BufferType.VALIDITY]; declare public readonly valueOffsets: Buffers[BufferType.OFFSET]; + declare public readonly variadicBuffers: ReadonlyArray; public get typeId(): T['TType'] { return this.type.typeId; } @@ -97,6 +98,7 @@ export class Data { values && (byteLength += values.byteLength); nullBitmap && (byteLength += nullBitmap.byteLength); typeIds && (byteLength += typeIds.byteLength); + byteLength += this.variadicBuffers.reduce((size, data) => size + (data?.byteLength ?? 0), 0); return this.children.reduce((byteLength, child) => byteLength + child.byteLength, byteLength); } @@ -117,7 +119,16 @@ export class Data { return nullCount; } - constructor(type: T, offset: number, length: number, nullCount?: number, buffers?: Partial> | Data, children: Data[] = [], dictionary?: Vector) { + constructor( + type: T, + offset: number, + length: number, + nullCount?: number, + buffers?: Partial> | Data, + children: Data[] = [], + dictionary?: Vector, + variadicBuffers: ReadonlyArray = [] + ) { this.type = type; this.children = children; this.dictionary = dictionary; @@ -131,6 +142,7 @@ export class Data { this.typeIds = buffers.typeIds; this.nullBitmap = buffers.nullBitmap; this.valueOffsets = buffers.valueOffsets; + this.variadicBuffers = buffers.variadicBuffers; } else { this.stride = strideForType(type); if (buffers) { @@ -139,15 +151,22 @@ export class Data { (buffer = (buffers as Buffers)[2]) && (this.nullBitmap = buffer); (buffer = (buffers as Buffers)[3]) && (this.typeIds = buffer); } + this.variadicBuffers = variadicBuffers; } + this.variadicBuffers ??= []; } public getValid(index: number): boolean { const { type } = this; if (DataType.isUnion(type)) { const union = (type as Union); - const child = this.children[union.typeIdToChildIndex[this.typeIds[index]]]; - const indexInChild = union.mode === UnionMode.Dense ? this.valueOffsets[index] : index; + const typeId = this.typeIds[index]; + const childIndex = union.typeIdToChildIndex[typeId]; + const child = this.children[childIndex]; + const valueOffsets = this.valueOffsets as Int32Array | BigInt64Array | undefined; + const indexInChild = union.mode === UnionMode.Dense && valueOffsets + ? Number(valueOffsets[index]) + : index; return child.getValid(indexInChild); } if (this.nullable && this.nullCount > 0) { @@ -163,8 +182,13 @@ export class Data { const { type } = this; if (DataType.isUnion(type)) { const union = (type as Union); - const child = this.children[union.typeIdToChildIndex[this.typeIds[index]]]; - const indexInChild = union.mode === UnionMode.Dense ? this.valueOffsets[index] : index; + const typeId = this.typeIds[index]; + const childIndex = union.typeIdToChildIndex[typeId]; + const child = this.children[childIndex]; + const valueOffsets = this.valueOffsets as Int32Array | BigInt64Array | undefined; + const indexInChild = union.mode === UnionMode.Dense && valueOffsets + ? Number(valueOffsets[index]) + : index; prev = child.getValid(indexInChild); child.setValid(indexInChild, value); } else { @@ -200,8 +224,16 @@ export class Data { return value; } - public clone(type: R = this.type as any, offset = this.offset, length = this.length, nullCount = this._nullCount, buffers: Buffers = this, children: Data[] = this.children) { - return new Data(type, offset, length, nullCount, buffers, children, this.dictionary); + public clone( + type: R = this.type as any, + offset = this.offset, + length = this.length, + nullCount = this._nullCount, + buffers: Buffers = this, + children: Data[] = this.children, + variadicBuffers: ReadonlyArray = this.variadicBuffers + ) { + return new Data(type, offset, length, nullCount, buffers, children, this.dictionary, variadicBuffers); } public slice(offset: number, length: number): Data { @@ -214,12 +246,13 @@ export class Data { const buffers = this._sliceBuffers(offset, length, stride, typeId); return this.clone(this.type, this.offset + offset, length, nullCount, buffers, // Don't slice children if we have value offsets (the variable-width types) - (children.length === 0 || this.valueOffsets) ? children : this._sliceChildren(children, childStride * offset, childStride * length)); + (children.length === 0 || this.valueOffsets) ? children : this._sliceChildren(children, childStride * offset, childStride * length), + this.variadicBuffers); } public _changeLengthAndBackfillNullBitmap(newLength: number): Data { if (this.typeId === Type.Null) { - return this.clone(this.type, 0, newLength, 0); + return this.clone(this.type, 0, newLength, 0, this.buffers, this.children, this.variadicBuffers); } const { length, nullCount } = this; // start initialized with 0s (nulls), then fill from 0 to length with 1s (not null) @@ -232,7 +265,7 @@ export class Data { } const buffers = this.buffers; buffers[BufferType.VALIDITY] = bitmap; - return this.clone(this.type, 0, newLength, nullCount + (newLength - length), buffers); + return this.clone(this.type, 0, newLength, nullCount + (newLength - length), buffers, this.children, this.variadicBuffers); } protected _sliceBuffers(offset: number, length: number, stride: number, typeId: T['TType']): Buffers { @@ -240,10 +273,15 @@ export class Data { const { buffers } = this; // If typeIds exist, slice the typeIds buffer (arr = buffers[BufferType.TYPE]) && (buffers[BufferType.TYPE] = arr.subarray(offset, offset + length)); - // If offsets exist, only slice the offsets buffer - (arr = buffers[BufferType.OFFSET]) && (buffers[BufferType.OFFSET] = arr.subarray(offset, offset + length + 1)) || - // Otherwise if no offsets, slice the data buffer. Don't slice the data vector for Booleans, since the offset goes by bits not bytes - (arr = buffers[BufferType.DATA]) && (buffers[BufferType.DATA] = typeId === 6 ? arr : arr.subarray(stride * offset, stride * (offset + length))); + if (DataType.isBinaryView(this.type) || DataType.isUtf8View(this.type)) { + const width = BinaryView.ELEMENT_WIDTH; + (arr = buffers[BufferType.DATA]) && (buffers[BufferType.DATA] = arr.subarray(offset * width, (offset + length) * width)); + } else { + // If offsets exist, only slice the offsets buffer + (arr = buffers[BufferType.OFFSET]) && (buffers[BufferType.OFFSET] = arr.subarray(offset, offset + length + 1)) || + // Otherwise if no offsets, slice the data buffer. Don't slice the data vector for Booleans, since the offset goes by bits not bytes + (arr = buffers[BufferType.DATA]) && (buffers[BufferType.DATA] = typeId === 6 ? arr : arr.subarray(stride * offset, stride * (offset + length))); + } return buffers; } @@ -256,7 +294,7 @@ export class Data { import { Dictionary, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, ListView, LargeListView, FixedSizeList, Map_, Struct, Float, Int, Date_, @@ -311,6 +349,15 @@ class MakeDataVisitor extends Visitor { const { ['length']: length = valueOffsets.length - 1, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props; return new Data(type, offset, length, nullCount, [valueOffsets, data, nullBitmap]); } + public visitUtf8View(props: Utf8ViewDataProps) { + const { ['type']: type, ['offset']: offset = 0 } = props; + const views = toArrayBufferView(type.ArrayType, props['views']); + const nullBitmap = toUint8Array(props['nullBitmap']); + const variadicBuffers = (props['variadicBuffers'] || []).map((buffer) => toUint8Array(buffer)); + const length = props['length'] ?? Math.trunc(views.length / Utf8View.ELEMENT_WIDTH); + const nullCount = props['nullBitmap'] ? -1 : 0; + return new Data(type, offset, length, nullCount, [undefined, views, nullBitmap], [], undefined, variadicBuffers); + } public visitLargeUtf8(props: LargeUtf8DataProps) { const { ['type']: type, ['offset']: offset = 0 } = props; const data = toUint8Array(props['data']); @@ -327,6 +374,15 @@ class MakeDataVisitor extends Visitor { const { ['length']: length = valueOffsets.length - 1, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props; return new Data(type, offset, length, nullCount, [valueOffsets, data, nullBitmap]); } + public visitBinaryView(props: BinaryViewDataProps) { + const { ['type']: type, ['offset']: offset = 0 } = props; + const views = toArrayBufferView(type.ArrayType, props['views']); + const nullBitmap = toUint8Array(props['nullBitmap']); + const variadicBuffers = (props['variadicBuffers'] || []).map((buffer) => toUint8Array(buffer)); + const length = props['length'] ?? Math.trunc(views.length / BinaryView.ELEMENT_WIDTH); + const nullCount = props['nullBitmap'] ? -1 : 0; + return new Data(type, offset, length, nullCount, [undefined, views, nullBitmap], [], undefined, variadicBuffers); + } public visitLargeBinary(props: LargeBinaryDataProps) { const { ['type']: type, ['offset']: offset = 0 } = props; const data = toUint8Array(props['data']); @@ -377,6 +433,22 @@ class MakeDataVisitor extends Visitor { const { ['length']: length = valueOffsets.length - 1, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props; return new Data(type, offset, length, nullCount, [valueOffsets, undefined, nullBitmap], [child]); } + public visitListView(props: ListViewDataProps) { + const { ['type']: type, ['offset']: offset = 0, ['child']: child } = props; + const nullBitmap = toUint8Array(props['nullBitmap']); + const valueOffsets = toInt32Array(props['valueOffsets']); + const sizes = toInt32Array(props['sizes']); + const { ['length']: length = sizes.length, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props; + return new Data(type, offset, length, nullCount, [valueOffsets, sizes, nullBitmap], [child]); + } + public visitLargeListView(props: LargeListViewDataProps) { + const { ['type']: type, ['offset']: offset = 0, ['child']: child } = props; + const nullBitmap = toUint8Array(props['nullBitmap']); + const valueOffsets = toBigInt64Array(props['valueOffsets']); + const sizes = toBigInt64Array(props['sizes']); + const { ['length']: length = Number(sizes.length), ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props; + return new Data(type, offset, length, nullCount, [valueOffsets, sizes, nullBitmap], [child]); + } public visitStruct(props: StructDataProps) { const { ['type']: type, ['offset']: offset = 0, ['children']: children = [] } = props; const nullBitmap = toUint8Array(props['nullBitmap']); @@ -455,10 +527,14 @@ interface IntervalDataProps extends DataProps_ { data?: D interface DurationDataProps extends DataProps_ { data?: DataBuffer } interface FixedSizeBinaryDataProps extends DataProps_ { data?: DataBuffer } interface BinaryDataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer } +interface BinaryViewDataProps extends DataProps_ { views: DataBuffer; variadicBuffers?: ReadonlyArray | Iterable | Uint8Array>; data?: DataBuffer } interface LargeBinaryDataProps extends DataProps_ { valueOffsets: LargeValueOffsetsBuffer | ValueOffsetsBuffer; data?: DataBuffer } interface Utf8DataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer } +interface Utf8ViewDataProps extends DataProps_ { views: DataBuffer; variadicBuffers?: ReadonlyArray | Iterable | Uint8Array>; data?: DataBuffer } interface LargeUtf8DataProps extends DataProps_ { valueOffsets: LargeValueOffsetsBuffer | ValueOffsetsBuffer; data?: DataBuffer } interface ListDataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; child: Data } +interface ListViewDataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; sizes: ValueOffsetsBuffer; child: Data } +interface LargeListViewDataProps extends DataProps_ { valueOffsets: LargeValueOffsetsBuffer | ValueOffsetsBuffer; sizes: LargeValueOffsetsBuffer | ValueOffsetsBuffer; child: Data } interface FixedSizeListDataProps extends DataProps_ { child: Data } interface StructDataProps extends DataProps_ { children: Data[] } interface Map_DataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; child: Data } @@ -481,9 +557,13 @@ export type DataProps = ( T extends FixedSizeBinary /* */ ? FixedSizeBinaryDataProps : T extends Binary /* */ ? BinaryDataProps : T extends LargeBinary /* */ ? LargeBinaryDataProps : + T extends BinaryView /* */ ? BinaryViewDataProps : T extends Utf8 /* */ ? Utf8DataProps : T extends LargeUtf8 /* */ ? LargeUtf8DataProps : + T extends Utf8View /* */ ? Utf8ViewDataProps : T extends List /* */ ? ListDataProps : + T extends ListView /* */ ? ListViewDataProps : + T extends LargeListView /* */ ? LargeListViewDataProps : T extends FixedSizeList /* */ ? FixedSizeListDataProps : T extends Struct /* */ ? StructDataProps : T extends Map_ /* */ ? Map_DataProps : @@ -507,11 +587,15 @@ export function makeData(props: TimestampDataProps): Dat export function makeData(props: IntervalDataProps): Data; export function makeData(props: DurationDataProps): Data; export function makeData(props: FixedSizeBinaryDataProps): Data; +export function makeData(props: BinaryViewDataProps): Data; export function makeData(props: BinaryDataProps): Data; export function makeData(props: LargeBinaryDataProps): Data; export function makeData(props: Utf8DataProps): Data; export function makeData(props: LargeUtf8DataProps): Data; +export function makeData(props: Utf8ViewDataProps): Data; export function makeData(props: ListDataProps): Data; +export function makeData(props: ListViewDataProps): Data; +export function makeData(props: LargeListViewDataProps): Data; export function makeData(props: FixedSizeListDataProps): Data; export function makeData(props: StructDataProps): Data; export function makeData(props: Map_DataProps): Data; diff --git a/src/enum.ts b/src/enum.ts index 73d95538..facb2184 100644 --- a/src/enum.ts +++ b/src/enum.ts @@ -70,6 +70,10 @@ export enum Type { Duration = 18, /** Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds */ LargeBinary = 19, /** Large variable-length bytes (no guarantee of UTF8-ness) */ LargeUtf8 = 20, /** Large variable-length string as List */ + BinaryView = 23, /** Variable-length binary values backed by inline-or-referenced views */ + Utf8View = 24, /** Variable-length UTF8 string values backed by inline-or-referenced views */ + ListView = 25, /** Variable-length list values backed by entry views */ + LargeListView = 26, /** Large variable-length list values backed by entry views */ Dictionary = -1, /** Dictionary aka Category type */ Int8 = -2, diff --git a/src/fb/File.ts b/src/fb/File.ts new file mode 100644 index 00000000..12c6f822 --- /dev/null +++ b/src/fb/File.ts @@ -0,0 +1,47 @@ +// automatically generated by the FlatBuffers compiler, do not modify + +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + +export { Binary } from './binary.js'; +export { BinaryView } from './binary-view.js'; +export { Block } from './block.js'; +export { Bool } from './bool.js'; +export { Buffer } from './buffer.js'; +export { Date } from './date.js'; +export { DateUnit } from './date-unit.js'; +export { Decimal } from './decimal.js'; +export { DictionaryEncoding } from './dictionary-encoding.js'; +export { DictionaryKind } from './dictionary-kind.js'; +export { Duration } from './duration.js'; +export { Endianness } from './endianness.js'; +export { Feature } from './feature.js'; +export { Field } from './field.js'; +export { FixedSizeBinary } from './fixed-size-binary.js'; +export { FixedSizeList } from './fixed-size-list.js'; +export { FloatingPoint } from './floating-point.js'; +export { Footer } from './footer.js'; +export { Int } from './int.js'; +export { Interval } from './interval.js'; +export { IntervalUnit } from './interval-unit.js'; +export { KeyValue } from './key-value.js'; +export { LargeBinary } from './large-binary.js'; +export { LargeList } from './large-list.js'; +export { LargeListView } from './large-list-view.js'; +export { LargeUtf8 } from './large-utf8.js'; +export { List } from './list.js'; +export { ListView } from './list-view.js'; +export { Map } from './map.js'; +export { MetadataVersion } from './metadata-version.js'; +export { Null } from './null.js'; +export { Precision } from './precision.js'; +export { RunEndEncoded } from './run-end-encoded.js'; +export { Schema } from './schema.js'; +export { Struct_ } from './struct-.js'; +export { Time } from './time.js'; +export { TimeUnit } from './time-unit.js'; +export { Timestamp } from './timestamp.js'; +export { Type } from './type.js'; +export { Union } from './union.js'; +export { UnionMode } from './union-mode.js'; +export { Utf8 } from './utf8.js'; +export { Utf8View } from './utf8-view.js'; diff --git a/src/fb/binary-view.ts b/src/fb/binary-view.ts new file mode 100644 index 00000000..f91f910f --- /dev/null +++ b/src/fb/binary-view.ts @@ -0,0 +1,47 @@ +// automatically generated by the FlatBuffers compiler, do not modify + +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + +import * as flatbuffers from 'flatbuffers'; + +/** + * Logically the same as Binary, but the internal representation uses a view + * struct that contains the string length and either the string's entire data + * inline (for small strings) or an inlined prefix, an index of another buffer, + * and an offset pointing to a slice in that buffer (for non-small strings). + * + * Since it uses a variable number of data buffers, each Field with this type + * must have a corresponding entry in `variadicBufferCounts`. + */ +export class BinaryView { + bb: flatbuffers.ByteBuffer|null = null; + bb_pos = 0; + __init(i:number, bb:flatbuffers.ByteBuffer):BinaryView { + this.bb_pos = i; + this.bb = bb; + return this; +} + +static getRootAsBinaryView(bb:flatbuffers.ByteBuffer, obj?:BinaryView):BinaryView { + return (obj || new BinaryView()).__init(bb.readInt32(bb.position()) + bb.position(), bb); +} + +static getSizePrefixedRootAsBinaryView(bb:flatbuffers.ByteBuffer, obj?:BinaryView):BinaryView { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new BinaryView()).__init(bb.readInt32(bb.position()) + bb.position(), bb); +} + +static startBinaryView(builder:flatbuffers.Builder) { + builder.startObject(0); +} + +static endBinaryView(builder:flatbuffers.Builder):flatbuffers.Offset { + const offset = builder.endObject(); + return offset; +} + +static createBinaryView(builder:flatbuffers.Builder):flatbuffers.Offset { + BinaryView.startBinaryView(builder); + return BinaryView.endBinaryView(builder); +} +} diff --git a/src/fb/large-list-view.ts b/src/fb/large-list-view.ts new file mode 100644 index 00000000..5785cd3f --- /dev/null +++ b/src/fb/large-list-view.ts @@ -0,0 +1,42 @@ +// automatically generated by the FlatBuffers compiler, do not modify + +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + +import * as flatbuffers from 'flatbuffers'; + +/** + * Same as ListView, but with 64-bit offsets and sizes, allowing to represent + * extremely large data values. + */ +export class LargeListView { + bb: flatbuffers.ByteBuffer|null = null; + bb_pos = 0; + __init(i:number, bb:flatbuffers.ByteBuffer):LargeListView { + this.bb_pos = i; + this.bb = bb; + return this; +} + +static getRootAsLargeListView(bb:flatbuffers.ByteBuffer, obj?:LargeListView):LargeListView { + return (obj || new LargeListView()).__init(bb.readInt32(bb.position()) + bb.position(), bb); +} + +static getSizePrefixedRootAsLargeListView(bb:flatbuffers.ByteBuffer, obj?:LargeListView):LargeListView { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new LargeListView()).__init(bb.readInt32(bb.position()) + bb.position(), bb); +} + +static startLargeListView(builder:flatbuffers.Builder) { + builder.startObject(0); +} + +static endLargeListView(builder:flatbuffers.Builder):flatbuffers.Offset { + const offset = builder.endObject(); + return offset; +} + +static createLargeListView(builder:flatbuffers.Builder):flatbuffers.Offset { + LargeListView.startLargeListView(builder); + return LargeListView.endLargeListView(builder); +} +} diff --git a/src/fb/list-view.ts b/src/fb/list-view.ts new file mode 100644 index 00000000..f9afae01 --- /dev/null +++ b/src/fb/list-view.ts @@ -0,0 +1,43 @@ +// automatically generated by the FlatBuffers compiler, do not modify + +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + +import * as flatbuffers from 'flatbuffers'; + +/** + * Represents the same logical types that List can, but contains offsets and + * sizes allowing for writes in any order and sharing of child values among + * list values. + */ +export class ListView { + bb: flatbuffers.ByteBuffer|null = null; + bb_pos = 0; + __init(i:number, bb:flatbuffers.ByteBuffer):ListView { + this.bb_pos = i; + this.bb = bb; + return this; +} + +static getRootAsListView(bb:flatbuffers.ByteBuffer, obj?:ListView):ListView { + return (obj || new ListView()).__init(bb.readInt32(bb.position()) + bb.position(), bb); +} + +static getSizePrefixedRootAsListView(bb:flatbuffers.ByteBuffer, obj?:ListView):ListView { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new ListView()).__init(bb.readInt32(bb.position()) + bb.position(), bb); +} + +static startListView(builder:flatbuffers.Builder) { + builder.startObject(0); +} + +static endListView(builder:flatbuffers.Builder):flatbuffers.Offset { + const offset = builder.endObject(); + return offset; +} + +static createListView(builder:flatbuffers.Builder):flatbuffers.Offset { + ListView.startListView(builder); + return ListView.endListView(builder); +} +} diff --git a/src/fb/message.ts b/src/fb/message.ts index d752b91b..d3518599 100644 --- a/src/fb/message.ts +++ b/src/fb/message.ts @@ -1,5 +1,7 @@ // automatically generated by the FlatBuffers compiler, do not modify +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + import * as flatbuffers from 'flatbuffers'; import { KeyValue } from './key-value.js'; diff --git a/src/fb/record-batch.ts b/src/fb/record-batch.ts index 00681999..e6f41d02 100644 --- a/src/fb/record-batch.ts +++ b/src/fb/record-batch.ts @@ -1,5 +1,7 @@ // automatically generated by the FlatBuffers compiler, do not modify +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + import * as flatbuffers from 'flatbuffers'; import { BodyCompression } from './body-compression.js'; @@ -78,8 +80,34 @@ compression(obj?:BodyCompression):BodyCompression|null { return offset ? (obj || new BodyCompression()).__init(this.bb!.__indirect(this.bb_pos + offset), this.bb!) : null; } +/** + * Some types such as Utf8View are represented using a variable number of buffers. + * For each such Field in the pre-ordered flattened logical schema, there will be + * an entry in variadicBufferCounts to indicate the number of number of variadic + * buffers which belong to that Field in the current RecordBatch. + * + * For example, the schema + * col1: Struct + * col2: Utf8View + * contains two Fields with variadic buffers so variadicBufferCounts will have + * two entries, the first counting the variadic buffers of `col1.beta` and the + * second counting `col2`'s. + * + * This field may be omitted if and only if the schema contains no Fields with + * a variable number of buffers, such as BinaryView and Utf8View. + */ +variadicBufferCounts(index: number):bigint|null { + const offset = this.bb!.__offset(this.bb_pos, 12); + return offset ? this.bb!.readInt64(this.bb!.__vector(this.bb_pos + offset) + index * 8) : BigInt(0); +} + +variadicBufferCountsLength():number { + const offset = this.bb!.__offset(this.bb_pos, 12); + return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0; +} + static startRecordBatch(builder:flatbuffers.Builder) { - builder.startObject(4); + builder.startObject(5); } static addLength(builder:flatbuffers.Builder, length:bigint) { @@ -106,6 +134,22 @@ static addCompression(builder:flatbuffers.Builder, compressionOffset:flatbuffers builder.addFieldOffset(3, compressionOffset, 0); } +static addVariadicBufferCounts(builder:flatbuffers.Builder, variadicBufferCountsOffset:flatbuffers.Offset) { + builder.addFieldOffset(4, variadicBufferCountsOffset, 0); +} + +static createVariadicBufferCountsVector(builder:flatbuffers.Builder, data:bigint[]):flatbuffers.Offset { + builder.startVector(8, data.length, 8); + for (let i = data.length - 1; i >= 0; i--) { + builder.addInt64(data[i]!); + } + return builder.endVector(); +} + +static startVariadicBufferCountsVector(builder:flatbuffers.Builder, numElems:number) { + builder.startVector(8, numElems, 8); +} + static endRecordBatch(builder:flatbuffers.Builder):flatbuffers.Offset { const offset = builder.endObject(); return offset; diff --git a/src/fb/schema.ts b/src/fb/schema.ts index 394883eb..daae447e 100644 --- a/src/fb/schema.ts +++ b/src/fb/schema.ts @@ -1,5 +1,7 @@ // automatically generated by the FlatBuffers compiler, do not modify +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + import * as flatbuffers from 'flatbuffers'; import { Endianness } from './endianness.js'; @@ -133,14 +135,6 @@ static endSchema(builder:flatbuffers.Builder):flatbuffers.Offset { return offset; } -static finishSchemaBuffer(builder:flatbuffers.Builder, offset:flatbuffers.Offset) { - builder.finish(offset); -} - -static finishSizePrefixedSchemaBuffer(builder:flatbuffers.Builder, offset:flatbuffers.Offset) { - builder.finish(offset, undefined, true); -} - static createSchema(builder:flatbuffers.Builder, endianness:Endianness, fieldsOffset:flatbuffers.Offset, customMetadataOffset:flatbuffers.Offset, featuresOffset:flatbuffers.Offset):flatbuffers.Offset { Schema.startSchema(builder); Schema.addEndianness(builder, endianness); diff --git a/src/fb/type.ts b/src/fb/type.ts index 8eb87042..8f913d01 100644 --- a/src/fb/type.ts +++ b/src/fb/type.ts @@ -1,6 +1,9 @@ // automatically generated by the FlatBuffers compiler, do not modify +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + import { Binary } from './binary.js'; +import { BinaryView } from './binary-view.js'; import { Bool } from './bool.js'; import { Date } from './date.js'; import { Decimal } from './decimal.js'; @@ -12,8 +15,10 @@ import { Int } from './int.js'; import { Interval } from './interval.js'; import { LargeBinary } from './large-binary.js'; import { LargeList } from './large-list.js'; +import { LargeListView } from './large-list-view.js'; import { LargeUtf8 } from './large-utf8.js'; import { List } from './list.js'; +import { ListView } from './list-view.js'; import { Map } from './map.js'; import { Null } from './null.js'; import { RunEndEncoded } from './run-end-encoded.js'; @@ -22,6 +27,7 @@ import { Time } from './time.js'; import { Timestamp } from './timestamp.js'; import { Union } from './union.js'; import { Utf8 } from './utf8.js'; +import { Utf8View } from './utf8-view.js'; /** @@ -52,15 +58,19 @@ export enum Type { LargeBinary = 19, LargeUtf8 = 20, LargeList = 21, - RunEndEncoded = 22 + RunEndEncoded = 22, + BinaryView = 23, + Utf8View = 24, + ListView = 25, + LargeListView = 26 } export function unionToType( type: Type, - accessor: (obj:Binary|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeUtf8|List|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8) => Binary|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeUtf8|List|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|null -): Binary|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeUtf8|List|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|null { + accessor: (obj:Binary|BinaryView|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeListView|LargeUtf8|List|ListView|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|Utf8View) => Binary|BinaryView|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeListView|LargeUtf8|List|ListView|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|Utf8View|null +): Binary|BinaryView|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeListView|LargeUtf8|List|ListView|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|Utf8View|null { switch(Type[type]) { - case 'NONE': return null; + case 'NONE': return null; case 'Null': return accessor(new Null())! as Null; case 'Int': return accessor(new Int())! as Int; case 'FloatingPoint': return accessor(new FloatingPoint())! as FloatingPoint; @@ -83,17 +93,21 @@ export function unionToType( case 'LargeUtf8': return accessor(new LargeUtf8())! as LargeUtf8; case 'LargeList': return accessor(new LargeList())! as LargeList; case 'RunEndEncoded': return accessor(new RunEndEncoded())! as RunEndEncoded; + case 'BinaryView': return accessor(new BinaryView())! as BinaryView; + case 'Utf8View': return accessor(new Utf8View())! as Utf8View; + case 'ListView': return accessor(new ListView())! as ListView; + case 'LargeListView': return accessor(new LargeListView())! as LargeListView; default: return null; } } export function unionListToType( - type: Type, - accessor: (index: number, obj:Binary|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeUtf8|List|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8) => Binary|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeUtf8|List|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|null, + type: Type, + accessor: (index: number, obj:Binary|BinaryView|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeListView|LargeUtf8|List|ListView|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|Utf8View) => Binary|BinaryView|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeListView|LargeUtf8|List|ListView|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|Utf8View|null, index: number -): Binary|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeUtf8|List|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|null { +): Binary|BinaryView|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeListView|LargeUtf8|List|ListView|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|Utf8View|null { switch(Type[type]) { - case 'NONE': return null; + case 'NONE': return null; case 'Null': return accessor(index, new Null())! as Null; case 'Int': return accessor(index, new Int())! as Int; case 'FloatingPoint': return accessor(index, new FloatingPoint())! as FloatingPoint; @@ -116,6 +130,10 @@ export function unionListToType( case 'LargeUtf8': return accessor(index, new LargeUtf8())! as LargeUtf8; case 'LargeList': return accessor(index, new LargeList())! as LargeList; case 'RunEndEncoded': return accessor(index, new RunEndEncoded())! as RunEndEncoded; + case 'BinaryView': return accessor(index, new BinaryView())! as BinaryView; + case 'Utf8View': return accessor(index, new Utf8View())! as Utf8View; + case 'ListView': return accessor(index, new ListView())! as ListView; + case 'LargeListView': return accessor(index, new LargeListView())! as LargeListView; default: return null; } } diff --git a/src/fb/utf8-view.ts b/src/fb/utf8-view.ts new file mode 100644 index 00000000..886a9df7 --- /dev/null +++ b/src/fb/utf8-view.ts @@ -0,0 +1,47 @@ +// automatically generated by the FlatBuffers compiler, do not modify + +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + +import * as flatbuffers from 'flatbuffers'; + +/** + * Logically the same as Utf8, but the internal representation uses a view + * struct that contains the string length and either the string's entire data + * inline (for small strings) or an inlined prefix, an index of another buffer, + * and an offset pointing to a slice in that buffer (for non-small strings). + * + * Since it uses a variable number of data buffers, each Field with this type + * must have a corresponding entry in `variadicBufferCounts`. + */ +export class Utf8View { + bb: flatbuffers.ByteBuffer|null = null; + bb_pos = 0; + __init(i:number, bb:flatbuffers.ByteBuffer):Utf8View { + this.bb_pos = i; + this.bb = bb; + return this; +} + +static getRootAsUtf8View(bb:flatbuffers.ByteBuffer, obj?:Utf8View):Utf8View { + return (obj || new Utf8View()).__init(bb.readInt32(bb.position()) + bb.position(), bb); +} + +static getSizePrefixedRootAsUtf8View(bb:flatbuffers.ByteBuffer, obj?:Utf8View):Utf8View { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new Utf8View()).__init(bb.readInt32(bb.position()) + bb.position(), bb); +} + +static startUtf8View(builder:flatbuffers.Builder) { + builder.startObject(0); +} + +static endUtf8View(builder:flatbuffers.Builder):flatbuffers.Offset { + const offset = builder.endObject(); + return offset; +} + +static createUtf8View(builder:flatbuffers.Builder):flatbuffers.Offset { + Utf8View.startUtf8View(builder); + return Utf8View.endUtf8View(builder); +} +} diff --git a/src/interfaces.ts b/src/interfaces.ts index 0645753b..eea88bd4 100644 --- a/src/interfaces.ts +++ b/src/interfaces.ts @@ -212,6 +212,7 @@ export type TypeToDataType = { [Type.LargeUtf8]: type.LargeUtf8; [Type.Binary]: type.Binary; [Type.LargeBinary]: type.LargeBinary; + [Type.BinaryView]: type.BinaryView; [Type.FixedSizeBinary]: type.FixedSizeBinary; [Type.Date]: type.Date_; [Type.DateDay]: type.DateDay; @@ -244,6 +245,7 @@ export type TypeToDataType = { [Type.Struct]: type.Struct; [Type.Dictionary]: type.Dictionary; [Type.FixedSizeList]: type.FixedSizeList; + [Type.Utf8View]: type.Utf8View; }[T]; /** @ignore */ @@ -268,6 +270,7 @@ type TypeToBuilder = { [Type.LargeUtf8]: LargeUtf8Builder; [Type.Binary]: BinaryBuilder; [Type.LargeBinary]: LargeBinaryBuilder; + [Type.BinaryView]: Builder; [Type.FixedSizeBinary]: FixedSizeBinaryBuilder; [Type.Date]: DateBuilder; [Type.DateDay]: DateDayBuilder; @@ -300,6 +303,7 @@ type TypeToBuilder = { [Type.Struct]: StructBuilder; [Type.Dictionary]: DictionaryBuilder; [Type.FixedSizeList]: FixedSizeListBuilder; + [Type.Utf8View]: Builder; }[T]; /** @ignore */ @@ -324,6 +328,7 @@ type DataTypeToBuilder = { [Type.LargeUtf8]: T extends type.LargeUtf8 ? LargeUtf8Builder : never; [Type.Binary]: T extends type.Binary ? BinaryBuilder : never; [Type.LargeBinary]: T extends type.LargeBinary ? LargeBinaryBuilder : never; + [Type.BinaryView]: T extends type.BinaryView ? Builder : never; [Type.FixedSizeBinary]: T extends type.FixedSizeBinary ? FixedSizeBinaryBuilder : never; [Type.Date]: T extends type.Date_ ? DateBuilder : never; [Type.DateDay]: T extends type.DateDay ? DateDayBuilder : never; @@ -356,4 +361,5 @@ type DataTypeToBuilder = { [Type.Struct]: T extends type.Struct ? StructBuilder : never; [Type.Dictionary]: T extends type.Dictionary ? DictionaryBuilder : never; [Type.FixedSizeList]: T extends type.FixedSizeList ? FixedSizeListBuilder : never; + [Type.Utf8View]: T extends type.Utf8View ? Builder : never; }[T['TType']]; diff --git a/src/ipc/metadata/message.ts b/src/ipc/metadata/message.ts index 17e8897b..b41ec4a5 100644 --- a/src/ipc/metadata/message.ts +++ b/src/ipc/metadata/message.ts @@ -57,7 +57,7 @@ import ByteBuffer = flatbuffers.ByteBuffer; import { DataType, Dictionary, TimeBitWidth, - Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, + Utf8, LargeUtf8, Binary, LargeBinary, BinaryView, Utf8View, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Union, Bool, Null, Int, Float, Date_, Time, Interval, Timestamp, IntBitWidth, Int32, TKeys, Duration, } from '../../type.js'; @@ -156,20 +156,24 @@ export class RecordBatch { protected _nodes: FieldNode[]; protected _buffers: BufferRegion[]; protected _compression: BodyCompression | null; + protected _variadicBufferCounts: number[]; public get nodes() { return this._nodes; } public get length() { return this._length; } public get buffers() { return this._buffers; } public get compression() { return this._compression; } + public get variadicBufferCounts() { return this._variadicBufferCounts; } constructor( length: bigint | number, nodes: FieldNode[], buffers: BufferRegion[], - compression: BodyCompression | null + compression: BodyCompression | null, + variadicBufferCounts: number[] = [] ) { this._nodes = nodes; this._buffers = buffers; this._length = bigIntToNumber(length); this._compression = compression; + this._variadicBufferCounts = variadicBufferCounts; } } @@ -334,7 +338,8 @@ function decodeRecordBatch(batch: _RecordBatch, version = MetadataVersion.V5) { batch.length(), decodeFieldNodes(batch), decodeBuffers(batch, version), - decodeBodyCompression(batch.compression()) + decodeBodyCompression(batch.compression()), + decodeVariadicBufferCounts(batch) ); return recordBatch; } @@ -382,6 +387,16 @@ function decodeBuffers(batch: _RecordBatch, version: MetadataVersion) { return bufferRegions; } +/** @ignore */ +function decodeVariadicBufferCounts(batch: _RecordBatch) { + const counts = [] as number[]; + const length = Math.trunc(batch.variadicBufferCountsLength()); + for (let i = 0; i < length; ++i) { + counts.push(bigIntToNumber(batch.variadicBufferCounts(i)!)); + } + return counts; +} + /** @ignore */ function decodeSchemaFields(schema: _Schema, dictionaries?: Map) { const fields = [] as Field[]; @@ -468,8 +483,10 @@ function decodeFieldType(f: _Field, children?: Field[]): DataType { case Type['Null']: return new Null(); case Type['Binary']: return new Binary(); case Type['LargeBinary']: return new LargeBinary(); + case Type['BinaryView']: return new BinaryView(); case Type['Utf8']: return new Utf8(); case Type['LargeUtf8']: return new LargeUtf8(); + case Type['Utf8View']: return new Utf8View(); case Type['Bool']: return new Bool(); case Type['List']: return new List((children || [])[0]); case Type['Struct_']: return new Struct(children || []); @@ -614,6 +631,7 @@ function encodeRecordBatch(b: Builder, recordBatch: RecordBatch) { const nodes = recordBatch.nodes || []; const buffers = recordBatch.buffers || []; + const variadicBufferCounts = recordBatch.variadicBufferCounts || []; _RecordBatch.startNodesVector(b, nodes.length); for (const n of nodes.slice().reverse()) FieldNode.encode(b, n); @@ -630,6 +648,11 @@ function encodeRecordBatch(b: Builder, recordBatch: RecordBatch) { bodyCompressionOffset = encodeBodyCompression(b, recordBatch.compression); } + let variadicBufferCountsOffset = -1; + if (variadicBufferCounts.length > 0) { + variadicBufferCountsOffset = _RecordBatch.createVariadicBufferCountsVector(b, variadicBufferCounts.map(BigInt)); + } + _RecordBatch.startRecordBatch(b); _RecordBatch.addLength(b, BigInt(recordBatch.length)); _RecordBatch.addNodes(b, nodesVectorOffset); @@ -637,6 +660,9 @@ function encodeRecordBatch(b: Builder, recordBatch: RecordBatch) { if (recordBatch.compression !== null && bodyCompressionOffset) { _RecordBatch.addCompression(b, bodyCompressionOffset); } + if (variadicBufferCountsOffset !== -1) { + _RecordBatch.addVariadicBufferCounts(b, variadicBufferCountsOffset); + } return _RecordBatch.endRecordBatch(b); } diff --git a/src/ipc/reader.ts b/src/ipc/reader.ts index e36eeb52..da5b3cb3 100644 --- a/src/ipc/reader.ts +++ b/src/ipc/reader.ts @@ -397,7 +397,8 @@ abstract class RecordBatchReaderImpl implements RecordB header.data.length, header.data.nodes, buffers, - null + null, + header.data.variadicBufferCounts ), id, isDelta) } else { throw new Error('Dictionary batch is compressed but codec not found'); @@ -412,11 +413,11 @@ abstract class RecordBatchReaderImpl implements RecordB } protected _loadVectors(header: metadata.RecordBatch, body: Uint8Array, types: (Field | DataType)[]) { - return new VectorLoader(body, header.nodes, header.buffers, this.dictionaries, this.schema.metadataVersion).visitMany(types); + return new VectorLoader(body, header.nodes, header.buffers, this.dictionaries, this.schema.metadataVersion, header.variadicBufferCounts).visitMany(types); } protected _loadCompressedVectors(header: metadata.RecordBatch, body: Uint8Array[], types: (Field | DataType)[]) { - return new CompressedVectorLoader(body, header.nodes, header.buffers, this.dictionaries, this.schema.metadataVersion).visitMany(types); + return new CompressedVectorLoader(body, header.nodes, header.buffers, this.dictionaries, this.schema.metadataVersion, header.variadicBufferCounts).visitMany(types); } private _decompressBuffers(header: metadata.RecordBatch, body: Uint8Array, codec: Codec): { decommpressedBody: Uint8Array[]; buffers: metadata.BufferRegion[] } { diff --git a/src/ipc/writer.ts b/src/ipc/writer.ts index 17c8f0b6..0b13fdfc 100644 --- a/src/ipc/writer.ts +++ b/src/ipc/writer.ts @@ -274,8 +274,8 @@ export class RecordBatchWriter extends ReadableInterop< } protected _writeRecordBatch(batch: RecordBatch) { - const { byteLength, nodes, bufferRegions, buffers } = this._assembleRecordBatch(batch); - const recordBatch = new metadata.RecordBatch(batch.numRows, nodes, bufferRegions, this._compression); + const { byteLength, nodes, bufferRegions, buffers, variadicBufferCounts } = this._assembleRecordBatch(batch); + const recordBatch = new metadata.RecordBatch(batch.numRows, nodes, bufferRegions, this._compression, variadicBufferCounts); const message = Message.from(recordBatch, byteLength); return this ._writeDictionaries(batch) @@ -284,11 +284,11 @@ export class RecordBatchWriter extends ReadableInterop< } protected _assembleRecordBatch(batch: RecordBatch | Vector) { - let { byteLength, nodes, bufferRegions, buffers } = VectorAssembler.assemble(batch); + let { byteLength, nodes, bufferRegions, buffers, variadicBufferCounts } = VectorAssembler.assemble(batch); if (this._compression != null) { ({ byteLength, bufferRegions, buffers } = this._compressBodyBuffers(buffers)); } - return { byteLength, nodes, bufferRegions, buffers }; + return { byteLength, nodes, bufferRegions, buffers, variadicBufferCounts }; } protected _compressBodyBuffers(buffers: ArrayBufferView[]) { @@ -337,8 +337,8 @@ export class RecordBatchWriter extends ReadableInterop< } protected _writeDictionaryBatch(dictionary: Data, id: number, isDelta = false) { - const { byteLength, nodes, bufferRegions, buffers } = this._assembleRecordBatch(new Vector([dictionary])); - const recordBatch = new metadata.RecordBatch(dictionary.length, nodes, bufferRegions, this._compression); + const { byteLength, nodes, bufferRegions, buffers, variadicBufferCounts } = this._assembleRecordBatch(new Vector([dictionary])); + const recordBatch = new metadata.RecordBatch(dictionary.length, nodes, bufferRegions, this._compression, variadicBufferCounts); const dictionaryBatch = new metadata.DictionaryBatch(recordBatch, id, isDelta); const message = Message.from(dictionaryBatch, byteLength); return this diff --git a/src/type.ts b/src/type.ts index ea5e24fa..1475d668 100644 --- a/src/type.ts +++ b/src/type.ts @@ -58,8 +58,10 @@ export abstract class DataType { })(Binary.prototype); } +/** @ignore */ +export interface BinaryView extends DataType { + TArray: Uint8Array; + TValue: Uint8Array; + ArrayType: TypedArrayConstructor; +} +/** @ignore */ +export class BinaryView extends DataType { + public static readonly ELEMENT_WIDTH = 16; + public static readonly INLINE_CAPACITY = 12; + public static readonly LENGTH_OFFSET = 0; + public static readonly INLINE_OFFSET = 4; + public static readonly BUFFER_INDEX_OFFSET = 8; + public static readonly BUFFER_OFFSET_OFFSET = 12; + constructor() { + super(Type.BinaryView); + } + public toString() { return `BinaryView`; } + protected static [Symbol.toStringTag] = ((proto: BinaryView) => { + (proto).ArrayType = Uint8Array; + return proto[Symbol.toStringTag] = 'BinaryView'; + })(BinaryView.prototype); +} + /** @ignore */ export interface LargeBinary extends DataType { TArray: Uint8Array; TOffsetArray: BigInt64Array; TValue: Uint8Array; ArrayType: TypedArrayConstructor; OffsetArrayType: BigIntArrayConstructor } /** @ignore */ @@ -269,6 +297,7 @@ export class LargeBinary extends DataType { })(LargeBinary.prototype); } +/** @ignore */ /** @ignore */ export interface Utf8 extends DataType { TArray: Uint8Array; TOffsetArray: Int32Array; TValue: string; ArrayType: TypedArrayConstructor; OffsetArrayType: TypedArrayConstructor } /** @ignore */ @@ -283,6 +312,26 @@ export class Utf8 extends DataType { })(Utf8.prototype); } +/** @ignore */ +export interface Utf8View extends DataType { + TArray: Uint8Array; + TValue: string; + ArrayType: TypedArrayConstructor; +} +/** @ignore */ +export class Utf8View extends DataType { + public static readonly ELEMENT_WIDTH = BinaryView.ELEMENT_WIDTH; + public static readonly INLINE_CAPACITY = BinaryView.INLINE_CAPACITY; + constructor() { + super(Type.Utf8View); + } + public toString() { return `Utf8View`; } + protected static [Symbol.toStringTag] = ((proto: Utf8View) => { + (proto).ArrayType = Uint8Array; + return proto[Symbol.toStringTag] = 'Utf8View'; + })(Utf8View.prototype); +} + /** @ignore */ export interface LargeUtf8 extends DataType { TArray: Uint8Array; TOffsetArray: BigInt64Array; TValue: string; ArrayType: TypedArrayConstructor; OffsetArrayType: BigIntArrayConstructor } /** @ignore */ @@ -298,6 +347,7 @@ export class LargeUtf8 extends DataType { })(LargeUtf8.prototype); } +/** @ignore */ /** @ignore */ export interface Bool extends DataType { TArray: Uint8Array; TValue: boolean; ArrayType: TypedArrayConstructor } /** @ignore */ @@ -546,6 +596,40 @@ export class List extends DataType extends DataType { + constructor(child: Field) { + super(Type.ListView); + this.children = [child]; + } + public declare readonly children: Field[]; + public toString() { return `ListView<${this.valueType}>`; } + public get valueType(): T { return this.children[0].type as T; } + public get valueField(): Field { return this.children[0] as Field; } + public get ArrayType(): T['ArrayType'] { return this.valueType.ArrayType; } + protected static [Symbol.toStringTag] = ((proto: ListView) => { + (proto).children = null; + return proto[Symbol.toStringTag] = 'ListView'; + })(ListView.prototype); +} + +/** @ignore */ +export class LargeListView extends DataType { + constructor(child: Field) { + super(Type.LargeListView); + this.children = [child]; + } + public declare readonly children: Field[]; + public toString() { return `LargeListView<${this.valueType}>`; } + public get valueType(): T { return this.children[0].type as T; } + public get valueField(): Field { return this.children[0] as Field; } + public get ArrayType(): T['ArrayType'] { return this.valueType.ArrayType; } + protected static [Symbol.toStringTag] = ((proto: LargeListView) => { + (proto).children = null; + return proto[Symbol.toStringTag] = 'LargeListView'; + })(LargeListView.prototype); +} + /** @ignore */ export interface Struct extends DataType { TArray: Array>; @@ -759,6 +843,8 @@ export function strideForType(type: DataType) { } // case Type.Int: return 1 + +((t as Int_).bitWidth > 32); // case Type.Time: return 1 + +((t as Time_).bitWidth > 32); + case Type.BinaryView: + case Type.Utf8View: return 16; case Type.FixedSizeList: return (t as FixedSizeList).listSize; case Type.FixedSizeBinary: return (t as FixedSizeBinary).byteWidth; default: return 1; diff --git a/src/visitor.ts b/src/visitor.ts index 977e0a4e..177384ba 100644 --- a/src/visitor.ts +++ b/src/visitor.ts @@ -37,8 +37,10 @@ export abstract class Visitor { public visitFloat(_node: any, ..._args: any[]): any { return null; } public visitUtf8(_node: any, ..._args: any[]): any { return null; } public visitLargeUtf8(_node: any, ..._args: any[]): any { return null; } + public visitUtf8View(_node: any, ..._args: any[]): any { return null; } public visitBinary(_node: any, ..._args: any[]): any { return null; } public visitLargeBinary(_node: any, ..._args: any[]): any { return null; } + public visitBinaryView(_node: any, ..._args: any[]): any { return null; } public visitFixedSizeBinary(_node: any, ..._args: any[]): any { return null; } public visitDate(_node: any, ..._args: any[]): any { return null; } public visitTimestamp(_node: any, ..._args: any[]): any { return null; } @@ -52,6 +54,8 @@ export abstract class Visitor { public visitDuration(_node: any, ..._args: any[]): any { return null; } public visitFixedSizeList(_node: any, ..._args: any[]): any { return null; } public visitMap(_node: any, ..._args: any[]): any { return null; } + public visitListView(_node: any, ..._args: any[]): any { return null; } + public visitLargeListView(_node: any, ..._args: any[]): any { return null; } } /** @ignore */ @@ -92,8 +96,10 @@ function getVisitFnByTypeId(visitor: Visitor, dtype: Type, throwIfNotFound = tru case Type.Float64: fn = visitor.visitFloat64 || visitor.visitFloat; break; case Type.Utf8: fn = visitor.visitUtf8; break; case Type.LargeUtf8: fn = visitor.visitLargeUtf8; break; + case Type.Utf8View: fn = visitor.visitUtf8View || visitor.visitUtf8; break; case Type.Binary: fn = visitor.visitBinary; break; case Type.LargeBinary: fn = visitor.visitLargeBinary; break; + case Type.BinaryView: fn = visitor.visitBinaryView || visitor.visitBinary; break; case Type.FixedSizeBinary: fn = visitor.visitFixedSizeBinary; break; case Type.Date: fn = visitor.visitDate; break; case Type.DateDay: fn = visitor.visitDateDay || visitor.visitDate; break; @@ -126,6 +132,8 @@ function getVisitFnByTypeId(visitor: Visitor, dtype: Type, throwIfNotFound = tru case Type.DurationNanosecond: fn = visitor.visitDurationNanosecond || visitor.visitDuration; break; case Type.FixedSizeList: fn = visitor.visitFixedSizeList; break; case Type.Map: fn = visitor.visitMap; break; + case Type.ListView: fn = visitor.visitListView; break; + case Type.LargeListView: fn = visitor.visitLargeListView; break; } if (typeof fn === 'function') return fn; if (!throwIfNotFound) return () => null; @@ -157,8 +165,10 @@ function inferDType(type: T): Type { return Type.Float; case Type.Binary: return Type.Binary; case Type.LargeBinary: return Type.LargeBinary; + case Type.BinaryView: return Type.BinaryView; case Type.Utf8: return Type.Utf8; case Type.LargeUtf8: return Type.LargeUtf8; + case Type.Utf8View: return Type.Utf8View; case Type.Bool: return Type.Bool; case Type.Decimal: return Type.Decimal; case Type.Time: @@ -216,6 +226,8 @@ function inferDType(type: T): Type { case Type.FixedSizeBinary: return Type.FixedSizeBinary; case Type.FixedSizeList: return Type.FixedSizeList; case Type.Dictionary: return Type.Dictionary; + case Type.ListView: return Type.ListView; + case Type.LargeListView: return Type.LargeListView; } throw new Error(`Unrecognized type '${Type[type.typeId]}'`); } @@ -272,6 +284,8 @@ export interface Visitor { visitDurationNanosecond(node: any, ...args: any[]): any; visitFixedSizeList(node: any, ...args: any[]): any; visitMap(node: any, ...args: any[]): any; + visitListView(node: any, ...args: any[]): any; + visitLargeListView(node: any, ...args: any[]): any; } // Add these here so they're picked up by the externs creator diff --git a/src/visitor/builderctor.ts b/src/visitor/builderctor.ts index 791576b0..eda77abb 100644 --- a/src/visitor/builderctor.ts +++ b/src/visitor/builderctor.ts @@ -34,6 +34,7 @@ import { IntervalBuilder, IntervalDayTimeBuilder, IntervalMonthDayNanoBuilder, I import { DurationBuilder, DurationSecondBuilder, DurationMillisecondBuilder, DurationMicrosecondBuilder, DurationNanosecondBuilder } from '../builder/duration.js'; import { IntBuilder, Int8Builder, Int16Builder, Int32Builder, Int64Builder, Uint8Builder, Uint16Builder, Uint32Builder, Uint64Builder } from '../builder/int.js'; import { ListBuilder } from '../builder/list.js'; +import { ListViewBuilder, LargeListViewBuilder } from '../builder/listview.js'; import { MapBuilder } from '../builder/map.js'; import { NullBuilder } from '../builder/null.js'; import { StructBuilder } from '../builder/struct.js'; @@ -42,6 +43,8 @@ import { TimeBuilder, TimeSecondBuilder, TimeMillisecondBuilder, TimeMicrosecond import { UnionBuilder, DenseUnionBuilder, SparseUnionBuilder } from '../builder/union.js'; import { Utf8Builder } from '../builder/utf8.js'; import { LargeUtf8Builder } from '../builder/largeutf8.js'; +import { BinaryViewBuilder } from '../builder/binaryview.js'; +import { Utf8ViewBuilder } from '../builder/utf8view.js'; /** @ignore */ export interface GetBuilderCtor extends Visitor { @@ -88,6 +91,8 @@ export class GetBuilderCtor extends Visitor { public visitTimeNanosecond() { return TimeNanosecondBuilder; } public visitDecimal() { return DecimalBuilder; } public visitList() { return ListBuilder; } + public visitListView() { return ListViewBuilder; } + public visitLargeListView() { return LargeListViewBuilder; } public visitStruct() { return StructBuilder; } public visitUnion() { return UnionBuilder; } public visitDenseUnion() { return DenseUnionBuilder; } @@ -104,6 +109,8 @@ export class GetBuilderCtor extends Visitor { public visitDurationNanosecond() { return DurationNanosecondBuilder; } public visitFixedSizeList() { return FixedSizeListBuilder; } public visitMap() { return MapBuilder; } + public visitBinaryView() { return BinaryViewBuilder; } + public visitUtf8View() { return Utf8ViewBuilder; } } /** @ignore */ diff --git a/src/visitor/get.ts b/src/visitor/get.ts index a5502dd3..bea4a005 100644 --- a/src/visitor/get.ts +++ b/src/visitor/get.ts @@ -28,7 +28,7 @@ import { uint16ToFloat64 } from '../util/math.js'; import { Type, UnionMode, Precision, DateUnit, TimeUnit, IntervalUnit } from '../enum.js'; import { DataType, Dictionary, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, ListView, LargeListView, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -63,8 +63,10 @@ export interface GetVisitor extends Visitor { visitFloat64(data: Data, index: number): T['TValue'] | null; visitUtf8(data: Data, index: number): T['TValue'] | null; visitLargeUtf8(data: Data, index: number): T['TValue'] | null; + visitUtf8View(data: Data, index: number): T['TValue'] | null; visitBinary(data: Data, index: number): T['TValue'] | null; visitLargeBinary(data: Data, index: number): T['TValue'] | null; + visitBinaryView(data: Data, index: number): T['TValue'] | null; visitFixedSizeBinary(data: Data, index: number): T['TValue'] | null; visitDate(data: Data, index: number): T['TValue'] | null; visitDateDay(data: Data, index: number): T['TValue'] | null; @@ -81,6 +83,8 @@ export interface GetVisitor extends Visitor { visitTimeNanosecond(data: Data, index: number): T['TValue'] | null; visitDecimal(data: Data, index: number): T['TValue'] | null; visitList(data: Data, index: number): T['TValue'] | null; + visitListView(data: Data, index: number): T['TValue'] | null; + visitLargeListView(data: Data, index: number): T['TValue'] | null; visitStruct(data: Data, index: number): T['TValue'] | null; visitUnion(data: Data, index: number): T['TValue'] | null; visitDenseUnion(data: Data, index: number): T['TValue'] | null; @@ -109,6 +113,9 @@ function wrapGet(fn: (data: Data, _1: any) => any) { /** @ignore */const epochDaysToMs = (data: Int32Array, index: number) => 86400000 * data[index]; +const BINARY_VIEW_SIZE = 16; +const BINARY_VIEW_INLINE_CAPACITY = 12; + /** @ignore */ const getNull = (_data: Data, _index: number): T['TValue'] => null; /** @ignore */ @@ -149,10 +156,43 @@ const getFixedSizeBinary = ({ stride, values }: Data< /** @ignore */ const getBinary = ({ values, valueOffsets }: Data, index: number): T['TValue'] => getVariableWidthBytes(values, valueOffsets, index); /** @ignore */ +const getBinaryViewBytes = (data: Data, index: number): Uint8Array => { + const values = data.values as Uint8Array; + if (!values) { + throw new Error('BinaryView data is missing view buffer'); + } + const start = (data.offset + index) * BINARY_VIEW_SIZE; + const baseOffset = values.byteOffset + start; + const view = new DataView(values.buffer, baseOffset, BINARY_VIEW_SIZE); + const size = view.getInt32(0, true); + if (size <= 0) { + return new Uint8Array(0); + } + if (size <= BINARY_VIEW_INLINE_CAPACITY) { + return new Uint8Array(values.buffer, baseOffset + 4, size); + } + const bufferIndex = view.getInt32(8, true); + const offset = view.getInt32(12, true); + const variadicBuffer = data.variadicBuffers?.[bufferIndex]; + if (!variadicBuffer) { + throw new Error(`BinaryView variadic buffer ${bufferIndex} is missing`); + } + return variadicBuffer.subarray(offset, offset + size); +}; +/** @ignore */ +const getBinaryViewValue = (data: Data, index: number): T['TValue'] => { + return getBinaryViewBytes(data, index) as T['TValue']; +}; +/** @ignore */ const getUtf8 = ({ values, valueOffsets }: Data, index: number): T['TValue'] => { const bytes = getVariableWidthBytes(values, valueOffsets, index); return bytes !== null ? decodeUtf8(bytes) : null as any; }; +/** @ignore */ +const getUtf8ViewValue = (data: Data, index: number): T['TValue'] => { + const bytes = getBinaryViewBytes(data, index); + return decodeUtf8(bytes); +}; /* istanbul ignore next */ /** @ignore */ @@ -222,6 +262,26 @@ const getList = (data: Data, index: number): T['TValue'] => { return new Vector([slice]) as T['TValue']; }; +/** @ignore */ +const getListView = (data: Data, index: number): T['TValue'] => { + const { valueOffsets, values: sizes, children } = data; + const offset = bigIntToNumber(valueOffsets[index]); + const size = bigIntToNumber(sizes[index]); + const child: Data = children[0]; + const slice = child.slice(offset, size); + return new Vector([slice]) as T['TValue']; +}; + +/** @ignore */ +const getLargeListView = (data: Data, index: number): T['TValue'] => { + const { valueOffsets, values: sizes, children } = data; + const offset = bigIntToNumber(valueOffsets[index]); + const size = bigIntToNumber(sizes[index]); + const child: Data = children[0]; + const slice = child.slice(offset, size); + return new Vector([slice]) as T['TValue']; +}; + /** @ignore */ const getMap = (data: Data, index: number): T['TValue'] => { const { valueOffsets, children } = data; @@ -332,8 +392,10 @@ GetVisitor.prototype.visitFloat32 = wrapGet(getNumeric); GetVisitor.prototype.visitFloat64 = wrapGet(getNumeric); GetVisitor.prototype.visitUtf8 = wrapGet(getUtf8); GetVisitor.prototype.visitLargeUtf8 = wrapGet(getUtf8); +GetVisitor.prototype.visitUtf8View = wrapGet(getUtf8ViewValue); GetVisitor.prototype.visitBinary = wrapGet(getBinary); GetVisitor.prototype.visitLargeBinary = wrapGet(getBinary); +GetVisitor.prototype.visitBinaryView = wrapGet(getBinaryViewValue); GetVisitor.prototype.visitFixedSizeBinary = wrapGet(getFixedSizeBinary); GetVisitor.prototype.visitDate = wrapGet(getDate); GetVisitor.prototype.visitDateDay = wrapGet(getDateDay); @@ -350,6 +412,8 @@ GetVisitor.prototype.visitTimeMicrosecond = wrapGet(getTimeMicrosecond); GetVisitor.prototype.visitTimeNanosecond = wrapGet(getTimeNanosecond); GetVisitor.prototype.visitDecimal = wrapGet(getDecimal); GetVisitor.prototype.visitList = wrapGet(getList); +GetVisitor.prototype.visitListView = wrapGet(getListView); +GetVisitor.prototype.visitLargeListView = wrapGet(getLargeListView); GetVisitor.prototype.visitStruct = wrapGet(getStruct); GetVisitor.prototype.visitUnion = wrapGet(getUnion); GetVisitor.prototype.visitDenseUnion = wrapGet(getDenseUnion); diff --git a/src/visitor/indexof.ts b/src/visitor/indexof.ts index 3a4d1171..6881f99f 100644 --- a/src/visitor/indexof.ts +++ b/src/visitor/indexof.ts @@ -24,7 +24,7 @@ import { getBool, BitIterator } from '../util/bit.js'; import { createElementComparator } from '../util/vector.js'; import { DataType, Dictionary, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -59,8 +59,10 @@ export interface IndexOfVisitor extends Visitor { visitFloat64(data: Data, value: T['TValue'] | null, index?: number): number; visitUtf8(data: Data, value: T['TValue'] | null, index?: number): number; visitLargeUtf8(data: Data, value: T['TValue'] | null, index?: number): number; + visitUtf8View(data: Data, value: T['TValue'] | null, index?: number): number; visitBinary(data: Data, value: T['TValue'] | null, index?: number): number; visitLargeBinary(data: Data, value: T['TValue'] | null, index?: number): number; + visitBinaryView(data: Data, value: T['TValue'] | null, index?: number): number; visitFixedSizeBinary(data: Data, value: T['TValue'] | null, index?: number): number; visitDate(data: Data, value: T['TValue'] | null, index?: number): number; visitDateDay(data: Data, value: T['TValue'] | null, index?: number): number; @@ -177,8 +179,10 @@ IndexOfVisitor.prototype.visitFloat32 = indexOfValue; IndexOfVisitor.prototype.visitFloat64 = indexOfValue; IndexOfVisitor.prototype.visitUtf8 = indexOfValue; IndexOfVisitor.prototype.visitLargeUtf8 = indexOfValue; +IndexOfVisitor.prototype.visitUtf8View = indexOfValue; IndexOfVisitor.prototype.visitBinary = indexOfValue; IndexOfVisitor.prototype.visitLargeBinary = indexOfValue; +IndexOfVisitor.prototype.visitBinaryView = indexOfValue; IndexOfVisitor.prototype.visitFixedSizeBinary = indexOfValue; IndexOfVisitor.prototype.visitDate = indexOfValue; IndexOfVisitor.prototype.visitDateDay = indexOfValue; diff --git a/src/visitor/iterator.ts b/src/visitor/iterator.ts index 9f2844b3..ef54504c 100644 --- a/src/visitor/iterator.ts +++ b/src/visitor/iterator.ts @@ -21,7 +21,7 @@ import { Type, Precision } from '../enum.js'; import { TypeToDataType } from '../interfaces.js'; import { DataType, Dictionary, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -57,8 +57,10 @@ export interface IteratorVisitor extends Visitor { visitFloat64(vector: Vector): IterableIterator; visitUtf8(vector: Vector): IterableIterator; visitLargeUtf8(vector: Vector): IterableIterator; + visitUtf8View(vector: Vector): IterableIterator; visitBinary(vector: Vector): IterableIterator; visitLargeBinary(vector: Vector): IterableIterator; + visitBinaryView(vector: Vector): IterableIterator; visitFixedSizeBinary(vector: Vector): IterableIterator; visitDate(vector: Vector): IterableIterator; visitDateDay(vector: Vector): IterableIterator; @@ -164,8 +166,10 @@ IteratorVisitor.prototype.visitFloat32 = vectorIterator; IteratorVisitor.prototype.visitFloat64 = vectorIterator; IteratorVisitor.prototype.visitUtf8 = vectorIterator; IteratorVisitor.prototype.visitLargeUtf8 = vectorIterator; +IteratorVisitor.prototype.visitUtf8View = vectorIterator; IteratorVisitor.prototype.visitBinary = vectorIterator; IteratorVisitor.prototype.visitLargeBinary = vectorIterator; +IteratorVisitor.prototype.visitBinaryView = vectorIterator; IteratorVisitor.prototype.visitFixedSizeBinary = vectorIterator; IteratorVisitor.prototype.visitDate = vectorIterator; IteratorVisitor.prototype.visitDateDay = vectorIterator; diff --git a/src/visitor/set.ts b/src/visitor/set.ts index 4bf632ba..65b1022f 100644 --- a/src/visitor/set.ts +++ b/src/visitor/set.ts @@ -26,7 +26,7 @@ import { float64ToUint16 } from '../util/math.js'; import { Type, UnionMode, Precision, DateUnit, TimeUnit, IntervalUnit } from '../enum.js'; import { DataType, Dictionary, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -61,8 +61,10 @@ export interface SetVisitor extends Visitor { visitFloat64(data: Data, index: number, value: T['TValue']): void; visitUtf8(data: Data, index: number, value: T['TValue']): void; visitLargeUtf8(data: Data, index: number, value: T['TValue']): void; + visitUtf8View(data: Data, index: number, value: T['TValue']): void; visitBinary(data: Data, index: number, value: T['TValue']): void; visitLargeBinary(data: Data, index: number, value: T['TValue']): void; + visitBinaryView(data: Data, index: number, value: T['TValue']): void; visitFixedSizeBinary(data: Data, index: number, value: T['TValue']): void; visitDate(data: Data, index: number, value: T['TValue']): void; visitDateDay(data: Data, index: number, value: T['TValue']): void; @@ -155,7 +157,15 @@ export const setFixedSizeBinary = ({ stride, values } /** @ignore */ const setBinary = ({ values, valueOffsets }: Data, index: number, value: T['TValue']) => setVariableWidthBytes(values, valueOffsets, index, value); /** @ignore */ +const setBinaryView = (_data: Data, _index: number, _value: T['TValue']) => { + throw new Error('BinaryView values are immutable in the current implementation'); +}; +/** @ignore */ const setUtf8 = ({ values, valueOffsets }: Data, index: number, value: T['TValue']) => setVariableWidthBytes(values, valueOffsets, index, encodeUtf8(value)); +/** @ignore */ +const setUtf8View = (_data: Data, _index: number, _value: T['TValue']) => { + throw new Error('Utf8View values are immutable in the current implementation'); +}; /* istanbul ignore next */ export const setDate = (data: Data, index: number, value: T['TValue']): void => { @@ -359,8 +369,10 @@ SetVisitor.prototype.visitFloat32 = wrapSet(setFloat); SetVisitor.prototype.visitFloat64 = wrapSet(setFloat); SetVisitor.prototype.visitUtf8 = wrapSet(setUtf8); SetVisitor.prototype.visitLargeUtf8 = wrapSet(setUtf8); +SetVisitor.prototype.visitUtf8View = wrapSet(setUtf8View); SetVisitor.prototype.visitBinary = wrapSet(setBinary); SetVisitor.prototype.visitLargeBinary = wrapSet(setBinary); +SetVisitor.prototype.visitBinaryView = wrapSet(setBinaryView); SetVisitor.prototype.visitFixedSizeBinary = wrapSet(setFixedSizeBinary); SetVisitor.prototype.visitDate = wrapSet(setDate); SetVisitor.prototype.visitDateDay = wrapSet(setDateDay); diff --git a/src/visitor/typeassembler.ts b/src/visitor/typeassembler.ts index 169f3627..066d65e1 100644 --- a/src/visitor/typeassembler.ts +++ b/src/visitor/typeassembler.ts @@ -25,9 +25,11 @@ import { Null } from '../fb/null.js'; import { Int } from '../fb/int.js'; import { FloatingPoint } from '../fb/floating-point.js'; import { Binary } from '../fb/binary.js'; +import { BinaryView } from '../fb/binary-view.js'; import { LargeBinary } from '../fb/large-binary.js'; import { Bool } from '../fb/bool.js'; import { Utf8 } from '../fb/utf8.js'; +import { Utf8View } from '../fb/utf8-view.js'; import { LargeUtf8 } from '../fb/large-utf8.js'; import { Decimal } from '../fb/decimal.js'; import { Date } from '../fb/date.js'; @@ -36,6 +38,8 @@ import { Timestamp } from '../fb/timestamp.js'; import { Interval } from '../fb/interval.js'; import { Duration } from '../fb/duration.js'; import { List } from '../fb/list.js'; +import { ListView } from '../fb/list-view.js'; +import { LargeListView } from '../fb/large-list-view.js'; import { Struct_ as Struct } from '../fb/struct-.js'; import { Union } from '../fb/union.js'; import { DictionaryEncoding } from '../fb/dictionary-encoding.js'; @@ -72,6 +76,10 @@ export class TypeAssembler extends Visitor { Binary.startBinary(b); return Binary.endBinary(b); } + public visitBinaryView(_node: T, b: Builder) { + BinaryView.startBinaryView(b); + return BinaryView.endBinaryView(b); + } public visitLargeBinary(_node: T, b: Builder) { LargeBinary.startLargeBinary(b); return LargeBinary.endLargeBinary(b); @@ -84,6 +92,10 @@ export class TypeAssembler extends Visitor { Utf8.startUtf8(b); return Utf8.endUtf8(b); } + public visitUtf8View(_node: T, b: Builder) { + Utf8View.startUtf8View(b); + return Utf8View.endUtf8View(b); + } public visitLargeUtf8(_node: T, b: Builder) { LargeUtf8.startLargeUtf8(b); return LargeUtf8.endLargeUtf8(b); @@ -129,6 +141,14 @@ export class TypeAssembler extends Visitor { List.startList(b); return List.endList(b); } + public visitListView(_node: T, b: Builder) { + ListView.startListView(b); + return ListView.endListView(b); + } + public visitLargeListView(_node: T, b: Builder) { + LargeListView.startLargeListView(b); + return LargeListView.endLargeListView(b); + } public visitStruct(_node: T, b: Builder) { Struct.startStruct_(b); return Struct.endStruct_(b); diff --git a/src/visitor/typecomparator.ts b/src/visitor/typecomparator.ts index 65413ccd..5c1d60a9 100644 --- a/src/visitor/typecomparator.ts +++ b/src/visitor/typecomparator.ts @@ -21,7 +21,7 @@ import { Visitor } from '../visitor.js'; import { Schema, Field } from '../schema.js'; import { DataType, TypeMap, Dictionary, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -55,8 +55,10 @@ export interface TypeComparator extends Visitor { visitFloat64(type: T, other?: DataType | null): other is T; visitUtf8(type: T, other?: DataType | null): other is T; visitLargeUtf8(type: T, other?: DataType | null): other is T; + visitUtf8View(type: T, other?: DataType | null): other is T; visitBinary(type: T, other?: DataType | null): other is T; visitLargeBinary(type: T, other?: DataType | null): other is T; + visitBinaryView(type: T, other?: DataType | null): other is T; visitFixedSizeBinary(type: T, other?: DataType | null): other is T; visitDate(type: T, other?: DataType | null): other is T; visitDateDay(type: T, other?: DataType | null): other is T; @@ -254,8 +256,10 @@ TypeComparator.prototype.visitFloat32 = compareFloat; TypeComparator.prototype.visitFloat64 = compareFloat; TypeComparator.prototype.visitUtf8 = compareAny; TypeComparator.prototype.visitLargeUtf8 = compareAny; +TypeComparator.prototype.visitUtf8View = compareAny; TypeComparator.prototype.visitBinary = compareAny; TypeComparator.prototype.visitLargeBinary = compareAny; +TypeComparator.prototype.visitBinaryView = compareAny; TypeComparator.prototype.visitFixedSizeBinary = compareFixedSizeBinary; TypeComparator.prototype.visitDate = compareDate; TypeComparator.prototype.visitDateDay = compareDate; diff --git a/src/visitor/typector.ts b/src/visitor/typector.ts index 2aab6d3d..7fc45b3e 100644 --- a/src/visitor/typector.ts +++ b/src/visitor/typector.ts @@ -84,6 +84,10 @@ export class GetDataTypeConstructor extends Visitor { public visitDurationNanosecond() { return type.DurationNanosecond; } public visitFixedSizeList() { return type.FixedSizeList; } public visitMap() { return type.Map_; } + public visitBinaryView() { return type.BinaryView; } + public visitUtf8View() { return type.Utf8View; } + public visitListView() { return type.ListView; } + public visitLargeListView() { return type.LargeListView; } } /** @ignore */ diff --git a/src/visitor/vectorassembler.ts b/src/visitor/vectorassembler.ts index 7dc36955..2ac6f8fa 100644 --- a/src/visitor/vectorassembler.ts +++ b/src/visitor/vectorassembler.ts @@ -27,7 +27,7 @@ import { BufferRegion, FieldNode } from '../ipc/metadata/message.js'; import { DataType, Dictionary, Float, Int, Date_, Interval, Time, Timestamp, Union, Duration, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, } from '../type.js'; import { bigIntToNumber } from '../util/bigint.js'; @@ -115,11 +115,13 @@ export class VectorAssembler extends Visitor { public get buffers() { return this._buffers; } public get byteLength() { return this._byteLength; } public get bufferRegions() { return this._bufferRegions; } + public get variadicBufferCounts() { return this._variadicBufferCounts; } protected _byteLength = 0; protected _nodes: FieldNode[] = []; protected _buffers: ArrayBufferView[] = []; protected _bufferRegions: BufferRegion[] = []; + protected _variadicBufferCounts: number[] = []; } /** @ignore */ @@ -215,6 +217,22 @@ function assembleFlatListVector(this: VectorAssembler, data: Data) { + const { offset, length, stride, values, variadicBuffers = [] } = data; + if (!values) { + throw new Error('BinaryView data is missing view buffer'); + } + const start = offset * stride; + const end = start + length * stride; + addBuffer.call(this, values.subarray(start, end)); + for (const buffer of variadicBuffers) { + addBuffer.call(this, buffer); + } + this._variadicBufferCounts.push(variadicBuffers.length); + return this; +} + /** @ignore */ function assembleListVector(this: VectorAssembler, data: Data) { const { length, valueOffsets } = data; @@ -239,8 +257,10 @@ VectorAssembler.prototype.visitInt = assembleFlatVector; VectorAssembler.prototype.visitFloat = assembleFlatVector; VectorAssembler.prototype.visitUtf8 = assembleFlatListVector; VectorAssembler.prototype.visitLargeUtf8 = assembleFlatListVector; +VectorAssembler.prototype.visitUtf8View = assembleBinaryViewVector; VectorAssembler.prototype.visitBinary = assembleFlatListVector; VectorAssembler.prototype.visitLargeBinary = assembleFlatListVector; +VectorAssembler.prototype.visitBinaryView = assembleBinaryViewVector; VectorAssembler.prototype.visitFixedSizeBinary = assembleFlatVector; VectorAssembler.prototype.visitDate = assembleFlatVector; VectorAssembler.prototype.visitTimestamp = assembleFlatVector; diff --git a/src/visitor/vectorloader.ts b/src/visitor/vectorloader.ts index 7c82e7ab..9f4db6b5 100644 --- a/src/visitor/vectorloader.ts +++ b/src/visitor/vectorloader.ts @@ -44,13 +44,16 @@ export class VectorLoader extends Visitor { protected buffersIndex = -1; private dictionaries: Map>; private readonly metadataVersion: MetadataVersion; - constructor(bytes: Uint8Array, nodes: FieldNode[], buffers: BufferRegion[], dictionaries: Map>, metadataVersion = MetadataVersion.V5) { + private variadicBufferCounts: number[]; + private variadicBufferIndex = -1; + constructor(bytes: Uint8Array, nodes: FieldNode[], buffers: BufferRegion[], dictionaries: Map>, metadataVersion = MetadataVersion.V5, variadicBufferCounts: number[] = []) { super(); this.bytes = bytes; this.nodes = nodes; this.buffers = buffers; this.dictionaries = dictionaries; this.metadataVersion = metadataVersion; + this.variadicBufferCounts = variadicBufferCounts; } public visit(node: Field | T): Data { @@ -75,12 +78,24 @@ export class VectorLoader extends Visitor { public visitLargeUtf8(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), valueOffsets: this.readOffsets(type), data: this.readData(type) }); } + public visitUtf8View(type: T, { length, nullCount } = this.nextFieldNode()) { + const nullBitmap = this.readNullBitmap(type, nullCount); + const views = this.readData(type); + const variadicBuffers = this.readVariadicBuffers(this.nextVariadicBufferCount()); + return makeData({ type, length, nullCount, nullBitmap, views, variadicBuffers }); + } public visitBinary(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), valueOffsets: this.readOffsets(type), data: this.readData(type) }); } public visitLargeBinary(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), valueOffsets: this.readOffsets(type), data: this.readData(type) }); } + public visitBinaryView(type: T, { length, nullCount } = this.nextFieldNode()) { + const nullBitmap = this.readNullBitmap(type, nullCount); + const views = this.readData(type); + const variadicBuffers = this.readVariadicBuffers(this.nextVariadicBufferCount()); + return makeData({ type, length, nullCount, nullBitmap, views, variadicBuffers }); + } public visitFixedSizeBinary(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), data: this.readData(type) }); } @@ -142,6 +157,12 @@ export class VectorLoader extends Visitor { protected readData(_type: T, { length, offset } = this.nextBufferRange()) { return this.bytes.subarray(offset, offset + length); } + protected readVariadicBuffers(length: number) { + return Array.from({ length }, () => this.readData(null as any)); + } + protected nextVariadicBufferCount() { + return this.variadicBufferCounts[++this.variadicBufferIndex] ?? 0; + } protected readDictionary(type: T): Vector { return this.dictionaries.get(type.id)!; } @@ -208,8 +229,8 @@ function binaryDataFromJSON(values: string[]) { export class CompressedVectorLoader extends VectorLoader { private bodyChunks: Uint8Array[]; - constructor(bodyChunks: Uint8Array[], nodes: FieldNode[], buffers: BufferRegion[], dictionaries: Map>, metadataVersion: MetadataVersion) { - super(new Uint8Array(0), nodes, buffers, dictionaries, metadataVersion); + constructor(bodyChunks: Uint8Array[], nodes: FieldNode[], buffers: BufferRegion[], dictionaries: Map>, metadataVersion: MetadataVersion, variadicBufferCounts: number[] = []) { + super(new Uint8Array(0), nodes, buffers, dictionaries, metadataVersion, variadicBufferCounts); this.bodyChunks = bodyChunks; } protected readData(_type: T, _buffer = this.nextBufferRange()) { diff --git a/test/unit/builders/listview-tests.ts b/test/unit/builders/listview-tests.ts new file mode 100644 index 00000000..69a908b1 --- /dev/null +++ b/test/unit/builders/listview-tests.ts @@ -0,0 +1,199 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { ListView, LargeListView, Int32 } from '../../../src/type.js'; +import { Field } from '../../../src/schema.js'; +import { ListViewBuilder, LargeListViewBuilder } from '../../../src/builder/listview.js'; +import { Int32Builder } from '../../../src/builder/int.js'; +import { Vector } from '../../../src/vector.js'; + +describe('ListViewBuilder', () => { + it('should build ListView with basic values', () => { + const type = new ListView(new Field('item', new Int32())); + const builder = new ListViewBuilder({ type }); + builder.addChild(new Int32Builder({ type: new Int32() }), 'item'); + + builder.append([1, 2, 3]); + builder.append([4, 5]); + builder.append([6]); + + const vector = builder.finish().toVector(); + + expect(vector).toHaveLength(3); + expect(vector.get(0)?.toArray()).toEqual(new Int32Array([1, 2, 3])); + expect(vector.get(1)?.toArray()).toEqual(new Int32Array([4, 5])); + expect(vector.get(2)?.toArray()).toEqual(new Int32Array([6])); + }); + + it('should handle null values', () => { + const type = new ListView(new Field('item', new Int32())); + const builder = new ListViewBuilder({ type, nullValues: [null] }); + builder.addChild(new Int32Builder({ type: new Int32() }), 'item'); + + builder.append([1, 2]); + builder.append(null); + builder.append([3, 4, 5]); + + const vector = builder.finish().toVector(); + + expect(vector).toHaveLength(3); + expect(vector.get(0)?.toArray()).toEqual(new Int32Array([1, 2])); + expect(vector.get(1)).toBeNull(); + expect(vector.get(2)?.toArray()).toEqual(new Int32Array([3, 4, 5])); + }); + + it('should handle empty lists', () => { + const type = new ListView(new Field('item', new Int32())); + const builder = new ListViewBuilder({ type }); + builder.addChild(new Int32Builder({ type: new Int32() }), 'item'); + + builder.append([]); + builder.append([1, 2]); + builder.append([]); + + const vector = builder.finish().toVector(); + + expect(vector).toHaveLength(3); + expect(vector.get(0)?.toArray()).toEqual(new Int32Array([])); + expect(vector.get(1)?.toArray()).toEqual(new Int32Array([1, 2])); + expect(vector.get(2)?.toArray()).toEqual(new Int32Array([])); + }); + + it('should handle multiple flushes', () => { + const type = new ListView(new Field('item', new Int32())); + const builder = new ListViewBuilder({ type }); + builder.addChild(new Int32Builder({ type: new Int32() }), 'item'); + + builder.append([1, 2]); + const data1 = builder.flush(); + builder.append([3, 4]); + const data2 = builder.flush(); + + builder.finish(); + + const vector1 = new Vector([data1]); + const vector2 = new Vector([data2]); + + expect(vector1).toHaveLength(1); + expect(vector1.get(0)?.toArray()).toEqual(new Int32Array([1, 2])); + expect(vector2).toHaveLength(1); + expect(vector2.get(0)?.toArray()).toEqual(new Int32Array([3, 4])); + }); + + it('should build ListView with varying list sizes', () => { + const type = new ListView(new Field('item', new Int32())); + const builder = new ListViewBuilder({ type }); + builder.addChild(new Int32Builder({ type: new Int32() }), 'item'); + + builder.append([1]); + builder.append([2, 3]); + builder.append([4, 5, 6]); + builder.append([7, 8, 9, 10]); + + const vector = builder.finish().toVector(); + + expect(vector).toHaveLength(4); + expect(vector.get(0)?.toArray()).toEqual(new Int32Array([1])); + expect(vector.get(1)?.toArray()).toEqual(new Int32Array([2, 3])); + expect(vector.get(2)?.toArray()).toEqual(new Int32Array([4, 5, 6])); + expect(vector.get(3)?.toArray()).toEqual(new Int32Array([7, 8, 9, 10])); + }); +}); + +describe('LargeListViewBuilder', () => { + it('should build LargeListView with basic values', () => { + const type = new LargeListView(new Field('item', new Int32())); + const builder = new LargeListViewBuilder({ type }); + builder.addChild(new Int32Builder({ type: new Int32() }), 'item'); + + builder.append([1, 2, 3]); + builder.append([4, 5]); + builder.append([6]); + + const vector = builder.finish().toVector(); + + expect(vector).toHaveLength(3); + expect(vector.get(0)?.toArray()).toEqual(new Int32Array([1, 2, 3])); + expect(vector.get(1)?.toArray()).toEqual(new Int32Array([4, 5])); + expect(vector.get(2)?.toArray()).toEqual(new Int32Array([6])); + }); + + it('should handle null values', () => { + const type = new LargeListView(new Field('item', new Int32())); + const builder = new LargeListViewBuilder({ type, nullValues: [null] }); + builder.addChild(new Int32Builder({ type: new Int32() }), 'item'); + + builder.append([1, 2]); + builder.append(null); + builder.append([3, 4, 5]); + + const vector = builder.finish().toVector(); + + expect(vector).toHaveLength(3); + expect(vector.get(0)?.toArray()).toEqual(new Int32Array([1, 2])); + expect(vector.get(1)).toBeNull(); + expect(vector.get(2)?.toArray()).toEqual(new Int32Array([3, 4, 5])); + }); + + it('should handle empty lists', () => { + const type = new LargeListView(new Field('item', new Int32())); + const builder = new LargeListViewBuilder({ type }); + builder.addChild(new Int32Builder({ type: new Int32() }), 'item'); + + builder.append([]); + builder.append([1, 2]); + builder.append([]); + + const vector = builder.finish().toVector(); + + expect(vector).toHaveLength(3); + expect(vector.get(0)?.toArray()).toEqual(new Int32Array([])); + expect(vector.get(1)?.toArray()).toEqual(new Int32Array([1, 2])); + expect(vector.get(2)?.toArray()).toEqual(new Int32Array([])); + }); + + it('should use BigInt offsets internally', () => { + const type = new LargeListView(new Field('item', new Int32())); + const builder = new LargeListViewBuilder({ type }); + builder.addChild(new Int32Builder({ type: new Int32() }), 'item'); + + builder.append([1, 2]); + builder.append([3, 4, 5]); + + const data = builder.finish().flush(); + + // Verify that offsets and sizes are BigInt64Array + expect(data.valueOffsets).toBeInstanceOf(BigInt64Array); + expect(data.values).toBeInstanceOf(BigInt64Array); // sizes buffer + }); +}); + +describe('ListView type properties', () => { + it('should correctly report type name', () => { + const type = new ListView(new Field('item', new Int32())); + const builder = new ListViewBuilder({ type }); + builder.addChild(new Int32Builder({ type: new Int32() }), 'item'); + expect(builder.type.toString()).toBe('ListView'); + }); + + it('should correctly report LargeListView type name', () => { + const type = new LargeListView(new Field('item', new Int32())); + const builder = new LargeListViewBuilder({ type }); + builder.addChild(new Int32Builder({ type: new Int32() }), 'item'); + expect(builder.type.toString()).toBe('LargeListView'); + }); +}); diff --git a/test/unit/builders/view-builders-tests.ts b/test/unit/builders/view-builders-tests.ts new file mode 100644 index 00000000..88ee28fe --- /dev/null +++ b/test/unit/builders/view-builders-tests.ts @@ -0,0 +1,258 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { BinaryView, Utf8View } from '../../../src/type.js'; +import { makeBuilder, vectorFromArray } from '../../../src/factories.js'; + +describe('BinaryViewBuilder', () => { + it('should build inline binary values (≤12 bytes)', () => { + const builder = makeBuilder({ type: new BinaryView() }); + const values = [ + new Uint8Array([1, 2, 3]), + new Uint8Array([4, 5, 6, 7, 8, 9, 10, 11, 12]), + new Uint8Array([13]) + ]; + + for (const value of values) { + builder.append(value); + } + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(3); + expect(vector.get(0)).toEqual(values[0]); + expect(vector.get(1)).toEqual(values[1]); + expect(vector.get(2)).toEqual(values[2]); + }); + + it('should build out-of-line binary values (>12 bytes)', () => { + const builder = makeBuilder({ type: new BinaryView() }); + const value = new Uint8Array(100); + for (let i = 0; i < 100; i++) { + value[i] = i % 256; + } + + builder.append(value); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(1); + expect(vector.get(0)).toEqual(value); + }); + + it('should build mixed inline and out-of-line values', () => { + const builder = makeBuilder({ type: new BinaryView() }); + const small = new Uint8Array([1, 2, 3]); + const large = new Uint8Array(50); + for (let i = 0; i < 50; i++) { + large[i] = i % 256; + } + + builder.append(small); + builder.append(large); + builder.append(small); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(3); + expect(vector.get(0)).toEqual(small); + expect(vector.get(1)).toEqual(large); + expect(vector.get(2)).toEqual(small); + }); + + it('should handle null values', () => { + const builder = makeBuilder({ type: new BinaryView(), nullValues: [null] }); + + builder.append(new Uint8Array([1, 2, 3])); + builder.append(null); + builder.append(new Uint8Array([4, 5, 6])); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(3); + expect(vector.get(0)).toEqual(new Uint8Array([1, 2, 3])); + expect(vector.get(1)).toBeNull(); + expect(vector.get(2)).toEqual(new Uint8Array([4, 5, 6])); + }); + + it('should handle empty values', () => { + const builder = makeBuilder({ type: new BinaryView() }); + + builder.append(new Uint8Array([])); + builder.append(new Uint8Array([1])); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(2); + expect(vector.get(0)).toEqual(new Uint8Array([])); + expect(vector.get(1)).toEqual(new Uint8Array([1])); + }); + + it('should handle exactly 12-byte boundary values', () => { + const builder = makeBuilder({ type: new BinaryView() }); + const exactly12 = new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]); + const exactly13 = new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]); + + builder.append(exactly12); + builder.append(exactly13); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(2); + expect(vector.get(0)).toEqual(exactly12); + expect(vector.get(1)).toEqual(exactly13); + }); + + it('should handle multiple flushes', () => { + const builder = makeBuilder({ type: new BinaryView() }); + + builder.append(new Uint8Array([1, 2])); + const data1 = builder.flush(); + expect(data1).toHaveLength(1); + + builder.append(new Uint8Array([3, 4])); + builder.append(new Uint8Array([5, 6])); + const data2 = builder.flush(); + expect(data2).toHaveLength(2); + }); +}); + +describe('Utf8ViewBuilder', () => { + it('should build inline string values (≤12 bytes)', () => { + const builder = makeBuilder({ type: new Utf8View() }); + const values = ['hello', 'world', 'foo']; + + for (const value of values) { + builder.append(value); + } + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(3); + expect(vector.get(0)).toBe('hello'); + expect(vector.get(1)).toBe('world'); + expect(vector.get(2)).toBe('foo'); + }); + + it('should build out-of-line string values (>12 bytes)', () => { + const builder = makeBuilder({ type: new Utf8View() }); + const longString = 'This is a long string that exceeds 12 bytes'; + + builder.append(longString); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(1); + expect(vector.get(0)).toBe(longString); + }); + + it('should build mixed inline and out-of-line strings', () => { + const builder = makeBuilder({ type: new Utf8View() }); + const short = 'hi'; + const long = 'This is a very long string that definitely exceeds the 12 byte inline capacity'; + + builder.append(short); + builder.append(long); + builder.append(short); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(3); + expect(vector.get(0)).toBe(short); + expect(vector.get(1)).toBe(long); + expect(vector.get(2)).toBe(short); + }); + + it('should handle null values', () => { + const builder = makeBuilder({ type: new Utf8View(), nullValues: [null] }); + + builder.append('hello'); + builder.append(null); + builder.append('world'); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(3); + expect(vector.get(0)).toBe('hello'); + expect(vector.get(1)).toBeNull(); + expect(vector.get(2)).toBe('world'); + }); + + it('should handle empty strings', () => { + const builder = makeBuilder({ type: new Utf8View() }); + + builder.append(''); + builder.append('a'); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(2); + expect(vector.get(0)).toBe(''); + expect(vector.get(1)).toBe('a'); + }); + + it('should handle UTF-8 multibyte characters', () => { + const builder = makeBuilder({ type: new Utf8View() }); + const values = ['🚀', '你好', 'Ñoño', 'emoji: 🎉']; + + for (const value of values) { + builder.append(value); + } + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(4); + expect(vector.get(0)).toBe('🚀'); + expect(vector.get(1)).toBe('你好'); + expect(vector.get(2)).toBe('Ñoño'); + expect(vector.get(3)).toBe('emoji: 🎉'); + }); + + it('should handle exactly 12-byte boundary strings', () => { + const builder = makeBuilder({ type: new Utf8View() }); + const exactly12 = 'twelve bytes'; // ASCII: 12 bytes + const exactly13 = 'thirteen byte'; // ASCII: 13 bytes + + builder.append(exactly12); + builder.append(exactly13); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(2); + expect(vector.get(0)).toBe(exactly12); + expect(vector.get(1)).toBe(exactly13); + }); + + it('should build from vectorFromArray', () => { + const values = ['hello', 'world', null, 'foo']; + const vector = vectorFromArray(values, new Utf8View()); + + expect(vector).toHaveLength(4); + expect(vector.get(0)).toBe('hello'); + expect(vector.get(1)).toBe('world'); + expect(vector.get(2)).toBeNull(); + expect(vector.get(3)).toBe('foo'); + }); + + it('should handle large batch of values', () => { + const builder = makeBuilder({ type: new Utf8View() }); + const count = 1000; + const values: string[] = []; + + for (let i = 0; i < count; i++) { + const value = i % 2 === 0 + ? `short_${i}` // inline + : `this_is_a_long_string_that_goes_out_of_line_${i}`; // out-of-line + values.push(value); + builder.append(value); + } + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(count); + + for (let i = 0; i < count; i++) { + expect(vector.get(i)).toBe(values[i]); + } + }); +}); diff --git a/test/unit/ipc/list-view-tests.ts b/test/unit/ipc/list-view-tests.ts new file mode 100644 index 00000000..da09c6d1 --- /dev/null +++ b/test/unit/ipc/list-view-tests.ts @@ -0,0 +1,262 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { makeData } from 'apache-arrow/data'; +import { ListView, LargeListView, Int8 } from 'apache-arrow/type'; +import { Vector } from 'apache-arrow/vector'; +import { Field } from 'apache-arrow/schema'; + +describe('ListView and LargeListView integration', () => { + describe('ListView', () => { + // Test case from Arrow spec documentation: + // [[12, -7, 25], null, [0, -127, 127, 50], []] + it('reads ListView values with in-order offsets', () => { + const childData = makeData({ + type: new Int8(), + length: 7, + nullCount: 0, + data: new Int8Array([12, -7, 25, 0, -127, 127, 50]) + }); + + const offsets = new Int32Array([0, 7, 3, 0]); + const sizes = new Int32Array([3, 0, 4, 0]); + const nullBitmap = new Uint8Array([0b00001101]); // bits: [1,0,1,1] = valid, null, valid, valid + + const listViewData = makeData({ + type: new ListView(new Field('item', new Int8())), + length: 4, + nullCount: 1, + nullBitmap, + valueOffsets: offsets, + sizes, + child: childData + }); + + const vector = new Vector([listViewData]); + + expect(vector.get(0)?.toArray()).toEqual(new Int8Array([12, -7, 25])); + expect(vector.get(1)).toBeNull(); + expect(vector.get(2)?.toArray()).toEqual(new Int8Array([0, -127, 127, 50])); + expect(vector.get(3)?.toArray()).toEqual(new Int8Array([])); + }); + + // Test case from Arrow spec showing out-of-order offsets and value sharing: + // [[12, -7, 25], null, [0, -127, 127, 50], [], [50, 12]] + it('reads ListView values with out-of-order offsets and value sharing', () => { + const childData = makeData({ + type: new Int8(), + length: 7, + nullCount: 0, + data: new Int8Array([0, -127, 127, 50, 12, -7, 25]) + }); + + // Out of order offsets: [4, 7, 0, 0, 3] + const offsets = new Int32Array([4, 7, 0, 0, 3]); + const sizes = new Int32Array([3, 0, 4, 0, 2]); + const nullBitmap = new Uint8Array([0b00011101]); // [1,0,1,1,1] = valid, null, valid, valid, valid + + const listViewData = makeData({ + type: new ListView(new Field('item', new Int8())), + length: 5, + nullCount: 1, + nullBitmap, + valueOffsets: offsets, + sizes, + child: childData + }); + + const vector = new Vector([listViewData]); + + // List 0: offset=4, size=3 -> [12, -7, 25] + expect(vector.get(0)?.toArray()).toEqual(new Int8Array([12, -7, 25])); + // List 1: null + expect(vector.get(1)).toBeNull(); + // List 2: offset=0, size=4 -> [0, -127, 127, 50] + expect(vector.get(2)?.toArray()).toEqual(new Int8Array([0, -127, 127, 50])); + // List 3: offset=0, size=0 -> [] + expect(vector.get(3)?.toArray()).toEqual(new Int8Array([])); + // List 4: offset=3, size=2 -> [50, 12] (shares values with list 2) + expect(vector.get(4)?.toArray()).toEqual(new Int8Array([50, 12])); + }); + + it('handles all null ListView', () => { + const childData = makeData({ + type: new Int8(), + length: 0, + nullCount: 0, + data: new Int8Array([]) + }); + + const offsets = new Int32Array([0, 0, 0]); + const sizes = new Int32Array([0, 0, 0]); + const nullBitmap = new Uint8Array([0b00000000]); // all null + + const listViewData = makeData({ + type: new ListView(new Field('item', new Int8())), + length: 3, + nullCount: 3, + nullBitmap, + valueOffsets: offsets, + sizes, + child: childData + }); + + const vector = new Vector([listViewData]); + + expect(vector.get(0)).toBeNull(); + expect(vector.get(1)).toBeNull(); + expect(vector.get(2)).toBeNull(); + }); + + it('handles ListView with all empty lists', () => { + const childData = makeData({ + type: new Int8(), + length: 0, + nullCount: 0, + data: new Int8Array([]) + }); + + const offsets = new Int32Array([0, 0, 0]); + const sizes = new Int32Array([0, 0, 0]); + const nullBitmap = new Uint8Array([0b00000111]); // all valid + + const listViewData = makeData({ + type: new ListView(new Field('item', new Int8())), + length: 3, + nullCount: 0, + nullBitmap, + valueOffsets: offsets, + sizes, + child: childData + }); + + const vector = new Vector([listViewData]); + + expect(vector.get(0)?.toArray()).toEqual(new Int8Array([])); + expect(vector.get(1)?.toArray()).toEqual(new Int8Array([])); + expect(vector.get(2)?.toArray()).toEqual(new Int8Array([])); + }); + + it('handles ListView with single element lists', () => { + const childData = makeData({ + type: new Int8(), + length: 3, + nullCount: 0, + data: new Int8Array([42, -1, 100]) + }); + + const offsets = new Int32Array([0, 1, 2]); + const sizes = new Int32Array([1, 1, 1]); + const nullBitmap = new Uint8Array([0b00000111]); + + const listViewData = makeData({ + type: new ListView(new Field('item', new Int8())), + length: 3, + nullCount: 0, + nullBitmap, + valueOffsets: offsets, + sizes, + child: childData + }); + + const vector = new Vector([listViewData]); + + expect(vector.get(0)?.toArray()).toEqual(new Int8Array([42])); + expect(vector.get(1)?.toArray()).toEqual(new Int8Array([-1])); + expect(vector.get(2)?.toArray()).toEqual(new Int8Array([100])); + }); + }); + + describe('LargeListView', () => { + it('reads LargeListView values with BigInt offsets', () => { + const childData = makeData({ + type: new Int8(), + length: 7, + nullCount: 0, + data: new Int8Array([12, -7, 25, 0, -127, 127, 50]) + }); + + const offsets = new BigInt64Array([0n, 7n, 3n, 0n]); + const sizes = new BigInt64Array([3n, 0n, 4n, 0n]); + const nullBitmap = new Uint8Array([0b00001101]); + + const largeListViewData = makeData({ + type: new LargeListView(new Field('item', new Int8())), + length: 4, + nullCount: 1, + nullBitmap, + valueOffsets: offsets, + sizes, + child: childData + }); + + const vector = new Vector([largeListViewData]); + + expect(vector.get(0)?.toArray()).toEqual(new Int8Array([12, -7, 25])); + expect(vector.get(1)).toBeNull(); + expect(vector.get(2)?.toArray()).toEqual(new Int8Array([0, -127, 127, 50])); + expect(vector.get(3)?.toArray()).toEqual(new Int8Array([])); + }); + + it('reads LargeListView with out-of-order offsets', () => { + const childData = makeData({ + type: new Int8(), + length: 5, + nullCount: 0, + data: new Int8Array([10, 20, 30, 40, 50]) + }); + + // Out of order: list 0 starts at 2, list 1 starts at 0 + const offsets = new BigInt64Array([2n, 0n]); + const sizes = new BigInt64Array([3n, 2n]); + const nullBitmap = new Uint8Array([0b00000011]); + + const largeListViewData = makeData({ + type: new LargeListView(new Field('item', new Int8())), + length: 2, + nullCount: 0, + nullBitmap, + valueOffsets: offsets, + sizes, + child: childData + }); + + const vector = new Vector([largeListViewData]); + + expect(vector.get(0)?.toArray()).toEqual(new Int8Array([30, 40, 50])); + expect(vector.get(1)?.toArray()).toEqual(new Int8Array([10, 20])); + }); + }); + + describe('ListView properties', () => { + it('has correct type properties', () => { + const listViewType = new ListView(new Field('item', new Int8())); + expect(listViewType.typeId).toBe(25); // Type.ListView + expect(listViewType.toString()).toBe('ListView'); + expect(listViewType.valueType).toBeInstanceOf(Int8); + expect(listViewType.valueField.name).toBe('item'); + }); + + it('has correct type properties for LargeListView', () => { + const largeListViewType = new LargeListView(new Field('item', new Int8())); + expect(largeListViewType.typeId).toBe(26); // Type.LargeListView + expect(largeListViewType.toString()).toBe('LargeListView'); + expect(largeListViewType.valueType).toBeInstanceOf(Int8); + expect(largeListViewType.valueField.name).toBe('item'); + }); + }); +}); diff --git a/test/unit/ipc/view-types-tests.ts b/test/unit/ipc/view-types-tests.ts new file mode 100644 index 00000000..d0b5a7a9 --- /dev/null +++ b/test/unit/ipc/view-types-tests.ts @@ -0,0 +1,90 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { makeData } from 'apache-arrow/data'; +import { BinaryView, Utf8View } from 'apache-arrow/type'; +import { Vector } from 'apache-arrow/vector'; + +const BINARY_VIEW_SIZE = 16; + +function createInlineView(value: Uint8Array) { + const view = new Uint8Array(BINARY_VIEW_SIZE); + const dv = new DataView(view.buffer, view.byteOffset, view.byteLength); + dv.setInt32(0, value.length, true); + view.set(value, 4); + return view; +} + +function createReferencedView(value: Uint8Array, bufferIndex: number, offset: number) { + const view = new Uint8Array(BINARY_VIEW_SIZE); + const dv = new DataView(view.buffer, view.byteOffset, view.byteLength); + dv.setInt32(0, value.length, true); + view.set(value.subarray(0, Math.min(4, value.length)), 4); + dv.setInt32(8, bufferIndex, true); + dv.setInt32(12, offset, true); + return view; +} + +describe('BinaryView and Utf8View integration', () => { + const inlineBinary = new Uint8Array([1, 2, 3, 4, 5]); + const referencedBinary = new Uint8Array(Array.from({ length: 20 }, (_, i) => i)); + const referencedUtf8 = 'View types are fun!'; + + const inlineUtf8 = 'hi'; + + const binaryViews = new Uint8Array(BINARY_VIEW_SIZE * 3); + binaryViews.set(createInlineView(inlineBinary), 0); + binaryViews.set(createReferencedView(referencedBinary, 0, 0), BINARY_VIEW_SIZE); + binaryViews.set(createReferencedView(new Uint8Array(0), 0, referencedBinary.length), 2 * BINARY_VIEW_SIZE); + + const utf8Payload = new TextEncoder().encode(referencedUtf8); + const utf8Views = new Uint8Array(BINARY_VIEW_SIZE * 2); + utf8Views.set(createInlineView(new TextEncoder().encode(inlineUtf8)), 0); + utf8Views.set(createReferencedView(utf8Payload, 0, 0), BINARY_VIEW_SIZE); + + const nullBitmap = new Uint8Array([0b00000011]); + + const binaryData = makeData({ + type: new BinaryView(), + length: 3, + nullBitmap, + views: binaryViews, + variadicBuffers: [referencedBinary] + }); + + const utf8Data = makeData({ + type: new Utf8View(), + length: 2, + nullBitmap: new Uint8Array([0b00000011]), + views: utf8Views, + variadicBuffers: [utf8Payload] + }); + + it('reads BinaryView values via Vector', () => { + const vector = new Vector([binaryData]); + expect(vector.get(0)).toEqual(inlineBinary); + expect(vector.get(1)).toEqual(referencedBinary); + expect(vector.get(2)).toBeNull(); + }); + + it('reads Utf8View values via Vector', () => { + const vector = new Vector([utf8Data]); + expect(vector.get(0)).toBe(inlineUtf8); + expect(vector.get(1)).toBe(referencedUtf8); + }); + +});