diff --git a/.github/patches/enable-binaryview-integration-tests.patch b/.github/patches/enable-binaryview-integration-tests.patch new file mode 100644 index 00000000..ac5c17e1 --- /dev/null +++ b/.github/patches/enable-binaryview-integration-tests.patch @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py +index 83913dc379..7ace28e1be 100644 +--- a/dev/archery/archery/integration/datagen.py ++++ b/dev/archery/archery/integration/datagen.py +@@ -2003,7 +2003,6 @@ def get_generated_json_files(tempdir=None): + .skip_tester('Rust'), + + generate_binary_view_case() +- .skip_tester('JS') + # TODO(https://github.com/apache/arrow-nanoarrow/issues/618) + .skip_tester('nanoarrow') + .skip_tester('Rust'), diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 5e96bc17..344942a2 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -193,6 +193,9 @@ jobs: uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: python-version: 3 + - name: Patch Archery to enable BinaryView tests + run: | + patch -p1 < js/.github/patches/enable-binaryview-integration-tests.patch - name: Setup Archery run: pip install -e dev/archery[docker] - name: Execute Docker Build diff --git a/scripts/update_flatbuffers.sh b/scripts/update_flatbuffers.sh new file mode 100755 index 00000000..d81dfbc3 --- /dev/null +++ b/scripts/update_flatbuffers.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Regenerate the FlatBuffers helper files used by arrow-js. Requires a sibling +# checkout of apache/arrow (../arrow) if not provided in env and a working flatc on PATH. + +set -euo pipefail + +PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +FORMAT_DIR="${PROJECT_ROOT}/../arrow/format" + +if [[ ! -d "${FORMAT_DIR}" ]]; then + echo "error: expected FlatBuffers schemas in ../arrow/format" >&2 + exit 1 +fi + +if ! command -v flatc >/dev/null 2>&1; then + echo "error: flatc not found on PATH" >&2 + exit 1 +fi + +TMPDIR="$(mktemp -d "${PROJECT_ROOT}/.flatc.XXXXXX")" +cleanup() { + rm -rf "${TMPDIR}" +} +trap cleanup EXIT + +schemas=(File Schema Message Tensor SparseTensor) + +for schema in "${schemas[@]}"; do + cp "${FORMAT_DIR}/${schema}.fbs" "${TMPDIR}/${schema}.fbs" + sed -i '' \ + -e 's/namespace org.apache.arrow.flatbuf;//g' \ + -e 's/org\.apache\.arrow\.flatbuf\.//g' \ + "${TMPDIR}/${schema}.fbs" +done + +flatc --ts --ts-flat-files --ts-omit-entrypoint \ + -o "${TMPDIR}" \ + "${TMPDIR}"/{File,Schema,Message,Tensor,SparseTensor}.fbs + +rm -f "${TMPDIR}"/{File,Schema,Message,Tensor,SparseTensor}.fbs + +generated_files=( + binary-view.ts + list-view.ts + large-list-view.ts + message.ts + record-batch.ts + schema.ts + type.ts + utf8-view.ts +) + +for file in "${generated_files[@]}"; do + if [[ ! -f "${TMPDIR}/${file}" ]]; then + echo "error: expected generated file ${file} not found" >&2 + exit 1 + fi + install -m 0644 "${TMPDIR}/${file}" "${PROJECT_ROOT}/src/fb/${file}" +done diff --git a/src/Arrow.dom.ts b/src/Arrow.dom.ts index e0cd681c..20977a7d 100644 --- a/src/Arrow.dom.ts +++ b/src/Arrow.dom.ts @@ -49,15 +49,16 @@ export { Bool, Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64, Float, Float16, Float32, Float64, - Utf8, LargeUtf8, - Binary, LargeBinary, + Utf8, LargeUtf8, Utf8View, + Binary, LargeBinary, BinaryView, FixedSizeBinary, Date_, DateDay, DateMillisecond, Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond, Decimal, - List, + List, LargeList, ListView, LargeListView, Struct, StructRow, + RunEndEncoded, Union, DenseUnion, SparseUnion, Dictionary, Interval, IntervalDayTime, IntervalYearMonth, IntervalMonthDayNano, @@ -81,7 +82,7 @@ export { } from './Arrow.js'; export { - BinaryBuilder, LargeBinaryBuilder, + BinaryBuilder, BinaryViewBuilder, LargeBinaryBuilder, BoolBuilder, DateBuilder, DateDayBuilder, DateMillisecondBuilder, DecimalBuilder, @@ -92,12 +93,12 @@ export { IntervalBuilder, IntervalDayTimeBuilder, IntervalYearMonthBuilder, IntervalMonthDayNanoBuilder, DurationBuilder, DurationSecondBuilder, DurationMillisecondBuilder, DurationMicrosecondBuilder, DurationNanosecondBuilder, IntBuilder, Int8Builder, Int16Builder, Int32Builder, Int64Builder, Uint8Builder, Uint16Builder, Uint32Builder, Uint64Builder, - ListBuilder, + ListBuilder, ListViewBuilder, LargeListViewBuilder, MapBuilder, NullBuilder, StructBuilder, TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuilder, TimestampMicrosecondBuilder, TimestampNanosecondBuilder, TimeBuilder, TimeSecondBuilder, TimeMillisecondBuilder, TimeMicrosecondBuilder, TimeNanosecondBuilder, UnionBuilder, DenseUnionBuilder, SparseUnionBuilder, - Utf8Builder, LargeUtf8Builder + Utf8Builder, Utf8ViewBuilder, LargeUtf8Builder } from './Arrow.js'; diff --git a/src/Arrow.ts b/src/Arrow.ts index 8321026f..474480ef 100644 --- a/src/Arrow.ts +++ b/src/Arrow.ts @@ -37,21 +37,22 @@ export { Bool, Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64, Float, Float16, Float32, Float64, - Utf8, LargeUtf8, - Binary, LargeBinary, + Utf8, LargeUtf8, Utf8View, + Binary, LargeBinary, BinaryView, FixedSizeBinary, Date_, DateDay, DateMillisecond, Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond, Decimal, - List, + List, LargeList, ListView, LargeListView, Struct, Union, DenseUnion, SparseUnion, Dictionary, Interval, IntervalDayTime, IntervalYearMonth, IntervalMonthDayNano, Duration, DurationSecond, DurationMillisecond, DurationMicrosecond, DurationNanosecond, FixedSizeList, - Map_ + Map_, + RunEndEncoded } from './type.js'; export { Table, makeTable, tableFromArrays } from './table.js'; @@ -79,10 +80,14 @@ export { TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuilder, export { IntervalBuilder, IntervalDayTimeBuilder, IntervalYearMonthBuilder, IntervalMonthDayNanoBuilder } from './builder/interval.js'; export { DurationBuilder, DurationSecondBuilder, DurationMillisecondBuilder, DurationMicrosecondBuilder, DurationNanosecondBuilder } from './builder/duration.js'; export { Utf8Builder } from './builder/utf8.js'; +export { Utf8ViewBuilder } from './builder/utf8view.js'; export { LargeUtf8Builder } from './builder/largeutf8.js'; export { BinaryBuilder } from './builder/binary.js'; +export { BinaryViewBuilder } from './builder/binaryview.js'; export { LargeBinaryBuilder } from './builder/largebinary.js'; export { ListBuilder } from './builder/list.js'; +export { LargeListBuilder } from './builder/largelist.js'; +export { ListViewBuilder, LargeListViewBuilder } from './builder/listview.js'; export { FixedSizeListBuilder } from './builder/fixedsizelist.js'; export { MapBuilder } from './builder/map.js'; export { StructBuilder } from './builder/struct.js'; diff --git a/src/builder.ts b/src/builder.ts index 5ae43a88..3516ca9f 100644 --- a/src/builder.ts +++ b/src/builder.ts @@ -22,7 +22,7 @@ import { DataType, strideForType, Float, Int, Decimal, FixedSizeBinary, Date_, Time, Timestamp, Interval, Duration, - Utf8, LargeUtf8, Binary, LargeBinary, List, Map_, + Utf8, LargeUtf8, Binary, LargeBinary, List, LargeList, Map_, } from './type.js'; import { createIsValidFunction } from './builder/valid.js'; import { BufferBuilder, BitmapBufferBuilder, DataBufferBuilder, OffsetsBufferBuilder } from './builder/buffer.js'; @@ -357,7 +357,7 @@ export abstract class FixedWidthBuilder extends Builder { +export abstract class VariableWidthBuilder extends Builder { protected _pendingLength = 0; protected _offsets: OffsetsBufferBuilder; protected _pending: Map | undefined; diff --git a/src/builder/binaryview.ts b/src/builder/binaryview.ts new file mode 100644 index 00000000..80e5930f --- /dev/null +++ b/src/builder/binaryview.ts @@ -0,0 +1,169 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { BinaryView } from '../type.js'; +import { Builder, BuilderOptions } from '../builder.js'; +import { BufferBuilder } from './buffer.js'; +import { toUint8Array } from '../util/buffer.js'; +import { makeData } from '../data.js'; + +/** @ignore */ +export class BinaryViewBuilder extends Builder { + protected _views: BufferBuilder; + protected _variadicBuffers: Uint8Array[] = []; + protected _currentBuffer: BufferBuilder | null = null; + protected _currentBufferIndex = 0; + protected _currentBufferOffset = 0; + protected readonly _bufferSize = 32 * 1024 * 1024; // 32MB per buffer as per spec recommendation + + constructor(opts: BuilderOptions) { + super(opts); + this._views = new BufferBuilder(Uint8Array); + } + + public get byteLength(): number { + let size = 0; + this._views && (size += this._views.byteLength); + this._nulls && (size += this._nulls.byteLength); + for (const buffer of this._variadicBuffers) { + size += buffer.byteLength; + } + this._currentBuffer && (size += this._currentBuffer.byteLength); + return size; + } + + public setValue(index: number, value: Uint8Array) { + const data = toUint8Array(value); + const length = data.length; + + // Ensure views buffer has space up to this index (similar to FixedWidthBuilder) + const bytesNeeded = (index + 1) * BinaryView.ELEMENT_WIDTH; + const currentBytes = this._views.length; + if (bytesNeeded > currentBytes) { + this._views.reserve(bytesNeeded - currentBytes); + } + + const viewBuffer = this._views.buffer; + const viewOffset = index * BinaryView.ELEMENT_WIDTH; + const view = new DataView(viewBuffer.buffer, viewBuffer.byteOffset + viewOffset, BinaryView.ELEMENT_WIDTH); + + // Write length (4 bytes, little-endian) + view.setInt32(BinaryView.LENGTH_OFFSET, length, true); + + if (length <= BinaryView.INLINE_CAPACITY) { + // Inline: store data directly in view struct (up to 12 bytes) + viewBuffer.set(data, viewOffset + BinaryView.INLINE_OFFSET); + // Zero out remaining bytes + for (let i = length; i < BinaryView.INLINE_CAPACITY; i++) { + viewBuffer[viewOffset + BinaryView.INLINE_OFFSET + i] = 0; + } + } else { + // Out-of-line: store in variadic buffer + // Write prefix (first 4 bytes of data) + const prefix = new DataView(data.buffer, data.byteOffset, Math.min(4, length)); + view.setUint32(BinaryView.INLINE_OFFSET, prefix.getUint32(0, true), true); + + // Allocate space in variadic buffer + if (!this._currentBuffer || this._currentBufferOffset + length > this._bufferSize) { + // Start a new buffer + if (this._currentBuffer) { + this._variadicBuffers.push(this._currentBuffer.buffer.slice(0, this._currentBufferOffset)); + } + this._currentBuffer = new BufferBuilder(Uint8Array); + this._currentBufferIndex = this._variadicBuffers.length; + this._currentBufferOffset = 0; + } + + // Write data to current buffer + const bufferData = this._currentBuffer.reserve(length).buffer; + bufferData.set(data, this._currentBufferOffset); + + // Write buffer index and offset to view struct + view.setInt32(BinaryView.BUFFER_INDEX_OFFSET, this._currentBufferIndex, true); + view.setInt32(BinaryView.BUFFER_OFFSET_OFFSET, this._currentBufferOffset, true); + + this._currentBufferOffset += length; + } + + return this; + } + + public setValid(index: number, isValid: boolean) { + if (!super.setValid(index, isValid)) { + // For null values, write a zero-length view + // Ensure space is allocated + const bytesNeeded = (index + 1) * BinaryView.ELEMENT_WIDTH; + const currentBytes = this._views.length; + if (bytesNeeded > currentBytes) { + this._views.reserve(bytesNeeded - currentBytes); + } + + const viewBuffer = this._views.buffer; + const viewOffset = index * BinaryView.ELEMENT_WIDTH; + // Zero out the entire view struct + for (let i = 0; i < BinaryView.ELEMENT_WIDTH; i++) { + viewBuffer[viewOffset + i] = 0; + } + return false; + } + return true; + } + + public clear() { + this._variadicBuffers = []; + this._currentBuffer = null; + this._currentBufferIndex = 0; + this._currentBufferOffset = 0; + this._views.clear(); + return super.clear(); + } + + public flush() { + const { type, length, nullCount, _views, _nulls } = this; + + // Finalize current buffer if it exists + if (this._currentBuffer && this._currentBufferOffset > 0) { + this._variadicBuffers.push(this._currentBuffer.buffer.slice(0, this._currentBufferOffset)); + this._currentBuffer = null; + this._currentBufferOffset = 0; + } + + const views = _views.flush(length * BinaryView.ELEMENT_WIDTH); + const nullBitmap = nullCount > 0 ? _nulls.flush(length) : undefined; + const variadicBuffers = this._variadicBuffers.slice(); + + // Reset variadic buffers for next batch + this._variadicBuffers = []; + this._currentBufferIndex = 0; + + this.clear(); + + return makeData({ + type, + length, + nullCount, + nullBitmap, + views, + variadicBuffers + }); + } + + public finish() { + this.finished = true; + return this; + } +} diff --git a/src/builder/largelist.ts b/src/builder/largelist.ts new file mode 100644 index 00000000..409b1a68 --- /dev/null +++ b/src/builder/largelist.ts @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Field } from '../schema.js'; +import { DataType, LargeList } from '../type.js'; +import { OffsetsBufferBuilder } from './buffer.js'; +import { Builder, BuilderOptions, VariableWidthBuilder } from '../builder.js'; +import { bigIntToNumber } from '../util/bigint.js'; + +/** @ignore */ +export class LargeListBuilder extends VariableWidthBuilder, TNull> { + protected _offsets: OffsetsBufferBuilder>; + constructor(opts: BuilderOptions, TNull>) { + super(opts); + this._offsets = new OffsetsBufferBuilder(opts.type); + } + public addChild(child: Builder, name = '0') { + if (this.numChildren > 0) { + throw new Error('LargeListBuilder can only have one child.'); + } + this.children[this.numChildren] = child; + this.type = new LargeList(new Field(name, child.type, true)); + return this.numChildren - 1; + } + protected _flushPending(pending: Map) { + const offsets = this._offsets; + const [child] = this.children; + for (const [index, value] of pending) { + if (typeof value === 'undefined') { + offsets.set(index, BigInt(0)); + } else { + const v = value as T['TValue']; + const n = v.length; + const start = bigIntToNumber(offsets.set(index, BigInt(n)).buffer[index]); + for (let i = -1; ++i < n;) { + child.set(start + i, v[i]); + } + } + } + } +} diff --git a/src/builder/listview.ts b/src/builder/listview.ts new file mode 100644 index 00000000..82766775 --- /dev/null +++ b/src/builder/listview.ts @@ -0,0 +1,244 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Field } from '../schema.js'; +import { DataType, ListView, LargeListView } from '../type.js'; +import { DataBufferBuilder } from './buffer.js'; +import { Builder, BuilderOptions } from '../builder.js'; +import { makeData } from '../data.js'; + +/** @ignore */ +export class ListViewBuilder extends Builder, TNull> { + protected _offsets: DataBufferBuilder; + protected _sizes: DataBufferBuilder; + protected _pending: Map | undefined; + protected _writeIndex = 0; + + constructor(opts: BuilderOptions, TNull>) { + super(opts); + this._offsets = new DataBufferBuilder(Int32Array, 0); + this._sizes = new DataBufferBuilder(Int32Array, 0); + } + + public addChild(child: Builder, name = '0') { + if (this.numChildren > 0) { + throw new Error('ListViewBuilder can only have one child.'); + } + this.children[this.numChildren] = child; + this.type = new ListView(new Field(name, child.type, true)); + return this.numChildren - 1; + } + + public setValue(index: number, value: T['TValue']) { + const pending = this._pending || (this._pending = new Map()); + pending.set(index, value); + } + + public setValid(index: number, isValid: boolean) { + if (!super.setValid(index, isValid)) { + (this._pending || (this._pending = new Map())).set(index, undefined); + return false; + } + return true; + } + + public clear() { + this._pending = undefined; + this._writeIndex = 0; + return super.clear(); + } + + public flush() { + this._flush(); + + // Custom flush logic for ListView + const { type, length, nullCount, _offsets, _sizes, _nulls } = this; + + const valueOffsets = _offsets.flush(length); + const sizes = _sizes.flush(length); + const nullBitmap = nullCount > 0 ? _nulls.flush(length) : undefined; + const children = this.children.map((child) => child.flush()); + + this.clear(); + + return makeData({ + type, + length, + nullCount, + nullBitmap, + valueOffsets, + sizes, + child: children[0] + }); + } + + public finish() { + this._flush(); + return super.finish(); + } + + protected _flush() { + const pending = this._pending; + this._pending = undefined; + if (pending && pending.size > 0) { + this._flushPending(pending); + } + } + + protected _flushPending(pending: Map) { + const offsets = this._offsets; + const sizes = this._sizes; + const [child] = this.children; + + for (const [index, value] of pending) { + offsets.reserve(index + 1); + sizes.reserve(index + 1); + + if (typeof value === 'undefined') { + // Null or empty list + offsets.buffer[index] = 0; + sizes.buffer[index] = 0; + } else { + const v = value as T['TValue']; + const n = v.length; + const offset = this._writeIndex; + + // Set offset and size directly + offsets.buffer[index] = offset; + sizes.buffer[index] = n; + + // Write child values + for (let i = 0; i < n; i++) { + child.set(offset + i, v[i]); + } + + this._writeIndex += n; + } + } + } +} + +/** @ignore */ +export class LargeListViewBuilder extends Builder, TNull> { + protected _offsets: DataBufferBuilder; + protected _sizes: DataBufferBuilder; + protected _pending: Map | undefined; + protected _writeIndex = BigInt(0); // BigInt for LargeListView + + constructor(opts: BuilderOptions, TNull>) { + super(opts); + this._offsets = new DataBufferBuilder(BigInt64Array, 0); + this._sizes = new DataBufferBuilder(BigInt64Array, 0); + } + + public addChild(child: Builder, name = '0') { + if (this.numChildren > 0) { + throw new Error('LargeListViewBuilder can only have one child.'); + } + this.children[this.numChildren] = child; + this.type = new LargeListView(new Field(name, child.type, true)); + return this.numChildren - 1; + } + + public setValue(index: number, value: T['TValue']) { + const pending = this._pending || (this._pending = new Map()); + pending.set(index, value); + } + + public setValid(index: number, isValid: boolean) { + if (!super.setValid(index, isValid)) { + (this._pending || (this._pending = new Map())).set(index, undefined); + return false; + } + return true; + } + + public clear() { + this._pending = undefined; + this._writeIndex = BigInt(0); + return super.clear(); + } + + public flush() { + this._flush(); + + // Custom flush logic for LargeListView + const { type, length, nullCount, _offsets, _sizes, _nulls } = this; + + const valueOffsets = _offsets.flush(length); + const sizes = _sizes.flush(length); + const nullBitmap = nullCount > 0 ? _nulls.flush(length) : undefined; + const children = this.children.map((child) => child.flush()); + + this.clear(); + + return makeData({ + type, + length, + nullCount, + nullBitmap, + valueOffsets, + sizes, + child: children[0] + }); + } + + public finish() { + this._flush(); + return super.finish(); + } + + protected _flush() { + const pending = this._pending; + this._pending = undefined; + if (pending && pending.size > 0) { + this._flushPending(pending); + } + } + + protected _flushPending(pending: Map) { + const offsets = this._offsets; + const sizes = this._sizes; + const [child] = this.children; + + for (const [index, value] of pending) { + offsets.reserve(index + 1); + sizes.reserve(index + 1); + + if (typeof value === 'undefined') { + // Null or empty list + offsets.buffer[index] = BigInt(0); + sizes.buffer[index] = BigInt(0); + } else { + const v = value as T['TValue']; + const n = v.length; + const offset = this._writeIndex; + + // Set offset and size directly (using BigInt for LargeListView) + offsets.buffer[index] = offset; + sizes.buffer[index] = BigInt(n); + + // Write child values + for (let i = 0; i < n; i++) { + child.set(Number(offset) + i, v[i]); + } + + this._writeIndex += BigInt(n); + } + } + } +} diff --git a/src/builder/utf8view.ts b/src/builder/utf8view.ts new file mode 100644 index 00000000..299743e1 --- /dev/null +++ b/src/builder/utf8view.ts @@ -0,0 +1,154 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Utf8View, BinaryView } from '../type.js'; +import { encodeUtf8 } from '../util/utf8.js'; +import { BuilderOptions, Builder } from '../builder.js'; +import { BufferBuilder } from './buffer.js'; +import { makeData } from '../data.js'; + +/** @ignore */ +export class Utf8ViewBuilder extends Builder { + protected _views: BufferBuilder; + protected _variadicBuffers: Uint8Array[] = []; + protected _currentBuffer: BufferBuilder | null = null; + protected _currentBufferIndex = 0; + protected _currentBufferOffset = 0; + protected readonly _bufferSize = 32 * 1024 * 1024; + + constructor(opts: BuilderOptions) { + super(opts); + this._views = new BufferBuilder(Uint8Array); + } + + public get byteLength(): number { + let size = 0; + this._views && (size += this._views.byteLength); + this._nulls && (size += this._nulls.byteLength); + size += this._variadicBuffers.reduce((acc, buffer) => acc + buffer.byteLength, 0); + this._currentBuffer && (size += this._currentBuffer.byteLength); + return size; + } + + public setValue(index: number, value: string) { + const data = encodeUtf8(value); + const length = data.length; + + // Ensure views buffer has space up to this index + const bytesNeeded = (index + 1) * BinaryView.ELEMENT_WIDTH; + const currentBytes = this._views.length; + if (bytesNeeded > currentBytes) { + this._views.reserve(bytesNeeded - currentBytes); + } + + const viewBuffer = this._views.buffer; + const viewOffset = index * BinaryView.ELEMENT_WIDTH; + const view = new DataView(viewBuffer.buffer, viewBuffer.byteOffset + viewOffset, BinaryView.ELEMENT_WIDTH); + + view.setInt32(BinaryView.LENGTH_OFFSET, length, true); + + if (length <= BinaryView.INLINE_CAPACITY) { + viewBuffer.set(data, viewOffset + BinaryView.INLINE_OFFSET); + for (let i = length; i < BinaryView.INLINE_CAPACITY; i++) { + viewBuffer[viewOffset + BinaryView.INLINE_OFFSET + i] = 0; + } + } else { + const prefix = new DataView(data.buffer, data.byteOffset, Math.min(4, length)); + view.setUint32(BinaryView.INLINE_OFFSET, prefix.getUint32(0, true), true); + + if (!this._currentBuffer || this._currentBufferOffset + length > this._bufferSize) { + if (this._currentBuffer) { + this._variadicBuffers.push(this._currentBuffer.buffer.slice(0, this._currentBufferOffset)); + } + this._currentBuffer = new BufferBuilder(Uint8Array); + this._currentBufferIndex = this._variadicBuffers.length; + this._currentBufferOffset = 0; + } + + const bufferData = this._currentBuffer.reserve(length).buffer; + bufferData.set(data, this._currentBufferOffset); + + view.setInt32(BinaryView.BUFFER_INDEX_OFFSET, this._currentBufferIndex, true); + view.setInt32(BinaryView.BUFFER_OFFSET_OFFSET, this._currentBufferOffset, true); + + this._currentBufferOffset += length; + } + + return this; + } + + public setValid(index: number, isValid: boolean) { + if (!super.setValid(index, isValid)) { + // Ensure space is allocated + const bytesNeeded = (index + 1) * BinaryView.ELEMENT_WIDTH; + const currentBytes = this._views.length; + if (bytesNeeded > currentBytes) { + this._views.reserve(bytesNeeded - currentBytes); + } + + const viewBuffer = this._views.buffer; + const viewOffset = index * BinaryView.ELEMENT_WIDTH; + for (let i = 0; i < BinaryView.ELEMENT_WIDTH; i++) { + viewBuffer[viewOffset + i] = 0; + } + return false; + } + return true; + } + + public clear() { + this._variadicBuffers = []; + this._currentBuffer = null; + this._currentBufferIndex = 0; + this._currentBufferOffset = 0; + this._views.clear(); + return super.clear(); + } + + public flush() { + const { type, length, nullCount, _views, _nulls } = this; + + if (this._currentBuffer && this._currentBufferOffset > 0) { + this._variadicBuffers.push(this._currentBuffer.buffer.slice(0, this._currentBufferOffset)); + this._currentBuffer = null; + this._currentBufferOffset = 0; + } + + const views = _views.flush(length * BinaryView.ELEMENT_WIDTH); + const nullBitmap = nullCount > 0 ? _nulls.flush(length) : undefined; + const variadicBuffers = this._variadicBuffers.slice(); + + this._variadicBuffers = []; + this._currentBufferIndex = 0; + + this.clear(); + + return makeData({ + type, + length, + nullCount, + nullBitmap, + views, + variadicBuffers + }); + } + + public finish() { + this.finished = true; + return this; + } +} diff --git a/src/data.ts b/src/data.ts index 45fcc35d..b79b5311 100644 --- a/src/data.ts +++ b/src/data.ts @@ -68,6 +68,7 @@ export class Data { declare public readonly typeIds: Buffers[BufferType.TYPE]; declare public readonly nullBitmap: Buffers[BufferType.VALIDITY]; declare public readonly valueOffsets: Buffers[BufferType.OFFSET]; + declare public readonly variadicBuffers: ReadonlyArray; public get typeId(): T['TType'] { return this.type.typeId; } @@ -97,6 +98,7 @@ export class Data { values && (byteLength += values.byteLength); nullBitmap && (byteLength += nullBitmap.byteLength); typeIds && (byteLength += typeIds.byteLength); + byteLength += this.variadicBuffers.reduce((size, data) => size + (data?.byteLength ?? 0), 0); return this.children.reduce((byteLength, child) => byteLength + child.byteLength, byteLength); } @@ -117,7 +119,16 @@ export class Data { return nullCount; } - constructor(type: T, offset: number, length: number, nullCount?: number, buffers?: Partial> | Data, children: Data[] = [], dictionary?: Vector) { + constructor( + type: T, + offset: number, + length: number, + nullCount?: number, + buffers?: Partial> | Data, + children: Data[] = [], + dictionary?: Vector, + variadicBuffers: ReadonlyArray = [] + ) { this.type = type; this.children = children; this.dictionary = dictionary; @@ -131,6 +142,7 @@ export class Data { this.typeIds = buffers.typeIds; this.nullBitmap = buffers.nullBitmap; this.valueOffsets = buffers.valueOffsets; + this.variadicBuffers = buffers.variadicBuffers; } else { this.stride = strideForType(type); if (buffers) { @@ -139,15 +151,22 @@ export class Data { (buffer = (buffers as Buffers)[2]) && (this.nullBitmap = buffer); (buffer = (buffers as Buffers)[3]) && (this.typeIds = buffer); } + this.variadicBuffers = variadicBuffers; } + this.variadicBuffers ??= []; } public getValid(index: number): boolean { const { type } = this; if (DataType.isUnion(type)) { const union = (type as Union); - const child = this.children[union.typeIdToChildIndex[this.typeIds[index]]]; - const indexInChild = union.mode === UnionMode.Dense ? this.valueOffsets[index] : index; + const typeId = this.typeIds[index]; + const childIndex = union.typeIdToChildIndex[typeId]; + const child = this.children[childIndex]; + const valueOffsets = this.valueOffsets as Int32Array | BigInt64Array | undefined; + const indexInChild = union.mode === UnionMode.Dense && valueOffsets + ? Number(valueOffsets[index]) + : index; return child.getValid(indexInChild); } if (this.nullable && this.nullCount > 0) { @@ -163,8 +182,13 @@ export class Data { const { type } = this; if (DataType.isUnion(type)) { const union = (type as Union); - const child = this.children[union.typeIdToChildIndex[this.typeIds[index]]]; - const indexInChild = union.mode === UnionMode.Dense ? this.valueOffsets[index] : index; + const typeId = this.typeIds[index]; + const childIndex = union.typeIdToChildIndex[typeId]; + const child = this.children[childIndex]; + const valueOffsets = this.valueOffsets as Int32Array | BigInt64Array | undefined; + const indexInChild = union.mode === UnionMode.Dense && valueOffsets + ? Number(valueOffsets[index]) + : index; prev = child.getValid(indexInChild); child.setValid(indexInChild, value); } else { @@ -200,8 +224,16 @@ export class Data { return value; } - public clone(type: R = this.type as any, offset = this.offset, length = this.length, nullCount = this._nullCount, buffers: Buffers = this, children: Data[] = this.children) { - return new Data(type, offset, length, nullCount, buffers, children, this.dictionary); + public clone( + type: R = this.type as any, + offset = this.offset, + length = this.length, + nullCount = this._nullCount, + buffers: Buffers = this, + children: Data[] = this.children, + variadicBuffers: ReadonlyArray = this.variadicBuffers + ) { + return new Data(type, offset, length, nullCount, buffers, children, this.dictionary, variadicBuffers); } public slice(offset: number, length: number): Data { @@ -214,12 +246,13 @@ export class Data { const buffers = this._sliceBuffers(offset, length, stride, typeId); return this.clone(this.type, this.offset + offset, length, nullCount, buffers, // Don't slice children if we have value offsets (the variable-width types) - (children.length === 0 || this.valueOffsets) ? children : this._sliceChildren(children, childStride * offset, childStride * length)); + (children.length === 0 || this.valueOffsets) ? children : this._sliceChildren(children, childStride * offset, childStride * length), + this.variadicBuffers); } public _changeLengthAndBackfillNullBitmap(newLength: number): Data { if (this.typeId === Type.Null) { - return this.clone(this.type, 0, newLength, 0); + return this.clone(this.type, 0, newLength, 0, this.buffers, this.children, this.variadicBuffers); } const { length, nullCount } = this; // start initialized with 0s (nulls), then fill from 0 to length with 1s (not null) @@ -232,7 +265,7 @@ export class Data { } const buffers = this.buffers; buffers[BufferType.VALIDITY] = bitmap; - return this.clone(this.type, 0, newLength, nullCount + (newLength - length), buffers); + return this.clone(this.type, 0, newLength, nullCount + (newLength - length), buffers, this.children, this.variadicBuffers); } protected _sliceBuffers(offset: number, length: number, stride: number, typeId: T['TType']): Buffers { @@ -240,10 +273,15 @@ export class Data { const { buffers } = this; // If typeIds exist, slice the typeIds buffer (arr = buffers[BufferType.TYPE]) && (buffers[BufferType.TYPE] = arr.subarray(offset, offset + length)); - // If offsets exist, only slice the offsets buffer - (arr = buffers[BufferType.OFFSET]) && (buffers[BufferType.OFFSET] = arr.subarray(offset, offset + length + 1)) || - // Otherwise if no offsets, slice the data buffer. Don't slice the data vector for Booleans, since the offset goes by bits not bytes - (arr = buffers[BufferType.DATA]) && (buffers[BufferType.DATA] = typeId === 6 ? arr : arr.subarray(stride * offset, stride * (offset + length))); + if (DataType.isBinaryView(this.type) || DataType.isUtf8View(this.type)) { + const width = BinaryView.ELEMENT_WIDTH; + (arr = buffers[BufferType.DATA]) && (buffers[BufferType.DATA] = arr.subarray(offset * width, (offset + length) * width)); + } else { + // If offsets exist, only slice the offsets buffer + (arr = buffers[BufferType.OFFSET]) && (buffers[BufferType.OFFSET] = arr.subarray(offset, offset + length + 1)) || + // Otherwise if no offsets, slice the data buffer. Don't slice the data vector for Booleans, since the offset goes by bits not bytes + (arr = buffers[BufferType.DATA]) && (buffers[BufferType.DATA] = typeId === 6 ? arr : arr.subarray(stride * offset, stride * (offset + length))); + } return buffers; } @@ -256,7 +294,7 @@ export class Data { import { Dictionary, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, LargeList, ListView, LargeListView, FixedSizeList, Map_, Struct, Float, Int, Date_, @@ -265,6 +303,7 @@ import { Time, Timestamp, Union, DenseUnion, SparseUnion, + RunEndEncoded, } from './type.js'; import { Visitor } from './visitor.js'; @@ -311,6 +350,15 @@ class MakeDataVisitor extends Visitor { const { ['length']: length = valueOffsets.length - 1, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props; return new Data(type, offset, length, nullCount, [valueOffsets, data, nullBitmap]); } + public visitUtf8View(props: Utf8ViewDataProps) { + const { ['type']: type, ['offset']: offset = 0 } = props; + const views = toArrayBufferView(type.ArrayType, props['views']); + const nullBitmap = toUint8Array(props['nullBitmap']); + const variadicBuffers = (props['variadicBuffers'] || []).map((buffer) => toUint8Array(buffer)); + const length = props['length'] ?? Math.trunc(views.length / Utf8View.ELEMENT_WIDTH); + const nullCount = props['nullBitmap'] ? -1 : 0; + return new Data(type, offset, length, nullCount, [undefined, views, nullBitmap], [], undefined, variadicBuffers); + } public visitLargeUtf8(props: LargeUtf8DataProps) { const { ['type']: type, ['offset']: offset = 0 } = props; const data = toUint8Array(props['data']); @@ -327,6 +375,15 @@ class MakeDataVisitor extends Visitor { const { ['length']: length = valueOffsets.length - 1, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props; return new Data(type, offset, length, nullCount, [valueOffsets, data, nullBitmap]); } + public visitBinaryView(props: BinaryViewDataProps) { + const { ['type']: type, ['offset']: offset = 0 } = props; + const views = toArrayBufferView(type.ArrayType, props['views']); + const nullBitmap = toUint8Array(props['nullBitmap']); + const variadicBuffers = (props['variadicBuffers'] || []).map((buffer) => toUint8Array(buffer)); + const length = props['length'] ?? Math.trunc(views.length / BinaryView.ELEMENT_WIDTH); + const nullCount = props['nullBitmap'] ? -1 : 0; + return new Data(type, offset, length, nullCount, [undefined, views, nullBitmap], [], undefined, variadicBuffers); + } public visitLargeBinary(props: LargeBinaryDataProps) { const { ['type']: type, ['offset']: offset = 0 } = props; const data = toUint8Array(props['data']); @@ -377,6 +434,36 @@ class MakeDataVisitor extends Visitor { const { ['length']: length = valueOffsets.length - 1, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props; return new Data(type, offset, length, nullCount, [valueOffsets, undefined, nullBitmap], [child]); } + public visitLargeList(props: LargeListDataProps) { + const { ['type']: type, ['offset']: offset = 0, ['child']: child } = props; + const nullBitmap = toUint8Array(props['nullBitmap']); + const valueOffsets = toBigInt64Array(props['valueOffsets']); + const { ['length']: length = valueOffsets.length - 1, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props; + return new Data(type, offset, length, nullCount, [valueOffsets, undefined, nullBitmap], [child]); + } + public visitListView(props: ListViewDataProps) { + const { ['type']: type, ['offset']: offset = 0, ['child']: child } = props; + const nullBitmap = toUint8Array(props['nullBitmap']); + const valueOffsets = toInt32Array(props['valueOffsets']); + const sizes = toInt32Array(props['sizes']); + const { ['length']: length = sizes.length, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props; + return new Data(type, offset, length, nullCount, [valueOffsets, sizes, nullBitmap], [child]); + } + public visitLargeListView(props: LargeListViewDataProps) { + const { ['type']: type, ['offset']: offset = 0, ['child']: child } = props; + const nullBitmap = toUint8Array(props['nullBitmap']); + const valueOffsets = toBigInt64Array(props['valueOffsets']); + const sizes = toBigInt64Array(props['sizes']); + const { ['length']: length = Number(sizes.length), ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props; + return new Data(type, offset, length, nullCount, [valueOffsets, sizes, nullBitmap], [child]); + } + public visitRunEndEncoded(props: RunEndEncodedDataProps) { + const { ['type']: type, ['offset']: offset = 0, ['children']: children } = props; + const nullBitmap = toUint8Array(props['nullBitmap']); + const length = children[0].length; + const { ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props; + return new Data(type, offset, length, nullCount, [undefined, undefined, nullBitmap], children); + } public visitStruct(props: StructDataProps) { const { ['type']: type, ['offset']: offset = 0, ['children']: children = [] } = props; const nullBitmap = toUint8Array(props['nullBitmap']); @@ -455,11 +542,17 @@ interface IntervalDataProps extends DataProps_ { data?: D interface DurationDataProps extends DataProps_ { data?: DataBuffer } interface FixedSizeBinaryDataProps extends DataProps_ { data?: DataBuffer } interface BinaryDataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer } +interface BinaryViewDataProps extends DataProps_ { views: DataBuffer; variadicBuffers?: ReadonlyArray | Iterable | Uint8Array>; data?: DataBuffer } interface LargeBinaryDataProps extends DataProps_ { valueOffsets: LargeValueOffsetsBuffer | ValueOffsetsBuffer; data?: DataBuffer } interface Utf8DataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer } +interface Utf8ViewDataProps extends DataProps_ { views: DataBuffer; variadicBuffers?: ReadonlyArray | Iterable | Uint8Array>; data?: DataBuffer } interface LargeUtf8DataProps extends DataProps_ { valueOffsets: LargeValueOffsetsBuffer | ValueOffsetsBuffer; data?: DataBuffer } interface ListDataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; child: Data } +interface LargeListDataProps extends DataProps_ { valueOffsets: LargeValueOffsetsBuffer | ValueOffsetsBuffer; child: Data } +interface ListViewDataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; sizes: ValueOffsetsBuffer; child: Data } +interface LargeListViewDataProps extends DataProps_ { valueOffsets: LargeValueOffsetsBuffer | ValueOffsetsBuffer; sizes: LargeValueOffsetsBuffer | ValueOffsetsBuffer; child: Data } interface FixedSizeListDataProps extends DataProps_ { child: Data } +interface RunEndEncodedDataProps extends DataProps_ { children: [Data, Data] } interface StructDataProps extends DataProps_ { children: Data[] } interface Map_DataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; child: Data } interface SparseUnionDataProps extends DataProps_ { nullBitmap: never; typeIds: TypeIdsBuffer; children: Data[] } @@ -481,10 +574,16 @@ export type DataProps = ( T extends FixedSizeBinary /* */ ? FixedSizeBinaryDataProps : T extends Binary /* */ ? BinaryDataProps : T extends LargeBinary /* */ ? LargeBinaryDataProps : + T extends BinaryView /* */ ? BinaryViewDataProps : T extends Utf8 /* */ ? Utf8DataProps : T extends LargeUtf8 /* */ ? LargeUtf8DataProps : + T extends Utf8View /* */ ? Utf8ViewDataProps : T extends List /* */ ? ListDataProps : + T extends LargeList /* */ ? LargeListDataProps : + T extends ListView /* */ ? ListViewDataProps : + T extends LargeListView /* */ ? LargeListViewDataProps : T extends FixedSizeList /* */ ? FixedSizeListDataProps : + T extends RunEndEncoded /* */ ? RunEndEncodedDataProps : T extends Struct /* */ ? StructDataProps : T extends Map_ /* */ ? Map_DataProps : T extends SparseUnion /* */ ? SparseUnionDataProps : @@ -507,12 +606,18 @@ export function makeData(props: TimestampDataProps): Dat export function makeData(props: IntervalDataProps): Data; export function makeData(props: DurationDataProps): Data; export function makeData(props: FixedSizeBinaryDataProps): Data; +export function makeData(props: BinaryViewDataProps): Data; export function makeData(props: BinaryDataProps): Data; export function makeData(props: LargeBinaryDataProps): Data; export function makeData(props: Utf8DataProps): Data; export function makeData(props: LargeUtf8DataProps): Data; +export function makeData(props: Utf8ViewDataProps): Data; export function makeData(props: ListDataProps): Data; +export function makeData(props: LargeListDataProps): Data; +export function makeData(props: ListViewDataProps): Data; +export function makeData(props: LargeListViewDataProps): Data; export function makeData(props: FixedSizeListDataProps): Data; +export function makeData(props: RunEndEncodedDataProps): Data; export function makeData(props: StructDataProps): Data; export function makeData(props: Map_DataProps): Data; export function makeData(props: SparseUnionDataProps): Data; diff --git a/src/enum.ts b/src/enum.ts index 73d95538..f68854e1 100644 --- a/src/enum.ts +++ b/src/enum.ts @@ -70,6 +70,12 @@ export enum Type { Duration = 18, /** Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds */ LargeBinary = 19, /** Large variable-length bytes (no guarantee of UTF8-ness) */ LargeUtf8 = 20, /** Large variable-length string as List */ + LargeList = 21, /** Large variable-length list as LargeList */ + RunEndEncoded = 22, /** Run-end encoded array with run_ends and values children */ + BinaryView = 23, /** Variable-length binary values backed by inline-or-referenced views */ + Utf8View = 24, /** Variable-length UTF8 string values backed by inline-or-referenced views */ + ListView = 25, /** Variable-length list values backed by entry views */ + LargeListView = 26, /** Large variable-length list values backed by entry views */ Dictionary = -1, /** Dictionary aka Category type */ Int8 = -2, diff --git a/src/fb/File.ts b/src/fb/File.ts new file mode 100644 index 00000000..12c6f822 --- /dev/null +++ b/src/fb/File.ts @@ -0,0 +1,47 @@ +// automatically generated by the FlatBuffers compiler, do not modify + +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + +export { Binary } from './binary.js'; +export { BinaryView } from './binary-view.js'; +export { Block } from './block.js'; +export { Bool } from './bool.js'; +export { Buffer } from './buffer.js'; +export { Date } from './date.js'; +export { DateUnit } from './date-unit.js'; +export { Decimal } from './decimal.js'; +export { DictionaryEncoding } from './dictionary-encoding.js'; +export { DictionaryKind } from './dictionary-kind.js'; +export { Duration } from './duration.js'; +export { Endianness } from './endianness.js'; +export { Feature } from './feature.js'; +export { Field } from './field.js'; +export { FixedSizeBinary } from './fixed-size-binary.js'; +export { FixedSizeList } from './fixed-size-list.js'; +export { FloatingPoint } from './floating-point.js'; +export { Footer } from './footer.js'; +export { Int } from './int.js'; +export { Interval } from './interval.js'; +export { IntervalUnit } from './interval-unit.js'; +export { KeyValue } from './key-value.js'; +export { LargeBinary } from './large-binary.js'; +export { LargeList } from './large-list.js'; +export { LargeListView } from './large-list-view.js'; +export { LargeUtf8 } from './large-utf8.js'; +export { List } from './list.js'; +export { ListView } from './list-view.js'; +export { Map } from './map.js'; +export { MetadataVersion } from './metadata-version.js'; +export { Null } from './null.js'; +export { Precision } from './precision.js'; +export { RunEndEncoded } from './run-end-encoded.js'; +export { Schema } from './schema.js'; +export { Struct_ } from './struct-.js'; +export { Time } from './time.js'; +export { TimeUnit } from './time-unit.js'; +export { Timestamp } from './timestamp.js'; +export { Type } from './type.js'; +export { Union } from './union.js'; +export { UnionMode } from './union-mode.js'; +export { Utf8 } from './utf8.js'; +export { Utf8View } from './utf8-view.js'; diff --git a/src/fb/binary-view.ts b/src/fb/binary-view.ts new file mode 100644 index 00000000..f91f910f --- /dev/null +++ b/src/fb/binary-view.ts @@ -0,0 +1,47 @@ +// automatically generated by the FlatBuffers compiler, do not modify + +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + +import * as flatbuffers from 'flatbuffers'; + +/** + * Logically the same as Binary, but the internal representation uses a view + * struct that contains the string length and either the string's entire data + * inline (for small strings) or an inlined prefix, an index of another buffer, + * and an offset pointing to a slice in that buffer (for non-small strings). + * + * Since it uses a variable number of data buffers, each Field with this type + * must have a corresponding entry in `variadicBufferCounts`. + */ +export class BinaryView { + bb: flatbuffers.ByteBuffer|null = null; + bb_pos = 0; + __init(i:number, bb:flatbuffers.ByteBuffer):BinaryView { + this.bb_pos = i; + this.bb = bb; + return this; +} + +static getRootAsBinaryView(bb:flatbuffers.ByteBuffer, obj?:BinaryView):BinaryView { + return (obj || new BinaryView()).__init(bb.readInt32(bb.position()) + bb.position(), bb); +} + +static getSizePrefixedRootAsBinaryView(bb:flatbuffers.ByteBuffer, obj?:BinaryView):BinaryView { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new BinaryView()).__init(bb.readInt32(bb.position()) + bb.position(), bb); +} + +static startBinaryView(builder:flatbuffers.Builder) { + builder.startObject(0); +} + +static endBinaryView(builder:flatbuffers.Builder):flatbuffers.Offset { + const offset = builder.endObject(); + return offset; +} + +static createBinaryView(builder:flatbuffers.Builder):flatbuffers.Offset { + BinaryView.startBinaryView(builder); + return BinaryView.endBinaryView(builder); +} +} diff --git a/src/fb/large-list-view.ts b/src/fb/large-list-view.ts new file mode 100644 index 00000000..5785cd3f --- /dev/null +++ b/src/fb/large-list-view.ts @@ -0,0 +1,42 @@ +// automatically generated by the FlatBuffers compiler, do not modify + +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + +import * as flatbuffers from 'flatbuffers'; + +/** + * Same as ListView, but with 64-bit offsets and sizes, allowing to represent + * extremely large data values. + */ +export class LargeListView { + bb: flatbuffers.ByteBuffer|null = null; + bb_pos = 0; + __init(i:number, bb:flatbuffers.ByteBuffer):LargeListView { + this.bb_pos = i; + this.bb = bb; + return this; +} + +static getRootAsLargeListView(bb:flatbuffers.ByteBuffer, obj?:LargeListView):LargeListView { + return (obj || new LargeListView()).__init(bb.readInt32(bb.position()) + bb.position(), bb); +} + +static getSizePrefixedRootAsLargeListView(bb:flatbuffers.ByteBuffer, obj?:LargeListView):LargeListView { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new LargeListView()).__init(bb.readInt32(bb.position()) + bb.position(), bb); +} + +static startLargeListView(builder:flatbuffers.Builder) { + builder.startObject(0); +} + +static endLargeListView(builder:flatbuffers.Builder):flatbuffers.Offset { + const offset = builder.endObject(); + return offset; +} + +static createLargeListView(builder:flatbuffers.Builder):flatbuffers.Offset { + LargeListView.startLargeListView(builder); + return LargeListView.endLargeListView(builder); +} +} diff --git a/src/fb/list-view.ts b/src/fb/list-view.ts new file mode 100644 index 00000000..f9afae01 --- /dev/null +++ b/src/fb/list-view.ts @@ -0,0 +1,43 @@ +// automatically generated by the FlatBuffers compiler, do not modify + +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + +import * as flatbuffers from 'flatbuffers'; + +/** + * Represents the same logical types that List can, but contains offsets and + * sizes allowing for writes in any order and sharing of child values among + * list values. + */ +export class ListView { + bb: flatbuffers.ByteBuffer|null = null; + bb_pos = 0; + __init(i:number, bb:flatbuffers.ByteBuffer):ListView { + this.bb_pos = i; + this.bb = bb; + return this; +} + +static getRootAsListView(bb:flatbuffers.ByteBuffer, obj?:ListView):ListView { + return (obj || new ListView()).__init(bb.readInt32(bb.position()) + bb.position(), bb); +} + +static getSizePrefixedRootAsListView(bb:flatbuffers.ByteBuffer, obj?:ListView):ListView { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new ListView()).__init(bb.readInt32(bb.position()) + bb.position(), bb); +} + +static startListView(builder:flatbuffers.Builder) { + builder.startObject(0); +} + +static endListView(builder:flatbuffers.Builder):flatbuffers.Offset { + const offset = builder.endObject(); + return offset; +} + +static createListView(builder:flatbuffers.Builder):flatbuffers.Offset { + ListView.startListView(builder); + return ListView.endListView(builder); +} +} diff --git a/src/fb/message.ts b/src/fb/message.ts index d752b91b..d3518599 100644 --- a/src/fb/message.ts +++ b/src/fb/message.ts @@ -1,5 +1,7 @@ // automatically generated by the FlatBuffers compiler, do not modify +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + import * as flatbuffers from 'flatbuffers'; import { KeyValue } from './key-value.js'; diff --git a/src/fb/record-batch.ts b/src/fb/record-batch.ts index 00681999..e6f41d02 100644 --- a/src/fb/record-batch.ts +++ b/src/fb/record-batch.ts @@ -1,5 +1,7 @@ // automatically generated by the FlatBuffers compiler, do not modify +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + import * as flatbuffers from 'flatbuffers'; import { BodyCompression } from './body-compression.js'; @@ -78,8 +80,34 @@ compression(obj?:BodyCompression):BodyCompression|null { return offset ? (obj || new BodyCompression()).__init(this.bb!.__indirect(this.bb_pos + offset), this.bb!) : null; } +/** + * Some types such as Utf8View are represented using a variable number of buffers. + * For each such Field in the pre-ordered flattened logical schema, there will be + * an entry in variadicBufferCounts to indicate the number of number of variadic + * buffers which belong to that Field in the current RecordBatch. + * + * For example, the schema + * col1: Struct + * col2: Utf8View + * contains two Fields with variadic buffers so variadicBufferCounts will have + * two entries, the first counting the variadic buffers of `col1.beta` and the + * second counting `col2`'s. + * + * This field may be omitted if and only if the schema contains no Fields with + * a variable number of buffers, such as BinaryView and Utf8View. + */ +variadicBufferCounts(index: number):bigint|null { + const offset = this.bb!.__offset(this.bb_pos, 12); + return offset ? this.bb!.readInt64(this.bb!.__vector(this.bb_pos + offset) + index * 8) : BigInt(0); +} + +variadicBufferCountsLength():number { + const offset = this.bb!.__offset(this.bb_pos, 12); + return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0; +} + static startRecordBatch(builder:flatbuffers.Builder) { - builder.startObject(4); + builder.startObject(5); } static addLength(builder:flatbuffers.Builder, length:bigint) { @@ -106,6 +134,22 @@ static addCompression(builder:flatbuffers.Builder, compressionOffset:flatbuffers builder.addFieldOffset(3, compressionOffset, 0); } +static addVariadicBufferCounts(builder:flatbuffers.Builder, variadicBufferCountsOffset:flatbuffers.Offset) { + builder.addFieldOffset(4, variadicBufferCountsOffset, 0); +} + +static createVariadicBufferCountsVector(builder:flatbuffers.Builder, data:bigint[]):flatbuffers.Offset { + builder.startVector(8, data.length, 8); + for (let i = data.length - 1; i >= 0; i--) { + builder.addInt64(data[i]!); + } + return builder.endVector(); +} + +static startVariadicBufferCountsVector(builder:flatbuffers.Builder, numElems:number) { + builder.startVector(8, numElems, 8); +} + static endRecordBatch(builder:flatbuffers.Builder):flatbuffers.Offset { const offset = builder.endObject(); return offset; diff --git a/src/fb/schema.ts b/src/fb/schema.ts index 394883eb..daae447e 100644 --- a/src/fb/schema.ts +++ b/src/fb/schema.ts @@ -1,5 +1,7 @@ // automatically generated by the FlatBuffers compiler, do not modify +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + import * as flatbuffers from 'flatbuffers'; import { Endianness } from './endianness.js'; @@ -133,14 +135,6 @@ static endSchema(builder:flatbuffers.Builder):flatbuffers.Offset { return offset; } -static finishSchemaBuffer(builder:flatbuffers.Builder, offset:flatbuffers.Offset) { - builder.finish(offset); -} - -static finishSizePrefixedSchemaBuffer(builder:flatbuffers.Builder, offset:flatbuffers.Offset) { - builder.finish(offset, undefined, true); -} - static createSchema(builder:flatbuffers.Builder, endianness:Endianness, fieldsOffset:flatbuffers.Offset, customMetadataOffset:flatbuffers.Offset, featuresOffset:flatbuffers.Offset):flatbuffers.Offset { Schema.startSchema(builder); Schema.addEndianness(builder, endianness); diff --git a/src/fb/type.ts b/src/fb/type.ts index 8eb87042..8f913d01 100644 --- a/src/fb/type.ts +++ b/src/fb/type.ts @@ -1,6 +1,9 @@ // automatically generated by the FlatBuffers compiler, do not modify +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + import { Binary } from './binary.js'; +import { BinaryView } from './binary-view.js'; import { Bool } from './bool.js'; import { Date } from './date.js'; import { Decimal } from './decimal.js'; @@ -12,8 +15,10 @@ import { Int } from './int.js'; import { Interval } from './interval.js'; import { LargeBinary } from './large-binary.js'; import { LargeList } from './large-list.js'; +import { LargeListView } from './large-list-view.js'; import { LargeUtf8 } from './large-utf8.js'; import { List } from './list.js'; +import { ListView } from './list-view.js'; import { Map } from './map.js'; import { Null } from './null.js'; import { RunEndEncoded } from './run-end-encoded.js'; @@ -22,6 +27,7 @@ import { Time } from './time.js'; import { Timestamp } from './timestamp.js'; import { Union } from './union.js'; import { Utf8 } from './utf8.js'; +import { Utf8View } from './utf8-view.js'; /** @@ -52,15 +58,19 @@ export enum Type { LargeBinary = 19, LargeUtf8 = 20, LargeList = 21, - RunEndEncoded = 22 + RunEndEncoded = 22, + BinaryView = 23, + Utf8View = 24, + ListView = 25, + LargeListView = 26 } export function unionToType( type: Type, - accessor: (obj:Binary|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeUtf8|List|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8) => Binary|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeUtf8|List|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|null -): Binary|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeUtf8|List|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|null { + accessor: (obj:Binary|BinaryView|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeListView|LargeUtf8|List|ListView|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|Utf8View) => Binary|BinaryView|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeListView|LargeUtf8|List|ListView|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|Utf8View|null +): Binary|BinaryView|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeListView|LargeUtf8|List|ListView|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|Utf8View|null { switch(Type[type]) { - case 'NONE': return null; + case 'NONE': return null; case 'Null': return accessor(new Null())! as Null; case 'Int': return accessor(new Int())! as Int; case 'FloatingPoint': return accessor(new FloatingPoint())! as FloatingPoint; @@ -83,17 +93,21 @@ export function unionToType( case 'LargeUtf8': return accessor(new LargeUtf8())! as LargeUtf8; case 'LargeList': return accessor(new LargeList())! as LargeList; case 'RunEndEncoded': return accessor(new RunEndEncoded())! as RunEndEncoded; + case 'BinaryView': return accessor(new BinaryView())! as BinaryView; + case 'Utf8View': return accessor(new Utf8View())! as Utf8View; + case 'ListView': return accessor(new ListView())! as ListView; + case 'LargeListView': return accessor(new LargeListView())! as LargeListView; default: return null; } } export function unionListToType( - type: Type, - accessor: (index: number, obj:Binary|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeUtf8|List|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8) => Binary|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeUtf8|List|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|null, + type: Type, + accessor: (index: number, obj:Binary|BinaryView|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeListView|LargeUtf8|List|ListView|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|Utf8View) => Binary|BinaryView|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeListView|LargeUtf8|List|ListView|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|Utf8View|null, index: number -): Binary|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeUtf8|List|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|null { +): Binary|BinaryView|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeListView|LargeUtf8|List|ListView|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|Utf8View|null { switch(Type[type]) { - case 'NONE': return null; + case 'NONE': return null; case 'Null': return accessor(index, new Null())! as Null; case 'Int': return accessor(index, new Int())! as Int; case 'FloatingPoint': return accessor(index, new FloatingPoint())! as FloatingPoint; @@ -116,6 +130,10 @@ export function unionListToType( case 'LargeUtf8': return accessor(index, new LargeUtf8())! as LargeUtf8; case 'LargeList': return accessor(index, new LargeList())! as LargeList; case 'RunEndEncoded': return accessor(index, new RunEndEncoded())! as RunEndEncoded; + case 'BinaryView': return accessor(index, new BinaryView())! as BinaryView; + case 'Utf8View': return accessor(index, new Utf8View())! as Utf8View; + case 'ListView': return accessor(index, new ListView())! as ListView; + case 'LargeListView': return accessor(index, new LargeListView())! as LargeListView; default: return null; } } diff --git a/src/fb/utf8-view.ts b/src/fb/utf8-view.ts new file mode 100644 index 00000000..886a9df7 --- /dev/null +++ b/src/fb/utf8-view.ts @@ -0,0 +1,47 @@ +// automatically generated by the FlatBuffers compiler, do not modify + +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + +import * as flatbuffers from 'flatbuffers'; + +/** + * Logically the same as Utf8, but the internal representation uses a view + * struct that contains the string length and either the string's entire data + * inline (for small strings) or an inlined prefix, an index of another buffer, + * and an offset pointing to a slice in that buffer (for non-small strings). + * + * Since it uses a variable number of data buffers, each Field with this type + * must have a corresponding entry in `variadicBufferCounts`. + */ +export class Utf8View { + bb: flatbuffers.ByteBuffer|null = null; + bb_pos = 0; + __init(i:number, bb:flatbuffers.ByteBuffer):Utf8View { + this.bb_pos = i; + this.bb = bb; + return this; +} + +static getRootAsUtf8View(bb:flatbuffers.ByteBuffer, obj?:Utf8View):Utf8View { + return (obj || new Utf8View()).__init(bb.readInt32(bb.position()) + bb.position(), bb); +} + +static getSizePrefixedRootAsUtf8View(bb:flatbuffers.ByteBuffer, obj?:Utf8View):Utf8View { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new Utf8View()).__init(bb.readInt32(bb.position()) + bb.position(), bb); +} + +static startUtf8View(builder:flatbuffers.Builder) { + builder.startObject(0); +} + +static endUtf8View(builder:flatbuffers.Builder):flatbuffers.Offset { + const offset = builder.endObject(); + return offset; +} + +static createUtf8View(builder:flatbuffers.Builder):flatbuffers.Offset { + Utf8View.startUtf8View(builder); + return Utf8View.endUtf8View(builder); +} +} diff --git a/src/interfaces.ts b/src/interfaces.ts index 0645753b..eea88bd4 100644 --- a/src/interfaces.ts +++ b/src/interfaces.ts @@ -212,6 +212,7 @@ export type TypeToDataType = { [Type.LargeUtf8]: type.LargeUtf8; [Type.Binary]: type.Binary; [Type.LargeBinary]: type.LargeBinary; + [Type.BinaryView]: type.BinaryView; [Type.FixedSizeBinary]: type.FixedSizeBinary; [Type.Date]: type.Date_; [Type.DateDay]: type.DateDay; @@ -244,6 +245,7 @@ export type TypeToDataType = { [Type.Struct]: type.Struct; [Type.Dictionary]: type.Dictionary; [Type.FixedSizeList]: type.FixedSizeList; + [Type.Utf8View]: type.Utf8View; }[T]; /** @ignore */ @@ -268,6 +270,7 @@ type TypeToBuilder = { [Type.LargeUtf8]: LargeUtf8Builder; [Type.Binary]: BinaryBuilder; [Type.LargeBinary]: LargeBinaryBuilder; + [Type.BinaryView]: Builder; [Type.FixedSizeBinary]: FixedSizeBinaryBuilder; [Type.Date]: DateBuilder; [Type.DateDay]: DateDayBuilder; @@ -300,6 +303,7 @@ type TypeToBuilder = { [Type.Struct]: StructBuilder; [Type.Dictionary]: DictionaryBuilder; [Type.FixedSizeList]: FixedSizeListBuilder; + [Type.Utf8View]: Builder; }[T]; /** @ignore */ @@ -324,6 +328,7 @@ type DataTypeToBuilder = { [Type.LargeUtf8]: T extends type.LargeUtf8 ? LargeUtf8Builder : never; [Type.Binary]: T extends type.Binary ? BinaryBuilder : never; [Type.LargeBinary]: T extends type.LargeBinary ? LargeBinaryBuilder : never; + [Type.BinaryView]: T extends type.BinaryView ? Builder : never; [Type.FixedSizeBinary]: T extends type.FixedSizeBinary ? FixedSizeBinaryBuilder : never; [Type.Date]: T extends type.Date_ ? DateBuilder : never; [Type.DateDay]: T extends type.DateDay ? DateDayBuilder : never; @@ -356,4 +361,5 @@ type DataTypeToBuilder = { [Type.Struct]: T extends type.Struct ? StructBuilder : never; [Type.Dictionary]: T extends type.Dictionary ? DictionaryBuilder : never; [Type.FixedSizeList]: T extends type.FixedSizeList ? FixedSizeListBuilder : never; + [Type.Utf8View]: T extends type.Utf8View ? Builder : never; }[T['TType']]; diff --git a/src/ipc/message.ts b/src/ipc/message.ts index 3dc86252..40a65439 100644 --- a/src/ipc/message.ts +++ b/src/ipc/message.ts @@ -204,6 +204,8 @@ export class JSONMessageReader extends MessageReader { ...(column['TYPE_ID'] && [column['TYPE_ID']] || []), ...(column['OFFSET'] && [column['OFFSET']] || []), ...(column['DATA'] && [column['DATA']] || []), + ...(column['VIEWS'] && [column['VIEWS']] || []), + ...(column['VARIADIC_DATA_BUFFERS'] || []), ...flattenDataSources(column['children']) ], [] as any[][]); } diff --git a/src/ipc/metadata/json.ts b/src/ipc/metadata/json.ts index 15f87189..8aed54ec 100644 --- a/src/ipc/metadata/json.ts +++ b/src/ipc/metadata/json.ts @@ -18,7 +18,7 @@ import { Schema, Field } from '../../schema.js'; import { DataType, Dictionary, TimeBitWidth, - Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, + Utf8, LargeUtf8, Binary, LargeBinary, BinaryView, Utf8View, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Union, Bool, Null, Int, Float, Date_, Time, Interval, Timestamp, IntBitWidth, Int32, TKeys, Duration, } from '../../type.js'; @@ -41,7 +41,8 @@ export function recordBatchFromJSON(b: any) { b['count'], fieldNodesFromJSON(b['columns']), buffersFromJSON(b['columns']), - null + null, + variadicBufferCountsFromJSON(b['columns']) ); } @@ -83,6 +84,13 @@ function buffersFromJSON(xs: any[], buffers: BufferRegion[] = []): BufferRegion[ column['TYPE_ID'] && buffers.push(new BufferRegion(buffers.length, column['TYPE_ID'].length)); column['OFFSET'] && buffers.push(new BufferRegion(buffers.length, column['OFFSET'].length)); column['DATA'] && buffers.push(new BufferRegion(buffers.length, column['DATA'].length)); + column['VIEWS'] && buffers.push(new BufferRegion(buffers.length, column['VIEWS'].length)); + // Handle variadic buffers for view types (BinaryView, Utf8View) + if (column['VARIADIC_DATA_BUFFERS']) { + for (const buf of column['VARIADIC_DATA_BUFFERS']) { + buffers.push(new BufferRegion(buffers.length, buf.length)); + } + } buffers = buffersFromJSON(column['children'], buffers); } return buffers; @@ -93,6 +101,15 @@ function nullCountFromJSON(validity: number[]) { return (validity || []).reduce((sum, val) => sum + +(val === 0), 0); } +/** @ignore */ +function variadicBufferCountsFromJSON(xs: any[]): number[] { + return (xs || []).reduce((counts, column: any) => [ + ...counts, + ...(column['VARIADIC_DATA_BUFFERS'] ? [column['VARIADIC_DATA_BUFFERS'].length] : []), + ...variadicBufferCountsFromJSON(column['children']) + ], [] as number[]); +} + /** @ignore */ export function fieldFromJSON(_field: any, dictionaries?: Map) { @@ -149,8 +166,10 @@ function typeFromJSON(f: any, children?: Field[]): DataType { case 'null': return new Null(); case 'binary': return new Binary(); case 'largebinary': return new LargeBinary(); + case 'binaryview': return new BinaryView(); case 'utf8': return new Utf8(); case 'largeutf8': return new LargeUtf8(); + case 'utf8view': return new Utf8View(); case 'bool': return new Bool(); case 'list': return new List((children || [])[0]); case 'struct': return new Struct(children || []); diff --git a/src/ipc/metadata/message.ts b/src/ipc/metadata/message.ts index 17e8897b..b41ec4a5 100644 --- a/src/ipc/metadata/message.ts +++ b/src/ipc/metadata/message.ts @@ -57,7 +57,7 @@ import ByteBuffer = flatbuffers.ByteBuffer; import { DataType, Dictionary, TimeBitWidth, - Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, + Utf8, LargeUtf8, Binary, LargeBinary, BinaryView, Utf8View, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Union, Bool, Null, Int, Float, Date_, Time, Interval, Timestamp, IntBitWidth, Int32, TKeys, Duration, } from '../../type.js'; @@ -156,20 +156,24 @@ export class RecordBatch { protected _nodes: FieldNode[]; protected _buffers: BufferRegion[]; protected _compression: BodyCompression | null; + protected _variadicBufferCounts: number[]; public get nodes() { return this._nodes; } public get length() { return this._length; } public get buffers() { return this._buffers; } public get compression() { return this._compression; } + public get variadicBufferCounts() { return this._variadicBufferCounts; } constructor( length: bigint | number, nodes: FieldNode[], buffers: BufferRegion[], - compression: BodyCompression | null + compression: BodyCompression | null, + variadicBufferCounts: number[] = [] ) { this._nodes = nodes; this._buffers = buffers; this._length = bigIntToNumber(length); this._compression = compression; + this._variadicBufferCounts = variadicBufferCounts; } } @@ -334,7 +338,8 @@ function decodeRecordBatch(batch: _RecordBatch, version = MetadataVersion.V5) { batch.length(), decodeFieldNodes(batch), decodeBuffers(batch, version), - decodeBodyCompression(batch.compression()) + decodeBodyCompression(batch.compression()), + decodeVariadicBufferCounts(batch) ); return recordBatch; } @@ -382,6 +387,16 @@ function decodeBuffers(batch: _RecordBatch, version: MetadataVersion) { return bufferRegions; } +/** @ignore */ +function decodeVariadicBufferCounts(batch: _RecordBatch) { + const counts = [] as number[]; + const length = Math.trunc(batch.variadicBufferCountsLength()); + for (let i = 0; i < length; ++i) { + counts.push(bigIntToNumber(batch.variadicBufferCounts(i)!)); + } + return counts; +} + /** @ignore */ function decodeSchemaFields(schema: _Schema, dictionaries?: Map) { const fields = [] as Field[]; @@ -468,8 +483,10 @@ function decodeFieldType(f: _Field, children?: Field[]): DataType { case Type['Null']: return new Null(); case Type['Binary']: return new Binary(); case Type['LargeBinary']: return new LargeBinary(); + case Type['BinaryView']: return new BinaryView(); case Type['Utf8']: return new Utf8(); case Type['LargeUtf8']: return new LargeUtf8(); + case Type['Utf8View']: return new Utf8View(); case Type['Bool']: return new Bool(); case Type['List']: return new List((children || [])[0]); case Type['Struct_']: return new Struct(children || []); @@ -614,6 +631,7 @@ function encodeRecordBatch(b: Builder, recordBatch: RecordBatch) { const nodes = recordBatch.nodes || []; const buffers = recordBatch.buffers || []; + const variadicBufferCounts = recordBatch.variadicBufferCounts || []; _RecordBatch.startNodesVector(b, nodes.length); for (const n of nodes.slice().reverse()) FieldNode.encode(b, n); @@ -630,6 +648,11 @@ function encodeRecordBatch(b: Builder, recordBatch: RecordBatch) { bodyCompressionOffset = encodeBodyCompression(b, recordBatch.compression); } + let variadicBufferCountsOffset = -1; + if (variadicBufferCounts.length > 0) { + variadicBufferCountsOffset = _RecordBatch.createVariadicBufferCountsVector(b, variadicBufferCounts.map(BigInt)); + } + _RecordBatch.startRecordBatch(b); _RecordBatch.addLength(b, BigInt(recordBatch.length)); _RecordBatch.addNodes(b, nodesVectorOffset); @@ -637,6 +660,9 @@ function encodeRecordBatch(b: Builder, recordBatch: RecordBatch) { if (recordBatch.compression !== null && bodyCompressionOffset) { _RecordBatch.addCompression(b, bodyCompressionOffset); } + if (variadicBufferCountsOffset !== -1) { + _RecordBatch.addVariadicBufferCounts(b, variadicBufferCountsOffset); + } return _RecordBatch.endRecordBatch(b); } diff --git a/src/ipc/reader.ts b/src/ipc/reader.ts index e36eeb52..af49f372 100644 --- a/src/ipc/reader.ts +++ b/src/ipc/reader.ts @@ -397,7 +397,8 @@ abstract class RecordBatchReaderImpl implements RecordB header.data.length, header.data.nodes, buffers, - null + null, + header.data.variadicBufferCounts ), id, isDelta) } else { throw new Error('Dictionary batch is compressed but codec not found'); @@ -412,11 +413,11 @@ abstract class RecordBatchReaderImpl implements RecordB } protected _loadVectors(header: metadata.RecordBatch, body: Uint8Array, types: (Field | DataType)[]) { - return new VectorLoader(body, header.nodes, header.buffers, this.dictionaries, this.schema.metadataVersion).visitMany(types); + return new VectorLoader(body, header.nodes, header.buffers, this.dictionaries, this.schema.metadataVersion, header.variadicBufferCounts).visitMany(types); } protected _loadCompressedVectors(header: metadata.RecordBatch, body: Uint8Array[], types: (Field | DataType)[]) { - return new CompressedVectorLoader(body, header.nodes, header.buffers, this.dictionaries, this.schema.metadataVersion).visitMany(types); + return new CompressedVectorLoader(body, header.nodes, header.buffers, this.dictionaries, this.schema.metadataVersion, header.variadicBufferCounts).visitMany(types); } private _decompressBuffers(header: metadata.RecordBatch, body: Uint8Array, codec: Codec): { decommpressedBody: Uint8Array[]; buffers: metadata.BufferRegion[] } { @@ -757,7 +758,7 @@ class RecordBatchJSONReaderImpl extends RecordBatchStre super(source, dictionaries); } protected _loadVectors(header: metadata.RecordBatch, body: any, types: (Field | DataType)[]) { - return new JSONVectorLoader(body, header.nodes, header.buffers, this.dictionaries, this.schema.metadataVersion).visitMany(types); + return new JSONVectorLoader(body, header.nodes, header.buffers, this.dictionaries, this.schema.metadataVersion, header.variadicBufferCounts).visitMany(types); } } diff --git a/src/ipc/writer.ts b/src/ipc/writer.ts index 17c8f0b6..0b13fdfc 100644 --- a/src/ipc/writer.ts +++ b/src/ipc/writer.ts @@ -274,8 +274,8 @@ export class RecordBatchWriter extends ReadableInterop< } protected _writeRecordBatch(batch: RecordBatch) { - const { byteLength, nodes, bufferRegions, buffers } = this._assembleRecordBatch(batch); - const recordBatch = new metadata.RecordBatch(batch.numRows, nodes, bufferRegions, this._compression); + const { byteLength, nodes, bufferRegions, buffers, variadicBufferCounts } = this._assembleRecordBatch(batch); + const recordBatch = new metadata.RecordBatch(batch.numRows, nodes, bufferRegions, this._compression, variadicBufferCounts); const message = Message.from(recordBatch, byteLength); return this ._writeDictionaries(batch) @@ -284,11 +284,11 @@ export class RecordBatchWriter extends ReadableInterop< } protected _assembleRecordBatch(batch: RecordBatch | Vector) { - let { byteLength, nodes, bufferRegions, buffers } = VectorAssembler.assemble(batch); + let { byteLength, nodes, bufferRegions, buffers, variadicBufferCounts } = VectorAssembler.assemble(batch); if (this._compression != null) { ({ byteLength, bufferRegions, buffers } = this._compressBodyBuffers(buffers)); } - return { byteLength, nodes, bufferRegions, buffers }; + return { byteLength, nodes, bufferRegions, buffers, variadicBufferCounts }; } protected _compressBodyBuffers(buffers: ArrayBufferView[]) { @@ -337,8 +337,8 @@ export class RecordBatchWriter extends ReadableInterop< } protected _writeDictionaryBatch(dictionary: Data, id: number, isDelta = false) { - const { byteLength, nodes, bufferRegions, buffers } = this._assembleRecordBatch(new Vector([dictionary])); - const recordBatch = new metadata.RecordBatch(dictionary.length, nodes, bufferRegions, this._compression); + const { byteLength, nodes, bufferRegions, buffers, variadicBufferCounts } = this._assembleRecordBatch(new Vector([dictionary])); + const recordBatch = new metadata.RecordBatch(dictionary.length, nodes, bufferRegions, this._compression, variadicBufferCounts); const dictionaryBatch = new metadata.DictionaryBatch(recordBatch, id, isDelta); const message = Message.from(dictionaryBatch, byteLength); return this diff --git a/src/type.ts b/src/type.ts index ea5e24fa..73b78cbe 100644 --- a/src/type.ts +++ b/src/type.ts @@ -58,8 +58,10 @@ export abstract class DataType { })(Binary.prototype); } +/** @ignore */ +export interface BinaryView extends DataType { + TArray: Uint8Array; + TValue: Uint8Array; + ArrayType: TypedArrayConstructor; +} +/** @ignore */ +export class BinaryView extends DataType { + public static readonly ELEMENT_WIDTH = 16; + public static readonly INLINE_CAPACITY = 12; + public static readonly LENGTH_OFFSET = 0; + public static readonly INLINE_OFFSET = 4; + public static readonly BUFFER_INDEX_OFFSET = 8; + public static readonly BUFFER_OFFSET_OFFSET = 12; + constructor() { + super(Type.BinaryView); + } + public toString() { return `BinaryView`; } + protected static [Symbol.toStringTag] = ((proto: BinaryView) => { + (proto).ArrayType = Uint8Array; + return proto[Symbol.toStringTag] = 'BinaryView'; + })(BinaryView.prototype); +} + /** @ignore */ export interface LargeBinary extends DataType { TArray: Uint8Array; TOffsetArray: BigInt64Array; TValue: Uint8Array; ArrayType: TypedArrayConstructor; OffsetArrayType: BigIntArrayConstructor } /** @ignore */ @@ -269,6 +299,7 @@ export class LargeBinary extends DataType { })(LargeBinary.prototype); } +/** @ignore */ /** @ignore */ export interface Utf8 extends DataType { TArray: Uint8Array; TOffsetArray: Int32Array; TValue: string; ArrayType: TypedArrayConstructor; OffsetArrayType: TypedArrayConstructor } /** @ignore */ @@ -283,6 +314,26 @@ export class Utf8 extends DataType { })(Utf8.prototype); } +/** @ignore */ +export interface Utf8View extends DataType { + TArray: Uint8Array; + TValue: string; + ArrayType: TypedArrayConstructor; +} +/** @ignore */ +export class Utf8View extends DataType { + public static readonly ELEMENT_WIDTH = BinaryView.ELEMENT_WIDTH; + public static readonly INLINE_CAPACITY = BinaryView.INLINE_CAPACITY; + constructor() { + super(Type.Utf8View); + } + public toString() { return `Utf8View`; } + protected static [Symbol.toStringTag] = ((proto: Utf8View) => { + (proto).ArrayType = Uint8Array; + return proto[Symbol.toStringTag] = 'Utf8View'; + })(Utf8View.prototype); +} + /** @ignore */ export interface LargeUtf8 extends DataType { TArray: Uint8Array; TOffsetArray: BigInt64Array; TValue: string; ArrayType: TypedArrayConstructor; OffsetArrayType: BigIntArrayConstructor } /** @ignore */ @@ -298,6 +349,7 @@ export class LargeUtf8 extends DataType { })(LargeUtf8.prototype); } +/** @ignore */ /** @ignore */ export interface Bool extends DataType { TArray: Uint8Array; TValue: boolean; ArrayType: TypedArrayConstructor } /** @ignore */ @@ -546,6 +598,94 @@ export class List extends DataType extends DataType { + TArray: Array; + TValue: Vector; + TOffsetArray: BigInt64Array; + OffsetArrayType: BigIntArrayConstructor; +} + +/** @ignore */ +export class LargeList extends DataType { + constructor(child: Field) { + super(Type.LargeList); + this.children = [child]; + } + public declare readonly children: Field[]; + public toString() { return `LargeList<${this.valueType}>`; } + public get valueType(): T { return this.children[0].type as T; } + public get valueField(): Field { return this.children[0] as Field; } + public get ArrayType(): T['ArrayType'] { return this.valueType.ArrayType; } + protected static [Symbol.toStringTag] = ((proto: LargeList) => { + (proto).children = null; + (proto).OffsetArrayType = BigInt64Array; + return proto[Symbol.toStringTag] = 'LargeList'; + })(LargeList.prototype); +} + +/** @ignore */ +export class ListView extends DataType { + constructor(child: Field) { + super(Type.ListView); + this.children = [child]; + } + public declare readonly children: Field[]; + public toString() { return `ListView<${this.valueType}>`; } + public get valueType(): T { return this.children[0].type as T; } + public get valueField(): Field { return this.children[0] as Field; } + public get ArrayType(): T['ArrayType'] { return this.valueType.ArrayType; } + protected static [Symbol.toStringTag] = ((proto: ListView) => { + (proto).children = null; + return proto[Symbol.toStringTag] = 'ListView'; + })(ListView.prototype); +} + +/** @ignore */ +export class LargeListView extends DataType { + constructor(child: Field) { + super(Type.LargeListView); + this.children = [child]; + } + public declare readonly children: Field[]; + public toString() { return `LargeListView<${this.valueType}>`; } + public get valueType(): T { return this.children[0].type as T; } + public get valueField(): Field { return this.children[0] as Field; } + public get ArrayType(): T['ArrayType'] { return this.valueType.ArrayType; } + protected static [Symbol.toStringTag] = ((proto: LargeListView) => { + (proto).children = null; + return proto[Symbol.toStringTag] = 'LargeListView'; + })(LargeListView.prototype); +} + +/** @ignore */ +export type TRunEnds = Int16 | Int32 | Int64; + +/** @ignore */ +export interface RunEndEncoded extends DataType { + TArray: TValue['TArray']; + TValue: TValue['TValue']; +} + +/** @ignore */ +export class RunEndEncoded extends DataType { + constructor(runEnds: Field, values: Field) { + super(Type.RunEndEncoded); + this.children = [runEnds, values]; + } + public declare readonly children: [Field, Field]; + public toString() { return `RunEndEncoded<${this.runEndsType}, ${this.valueType}>`; } + public get runEndsType(): TRunEnds { return this.children[0].type as TRunEnds; } + public get valueType(): TValue { return this.children[1].type as TValue; } + public get runEndsField(): Field { return this.children[0] as Field; } + public get valueField(): Field { return this.children[1] as Field; } + public get ArrayType(): TValue['ArrayType'] { return this.valueType.ArrayType; } + protected static [Symbol.toStringTag] = ((proto: RunEndEncoded) => { + (proto).children = null; + return proto[Symbol.toStringTag] = 'RunEndEncoded'; + })(RunEndEncoded.prototype); +} + /** @ignore */ export interface Struct extends DataType { TArray: Array>; @@ -759,6 +899,8 @@ export function strideForType(type: DataType) { } // case Type.Int: return 1 + +((t as Int_).bitWidth > 32); // case Type.Time: return 1 + +((t as Time_).bitWidth > 32); + case Type.BinaryView: + case Type.Utf8View: return 16; case Type.FixedSizeList: return (t as FixedSizeList).listSize; case Type.FixedSizeBinary: return (t as FixedSizeBinary).byteWidth; default: return 1; diff --git a/src/visitor.ts b/src/visitor.ts index 977e0a4e..752e6352 100644 --- a/src/visitor.ts +++ b/src/visitor.ts @@ -37,14 +37,17 @@ export abstract class Visitor { public visitFloat(_node: any, ..._args: any[]): any { return null; } public visitUtf8(_node: any, ..._args: any[]): any { return null; } public visitLargeUtf8(_node: any, ..._args: any[]): any { return null; } + public visitUtf8View(_node: any, ..._args: any[]): any { return null; } public visitBinary(_node: any, ..._args: any[]): any { return null; } public visitLargeBinary(_node: any, ..._args: any[]): any { return null; } + public visitBinaryView(_node: any, ..._args: any[]): any { return null; } public visitFixedSizeBinary(_node: any, ..._args: any[]): any { return null; } public visitDate(_node: any, ..._args: any[]): any { return null; } public visitTimestamp(_node: any, ..._args: any[]): any { return null; } public visitTime(_node: any, ..._args: any[]): any { return null; } public visitDecimal(_node: any, ..._args: any[]): any { return null; } public visitList(_node: any, ..._args: any[]): any { return null; } + public visitLargeList(_node: any, ..._args: any[]): any { return null; } public visitStruct(_node: any, ..._args: any[]): any { return null; } public visitUnion(_node: any, ..._args: any[]): any { return null; } public visitDictionary(_node: any, ..._args: any[]): any { return null; } @@ -52,6 +55,9 @@ export abstract class Visitor { public visitDuration(_node: any, ..._args: any[]): any { return null; } public visitFixedSizeList(_node: any, ..._args: any[]): any { return null; } public visitMap(_node: any, ..._args: any[]): any { return null; } + public visitListView(_node: any, ..._args: any[]): any { return null; } + public visitLargeListView(_node: any, ..._args: any[]): any { return null; } + public visitRunEndEncoded(_node: any, ..._args: any[]): any { return null; } } /** @ignore */ @@ -92,8 +98,10 @@ function getVisitFnByTypeId(visitor: Visitor, dtype: Type, throwIfNotFound = tru case Type.Float64: fn = visitor.visitFloat64 || visitor.visitFloat; break; case Type.Utf8: fn = visitor.visitUtf8; break; case Type.LargeUtf8: fn = visitor.visitLargeUtf8; break; + case Type.Utf8View: fn = visitor.visitUtf8View || visitor.visitUtf8; break; case Type.Binary: fn = visitor.visitBinary; break; case Type.LargeBinary: fn = visitor.visitLargeBinary; break; + case Type.BinaryView: fn = visitor.visitBinaryView || visitor.visitBinary; break; case Type.FixedSizeBinary: fn = visitor.visitFixedSizeBinary; break; case Type.Date: fn = visitor.visitDate; break; case Type.DateDay: fn = visitor.visitDateDay || visitor.visitDate; break; @@ -110,6 +118,7 @@ function getVisitFnByTypeId(visitor: Visitor, dtype: Type, throwIfNotFound = tru case Type.TimeNanosecond: fn = visitor.visitTimeNanosecond || visitor.visitTime; break; case Type.Decimal: fn = visitor.visitDecimal; break; case Type.List: fn = visitor.visitList; break; + case Type.LargeList: fn = visitor.visitLargeList; break; case Type.Struct: fn = visitor.visitStruct; break; case Type.Union: fn = visitor.visitUnion; break; case Type.DenseUnion: fn = visitor.visitDenseUnion || visitor.visitUnion; break; @@ -126,6 +135,9 @@ function getVisitFnByTypeId(visitor: Visitor, dtype: Type, throwIfNotFound = tru case Type.DurationNanosecond: fn = visitor.visitDurationNanosecond || visitor.visitDuration; break; case Type.FixedSizeList: fn = visitor.visitFixedSizeList; break; case Type.Map: fn = visitor.visitMap; break; + case Type.ListView: fn = visitor.visitListView; break; + case Type.LargeListView: fn = visitor.visitLargeListView; break; + case Type.RunEndEncoded: fn = visitor.visitRunEndEncoded; break; } if (typeof fn === 'function') return fn; if (!throwIfNotFound) return () => null; @@ -157,8 +169,10 @@ function inferDType(type: T): Type { return Type.Float; case Type.Binary: return Type.Binary; case Type.LargeBinary: return Type.LargeBinary; + case Type.BinaryView: return Type.BinaryView; case Type.Utf8: return Type.Utf8; case Type.LargeUtf8: return Type.LargeUtf8; + case Type.Utf8View: return Type.Utf8View; case Type.Bool: return Type.Bool; case Type.Decimal: return Type.Decimal; case Type.Time: @@ -205,6 +219,7 @@ function inferDType(type: T): Type { return Type.Duration; case Type.Map: return Type.Map; case Type.List: return Type.List; + case Type.LargeList: return Type.LargeList; case Type.Struct: return Type.Struct; case Type.Union: switch ((type as any as Union).mode) { @@ -216,6 +231,9 @@ function inferDType(type: T): Type { case Type.FixedSizeBinary: return Type.FixedSizeBinary; case Type.FixedSizeList: return Type.FixedSizeList; case Type.Dictionary: return Type.Dictionary; + case Type.ListView: return Type.ListView; + case Type.LargeListView: return Type.LargeListView; + case Type.RunEndEncoded: return Type.RunEndEncoded; } throw new Error(`Unrecognized type '${Type[type.typeId]}'`); } @@ -256,6 +274,7 @@ export interface Visitor { visitTimeNanosecond?(node: any, ...args: any[]): any; visitDecimal(node: any, ...args: any[]): any; visitList(node: any, ...args: any[]): any; + visitLargeList(node: any, ...args: any[]): any; visitStruct(node: any, ...args: any[]): any; visitUnion(node: any, ...args: any[]): any; visitDenseUnion?(node: any, ...args: any[]): any; @@ -272,6 +291,9 @@ export interface Visitor { visitDurationNanosecond(node: any, ...args: any[]): any; visitFixedSizeList(node: any, ...args: any[]): any; visitMap(node: any, ...args: any[]): any; + visitListView(node: any, ...args: any[]): any; + visitLargeListView(node: any, ...args: any[]): any; + visitRunEndEncoded(node: any, ...args: any[]): any; } // Add these here so they're picked up by the externs creator diff --git a/src/visitor/builderctor.ts b/src/visitor/builderctor.ts index 791576b0..c7c1f289 100644 --- a/src/visitor/builderctor.ts +++ b/src/visitor/builderctor.ts @@ -34,6 +34,8 @@ import { IntervalBuilder, IntervalDayTimeBuilder, IntervalMonthDayNanoBuilder, I import { DurationBuilder, DurationSecondBuilder, DurationMillisecondBuilder, DurationMicrosecondBuilder, DurationNanosecondBuilder } from '../builder/duration.js'; import { IntBuilder, Int8Builder, Int16Builder, Int32Builder, Int64Builder, Uint8Builder, Uint16Builder, Uint32Builder, Uint64Builder } from '../builder/int.js'; import { ListBuilder } from '../builder/list.js'; +import { LargeListBuilder } from '../builder/largelist.js'; +import { ListViewBuilder, LargeListViewBuilder } from '../builder/listview.js'; import { MapBuilder } from '../builder/map.js'; import { NullBuilder } from '../builder/null.js'; import { StructBuilder } from '../builder/struct.js'; @@ -42,6 +44,8 @@ import { TimeBuilder, TimeSecondBuilder, TimeMillisecondBuilder, TimeMicrosecond import { UnionBuilder, DenseUnionBuilder, SparseUnionBuilder } from '../builder/union.js'; import { Utf8Builder } from '../builder/utf8.js'; import { LargeUtf8Builder } from '../builder/largeutf8.js'; +import { BinaryViewBuilder } from '../builder/binaryview.js'; +import { Utf8ViewBuilder } from '../builder/utf8view.js'; /** @ignore */ export interface GetBuilderCtor extends Visitor { @@ -88,6 +92,9 @@ export class GetBuilderCtor extends Visitor { public visitTimeNanosecond() { return TimeNanosecondBuilder; } public visitDecimal() { return DecimalBuilder; } public visitList() { return ListBuilder; } + public visitLargeList() { return LargeListBuilder; } + public visitListView() { return ListViewBuilder; } + public visitLargeListView() { return LargeListViewBuilder; } public visitStruct() { return StructBuilder; } public visitUnion() { return UnionBuilder; } public visitDenseUnion() { return DenseUnionBuilder; } @@ -104,6 +111,8 @@ export class GetBuilderCtor extends Visitor { public visitDurationNanosecond() { return DurationNanosecondBuilder; } public visitFixedSizeList() { return FixedSizeListBuilder; } public visitMap() { return MapBuilder; } + public visitBinaryView() { return BinaryViewBuilder; } + public visitUtf8View() { return Utf8ViewBuilder; } } /** @ignore */ diff --git a/src/visitor/get.ts b/src/visitor/get.ts index a5502dd3..6b96a69f 100644 --- a/src/visitor/get.ts +++ b/src/visitor/get.ts @@ -28,7 +28,7 @@ import { uint16ToFloat64 } from '../util/math.js'; import { Type, UnionMode, Precision, DateUnit, TimeUnit, IntervalUnit } from '../enum.js'; import { DataType, Dictionary, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, LargeList, ListView, LargeListView, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -38,6 +38,7 @@ import { Duration, DurationSecond, DurationMillisecond, DurationMicrosecond, DurationNanosecond, Union, DenseUnion, SparseUnion, IntervalMonthDayNano, + RunEndEncoded, } from '../type.js'; /** @ignore */ @@ -63,8 +64,10 @@ export interface GetVisitor extends Visitor { visitFloat64(data: Data, index: number): T['TValue'] | null; visitUtf8(data: Data, index: number): T['TValue'] | null; visitLargeUtf8(data: Data, index: number): T['TValue'] | null; + visitUtf8View(data: Data, index: number): T['TValue'] | null; visitBinary(data: Data, index: number): T['TValue'] | null; visitLargeBinary(data: Data, index: number): T['TValue'] | null; + visitBinaryView(data: Data, index: number): T['TValue'] | null; visitFixedSizeBinary(data: Data, index: number): T['TValue'] | null; visitDate(data: Data, index: number): T['TValue'] | null; visitDateDay(data: Data, index: number): T['TValue'] | null; @@ -81,6 +84,10 @@ export interface GetVisitor extends Visitor { visitTimeNanosecond(data: Data, index: number): T['TValue'] | null; visitDecimal(data: Data, index: number): T['TValue'] | null; visitList(data: Data, index: number): T['TValue'] | null; + visitLargeList(data: Data, index: number): T['TValue'] | null; + visitListView(data: Data, index: number): T['TValue'] | null; + visitLargeListView(data: Data, index: number): T['TValue'] | null; + visitRunEndEncoded(data: Data, index: number): T['TValue'] | null; visitStruct(data: Data, index: number): T['TValue'] | null; visitUnion(data: Data, index: number): T['TValue'] | null; visitDenseUnion(data: Data, index: number): T['TValue'] | null; @@ -109,6 +116,9 @@ function wrapGet(fn: (data: Data, _1: any) => any) { /** @ignore */const epochDaysToMs = (data: Int32Array, index: number) => 86400000 * data[index]; +const BINARY_VIEW_SIZE = 16; +const BINARY_VIEW_INLINE_CAPACITY = 12; + /** @ignore */ const getNull = (_data: Data, _index: number): T['TValue'] => null; /** @ignore */ @@ -149,10 +159,43 @@ const getFixedSizeBinary = ({ stride, values }: Data< /** @ignore */ const getBinary = ({ values, valueOffsets }: Data, index: number): T['TValue'] => getVariableWidthBytes(values, valueOffsets, index); /** @ignore */ +const getBinaryViewBytes = (data: Data, index: number): Uint8Array => { + const values = data.values as Uint8Array; + if (!values) { + throw new Error('BinaryView data is missing view buffer'); + } + const start = (data.offset + index) * BINARY_VIEW_SIZE; + const baseOffset = values.byteOffset + start; + const view = new DataView(values.buffer, baseOffset, BINARY_VIEW_SIZE); + const size = view.getInt32(0, true); + if (size <= 0) { + return new Uint8Array(0); + } + if (size <= BINARY_VIEW_INLINE_CAPACITY) { + return new Uint8Array(values.buffer, baseOffset + 4, size); + } + const bufferIndex = view.getInt32(8, true); + const offset = view.getInt32(12, true); + const variadicBuffer = data.variadicBuffers?.[bufferIndex]; + if (!variadicBuffer) { + throw new Error(`BinaryView variadic buffer ${bufferIndex} is missing`); + } + return variadicBuffer.subarray(offset, offset + size); +}; +/** @ignore */ +const getBinaryViewValue = (data: Data, index: number): T['TValue'] => { + return getBinaryViewBytes(data, index) as T['TValue']; +}; +/** @ignore */ const getUtf8 = ({ values, valueOffsets }: Data, index: number): T['TValue'] => { const bytes = getVariableWidthBytes(values, valueOffsets, index); return bytes !== null ? decodeUtf8(bytes) : null as any; }; +/** @ignore */ +const getUtf8ViewValue = (data: Data, index: number): T['TValue'] => { + const bytes = getBinaryViewBytes(data, index); + return decodeUtf8(bytes); +}; /* istanbul ignore next */ /** @ignore */ @@ -222,6 +265,60 @@ const getList = (data: Data, index: number): T['TValue'] => { return new Vector([slice]) as T['TValue']; }; +/** @ignore */ +const getLargeList = (data: Data, index: number): T['TValue'] => { + const { valueOffsets, stride, children } = data; + const begin = bigIntToNumber(valueOffsets[index * stride]); + const end = bigIntToNumber(valueOffsets[index * stride + 1]); + const child: Data = children[0]; + const slice = child.slice(begin, end - begin); + return new Vector([slice]) as T['TValue']; +}; + +/** @ignore */ +const getListView = (data: Data, index: number): T['TValue'] => { + const { valueOffsets, values: sizes, children } = data; + const offset = bigIntToNumber(valueOffsets[index]); + const size = bigIntToNumber(sizes[index]); + const child: Data = children[0]; + const slice = child.slice(offset, size); + return new Vector([slice]) as T['TValue']; +}; + +/** @ignore */ +const getLargeListView = (data: Data, index: number): T['TValue'] => { + const { valueOffsets, values: sizes, children } = data; + const offset = bigIntToNumber(valueOffsets[index]); + const size = bigIntToNumber(sizes[index]); + const child: Data = children[0]; + const slice = child.slice(offset, size); + return new Vector([slice]) as T['TValue']; +}; + +/** @ignore */ +const getRunEndEncoded = (data: Data, index: number): T['TValue'] => { + const { children } = data; + const runEnds = children[0] as Data; + const values = children[1] as Data; + const getRunEnd = instance.getVisitFn(runEnds); + const get = instance.getVisitFn(values); + + // Binary search to find the run that contains this index + let low = 0; + let high = runEnds.length - 1; + while (low < high) { + const mid = (low + high) >>> 1; + const runEnd = bigIntToNumber(getRunEnd(runEnds, mid) as number | bigint); + if (index < runEnd) { + high = mid; + } else { + low = mid + 1; + } + } + + return get(values, low); +}; + /** @ignore */ const getMap = (data: Data, index: number): T['TValue'] => { const { valueOffsets, children } = data; @@ -332,8 +429,10 @@ GetVisitor.prototype.visitFloat32 = wrapGet(getNumeric); GetVisitor.prototype.visitFloat64 = wrapGet(getNumeric); GetVisitor.prototype.visitUtf8 = wrapGet(getUtf8); GetVisitor.prototype.visitLargeUtf8 = wrapGet(getUtf8); +GetVisitor.prototype.visitUtf8View = wrapGet(getUtf8ViewValue); GetVisitor.prototype.visitBinary = wrapGet(getBinary); GetVisitor.prototype.visitLargeBinary = wrapGet(getBinary); +GetVisitor.prototype.visitBinaryView = wrapGet(getBinaryViewValue); GetVisitor.prototype.visitFixedSizeBinary = wrapGet(getFixedSizeBinary); GetVisitor.prototype.visitDate = wrapGet(getDate); GetVisitor.prototype.visitDateDay = wrapGet(getDateDay); @@ -350,6 +449,10 @@ GetVisitor.prototype.visitTimeMicrosecond = wrapGet(getTimeMicrosecond); GetVisitor.prototype.visitTimeNanosecond = wrapGet(getTimeNanosecond); GetVisitor.prototype.visitDecimal = wrapGet(getDecimal); GetVisitor.prototype.visitList = wrapGet(getList); +GetVisitor.prototype.visitLargeList = wrapGet(getLargeList); +GetVisitor.prototype.visitListView = wrapGet(getListView); +GetVisitor.prototype.visitLargeListView = wrapGet(getLargeListView); +GetVisitor.prototype.visitRunEndEncoded = wrapGet(getRunEndEncoded); GetVisitor.prototype.visitStruct = wrapGet(getStruct); GetVisitor.prototype.visitUnion = wrapGet(getUnion); GetVisitor.prototype.visitDenseUnion = wrapGet(getDenseUnion); diff --git a/src/visitor/indexof.ts b/src/visitor/indexof.ts index 3a4d1171..e0f07894 100644 --- a/src/visitor/indexof.ts +++ b/src/visitor/indexof.ts @@ -24,7 +24,7 @@ import { getBool, BitIterator } from '../util/bit.js'; import { createElementComparator } from '../util/vector.js'; import { DataType, Dictionary, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, LargeList, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -34,6 +34,7 @@ import { Duration, DurationSecond, DurationMillisecond, DurationMicrosecond, DurationNanosecond, Union, DenseUnion, SparseUnion, IntervalMonthDayNano, + RunEndEncoded, } from '../type.js'; /** @ignore */ @@ -59,8 +60,10 @@ export interface IndexOfVisitor extends Visitor { visitFloat64(data: Data, value: T['TValue'] | null, index?: number): number; visitUtf8(data: Data, value: T['TValue'] | null, index?: number): number; visitLargeUtf8(data: Data, value: T['TValue'] | null, index?: number): number; + visitUtf8View(data: Data, value: T['TValue'] | null, index?: number): number; visitBinary(data: Data, value: T['TValue'] | null, index?: number): number; visitLargeBinary(data: Data, value: T['TValue'] | null, index?: number): number; + visitBinaryView(data: Data, value: T['TValue'] | null, index?: number): number; visitFixedSizeBinary(data: Data, value: T['TValue'] | null, index?: number): number; visitDate(data: Data, value: T['TValue'] | null, index?: number): number; visitDateDay(data: Data, value: T['TValue'] | null, index?: number): number; @@ -77,6 +80,7 @@ export interface IndexOfVisitor extends Visitor { visitTimeNanosecond(data: Data, value: T['TValue'] | null, index?: number): number; visitDecimal(data: Data, value: T['TValue'] | null, index?: number): number; visitList(data: Data, value: T['TValue'] | null, index?: number): number; + visitLargeList(data: Data, value: T['TValue'] | null, index?: number): number; visitStruct(data: Data, value: T['TValue'] | null, index?: number): number; visitUnion(data: Data, value: T['TValue'] | null, index?: number): number; visitDenseUnion(data: Data, value: T['TValue'] | null, index?: number): number; @@ -93,6 +97,7 @@ export interface IndexOfVisitor extends Visitor { visitDurationNanosecond(data: Data, value: T['TValue'] | null, index?: number): number; visitFixedSizeList(data: Data, value: T['TValue'] | null, index?: number): number; visitMap(data: Data, value: T['TValue'] | null, index?: number): number; + visitRunEndEncoded(data: Data, value: T['TValue'] | null, index?: number): number; } /** @ignore */ @@ -177,8 +182,10 @@ IndexOfVisitor.prototype.visitFloat32 = indexOfValue; IndexOfVisitor.prototype.visitFloat64 = indexOfValue; IndexOfVisitor.prototype.visitUtf8 = indexOfValue; IndexOfVisitor.prototype.visitLargeUtf8 = indexOfValue; +IndexOfVisitor.prototype.visitUtf8View = indexOfValue; IndexOfVisitor.prototype.visitBinary = indexOfValue; IndexOfVisitor.prototype.visitLargeBinary = indexOfValue; +IndexOfVisitor.prototype.visitBinaryView = indexOfValue; IndexOfVisitor.prototype.visitFixedSizeBinary = indexOfValue; IndexOfVisitor.prototype.visitDate = indexOfValue; IndexOfVisitor.prototype.visitDateDay = indexOfValue; @@ -195,6 +202,7 @@ IndexOfVisitor.prototype.visitTimeMicrosecond = indexOfValue; IndexOfVisitor.prototype.visitTimeNanosecond = indexOfValue; IndexOfVisitor.prototype.visitDecimal = indexOfValue; IndexOfVisitor.prototype.visitList = indexOfValue; +IndexOfVisitor.prototype.visitLargeList = indexOfValue; IndexOfVisitor.prototype.visitStruct = indexOfValue; IndexOfVisitor.prototype.visitUnion = indexOfValue; IndexOfVisitor.prototype.visitDenseUnion = indexOfUnion; @@ -211,6 +219,7 @@ IndexOfVisitor.prototype.visitDurationMicrosecond = indexOfValue; IndexOfVisitor.prototype.visitDurationNanosecond = indexOfValue; IndexOfVisitor.prototype.visitFixedSizeList = indexOfValue; IndexOfVisitor.prototype.visitMap = indexOfValue; +IndexOfVisitor.prototype.visitRunEndEncoded = indexOfValue; /** @ignore */ export const instance = new IndexOfVisitor(); diff --git a/src/visitor/iterator.ts b/src/visitor/iterator.ts index 9f2844b3..5111dd5c 100644 --- a/src/visitor/iterator.ts +++ b/src/visitor/iterator.ts @@ -15,13 +15,15 @@ // specific language governing permissions and limitations // under the License. +import { Data } from '../data.js'; import { Vector } from '../vector.js'; import { Visitor } from '../visitor.js'; import { Type, Precision } from '../enum.js'; import { TypeToDataType } from '../interfaces.js'; +import { instance as getVisitor } from './get.js'; import { DataType, Dictionary, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, LargeList, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -31,6 +33,7 @@ import { Duration, DurationSecond, DurationMillisecond, DurationMicrosecond, DurationNanosecond, Union, DenseUnion, SparseUnion, IntervalMonthDayNano, + RunEndEncoded, } from '../type.js'; import { ChunkedIterator } from '../util/chunk.js'; @@ -57,8 +60,10 @@ export interface IteratorVisitor extends Visitor { visitFloat64(vector: Vector): IterableIterator; visitUtf8(vector: Vector): IterableIterator; visitLargeUtf8(vector: Vector): IterableIterator; + visitUtf8View(vector: Vector): IterableIterator; visitBinary(vector: Vector): IterableIterator; visitLargeBinary(vector: Vector): IterableIterator; + visitBinaryView(vector: Vector): IterableIterator; visitFixedSizeBinary(vector: Vector): IterableIterator; visitDate(vector: Vector): IterableIterator; visitDateDay(vector: Vector): IterableIterator; @@ -75,6 +80,7 @@ export interface IteratorVisitor extends Visitor { visitTimeNanosecond(vector: Vector): IterableIterator; visitDecimal(vector: Vector): IterableIterator; visitList(vector: Vector): IterableIterator; + visitLargeList(vector: Vector): IterableIterator; visitStruct(vector: Vector): IterableIterator; visitUnion(vector: Vector): IterableIterator; visitDenseUnion(vector: Vector): IterableIterator; @@ -91,6 +97,7 @@ export interface IteratorVisitor extends Visitor { visitDurationNanosecond(vector: Vector): IterableIterator; visitFixedSizeList(vector: Vector): IterableIterator; visitMap(vector: Vector): IterableIterator; + visitRunEndEncoded(vector: Vector): IterableIterator; } /** @ignore */ @@ -126,6 +133,19 @@ function vectorIterator(vector: Vector): IterableIterator }); } +/** @ignore */ +function runEndEncodedIterator(vector: Vector): IterableIterator { + // Use specialized iterator with O(1) amortized sequential access + let offset = 0; + return new ChunkedIterator(vector.data.length, (chunkIndex) => { + const data = vector.data[chunkIndex]; + const length = data.length; + const inner = vector.slice(offset, offset + length); + offset += length; + return new RunEndEncodedIterator(inner); + }); +} + /** @ignore */ class VectorIterator implements IterableIterator { private index = 0; @@ -147,6 +167,89 @@ class VectorIterator implements IterableIterator implements IterableIterator { + private index = 0; + private lastPhysicalIndex = 0; + private readonly runEnds: Data; + private readonly values: Data; + private readonly getRunEnd: (data: Data, index: number) => T['runEndsType']['TValue'] | null; + private readonly getValue: (data: Data, index: number) => T['TValue'] | null; + + constructor(private vector: Vector) { + const data = vector.data[0]; + this.runEnds = data.children[0] as Data; + this.values = data.children[1] as Data; + this.getRunEnd = getVisitor.getVisitFn(this.runEnds); + this.getValue = getVisitor.getVisitFn(this.values); + } + + next(): IteratorResult { + if (this.index < this.vector.length) { + const value = this.getValueAtIndex(this.index++); + return { value }; + } + return { done: true, value: null }; + } + + private getValueAtIndex(logicalIndex: number): T['TValue'] { + const physicalIndex = this.findPhysicalIndex(logicalIndex); + return this.getValue(this.values, physicalIndex); + } + + private findPhysicalIndex(i: number): number { + const runEndsLength = this.runEnds.length; + const offset = this.vector.data[0].offset; + + // Fast path: check if the cached physical index is still valid + const cachedRunEnd = Number(this.getRunEnd(this.runEnds, this.lastPhysicalIndex)); + if (offset + i < cachedRunEnd) { + // Cached value is an upper bound, but is it the least upper bound? + if (this.lastPhysicalIndex === 0) { + return this.lastPhysicalIndex; + } + const prevRunEnd = Number(this.getRunEnd(this.runEnds, this.lastPhysicalIndex - 1)); + if (offset + i >= prevRunEnd) { + // Cache hit - same run as before + return this.lastPhysicalIndex; + } + // Search in the range before the cached index + this.lastPhysicalIndex = this.binarySearchRange(0, this.lastPhysicalIndex, i, offset); + return this.lastPhysicalIndex; + } + + // Cached index is not an upper bound, search after it + const minPhysicalIndex = this.lastPhysicalIndex + 1; + const relativeIndex = this.binarySearchRange( + minPhysicalIndex, + runEndsLength, + i, + offset + ); + this.lastPhysicalIndex = relativeIndex; + return this.lastPhysicalIndex; + } + + private binarySearchRange(start: number, end: number, i: number, offset: number): number { + let low = start; + let high = end - 1; + while (low < high) { + const mid = (low + high) >>> 1; + const runEnd = Number(this.getRunEnd(this.runEnds, mid)); + if (offset + i < runEnd) { + high = mid; + } else { + low = mid + 1; + } + } + return low; + } + + [Symbol.iterator]() { + return this; + } +} + IteratorVisitor.prototype.visitNull = vectorIterator; IteratorVisitor.prototype.visitBool = vectorIterator; IteratorVisitor.prototype.visitInt = vectorIterator; @@ -164,8 +267,10 @@ IteratorVisitor.prototype.visitFloat32 = vectorIterator; IteratorVisitor.prototype.visitFloat64 = vectorIterator; IteratorVisitor.prototype.visitUtf8 = vectorIterator; IteratorVisitor.prototype.visitLargeUtf8 = vectorIterator; +IteratorVisitor.prototype.visitUtf8View = vectorIterator; IteratorVisitor.prototype.visitBinary = vectorIterator; IteratorVisitor.prototype.visitLargeBinary = vectorIterator; +IteratorVisitor.prototype.visitBinaryView = vectorIterator; IteratorVisitor.prototype.visitFixedSizeBinary = vectorIterator; IteratorVisitor.prototype.visitDate = vectorIterator; IteratorVisitor.prototype.visitDateDay = vectorIterator; @@ -182,6 +287,7 @@ IteratorVisitor.prototype.visitTimeMicrosecond = vectorIterator; IteratorVisitor.prototype.visitTimeNanosecond = vectorIterator; IteratorVisitor.prototype.visitDecimal = vectorIterator; IteratorVisitor.prototype.visitList = vectorIterator; +IteratorVisitor.prototype.visitLargeList = vectorIterator; IteratorVisitor.prototype.visitStruct = vectorIterator; IteratorVisitor.prototype.visitUnion = vectorIterator; IteratorVisitor.prototype.visitDenseUnion = vectorIterator; @@ -198,6 +304,7 @@ IteratorVisitor.prototype.visitDurationMicrosecond = vectorIterator; IteratorVisitor.prototype.visitDurationNanosecond = vectorIterator; IteratorVisitor.prototype.visitFixedSizeList = vectorIterator; IteratorVisitor.prototype.visitMap = vectorIterator; +IteratorVisitor.prototype.visitRunEndEncoded = runEndEncodedIterator; /** @ignore */ export const instance = new IteratorVisitor(); diff --git a/src/visitor/jsontypeassembler.ts b/src/visitor/jsontypeassembler.ts index 823b1dea..2f87b260 100644 --- a/src/visitor/jsontypeassembler.ts +++ b/src/visitor/jsontypeassembler.ts @@ -45,6 +45,9 @@ export class JSONTypeAssembler extends Visitor { public visitLargeBinary({ typeId }: T) { return { 'name': ArrowType[typeId].toLowerCase() }; } + public visitBinaryView({ typeId }: T) { + return { 'name': ArrowType[typeId].toLowerCase() }; + } public visitBool({ typeId }: T) { return { 'name': ArrowType[typeId].toLowerCase() }; } @@ -54,6 +57,9 @@ export class JSONTypeAssembler extends Visitor { public visitLargeUtf8({ typeId }: T) { return { 'name': ArrowType[typeId].toLowerCase() }; } + public visitUtf8View({ typeId }: T) { + return { 'name': ArrowType[typeId].toLowerCase() }; + } public visitDecimal({ typeId, scale, precision, bitWidth }: T) { return { 'name': ArrowType[typeId].toLowerCase(), 'scale': scale, 'precision': precision, 'bitWidth': bitWidth }; } @@ -75,6 +81,9 @@ export class JSONTypeAssembler extends Visitor { public visitList({ typeId }: T) { return { 'name': ArrowType[typeId].toLowerCase() }; } + public visitLargeList({ typeId }: T) { + return { 'name': ArrowType[typeId].toLowerCase() }; + } public visitStruct({ typeId }: T) { return { 'name': ArrowType[typeId].toLowerCase() }; } @@ -97,4 +106,7 @@ export class JSONTypeAssembler extends Visitor { public visitMap({ typeId, keysSorted }: T) { return { 'name': ArrowType[typeId].toLowerCase(), 'keysSorted': keysSorted }; } + public visitRunEndEncoded({ typeId }: T) { + return { 'name': ArrowType[typeId].toLowerCase() }; + } } diff --git a/src/visitor/jsonvectorassembler.ts b/src/visitor/jsonvectorassembler.ts index 6841b39d..ab36d325 100644 --- a/src/visitor/jsonvectorassembler.ts +++ b/src/visitor/jsonvectorassembler.ts @@ -28,7 +28,8 @@ import { toIntervalDayTimeObjects, toIntervalMonthDayNanoObjects } from '../util import { DataType, Float, Int, Date_, Interval, Time, Timestamp, Union, Duration, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, IntArray, + Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, BinaryView, Utf8View, Decimal, FixedSizeBinary, List, LargeList, FixedSizeList, Map_, Struct, IntArray, + RunEndEncoded, } from '../type.js'; /** @ignore */ @@ -46,18 +47,22 @@ export interface JSONVectorAssembler extends Visitor { visitLargeUtf8(data: Data): { DATA: string[]; OFFSET: string[] }; visitBinary(data: Data): { DATA: string[]; OFFSET: number[] }; visitLargeBinary(data: Data): { DATA: string[]; OFFSET: string[] }; + visitBinaryView(data: Data): { VIEWS: any[]; VARIADIC_DATA_BUFFERS: string[] }; + visitUtf8View(data: Data): { VIEWS: any[]; VARIADIC_DATA_BUFFERS: string[] }; visitFixedSizeBinary(data: Data): { DATA: string[] }; visitDate(data: Data): { DATA: number[] }; visitTimestamp(data: Data): { DATA: string[] }; visitTime(data: Data): { DATA: number[] }; visitDecimal(data: Data): { DATA: string[] }; visitList(data: Data): { children: any[]; OFFSET: number[] }; + visitLargeList(data: Data): { children: any[]; OFFSET: string[] }; visitStruct(data: Data): { children: any[] }; visitUnion(data: Data): { children: any[]; TYPE_ID: number[] }; visitInterval(data: Data): { DATA: number[] }; visitDuration(data: Data): { DATA: string[] }; visitFixedSizeList(data: Data): { children: any[] }; visitMap(data: Data): { children: any[] }; + visitRunEndEncoded(data: Data): { children: any[] }; } /** @ignore */ @@ -112,6 +117,12 @@ export class JSONVectorAssembler extends Visitor { public visitLargeBinary(data: Data) { return { 'DATA': [...binaryToString(new Vector([data]))], 'OFFSET': [...bigNumsToStrings(data.valueOffsets, 2)] }; } + public visitBinaryView(data: Data) { + return viewDataToJSON(data, true); + } + public visitUtf8View(data: Data) { + return viewDataToJSON(data, false); + } public visitFixedSizeBinary(data: Data) { return { 'DATA': [...binaryToString(new Vector([data]))] }; } @@ -141,6 +152,12 @@ export class JSONVectorAssembler extends Visitor { 'children': this.visitMany(data.type.children, data.children) }; } + public visitLargeList(data: Data) { + return { + 'OFFSET': [...data.valueOffsets].map(x => `${x}`), + 'children': this.visitMany(data.type.children, data.children) + }; + } public visitStruct(data: Data) { return { 'children': this.visitMany(data.type.children, data.children) @@ -177,6 +194,11 @@ export class JSONVectorAssembler extends Visitor { 'children': this.visitMany(data.type.children, data.children) }; } + public visitRunEndEncoded(data: Data) { + return { + 'children': this.visitMany(data.type.children, data.children) + }; + } } /** @ignore */ @@ -195,3 +217,67 @@ function* bigNumsToStrings(values: BigUint64Array | BigInt64Array | Uint32Array yield `${BN.new(u32s.subarray((i + 0) * stride, (i + 1) * stride), false)}`; } } + +/** @ignore */ +function viewDataToJSON(data: Data | Data, isBinary: boolean) { + const INLINE_SIZE = 12; + const views: any[] = []; + const variadicBuffers: string[] = []; + const variadicBuffersMap = new Map(); // buffer index in data -> index in output array + + // Read view structs from the views buffer (16 bytes each) + const viewsData = data.values; + const dataView = new DataView(viewsData.buffer, viewsData.byteOffset, viewsData.byteLength); + const numViews = viewsData.byteLength / 16; + + for (let i = 0; i < numViews; i++) { + const offset = i * 16; + const size = dataView.getInt32(offset, true); + + if (size <= INLINE_SIZE) { + // Inline view: read the inlined data (bytes 4-15, up to 12 bytes) + const inlined = viewsData.subarray(offset + 4, offset + 4 + size); + const inlinedHex = Array.from(inlined) + .map(b => ('0' + (b & 0xFF).toString(16)).slice(-2)) + .join('') + .toUpperCase(); + + views.push({ + 'SIZE': size, + 'INLINED': isBinary ? inlinedHex : Array.from(inlined).map(b => String.fromCodePoint(b)).join('') + }); + } else { + // Out-of-line view: read prefix (4 bytes at offset 4-7), buffer_index, offset + const prefix = viewsData.subarray(offset + 4, offset + 8); + const prefixHex = Array.from(prefix) + .map(b => ('0' + (b & 0xFF).toString(16)).slice(-2)) + .join('') + .toUpperCase(); + const bufferIndex = dataView.getInt32(offset + 8, true); + const bufferOffset = dataView.getInt32(offset + 12, true); + + // Track which variadic buffers we're using and map to output indices + if (!variadicBuffersMap.has(bufferIndex)) { + const outputIndex = variadicBuffers.length; + variadicBuffersMap.set(bufferIndex, outputIndex); + + // Get the actual buffer data and convert to hex + const buffer = data.variadicBuffers[bufferIndex]; + const hex = Array.from(buffer) + .map(b => ('0' + (b & 0xFF).toString(16)).slice(-2)) + .join('') + .toUpperCase(); + variadicBuffers.push(hex); + } + + views.push({ + 'SIZE': size, + 'PREFIX_HEX': prefixHex, + 'BUFFER_INDEX': variadicBuffersMap.get(bufferIndex), + 'OFFSET': bufferOffset + }); + } + } + + return { 'VIEWS': views, 'VARIADIC_DATA_BUFFERS': variadicBuffers }; +} diff --git a/src/visitor/set.ts b/src/visitor/set.ts index 4bf632ba..6615fd25 100644 --- a/src/visitor/set.ts +++ b/src/visitor/set.ts @@ -26,7 +26,7 @@ import { float64ToUint16 } from '../util/math.js'; import { Type, UnionMode, Precision, DateUnit, TimeUnit, IntervalUnit } from '../enum.js'; import { DataType, Dictionary, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, LargeList, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -36,6 +36,7 @@ import { Duration, DurationSecond, DurationMillisecond, DurationMicrosecond, DurationNanosecond, Union, DenseUnion, SparseUnion, IntervalMonthDayNano, + RunEndEncoded, } from '../type.js'; /** @ignore */ @@ -61,8 +62,10 @@ export interface SetVisitor extends Visitor { visitFloat64(data: Data, index: number, value: T['TValue']): void; visitUtf8(data: Data, index: number, value: T['TValue']): void; visitLargeUtf8(data: Data, index: number, value: T['TValue']): void; + visitUtf8View(data: Data, index: number, value: T['TValue']): void; visitBinary(data: Data, index: number, value: T['TValue']): void; visitLargeBinary(data: Data, index: number, value: T['TValue']): void; + visitBinaryView(data: Data, index: number, value: T['TValue']): void; visitFixedSizeBinary(data: Data, index: number, value: T['TValue']): void; visitDate(data: Data, index: number, value: T['TValue']): void; visitDateDay(data: Data, index: number, value: T['TValue']): void; @@ -79,6 +82,8 @@ export interface SetVisitor extends Visitor { visitTimeNanosecond(data: Data, index: number, value: T['TValue']): void; visitDecimal(data: Data, index: number, value: T['TValue']): void; visitList(data: Data, index: number, value: T['TValue']): void; + visitLargeList(data: Data, index: number, value: T['TValue']): void; + visitRunEndEncoded(data: Data, index: number, value: T['TValue']): void; visitStruct(data: Data, index: number, value: T['TValue']): void; visitUnion(data: Data, index: number, value: T['TValue']): void; visitDenseUnion(data: Data, index: number, value: T['TValue']): void; @@ -155,7 +160,15 @@ export const setFixedSizeBinary = ({ stride, values } /** @ignore */ const setBinary = ({ values, valueOffsets }: Data, index: number, value: T['TValue']) => setVariableWidthBytes(values, valueOffsets, index, value); /** @ignore */ +const setBinaryView = (_data: Data, _index: number, _value: T['TValue']) => { + throw new Error('BinaryView values are immutable in the current implementation'); +}; +/** @ignore */ const setUtf8 = ({ values, valueOffsets }: Data, index: number, value: T['TValue']) => setVariableWidthBytes(values, valueOffsets, index, encodeUtf8(value)); +/** @ignore */ +const setUtf8View = (_data: Data, _index: number, _value: T['TValue']) => { + throw new Error('Utf8View values are immutable in the current implementation'); +}; /* istanbul ignore next */ export const setDate = (data: Data, index: number, value: T['TValue']): void => { @@ -221,6 +234,24 @@ const setList = (data: Data, index: number, value: T['TValue' } }; +/** @ignore */ +const setLargeList = (data: Data, index: number, value: T['TValue']): void => { + const values = data.children[0]; + const valueOffsets = data.valueOffsets; + const set = instance.getVisitFn(values); + const begin = bigIntToNumber(valueOffsets[index]); + const end = bigIntToNumber(valueOffsets[index + 1]); + if (Array.isArray(value)) { + for (let idx = -1, itr = begin; itr < end;) { + set(values, itr++, value[++idx]); + } + } else { + for (let idx = -1, itr = begin; itr < end;) { + set(values, itr++, value.get(++idx)); + } + } +}; + /** @ignore */ const setMap = (data: Data, index: number, value: T['TValue']) => { const values = data.children[0]; @@ -248,6 +279,11 @@ const setMap = (data: Data, index: number, value: T['TValue'] /** @ignore */ const _setStructObjectValue = (o: number, v: { [key: string]: any }) => (set: SetFunc, c: Data, f: Field, _: number) => c && set(c, o, v[f.name]); +/** @ignore */ +const setRunEndEncoded = (_data: Data, _index: number, _value: T['TValue']) => { + throw new Error('RunEndEncoded is immutable'); +}; + /** @ignore */ const setStruct = (data: Data, index: number, value: T['TValue']) => { @@ -359,8 +395,10 @@ SetVisitor.prototype.visitFloat32 = wrapSet(setFloat); SetVisitor.prototype.visitFloat64 = wrapSet(setFloat); SetVisitor.prototype.visitUtf8 = wrapSet(setUtf8); SetVisitor.prototype.visitLargeUtf8 = wrapSet(setUtf8); +SetVisitor.prototype.visitUtf8View = wrapSet(setUtf8View); SetVisitor.prototype.visitBinary = wrapSet(setBinary); SetVisitor.prototype.visitLargeBinary = wrapSet(setBinary); +SetVisitor.prototype.visitBinaryView = wrapSet(setBinaryView); SetVisitor.prototype.visitFixedSizeBinary = wrapSet(setFixedSizeBinary); SetVisitor.prototype.visitDate = wrapSet(setDate); SetVisitor.prototype.visitDateDay = wrapSet(setDateDay); @@ -377,6 +415,8 @@ SetVisitor.prototype.visitTimeMicrosecond = wrapSet(setTimeMicrosecond); SetVisitor.prototype.visitTimeNanosecond = wrapSet(setTimeNanosecond); SetVisitor.prototype.visitDecimal = wrapSet(setDecimal); SetVisitor.prototype.visitList = wrapSet(setList); +SetVisitor.prototype.visitLargeList = wrapSet(setLargeList); +SetVisitor.prototype.visitRunEndEncoded = wrapSet(setRunEndEncoded); SetVisitor.prototype.visitStruct = wrapSet(setStruct); SetVisitor.prototype.visitUnion = wrapSet(setUnion); SetVisitor.prototype.visitDenseUnion = wrapSet(setDenseUnion); diff --git a/src/visitor/typeassembler.ts b/src/visitor/typeassembler.ts index 169f3627..4934b32f 100644 --- a/src/visitor/typeassembler.ts +++ b/src/visitor/typeassembler.ts @@ -25,9 +25,11 @@ import { Null } from '../fb/null.js'; import { Int } from '../fb/int.js'; import { FloatingPoint } from '../fb/floating-point.js'; import { Binary } from '../fb/binary.js'; +import { BinaryView } from '../fb/binary-view.js'; import { LargeBinary } from '../fb/large-binary.js'; import { Bool } from '../fb/bool.js'; import { Utf8 } from '../fb/utf8.js'; +import { Utf8View } from '../fb/utf8-view.js'; import { LargeUtf8 } from '../fb/large-utf8.js'; import { Decimal } from '../fb/decimal.js'; import { Date } from '../fb/date.js'; @@ -36,6 +38,10 @@ import { Timestamp } from '../fb/timestamp.js'; import { Interval } from '../fb/interval.js'; import { Duration } from '../fb/duration.js'; import { List } from '../fb/list.js'; +import { LargeList } from '../fb/large-list.js'; +import { ListView } from '../fb/list-view.js'; +import { LargeListView } from '../fb/large-list-view.js'; +import { RunEndEncoded } from '../fb/run-end-encoded.js'; import { Struct_ as Struct } from '../fb/struct-.js'; import { Union } from '../fb/union.js'; import { DictionaryEncoding } from '../fb/dictionary-encoding.js'; @@ -72,6 +78,10 @@ export class TypeAssembler extends Visitor { Binary.startBinary(b); return Binary.endBinary(b); } + public visitBinaryView(_node: T, b: Builder) { + BinaryView.startBinaryView(b); + return BinaryView.endBinaryView(b); + } public visitLargeBinary(_node: T, b: Builder) { LargeBinary.startLargeBinary(b); return LargeBinary.endLargeBinary(b); @@ -84,6 +94,10 @@ export class TypeAssembler extends Visitor { Utf8.startUtf8(b); return Utf8.endUtf8(b); } + public visitUtf8View(_node: T, b: Builder) { + Utf8View.startUtf8View(b); + return Utf8View.endUtf8View(b); + } public visitLargeUtf8(_node: T, b: Builder) { LargeUtf8.startLargeUtf8(b); return LargeUtf8.endLargeUtf8(b); @@ -129,6 +143,22 @@ export class TypeAssembler extends Visitor { List.startList(b); return List.endList(b); } + public visitLargeList(_node: T, b: Builder) { + LargeList.startLargeList(b); + return LargeList.endLargeList(b); + } + public visitListView(_node: T, b: Builder) { + ListView.startListView(b); + return ListView.endListView(b); + } + public visitLargeListView(_node: T, b: Builder) { + LargeListView.startLargeListView(b); + return LargeListView.endLargeListView(b); + } + public visitRunEndEncoded(_node: T, b: Builder) { + RunEndEncoded.startRunEndEncoded(b); + return RunEndEncoded.endRunEndEncoded(b); + } public visitStruct(_node: T, b: Builder) { Struct.startStruct_(b); return Struct.endStruct_(b); diff --git a/src/visitor/typecomparator.ts b/src/visitor/typecomparator.ts index 65413ccd..07e8b249 100644 --- a/src/visitor/typecomparator.ts +++ b/src/visitor/typecomparator.ts @@ -21,7 +21,7 @@ import { Visitor } from '../visitor.js'; import { Schema, Field } from '../schema.js'; import { DataType, TypeMap, Dictionary, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -31,6 +31,7 @@ import { Duration, DurationSecond, DurationMillisecond, DurationMicrosecond, DurationNanosecond, Union, DenseUnion, SparseUnion, IntervalMonthDayNano, + RunEndEncoded, } from '../type.js'; /** @ignore */ @@ -55,8 +56,10 @@ export interface TypeComparator extends Visitor { visitFloat64(type: T, other?: DataType | null): other is T; visitUtf8(type: T, other?: DataType | null): other is T; visitLargeUtf8(type: T, other?: DataType | null): other is T; + visitUtf8View(type: T, other?: DataType | null): other is T; visitBinary(type: T, other?: DataType | null): other is T; visitLargeBinary(type: T, other?: DataType | null): other is T; + visitBinaryView(type: T, other?: DataType | null): other is T; visitFixedSizeBinary(type: T, other?: DataType | null): other is T; visitDate(type: T, other?: DataType | null): other is T; visitDateDay(type: T, other?: DataType | null): other is T; @@ -89,6 +92,7 @@ export interface TypeComparator extends Visitor { visitDurationNanosecond(type: T, other?: DataType | null): other is T; visitFixedSizeList(type: T, other?: DataType | null): other is T; visitMap(type: T, other?: DataType | null): other is T; + visitRunEndEncoded(type: T, other?: DataType | null): other is T; } /** @ignore */ @@ -237,6 +241,14 @@ function compareMap(type: T, other?: DataType | null): other is ); } +function compareRunEndEncoded(type: T, other?: DataType | null): other is T { + return (type === other) || ( + compareConstructor(type, other) && + type.children.length === other.children.length && + instance.compareManyFields(type.children, other.children) + ); +} + TypeComparator.prototype.visitNull = compareAny; TypeComparator.prototype.visitBool = compareAny; TypeComparator.prototype.visitInt = compareInt; @@ -254,8 +266,10 @@ TypeComparator.prototype.visitFloat32 = compareFloat; TypeComparator.prototype.visitFloat64 = compareFloat; TypeComparator.prototype.visitUtf8 = compareAny; TypeComparator.prototype.visitLargeUtf8 = compareAny; +TypeComparator.prototype.visitUtf8View = compareAny; TypeComparator.prototype.visitBinary = compareAny; TypeComparator.prototype.visitLargeBinary = compareAny; +TypeComparator.prototype.visitBinaryView = compareAny; TypeComparator.prototype.visitFixedSizeBinary = compareFixedSizeBinary; TypeComparator.prototype.visitDate = compareDate; TypeComparator.prototype.visitDateDay = compareDate; @@ -288,6 +302,7 @@ TypeComparator.prototype.visitDurationMicrosecond = compareDuration; TypeComparator.prototype.visitDurationNanosecond = compareDuration; TypeComparator.prototype.visitFixedSizeList = compareFixedSizeList; TypeComparator.prototype.visitMap = compareMap; +TypeComparator.prototype.visitRunEndEncoded = compareRunEndEncoded; /** @ignore */ export const instance = new TypeComparator(); diff --git a/src/visitor/typector.ts b/src/visitor/typector.ts index 2aab6d3d..323f1459 100644 --- a/src/visitor/typector.ts +++ b/src/visitor/typector.ts @@ -68,6 +68,7 @@ export class GetDataTypeConstructor extends Visitor { public visitTimeNanosecond() { return type.TimeNanosecond; } public visitDecimal() { return type.Decimal; } public visitList() { return type.List; } + public visitLargeList() { return type.LargeList; } public visitStruct() { return type.Struct; } public visitUnion() { return type.Union; } public visitDenseUnion() { return type.DenseUnion; } @@ -84,6 +85,11 @@ export class GetDataTypeConstructor extends Visitor { public visitDurationNanosecond() { return type.DurationNanosecond; } public visitFixedSizeList() { return type.FixedSizeList; } public visitMap() { return type.Map_; } + public visitBinaryView() { return type.BinaryView; } + public visitUtf8View() { return type.Utf8View; } + public visitListView() { return type.ListView; } + public visitLargeListView() { return type.LargeListView; } + public visitRunEndEncoded() { return type.RunEndEncoded; } } /** @ignore */ diff --git a/src/visitor/vectorassembler.ts b/src/visitor/vectorassembler.ts index 7dc36955..51816a1e 100644 --- a/src/visitor/vectorassembler.ts +++ b/src/visitor/vectorassembler.ts @@ -27,7 +27,8 @@ import { BufferRegion, FieldNode } from '../ipc/metadata/message.js'; import { DataType, Dictionary, Float, Int, Date_, Interval, Time, Timestamp, Union, Duration, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, LargeList, FixedSizeList, Map_, Struct, + RunEndEncoded, } from '../type.js'; import { bigIntToNumber } from '../util/bigint.js'; @@ -51,12 +52,14 @@ export interface VectorAssembler extends Visitor { visitTime(data: Data): this; visitDecimal(data: Data): this; visitList(data: Data): this; + visitLargeList(data: Data): this; visitStruct(data: Data): this; visitUnion(data: Data): this; visitInterval(data: Data): this; visitDuration(data: Data): this; visitFixedSizeList(data: Data): this; visitMap(data: Data): this; + visitRunEndEncoded(data: Data): this; } /** @ignore */ @@ -115,11 +118,13 @@ export class VectorAssembler extends Visitor { public get buffers() { return this._buffers; } public get byteLength() { return this._byteLength; } public get bufferRegions() { return this._bufferRegions; } + public get variadicBufferCounts() { return this._variadicBufferCounts; } protected _byteLength = 0; protected _nodes: FieldNode[] = []; protected _buffers: ArrayBufferView[] = []; protected _bufferRegions: BufferRegion[] = []; + protected _variadicBufferCounts: number[] = []; } /** @ignore */ @@ -216,11 +221,28 @@ function assembleFlatListVector(this: VectorAssembler, data: Data) { +function assembleBinaryViewVector(this: VectorAssembler, data: Data) { + const { offset, length, stride, values, variadicBuffers = [] } = data; + if (!values) { + throw new Error('BinaryView data is missing view buffer'); + } + const start = offset * stride; + const end = start + length * stride; + addBuffer.call(this, values.subarray(start, end)); + for (const buffer of variadicBuffers) { + addBuffer.call(this, buffer); + } + this._variadicBufferCounts.push(variadicBuffers.length); + return this; +} + +/** @ignore */ +function assembleListVector(this: VectorAssembler, data: Data) { const { length, valueOffsets } = data; - // If we have valueOffsets (MapVector, ListVector), push that buffer first + // If we have valueOffsets (MapVector, ListVector, LargeListVector), push that buffer first if (valueOffsets) { - const { [0]: begin, [length]: end } = valueOffsets; + const begin = typeof valueOffsets[0] === 'bigint' ? bigIntToNumber(valueOffsets[0]) : valueOffsets[0]; + const end = typeof valueOffsets[length] === 'bigint' ? bigIntToNumber(valueOffsets[length]) : valueOffsets[length]; addBuffer.call(this, rebaseValueOffsets(-begin, length + 1, valueOffsets)); // Then insert the List's values child return this.visit(data.children[0].slice(begin, end - begin)); @@ -230,7 +252,7 @@ function assembleListVector(this: VectorA } /** @ignore */ -function assembleNestedVector(this: VectorAssembler, data: Data) { +function assembleNestedVector(this: VectorAssembler, data: Data) { return this.visitMany(data.type.children.map((_, i) => data.children[i]).filter(Boolean))[0]; } @@ -239,17 +261,21 @@ VectorAssembler.prototype.visitInt = assembleFlatVector; VectorAssembler.prototype.visitFloat = assembleFlatVector; VectorAssembler.prototype.visitUtf8 = assembleFlatListVector; VectorAssembler.prototype.visitLargeUtf8 = assembleFlatListVector; +VectorAssembler.prototype.visitUtf8View = assembleBinaryViewVector; VectorAssembler.prototype.visitBinary = assembleFlatListVector; VectorAssembler.prototype.visitLargeBinary = assembleFlatListVector; +VectorAssembler.prototype.visitBinaryView = assembleBinaryViewVector; VectorAssembler.prototype.visitFixedSizeBinary = assembleFlatVector; VectorAssembler.prototype.visitDate = assembleFlatVector; VectorAssembler.prototype.visitTimestamp = assembleFlatVector; VectorAssembler.prototype.visitTime = assembleFlatVector; VectorAssembler.prototype.visitDecimal = assembleFlatVector; VectorAssembler.prototype.visitList = assembleListVector; +VectorAssembler.prototype.visitLargeList = assembleListVector; VectorAssembler.prototype.visitStruct = assembleNestedVector; VectorAssembler.prototype.visitUnion = assembleUnion; VectorAssembler.prototype.visitInterval = assembleFlatVector; VectorAssembler.prototype.visitDuration = assembleFlatVector; VectorAssembler.prototype.visitFixedSizeList = assembleListVector; VectorAssembler.prototype.visitMap = assembleListVector; +VectorAssembler.prototype.visitRunEndEncoded = assembleNestedVector; diff --git a/src/visitor/vectorloader.ts b/src/visitor/vectorloader.ts index 7c82e7ab..9a406db4 100644 --- a/src/visitor/vectorloader.ts +++ b/src/visitor/vectorloader.ts @@ -44,13 +44,16 @@ export class VectorLoader extends Visitor { protected buffersIndex = -1; private dictionaries: Map>; private readonly metadataVersion: MetadataVersion; - constructor(bytes: Uint8Array, nodes: FieldNode[], buffers: BufferRegion[], dictionaries: Map>, metadataVersion = MetadataVersion.V5) { + private variadicBufferCounts: number[]; + private variadicBufferIndex = -1; + constructor(bytes: Uint8Array, nodes: FieldNode[], buffers: BufferRegion[], dictionaries: Map>, metadataVersion = MetadataVersion.V5, variadicBufferCounts: number[] = []) { super(); this.bytes = bytes; this.nodes = nodes; this.buffers = buffers; this.dictionaries = dictionaries; this.metadataVersion = metadataVersion; + this.variadicBufferCounts = variadicBufferCounts; } public visit(node: Field | T): Data { @@ -75,12 +78,24 @@ export class VectorLoader extends Visitor { public visitLargeUtf8(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), valueOffsets: this.readOffsets(type), data: this.readData(type) }); } + public visitUtf8View(type: T, { length, nullCount } = this.nextFieldNode()) { + const nullBitmap = this.readNullBitmap(type, nullCount); + const views = this.readData(type); + const variadicBuffers = this.readVariadicBuffers(this.nextVariadicBufferCount()); + return makeData({ type, length, nullCount, nullBitmap, views, variadicBuffers }); + } public visitBinary(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), valueOffsets: this.readOffsets(type), data: this.readData(type) }); } public visitLargeBinary(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), valueOffsets: this.readOffsets(type), data: this.readData(type) }); } + public visitBinaryView(type: T, { length, nullCount } = this.nextFieldNode()) { + const nullBitmap = this.readNullBitmap(type, nullCount); + const views = this.readData(type); + const variadicBuffers = this.readVariadicBuffers(this.nextVariadicBufferCount()); + return makeData({ type, length, nullCount, nullBitmap, views, variadicBuffers }); + } public visitFixedSizeBinary(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), data: this.readData(type) }); } @@ -99,6 +114,9 @@ export class VectorLoader extends Visitor { public visitList(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), valueOffsets: this.readOffsets(type), 'child': this.visit(type.children[0]) }); } + public visitLargeList(type: T, { length, nullCount } = this.nextFieldNode()) { + return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), valueOffsets: this.readOffsets(type), 'child': this.visit(type.children[0]) }); + } public visitStruct(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), children: this.visitMany(type.children) }); } @@ -131,6 +149,10 @@ export class VectorLoader extends Visitor { public visitMap(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), valueOffsets: this.readOffsets(type), 'child': this.visit(type.children[0]) }); } + public visitRunEndEncoded(type: T, { length, nullCount } = this.nextFieldNode()) { + const children = this.visitMany(type.children) as [Data, Data]; + return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), children }); + } protected nextFieldNode() { return this.nodes[++this.nodesIndex]; } protected nextBufferRange() { return this.buffers[++this.buffersIndex]; } @@ -142,6 +164,12 @@ export class VectorLoader extends Visitor { protected readData(_type: T, { length, offset } = this.nextBufferRange()) { return this.bytes.subarray(offset, offset + length); } + protected readVariadicBuffers(length: number) { + return Array.from({ length }, () => this.readData(null as any)); + } + protected nextVariadicBufferCount() { + return this.variadicBufferCounts[++this.variadicBufferIndex] ?? 0; + } protected readDictionary(type: T): Vector { return this.dictionaries.get(type.id)!; } @@ -150,8 +178,8 @@ export class VectorLoader extends Visitor { /** @ignore */ export class JSONVectorLoader extends VectorLoader { private sources: any[][]; - constructor(sources: any[][], nodes: FieldNode[], buffers: BufferRegion[], dictionaries: Map>, metadataVersion: MetadataVersion) { - super(new Uint8Array(0), nodes, buffers, dictionaries, metadataVersion); + constructor(sources: any[][], nodes: FieldNode[], buffers: BufferRegion[], dictionaries: Map>, metadataVersion: MetadataVersion, variadicBufferCounts: number[] = []) { + super(new Uint8Array(0), nodes, buffers, dictionaries, metadataVersion, variadicBufferCounts); this.sources = sources; } protected readNullBitmap(_type: T, nullCount: number, { offset } = this.nextBufferRange()) { @@ -175,6 +203,8 @@ export class JSONVectorLoader extends VectorLoader { return toArrayBufferView(Uint8Array, Int128.convertArray(sources[offset] as string[])); } else if (DataType.isBinary(type) || DataType.isLargeBinary(type) || DataType.isFixedSizeBinary(type)) { return binaryDataFromJSON(sources[offset] as string[]); + } else if (DataType.isBinaryView(type) || DataType.isUtf8View(type)) { + return viewDataFromJSON(sources[offset] as any[]); } else if (DataType.isBool(type)) { return packBools(sources[offset] as number[]); } else if (DataType.isUtf8(type) || DataType.isLargeUtf8(type)) { @@ -191,25 +221,102 @@ export class JSONVectorLoader extends VectorLoader { } return toArrayBufferView(Uint8Array, toArrayBufferView(type.ArrayType, sources[offset].map((x) => +x))); } + protected readVariadicBuffers(length: number) { + // Per Arrow C++ reference implementation (cpp/src/arrow/ipc/reader.cc), + // each variadic buffer is stored as a separate buffer region, matching + // the IPC format where each is accessed via separate GetBuffer() calls. + // VARIADIC_DATA_BUFFERS in JSON is an array, but flattenDataSources spreads + // it so each hex string gets its own sources entry, maintaining 1:1 + // correspondence with BufferRegion entries. + const buffers: Uint8Array[] = []; + for (let i = 0; i < length; i++) { + const { offset } = this.nextBufferRange(); + // sources[offset] is 'any[]' but for variadic buffers it's actually a string + // after spreading in flattenDataSources. Cast necessary due to heterogeneous + // sources array structure (most fields are arrays, variadic elements are strings). + const hexString = this.sources[offset] as unknown as string; + buffers.push(hexStringToBytes(hexString)); + } + return buffers; + } +} + +/** @ignore */ +function hexStringToBytes(hexString: string): Uint8Array { + // Parse hex string per Arrow JSON integration format (uppercase hex encoding). + // Used for: VARIADIC_DATA_BUFFERS elements, Binary DATA (after join), + // BinaryView PREFIX_HEX and INLINED fields. + const data = new Uint8Array(hexString.length / 2); + for (let i = 0; i < hexString.length; i += 2) { + data[i >> 1] = Number.parseInt(hexString.slice(i, i + 2), 16); + } + return data; +} + +/** @ignore */ +function binaryDataFromJSON(values: string[]): Uint8Array { + // Arrow JSON Binary/LargeBinary/FixedSizeBinary format: + // "DATA": ["49BC7D5B6C47D2","3F5FB6D9322026"] (array of hex strings, one per value) + // Join all values into one continuous hex string, then parse to bytes. + return hexStringToBytes(values.join('')); } /** @ignore */ -function binaryDataFromJSON(values: string[]) { - // "DATA": ["49BC7D5B6C47D2","3F5FB6D9322026"] - // There are definitely more efficient ways to do this... but it gets the - // job done. - const joined = values.join(''); - const data = new Uint8Array(joined.length / 2); - for (let i = 0; i < joined.length; i += 2) { - data[i >> 1] = Number.parseInt(joined.slice(i, i + 2), 16); +function viewDataFromJSON(views: any[]) { + // Each view is a 16-byte struct: [length: i32, prefix/inlined: 12 bytes, buffer_index: i32, offset: i32] + const data = new Uint8Array(views.length * 16); + const dataView = new DataView(data.buffer); + + for (const [i, view] of views.entries()) { + const offset = i * 16; + const size = view.SIZE; + + // Write size (int32 at byte 0) + dataView.setInt32(offset, size, true); + + if (view.INLINED !== undefined) { + // Inline view: INLINED can be hex string (BinaryView) or UTF-8 string (Utf8View) + const inlined = view.INLINED; + + // Check if it's a hex string (even length, all hex chars) or a UTF-8 string + const isHex = typeof inlined === 'string' && + inlined.length % 2 === 0 && + /^[0-9A-Fa-f]*$/.test(inlined); + + if (isHex) { + // BinaryView: hex-encoded string + for (let j = 0; j < inlined.length && j < 24; j += 2) { + data[offset + 4 + (j >> 1)] = Number.parseInt(inlined.slice(j, j + 2), 16); + } + } else { + // Utf8View: UTF-8 string - encode to bytes + const encoder = new TextEncoder(); + const bytes = encoder.encode(inlined); + for (let j = 0; j < bytes.length && j < 12; j++) { + data[offset + 4 + j] = bytes[j]; + } + } + } else { + // Out-of-line view: write prefix, buffer_index, offset + const prefix = view.PREFIX_HEX; + // Write 4-byte prefix at bytes 4-7 + for (let j = 0; j < 8 && j < prefix.length; j += 2) { + data[offset + 4 + (j >> 1)] = Number.parseInt(prefix.slice(j, j + 2), 16); + } + // Write buffer_index (int32 at byte 8) + dataView.setInt32(offset + 8, view.BUFFER_INDEX, true); + // Write offset (int32 at byte 12) + dataView.setInt32(offset + 12, view.OFFSET, true); + } } + return data; } export class CompressedVectorLoader extends VectorLoader { private bodyChunks: Uint8Array[]; - constructor(bodyChunks: Uint8Array[], nodes: FieldNode[], buffers: BufferRegion[], dictionaries: Map>, metadataVersion: MetadataVersion) { - super(new Uint8Array(0), nodes, buffers, dictionaries, metadataVersion); + constructor(bodyChunks: Uint8Array[], nodes: FieldNode[], buffers: BufferRegion[], dictionaries: Map>, metadataVersion: MetadataVersion, variadicBufferCounts: number[] = []) { + super(new Uint8Array(0), nodes, buffers, dictionaries, metadataVersion, variadicBufferCounts); this.bodyChunks = bodyChunks; } protected readData(_type: T, _buffer = this.nextBufferRange()) { diff --git a/test/unit/builders/listview-tests.ts b/test/unit/builders/listview-tests.ts new file mode 100644 index 00000000..69a908b1 --- /dev/null +++ b/test/unit/builders/listview-tests.ts @@ -0,0 +1,199 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { ListView, LargeListView, Int32 } from '../../../src/type.js'; +import { Field } from '../../../src/schema.js'; +import { ListViewBuilder, LargeListViewBuilder } from '../../../src/builder/listview.js'; +import { Int32Builder } from '../../../src/builder/int.js'; +import { Vector } from '../../../src/vector.js'; + +describe('ListViewBuilder', () => { + it('should build ListView with basic values', () => { + const type = new ListView(new Field('item', new Int32())); + const builder = new ListViewBuilder({ type }); + builder.addChild(new Int32Builder({ type: new Int32() }), 'item'); + + builder.append([1, 2, 3]); + builder.append([4, 5]); + builder.append([6]); + + const vector = builder.finish().toVector(); + + expect(vector).toHaveLength(3); + expect(vector.get(0)?.toArray()).toEqual(new Int32Array([1, 2, 3])); + expect(vector.get(1)?.toArray()).toEqual(new Int32Array([4, 5])); + expect(vector.get(2)?.toArray()).toEqual(new Int32Array([6])); + }); + + it('should handle null values', () => { + const type = new ListView(new Field('item', new Int32())); + const builder = new ListViewBuilder({ type, nullValues: [null] }); + builder.addChild(new Int32Builder({ type: new Int32() }), 'item'); + + builder.append([1, 2]); + builder.append(null); + builder.append([3, 4, 5]); + + const vector = builder.finish().toVector(); + + expect(vector).toHaveLength(3); + expect(vector.get(0)?.toArray()).toEqual(new Int32Array([1, 2])); + expect(vector.get(1)).toBeNull(); + expect(vector.get(2)?.toArray()).toEqual(new Int32Array([3, 4, 5])); + }); + + it('should handle empty lists', () => { + const type = new ListView(new Field('item', new Int32())); + const builder = new ListViewBuilder({ type }); + builder.addChild(new Int32Builder({ type: new Int32() }), 'item'); + + builder.append([]); + builder.append([1, 2]); + builder.append([]); + + const vector = builder.finish().toVector(); + + expect(vector).toHaveLength(3); + expect(vector.get(0)?.toArray()).toEqual(new Int32Array([])); + expect(vector.get(1)?.toArray()).toEqual(new Int32Array([1, 2])); + expect(vector.get(2)?.toArray()).toEqual(new Int32Array([])); + }); + + it('should handle multiple flushes', () => { + const type = new ListView(new Field('item', new Int32())); + const builder = new ListViewBuilder({ type }); + builder.addChild(new Int32Builder({ type: new Int32() }), 'item'); + + builder.append([1, 2]); + const data1 = builder.flush(); + builder.append([3, 4]); + const data2 = builder.flush(); + + builder.finish(); + + const vector1 = new Vector([data1]); + const vector2 = new Vector([data2]); + + expect(vector1).toHaveLength(1); + expect(vector1.get(0)?.toArray()).toEqual(new Int32Array([1, 2])); + expect(vector2).toHaveLength(1); + expect(vector2.get(0)?.toArray()).toEqual(new Int32Array([3, 4])); + }); + + it('should build ListView with varying list sizes', () => { + const type = new ListView(new Field('item', new Int32())); + const builder = new ListViewBuilder({ type }); + builder.addChild(new Int32Builder({ type: new Int32() }), 'item'); + + builder.append([1]); + builder.append([2, 3]); + builder.append([4, 5, 6]); + builder.append([7, 8, 9, 10]); + + const vector = builder.finish().toVector(); + + expect(vector).toHaveLength(4); + expect(vector.get(0)?.toArray()).toEqual(new Int32Array([1])); + expect(vector.get(1)?.toArray()).toEqual(new Int32Array([2, 3])); + expect(vector.get(2)?.toArray()).toEqual(new Int32Array([4, 5, 6])); + expect(vector.get(3)?.toArray()).toEqual(new Int32Array([7, 8, 9, 10])); + }); +}); + +describe('LargeListViewBuilder', () => { + it('should build LargeListView with basic values', () => { + const type = new LargeListView(new Field('item', new Int32())); + const builder = new LargeListViewBuilder({ type }); + builder.addChild(new Int32Builder({ type: new Int32() }), 'item'); + + builder.append([1, 2, 3]); + builder.append([4, 5]); + builder.append([6]); + + const vector = builder.finish().toVector(); + + expect(vector).toHaveLength(3); + expect(vector.get(0)?.toArray()).toEqual(new Int32Array([1, 2, 3])); + expect(vector.get(1)?.toArray()).toEqual(new Int32Array([4, 5])); + expect(vector.get(2)?.toArray()).toEqual(new Int32Array([6])); + }); + + it('should handle null values', () => { + const type = new LargeListView(new Field('item', new Int32())); + const builder = new LargeListViewBuilder({ type, nullValues: [null] }); + builder.addChild(new Int32Builder({ type: new Int32() }), 'item'); + + builder.append([1, 2]); + builder.append(null); + builder.append([3, 4, 5]); + + const vector = builder.finish().toVector(); + + expect(vector).toHaveLength(3); + expect(vector.get(0)?.toArray()).toEqual(new Int32Array([1, 2])); + expect(vector.get(1)).toBeNull(); + expect(vector.get(2)?.toArray()).toEqual(new Int32Array([3, 4, 5])); + }); + + it('should handle empty lists', () => { + const type = new LargeListView(new Field('item', new Int32())); + const builder = new LargeListViewBuilder({ type }); + builder.addChild(new Int32Builder({ type: new Int32() }), 'item'); + + builder.append([]); + builder.append([1, 2]); + builder.append([]); + + const vector = builder.finish().toVector(); + + expect(vector).toHaveLength(3); + expect(vector.get(0)?.toArray()).toEqual(new Int32Array([])); + expect(vector.get(1)?.toArray()).toEqual(new Int32Array([1, 2])); + expect(vector.get(2)?.toArray()).toEqual(new Int32Array([])); + }); + + it('should use BigInt offsets internally', () => { + const type = new LargeListView(new Field('item', new Int32())); + const builder = new LargeListViewBuilder({ type }); + builder.addChild(new Int32Builder({ type: new Int32() }), 'item'); + + builder.append([1, 2]); + builder.append([3, 4, 5]); + + const data = builder.finish().flush(); + + // Verify that offsets and sizes are BigInt64Array + expect(data.valueOffsets).toBeInstanceOf(BigInt64Array); + expect(data.values).toBeInstanceOf(BigInt64Array); // sizes buffer + }); +}); + +describe('ListView type properties', () => { + it('should correctly report type name', () => { + const type = new ListView(new Field('item', new Int32())); + const builder = new ListViewBuilder({ type }); + builder.addChild(new Int32Builder({ type: new Int32() }), 'item'); + expect(builder.type.toString()).toBe('ListView'); + }); + + it('should correctly report LargeListView type name', () => { + const type = new LargeListView(new Field('item', new Int32())); + const builder = new LargeListViewBuilder({ type }); + builder.addChild(new Int32Builder({ type: new Int32() }), 'item'); + expect(builder.type.toString()).toBe('LargeListView'); + }); +}); diff --git a/test/unit/builders/view-builders-tests.ts b/test/unit/builders/view-builders-tests.ts new file mode 100644 index 00000000..88ee28fe --- /dev/null +++ b/test/unit/builders/view-builders-tests.ts @@ -0,0 +1,258 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { BinaryView, Utf8View } from '../../../src/type.js'; +import { makeBuilder, vectorFromArray } from '../../../src/factories.js'; + +describe('BinaryViewBuilder', () => { + it('should build inline binary values (≤12 bytes)', () => { + const builder = makeBuilder({ type: new BinaryView() }); + const values = [ + new Uint8Array([1, 2, 3]), + new Uint8Array([4, 5, 6, 7, 8, 9, 10, 11, 12]), + new Uint8Array([13]) + ]; + + for (const value of values) { + builder.append(value); + } + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(3); + expect(vector.get(0)).toEqual(values[0]); + expect(vector.get(1)).toEqual(values[1]); + expect(vector.get(2)).toEqual(values[2]); + }); + + it('should build out-of-line binary values (>12 bytes)', () => { + const builder = makeBuilder({ type: new BinaryView() }); + const value = new Uint8Array(100); + for (let i = 0; i < 100; i++) { + value[i] = i % 256; + } + + builder.append(value); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(1); + expect(vector.get(0)).toEqual(value); + }); + + it('should build mixed inline and out-of-line values', () => { + const builder = makeBuilder({ type: new BinaryView() }); + const small = new Uint8Array([1, 2, 3]); + const large = new Uint8Array(50); + for (let i = 0; i < 50; i++) { + large[i] = i % 256; + } + + builder.append(small); + builder.append(large); + builder.append(small); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(3); + expect(vector.get(0)).toEqual(small); + expect(vector.get(1)).toEqual(large); + expect(vector.get(2)).toEqual(small); + }); + + it('should handle null values', () => { + const builder = makeBuilder({ type: new BinaryView(), nullValues: [null] }); + + builder.append(new Uint8Array([1, 2, 3])); + builder.append(null); + builder.append(new Uint8Array([4, 5, 6])); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(3); + expect(vector.get(0)).toEqual(new Uint8Array([1, 2, 3])); + expect(vector.get(1)).toBeNull(); + expect(vector.get(2)).toEqual(new Uint8Array([4, 5, 6])); + }); + + it('should handle empty values', () => { + const builder = makeBuilder({ type: new BinaryView() }); + + builder.append(new Uint8Array([])); + builder.append(new Uint8Array([1])); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(2); + expect(vector.get(0)).toEqual(new Uint8Array([])); + expect(vector.get(1)).toEqual(new Uint8Array([1])); + }); + + it('should handle exactly 12-byte boundary values', () => { + const builder = makeBuilder({ type: new BinaryView() }); + const exactly12 = new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]); + const exactly13 = new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]); + + builder.append(exactly12); + builder.append(exactly13); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(2); + expect(vector.get(0)).toEqual(exactly12); + expect(vector.get(1)).toEqual(exactly13); + }); + + it('should handle multiple flushes', () => { + const builder = makeBuilder({ type: new BinaryView() }); + + builder.append(new Uint8Array([1, 2])); + const data1 = builder.flush(); + expect(data1).toHaveLength(1); + + builder.append(new Uint8Array([3, 4])); + builder.append(new Uint8Array([5, 6])); + const data2 = builder.flush(); + expect(data2).toHaveLength(2); + }); +}); + +describe('Utf8ViewBuilder', () => { + it('should build inline string values (≤12 bytes)', () => { + const builder = makeBuilder({ type: new Utf8View() }); + const values = ['hello', 'world', 'foo']; + + for (const value of values) { + builder.append(value); + } + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(3); + expect(vector.get(0)).toBe('hello'); + expect(vector.get(1)).toBe('world'); + expect(vector.get(2)).toBe('foo'); + }); + + it('should build out-of-line string values (>12 bytes)', () => { + const builder = makeBuilder({ type: new Utf8View() }); + const longString = 'This is a long string that exceeds 12 bytes'; + + builder.append(longString); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(1); + expect(vector.get(0)).toBe(longString); + }); + + it('should build mixed inline and out-of-line strings', () => { + const builder = makeBuilder({ type: new Utf8View() }); + const short = 'hi'; + const long = 'This is a very long string that definitely exceeds the 12 byte inline capacity'; + + builder.append(short); + builder.append(long); + builder.append(short); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(3); + expect(vector.get(0)).toBe(short); + expect(vector.get(1)).toBe(long); + expect(vector.get(2)).toBe(short); + }); + + it('should handle null values', () => { + const builder = makeBuilder({ type: new Utf8View(), nullValues: [null] }); + + builder.append('hello'); + builder.append(null); + builder.append('world'); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(3); + expect(vector.get(0)).toBe('hello'); + expect(vector.get(1)).toBeNull(); + expect(vector.get(2)).toBe('world'); + }); + + it('should handle empty strings', () => { + const builder = makeBuilder({ type: new Utf8View() }); + + builder.append(''); + builder.append('a'); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(2); + expect(vector.get(0)).toBe(''); + expect(vector.get(1)).toBe('a'); + }); + + it('should handle UTF-8 multibyte characters', () => { + const builder = makeBuilder({ type: new Utf8View() }); + const values = ['🚀', '你好', 'Ñoño', 'emoji: 🎉']; + + for (const value of values) { + builder.append(value); + } + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(4); + expect(vector.get(0)).toBe('🚀'); + expect(vector.get(1)).toBe('你好'); + expect(vector.get(2)).toBe('Ñoño'); + expect(vector.get(3)).toBe('emoji: 🎉'); + }); + + it('should handle exactly 12-byte boundary strings', () => { + const builder = makeBuilder({ type: new Utf8View() }); + const exactly12 = 'twelve bytes'; // ASCII: 12 bytes + const exactly13 = 'thirteen byte'; // ASCII: 13 bytes + + builder.append(exactly12); + builder.append(exactly13); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(2); + expect(vector.get(0)).toBe(exactly12); + expect(vector.get(1)).toBe(exactly13); + }); + + it('should build from vectorFromArray', () => { + const values = ['hello', 'world', null, 'foo']; + const vector = vectorFromArray(values, new Utf8View()); + + expect(vector).toHaveLength(4); + expect(vector.get(0)).toBe('hello'); + expect(vector.get(1)).toBe('world'); + expect(vector.get(2)).toBeNull(); + expect(vector.get(3)).toBe('foo'); + }); + + it('should handle large batch of values', () => { + const builder = makeBuilder({ type: new Utf8View() }); + const count = 1000; + const values: string[] = []; + + for (let i = 0; i < count; i++) { + const value = i % 2 === 0 + ? `short_${i}` // inline + : `this_is_a_long_string_that_goes_out_of_line_${i}`; // out-of-line + values.push(value); + builder.append(value); + } + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(count); + + for (let i = 0; i < count; i++) { + expect(vector.get(i)).toBe(values[i]); + } + }); +}); diff --git a/test/unit/ipc/list-view-tests.ts b/test/unit/ipc/list-view-tests.ts new file mode 100644 index 00000000..da09c6d1 --- /dev/null +++ b/test/unit/ipc/list-view-tests.ts @@ -0,0 +1,262 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { makeData } from 'apache-arrow/data'; +import { ListView, LargeListView, Int8 } from 'apache-arrow/type'; +import { Vector } from 'apache-arrow/vector'; +import { Field } from 'apache-arrow/schema'; + +describe('ListView and LargeListView integration', () => { + describe('ListView', () => { + // Test case from Arrow spec documentation: + // [[12, -7, 25], null, [0, -127, 127, 50], []] + it('reads ListView values with in-order offsets', () => { + const childData = makeData({ + type: new Int8(), + length: 7, + nullCount: 0, + data: new Int8Array([12, -7, 25, 0, -127, 127, 50]) + }); + + const offsets = new Int32Array([0, 7, 3, 0]); + const sizes = new Int32Array([3, 0, 4, 0]); + const nullBitmap = new Uint8Array([0b00001101]); // bits: [1,0,1,1] = valid, null, valid, valid + + const listViewData = makeData({ + type: new ListView(new Field('item', new Int8())), + length: 4, + nullCount: 1, + nullBitmap, + valueOffsets: offsets, + sizes, + child: childData + }); + + const vector = new Vector([listViewData]); + + expect(vector.get(0)?.toArray()).toEqual(new Int8Array([12, -7, 25])); + expect(vector.get(1)).toBeNull(); + expect(vector.get(2)?.toArray()).toEqual(new Int8Array([0, -127, 127, 50])); + expect(vector.get(3)?.toArray()).toEqual(new Int8Array([])); + }); + + // Test case from Arrow spec showing out-of-order offsets and value sharing: + // [[12, -7, 25], null, [0, -127, 127, 50], [], [50, 12]] + it('reads ListView values with out-of-order offsets and value sharing', () => { + const childData = makeData({ + type: new Int8(), + length: 7, + nullCount: 0, + data: new Int8Array([0, -127, 127, 50, 12, -7, 25]) + }); + + // Out of order offsets: [4, 7, 0, 0, 3] + const offsets = new Int32Array([4, 7, 0, 0, 3]); + const sizes = new Int32Array([3, 0, 4, 0, 2]); + const nullBitmap = new Uint8Array([0b00011101]); // [1,0,1,1,1] = valid, null, valid, valid, valid + + const listViewData = makeData({ + type: new ListView(new Field('item', new Int8())), + length: 5, + nullCount: 1, + nullBitmap, + valueOffsets: offsets, + sizes, + child: childData + }); + + const vector = new Vector([listViewData]); + + // List 0: offset=4, size=3 -> [12, -7, 25] + expect(vector.get(0)?.toArray()).toEqual(new Int8Array([12, -7, 25])); + // List 1: null + expect(vector.get(1)).toBeNull(); + // List 2: offset=0, size=4 -> [0, -127, 127, 50] + expect(vector.get(2)?.toArray()).toEqual(new Int8Array([0, -127, 127, 50])); + // List 3: offset=0, size=0 -> [] + expect(vector.get(3)?.toArray()).toEqual(new Int8Array([])); + // List 4: offset=3, size=2 -> [50, 12] (shares values with list 2) + expect(vector.get(4)?.toArray()).toEqual(new Int8Array([50, 12])); + }); + + it('handles all null ListView', () => { + const childData = makeData({ + type: new Int8(), + length: 0, + nullCount: 0, + data: new Int8Array([]) + }); + + const offsets = new Int32Array([0, 0, 0]); + const sizes = new Int32Array([0, 0, 0]); + const nullBitmap = new Uint8Array([0b00000000]); // all null + + const listViewData = makeData({ + type: new ListView(new Field('item', new Int8())), + length: 3, + nullCount: 3, + nullBitmap, + valueOffsets: offsets, + sizes, + child: childData + }); + + const vector = new Vector([listViewData]); + + expect(vector.get(0)).toBeNull(); + expect(vector.get(1)).toBeNull(); + expect(vector.get(2)).toBeNull(); + }); + + it('handles ListView with all empty lists', () => { + const childData = makeData({ + type: new Int8(), + length: 0, + nullCount: 0, + data: new Int8Array([]) + }); + + const offsets = new Int32Array([0, 0, 0]); + const sizes = new Int32Array([0, 0, 0]); + const nullBitmap = new Uint8Array([0b00000111]); // all valid + + const listViewData = makeData({ + type: new ListView(new Field('item', new Int8())), + length: 3, + nullCount: 0, + nullBitmap, + valueOffsets: offsets, + sizes, + child: childData + }); + + const vector = new Vector([listViewData]); + + expect(vector.get(0)?.toArray()).toEqual(new Int8Array([])); + expect(vector.get(1)?.toArray()).toEqual(new Int8Array([])); + expect(vector.get(2)?.toArray()).toEqual(new Int8Array([])); + }); + + it('handles ListView with single element lists', () => { + const childData = makeData({ + type: new Int8(), + length: 3, + nullCount: 0, + data: new Int8Array([42, -1, 100]) + }); + + const offsets = new Int32Array([0, 1, 2]); + const sizes = new Int32Array([1, 1, 1]); + const nullBitmap = new Uint8Array([0b00000111]); + + const listViewData = makeData({ + type: new ListView(new Field('item', new Int8())), + length: 3, + nullCount: 0, + nullBitmap, + valueOffsets: offsets, + sizes, + child: childData + }); + + const vector = new Vector([listViewData]); + + expect(vector.get(0)?.toArray()).toEqual(new Int8Array([42])); + expect(vector.get(1)?.toArray()).toEqual(new Int8Array([-1])); + expect(vector.get(2)?.toArray()).toEqual(new Int8Array([100])); + }); + }); + + describe('LargeListView', () => { + it('reads LargeListView values with BigInt offsets', () => { + const childData = makeData({ + type: new Int8(), + length: 7, + nullCount: 0, + data: new Int8Array([12, -7, 25, 0, -127, 127, 50]) + }); + + const offsets = new BigInt64Array([0n, 7n, 3n, 0n]); + const sizes = new BigInt64Array([3n, 0n, 4n, 0n]); + const nullBitmap = new Uint8Array([0b00001101]); + + const largeListViewData = makeData({ + type: new LargeListView(new Field('item', new Int8())), + length: 4, + nullCount: 1, + nullBitmap, + valueOffsets: offsets, + sizes, + child: childData + }); + + const vector = new Vector([largeListViewData]); + + expect(vector.get(0)?.toArray()).toEqual(new Int8Array([12, -7, 25])); + expect(vector.get(1)).toBeNull(); + expect(vector.get(2)?.toArray()).toEqual(new Int8Array([0, -127, 127, 50])); + expect(vector.get(3)?.toArray()).toEqual(new Int8Array([])); + }); + + it('reads LargeListView with out-of-order offsets', () => { + const childData = makeData({ + type: new Int8(), + length: 5, + nullCount: 0, + data: new Int8Array([10, 20, 30, 40, 50]) + }); + + // Out of order: list 0 starts at 2, list 1 starts at 0 + const offsets = new BigInt64Array([2n, 0n]); + const sizes = new BigInt64Array([3n, 2n]); + const nullBitmap = new Uint8Array([0b00000011]); + + const largeListViewData = makeData({ + type: new LargeListView(new Field('item', new Int8())), + length: 2, + nullCount: 0, + nullBitmap, + valueOffsets: offsets, + sizes, + child: childData + }); + + const vector = new Vector([largeListViewData]); + + expect(vector.get(0)?.toArray()).toEqual(new Int8Array([30, 40, 50])); + expect(vector.get(1)?.toArray()).toEqual(new Int8Array([10, 20])); + }); + }); + + describe('ListView properties', () => { + it('has correct type properties', () => { + const listViewType = new ListView(new Field('item', new Int8())); + expect(listViewType.typeId).toBe(25); // Type.ListView + expect(listViewType.toString()).toBe('ListView'); + expect(listViewType.valueType).toBeInstanceOf(Int8); + expect(listViewType.valueField.name).toBe('item'); + }); + + it('has correct type properties for LargeListView', () => { + const largeListViewType = new LargeListView(new Field('item', new Int8())); + expect(largeListViewType.typeId).toBe(26); // Type.LargeListView + expect(largeListViewType.toString()).toBe('LargeListView'); + expect(largeListViewType.valueType).toBeInstanceOf(Int8); + expect(largeListViewType.valueField.name).toBe('item'); + }); + }); +}); diff --git a/test/unit/ipc/view-types-tests.ts b/test/unit/ipc/view-types-tests.ts new file mode 100644 index 00000000..d0b5a7a9 --- /dev/null +++ b/test/unit/ipc/view-types-tests.ts @@ -0,0 +1,90 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { makeData } from 'apache-arrow/data'; +import { BinaryView, Utf8View } from 'apache-arrow/type'; +import { Vector } from 'apache-arrow/vector'; + +const BINARY_VIEW_SIZE = 16; + +function createInlineView(value: Uint8Array) { + const view = new Uint8Array(BINARY_VIEW_SIZE); + const dv = new DataView(view.buffer, view.byteOffset, view.byteLength); + dv.setInt32(0, value.length, true); + view.set(value, 4); + return view; +} + +function createReferencedView(value: Uint8Array, bufferIndex: number, offset: number) { + const view = new Uint8Array(BINARY_VIEW_SIZE); + const dv = new DataView(view.buffer, view.byteOffset, view.byteLength); + dv.setInt32(0, value.length, true); + view.set(value.subarray(0, Math.min(4, value.length)), 4); + dv.setInt32(8, bufferIndex, true); + dv.setInt32(12, offset, true); + return view; +} + +describe('BinaryView and Utf8View integration', () => { + const inlineBinary = new Uint8Array([1, 2, 3, 4, 5]); + const referencedBinary = new Uint8Array(Array.from({ length: 20 }, (_, i) => i)); + const referencedUtf8 = 'View types are fun!'; + + const inlineUtf8 = 'hi'; + + const binaryViews = new Uint8Array(BINARY_VIEW_SIZE * 3); + binaryViews.set(createInlineView(inlineBinary), 0); + binaryViews.set(createReferencedView(referencedBinary, 0, 0), BINARY_VIEW_SIZE); + binaryViews.set(createReferencedView(new Uint8Array(0), 0, referencedBinary.length), 2 * BINARY_VIEW_SIZE); + + const utf8Payload = new TextEncoder().encode(referencedUtf8); + const utf8Views = new Uint8Array(BINARY_VIEW_SIZE * 2); + utf8Views.set(createInlineView(new TextEncoder().encode(inlineUtf8)), 0); + utf8Views.set(createReferencedView(utf8Payload, 0, 0), BINARY_VIEW_SIZE); + + const nullBitmap = new Uint8Array([0b00000011]); + + const binaryData = makeData({ + type: new BinaryView(), + length: 3, + nullBitmap, + views: binaryViews, + variadicBuffers: [referencedBinary] + }); + + const utf8Data = makeData({ + type: new Utf8View(), + length: 2, + nullBitmap: new Uint8Array([0b00000011]), + views: utf8Views, + variadicBuffers: [utf8Payload] + }); + + it('reads BinaryView values via Vector', () => { + const vector = new Vector([binaryData]); + expect(vector.get(0)).toEqual(inlineBinary); + expect(vector.get(1)).toEqual(referencedBinary); + expect(vector.get(2)).toBeNull(); + }); + + it('reads Utf8View values via Vector', () => { + const vector = new Vector([utf8Data]); + expect(vector.get(0)).toBe(inlineUtf8); + expect(vector.get(1)).toBe(referencedUtf8); + }); + +}); diff --git a/test/unit/ipc/writer/view-json-tests.ts b/test/unit/ipc/writer/view-json-tests.ts new file mode 100644 index 00000000..f594740b --- /dev/null +++ b/test/unit/ipc/writer/view-json-tests.ts @@ -0,0 +1,171 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { + BinaryView, + Utf8View, + RecordBatchJSONWriter, + RecordBatchReader, + Table, + tableFromArrays, + vectorFromArray +} from 'apache-arrow'; + +describe('BinaryView and Utf8View JSON serialization', () => { + test('Utf8View with inline data (≤12 bytes) round-trips through JSON', async () => { + // Create test data with strings that fit inline (≤12 bytes) + const strings = ['Hello', 'World', 'Arrow', 'JS', '', 'Test123456']; + const vector = vectorFromArray(strings, new Utf8View()); + const table = new Table({ data: vector }); + + // Serialize to JSON + const writer = RecordBatchJSONWriter.writeAll(table); + const jsonString = await writer.toString(); + const json = JSON.parse(jsonString); + + // Deserialize from JSON + const result = new Table(RecordBatchReader.from(json)); + + // Verify round-trip + expect(result.numRows).toBe(table.numRows); + expect(result.getChild('data')?.toArray()).toEqual(strings); + }); + + test('Utf8View with out-of-line data (>12 bytes) round-trips through JSON', async () => { + // Create test data with strings that require external buffers (>12 bytes) + const strings = [ + 'This is a longer string', + 'Another long string value', + 'Short', + 'Yet another string that exceeds 12 bytes', + null + ]; + const vector = vectorFromArray(strings, new Utf8View()); + const table = new Table({ data: vector }); + + // Serialize to JSON + const writer = RecordBatchJSONWriter.writeAll(table); + const jsonString = await writer.toString(); + const json = JSON.parse(jsonString); + + // Verify JSON structure has VIEWS and VARIADIC_DATA_BUFFERS + const batch = json.batches[0]; + const column = batch.columns[0]; + expect(column.VIEWS).toBeDefined(); + expect(column.VARIADIC_DATA_BUFFERS).toBeDefined(); + + // Deserialize from JSON + const result = new Table(RecordBatchReader.from(json)); + + // Verify round-trip + expect(result.numRows).toBe(table.numRows); + expect(result.getChild('data')?.toArray()).toEqual(strings); + }); + + test('BinaryView with inline data round-trips through JSON', async () => { + // Create test data with binary values that fit inline + const binaries = [ + new Uint8Array([1, 2, 3, 4]), + new Uint8Array([5, 6, 7]), + new Uint8Array([]), + new Uint8Array([0xFF, 0xAB, 0xCD, 0xEF, 0x12, 0x34]) + ]; + const vector = vectorFromArray(binaries, new BinaryView()); + const table = new Table({ data: vector }); + + // Serialize to JSON + const writer = RecordBatchJSONWriter.writeAll(table); + const jsonString = await writer.toString(); + const json = JSON.parse(jsonString); + + // Verify JSON structure + const batch = json.batches[0]; + const column = batch.columns[0]; + expect(column.VIEWS).toBeDefined(); + expect(Array.isArray(column.VIEWS)).toBe(true); + + // Deserialize from JSON + const result = new Table(RecordBatchReader.from(json)); + + // Verify round-trip + expect(result.numRows).toBe(table.numRows); + + const resultArray = result.getChild('data')?.toArray() || []; + for (const [i, binary] of binaries.entries()) { + expect(resultArray[i]).toEqual(binary); + } + }); + + test('BinaryView with out-of-line data round-trips through JSON', async () => { + // Create test data with binary values that require external buffers (>12 bytes) + const binaries = [ + new Uint8Array(Array.from({ length: 20 }, (_, i) => i)), + new Uint8Array([1, 2, 3, 4, 5]), + new Uint8Array(Array.from({ length: 50 }, (_, i) => i * 2)), + null + ]; + const vector = vectorFromArray(binaries, new BinaryView()); + const table = new Table({ data: vector }); + + // Serialize to JSON + const writer = RecordBatchJSONWriter.writeAll(table); + const jsonString = await writer.toString(); + const json = JSON.parse(jsonString); + + // Verify JSON structure has VARIADIC_DATA_BUFFERS + const batch = json.batches[0]; + const column = batch.columns[0]; + expect(column.VIEWS).toBeDefined(); + expect(column.VARIADIC_DATA_BUFFERS).toBeDefined(); + expect(column.VARIADIC_DATA_BUFFERS.length).toBeGreaterThan(0); + + // Deserialize from JSON + const result = new Table(RecordBatchReader.from(json)); + + // Verify round-trip + expect(result.numRows).toBe(table.numRows); + + const resultArray = result.getChild('data')?.toArray() || []; + for (const [i, binary] of binaries.entries()) { + if (binary === null) { + expect(resultArray[i]).toBeNull(); + } else { + expect(resultArray[i]).toEqual(binary); + } + } + }); + + test('Utf8View JSON distinguishes between inline hex (BinaryView) and UTF-8 strings', async () => { + // This test verifies the bug fix: Utf8View INLINED should be UTF-8 strings, not hex + const strings = ['Hello', 'World']; + const vector = vectorFromArray(strings, new Utf8View()); + const table = new Table({ data: vector }); + + // Serialize to JSON + const writer = RecordBatchJSONWriter.writeAll(table); + const jsonString = await writer.toString(); + const json = JSON.parse(jsonString); + + // Check that INLINED values are UTF-8 strings, not hex + const views = json.batches[0].columns[0].VIEWS; + expect(views[0].INLINED).toBe('Hello'); + expect(views[1].INLINED).toBe('World'); + + // NOT hex strings like "48656C6C6F" + expect(views[0].INLINED).not.toMatch(/^[0-9A-F]+$/); + }); +});