Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
a35ea1e
WIP: add binaryview and uft8view support
GeorgeLeePatterson Oct 29, 2025
5c5640a
feat: Add support for BinaryView and Utf8View types
GeorgeLeePatterson Oct 30, 2025
675b2f2
Add Apache license headers to fix RAT check
GeorgeLeePatterson Nov 1, 2025
73bda86
Fix Jest dynamic import errors by removing moduleResolution: NodeNext…
GeorgeLeePatterson Nov 1, 2025
456f85d
chore: Trigger CI validation on fork
GeorgeLeePatterson Nov 1, 2025
dfe9d56
fix: Add new files to RAT exclusion list
GeorgeLeePatterson Nov 1, 2025
21a778f
Revert "fix: Add new files to RAT exclusion list"
GeorgeLeePatterson Nov 1, 2025
e9d180b
fix: Correct license header format in update_flatbuffers.sh
GeorgeLeePatterson Nov 1, 2025
8d5bf77
fix: Export BinaryView and Utf8View types
GeorgeLeePatterson Nov 1, 2025
41f2d3e
fix: Export BinaryView and Utf8View in Arrow.dom.ts
GeorgeLeePatterson Nov 1, 2025
7cfb4dc
Address code review feedback
GeorgeLeePatterson Nov 2, 2025
2b3396e
Add BinaryView/Utf8View builders with comprehensive tests
GeorgeLeePatterson Oct 31, 2025
a28f69f
fix: Use toHaveLength() for jest length assertions
GeorgeLeePatterson Nov 1, 2025
5b312d5
Add BinaryViewBuilder and Utf8ViewBuilder exports
GeorgeLeePatterson Nov 1, 2025
5344b8f
Simplify byteLength calculation in view builders
GeorgeLeePatterson Nov 2, 2025
0576c00
ci: Enable BinaryView integration tests in Archery
GeorgeLeePatterson Nov 4, 2025
9502316
fix: Add Apache license header to patch file
GeorgeLeePatterson Nov 4, 2025
38bbee6
fix: Add BinaryView and Utf8View support to JSON type parser
GeorgeLeePatterson Nov 4, 2025
f174463
fix: Add readVariadicBuffers method to JSONVectorLoader
GeorgeLeePatterson Nov 4, 2025
86b58d8
feat: Add JSON format support for BinaryView/Utf8View variadic buffers
GeorgeLeePatterson Nov 4, 2025
f3817f5
feat: Add JSONVectorLoader support for BinaryView/Utf8View VIEWS buffer
GeorgeLeePatterson Nov 4, 2025
c664a79
feat: Add JSONVectorAssembler support for BinaryView/Utf8View (JSON w…
GeorgeLeePatterson Nov 4, 2025
00bb3c9
Merge remote-tracking branch 'origin/feat/binary-utf8-view-builders' …
GeorgeLeePatterson Nov 4, 2025
fe417a6
fix: Complete BinaryView/Utf8View JSON format support
GeorgeLeePatterson Nov 4, 2025
4c399d0
refactor: Extract hexStringToBytes helper and improve documentation
GeorgeLeePatterson Nov 4, 2025
3101acf
fix: Prevent DataView length overflow in getBinaryViewBytes
GeorgeLeePatterson Nov 4, 2025
810975d
fix: Closure Compiler property mangling for BinaryView/Utf8View
GeorgeLeePatterson Nov 5, 2025
c956dc4
fix: Use vectorFromArray in BinaryView/Utf8View integration tests
GeorgeLeePatterson Nov 5, 2025
8a1af64
Address review feedback: revert slice() to subarray() and fix propert…
GeorgeLeePatterson Nov 5, 2025
1ceb320
Address review feedback: revert slice() to subarray() and fix propert…
GeorgeLeePatterson Nov 5, 2025
d665777
Finalize view implementation across all IPC formats.
GeorgeLeePatterson Nov 6, 2025
dfa54d7
fix: addresses script updates from PR
GeorgeLeePatterson Nov 12, 2025
fb42846
refactor utf8 view builder
GeorgeLeePatterson Nov 13, 2025
8e736d3
refactor utf8 view builder
GeorgeLeePatterson Nov 13, 2025
445cdd0
fix: uses base class in utf8view
GeorgeLeePatterson Nov 14, 2025
434f307
fix utf8 view builder override
GeorgeLeePatterson Nov 14, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions .github/patches/enable-binaryview-integration-tests.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py
index 83913dc379..7ace28e1be 100644
--- a/dev/archery/archery/integration/datagen.py
+++ b/dev/archery/archery/integration/datagen.py
@@ -2003,7 +2003,6 @@ def get_generated_json_files(tempdir=None):
.skip_tester('Rust'),

generate_binary_view_case()
- .skip_tester('JS')
# TODO(https://github.com/apache/arrow-nanoarrow/issues/618)
.skip_tester('nanoarrow')
.skip_tester('Rust'),
3 changes: 3 additions & 0 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,9 @@ jobs:
uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
with:
python-version: 3
- name: Patch Archery to enable BinaryView tests
run: |
patch -p1 < js/.github/patches/enable-binaryview-integration-tests.patch
- name: Setup Archery
run: pip install -e dev/archery[docker]
- name: Execute Docker Build
Expand Down
75 changes: 75 additions & 0 deletions scripts/update_flatbuffers.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/usr/bin/env bash
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

# Regenerate the FlatBuffers helper files used by arrow-js. Requires a sibling
# checkout of apache/arrow (../arrow) if not provided in env and a working flatc on PATH.

set -euo pipefail

PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
FORMAT_DIR="${PROJECT_ROOT}/../arrow/format"

if [[ ! -d "${FORMAT_DIR}" ]]; then
echo "error: expected FlatBuffers schemas in ${FORMAT_DIR}" >&2
exit 1
fi

if ! command -v flatc >/dev/null 2>&1; then
echo "error: flatc not found on PATH" >&2
exit 1
fi

TMPDIR="$(mktemp -d "${PROJECT_ROOT}/.flatc.XXXXXX")"
cleanup() {
rm -rf "${TMPDIR}"
}
trap cleanup EXIT

schemas=(File Schema Message Tensor SparseTensor)

for schema in "${schemas[@]}"; do
cp "${FORMAT_DIR}/${schema}.fbs" "${TMPDIR}/${schema}.fbs"
sed \
-e 's/namespace org.apache.arrow.flatbuf;//g' \
-e 's/org\.apache\.arrow\.flatbuf\.//g' \
"${FORMAT_DIR}/${schema}.fbs" >"${TMPDIR}/${schema}.fbs"
done

flatc --ts --ts-flat-files --ts-omit-entrypoint \
-o "${TMPDIR}" \
"${TMPDIR}"/{File,Schema,Message,Tensor,SparseTensor}.fbs

generated_files=(
binary-view.ts
list-view.ts
large-list-view.ts
message.ts
record-batch.ts
schema.ts
type.ts
utf8-view.ts
)

for file in "${generated_files[@]}"; do
if [[ ! -f "${TMPDIR}/${file}" ]]; then
echo "error: expected generated file ${file} not found" >&2
exit 1
fi
install -m 0644 "${TMPDIR}/${file}" "${PROJECT_ROOT}/src/fb/${file}"
done
8 changes: 4 additions & 4 deletions src/Arrow.dom.ts
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ export {
Bool,
Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64,
Float, Float16, Float32, Float64,
Utf8, LargeUtf8,
Binary, LargeBinary,
Utf8, LargeUtf8, Utf8View,
Binary, LargeBinary, BinaryView,
FixedSizeBinary,
Date_, DateDay, DateMillisecond,
Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond,
Expand Down Expand Up @@ -81,7 +81,7 @@ export {
} from './Arrow.js';

export {
BinaryBuilder, LargeBinaryBuilder,
BinaryBuilder, BinaryViewBuilder, LargeBinaryBuilder,
BoolBuilder,
DateBuilder, DateDayBuilder, DateMillisecondBuilder,
DecimalBuilder,
Expand All @@ -99,5 +99,5 @@ export {
TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuilder, TimestampMicrosecondBuilder, TimestampNanosecondBuilder,
TimeBuilder, TimeSecondBuilder, TimeMillisecondBuilder, TimeMicrosecondBuilder, TimeNanosecondBuilder,
UnionBuilder, DenseUnionBuilder, SparseUnionBuilder,
Utf8Builder, LargeUtf8Builder
Utf8Builder, Utf8ViewBuilder, LargeUtf8Builder
} from './Arrow.js';
6 changes: 4 additions & 2 deletions src/Arrow.ts
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ export {
Bool,
Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64,
Float, Float16, Float32, Float64,
Utf8, LargeUtf8,
Binary, LargeBinary,
Utf8, LargeUtf8, Utf8View,
Binary, LargeBinary, BinaryView,
FixedSizeBinary,
Date_, DateDay, DateMillisecond,
Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond,
Expand Down Expand Up @@ -79,8 +79,10 @@ export { TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuilder,
export { IntervalBuilder, IntervalDayTimeBuilder, IntervalYearMonthBuilder, IntervalMonthDayNanoBuilder } from './builder/interval.js';
export { DurationBuilder, DurationSecondBuilder, DurationMillisecondBuilder, DurationMicrosecondBuilder, DurationNanosecondBuilder } from './builder/duration.js';
export { Utf8Builder } from './builder/utf8.js';
export { Utf8ViewBuilder } from './builder/utf8view.js';
export { LargeUtf8Builder } from './builder/largeutf8.js';
export { BinaryBuilder } from './builder/binary.js';
export { BinaryViewBuilder } from './builder/binaryview.js';
export { LargeBinaryBuilder } from './builder/largebinary.js';
export { ListBuilder } from './builder/list.js';
export { FixedSizeListBuilder } from './builder/fixedsizelist.js';
Expand Down
183 changes: 183 additions & 0 deletions src/builder/binaryview.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

import { BinaryView, Utf8View } from '../type.js';
import { Builder, BuilderOptions } from '../builder.js';
import { BufferBuilder } from './buffer.js';
import { toUint8Array } from '../util/buffer.js';
import { makeData } from '../data.js';
import type { DataProps } from '../data.js';

/** @ignore */
export class BinaryViewBuilder<
TType extends BinaryView | Utf8View = BinaryView,
TNull = any
> extends Builder<TType, TNull> {
protected _views: BufferBuilder<Uint8Array>;
protected _variadicBuffers: Uint8Array[] = [];
protected _currentBuffer: BufferBuilder<Uint8Array> | null = null;
protected _currentBufferIndex = 0;
protected _currentBufferOffset = 0;
protected readonly _bufferSize = 32 * 1024 * 1024; // 32MB per buffer as per spec recommendation

constructor(opts: BuilderOptions<TType, TNull>) {
super(opts);
this._views = new BufferBuilder(Uint8Array);
}

public get byteLength(): number {
let size = 0;
this._views && (size += this._views.byteLength);
this._nulls && (size += this._nulls.byteLength);
for (const buffer of this._variadicBuffers) {
size += buffer.byteLength;
}
this._currentBuffer && (size += this._currentBuffer.byteLength);
return size;
}

public setValue(index: number, value: TType['TValue']) {
return this.writeBinaryValue(index, this.encodeValue(value));
}

protected writeBinaryValue(index: number, data: Uint8Array) {
const length = data.length;

// Ensure views buffer has space up to this index (similar to FixedWidthBuilder)
const bytesNeeded = (index + 1) * BinaryView.ELEMENT_WIDTH;
const currentBytes = this._views.length;
if (bytesNeeded > currentBytes) {
this._views.reserve(bytesNeeded - currentBytes);
}

const viewBuffer = this._views.buffer;
const viewOffset = index * BinaryView.ELEMENT_WIDTH;
const view = new DataView(viewBuffer.buffer, viewBuffer.byteOffset + viewOffset, BinaryView.ELEMENT_WIDTH);

// Write length (4 bytes, little-endian)
view.setInt32(BinaryView.LENGTH_OFFSET, length, true);

if (length <= BinaryView.INLINE_CAPACITY) {
// Inline: store data directly in view struct (up to 12 bytes)
viewBuffer.set(data, viewOffset + BinaryView.INLINE_OFFSET);
// Zero out remaining bytes
for (let i = length; i < BinaryView.INLINE_CAPACITY; i++) {
viewBuffer[viewOffset + BinaryView.INLINE_OFFSET + i] = 0;
}
} else {
// Out-of-line: store in variadic buffer
// Write prefix (first 4 bytes of data)
const prefix = new DataView(data.buffer, data.byteOffset, Math.min(4, length));
view.setUint32(BinaryView.INLINE_OFFSET, prefix.getUint32(0, true), true);

// Allocate space in variadic buffer
if (!this._currentBuffer || this._currentBufferOffset + length > this._bufferSize) {
// Start a new buffer
if (this._currentBuffer) {
this._variadicBuffers.push(this._currentBuffer.buffer.slice(0, this._currentBufferOffset));
}
this._currentBuffer = new BufferBuilder(Uint8Array);
this._currentBufferIndex = this._variadicBuffers.length;
this._currentBufferOffset = 0;
}

// Write data to current buffer
const bufferData = this._currentBuffer.reserve(length).buffer;
bufferData.set(data, this._currentBufferOffset);

// Write buffer index and offset to view struct
view.setInt32(BinaryView.BUFFER_INDEX_OFFSET, this._currentBufferIndex, true);
view.setInt32(BinaryView.BUFFER_OFFSET_OFFSET, this._currentBufferOffset, true);

this._currentBufferOffset += length;
}

return this;
}

protected encodeValue(value: TType['TValue']): Uint8Array {
return toUint8Array(value as unknown as Uint8Array);
}

public setValid(index: number, isValid: boolean) {
// Ensure space is allocated in the views buffer for this index
const bytesNeeded = (index + 1) * BinaryView.ELEMENT_WIDTH;
const currentBytes = this._views.length;
if (bytesNeeded > currentBytes) {
this._views.reserve(bytesNeeded - currentBytes);
}

const result = super.setValid(index, isValid);

if (!result) {
// For null values, zero out the view struct
const viewBuffer = this._views.buffer;
const viewOffset = index * BinaryView.ELEMENT_WIDTH;
for (let i = 0; i < BinaryView.ELEMENT_WIDTH; i++) {
viewBuffer[viewOffset + i] = 0;
}
}

return result;
}

public clear() {
this._variadicBuffers = [];
this._currentBuffer = null;
this._currentBufferIndex = 0;
this._currentBufferOffset = 0;
this._views.clear();
return super.clear();
}

public flush() {
const { type, length, nullCount, _views, _nulls } = this;

// Finalize current buffer if it exists
if (this._currentBuffer && this._currentBufferOffset > 0) {
this._variadicBuffers.push(this._currentBuffer.buffer.slice(0, this._currentBufferOffset));
this._currentBuffer = null;
this._currentBufferOffset = 0;
}

const views = _views.flush(length * BinaryView.ELEMENT_WIDTH);
const nullBitmap = nullCount > 0 ? _nulls.flush(length) : undefined;
const variadicBuffers = this._variadicBuffers.slice();

// Reset variadic buffers for next batch
this._variadicBuffers = [];
this._currentBufferIndex = 0;

this.clear();

const props = {
type,
length,
nullCount,
nullBitmap,
['views']: views,
['variadicBuffers']: variadicBuffers
};

return makeData<TType>(props as unknown as DataProps<TType>);
}

public finish() {
this.finished = true;
return this;
}
}
32 changes: 32 additions & 0 deletions src/builder/utf8view.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

import { Utf8View } from '../type.js';
import { BuilderOptions } from '../builder.js';
import { BinaryViewBuilder } from './binaryview.js';
import { encodeUtf8 } from '../util/utf8.js';

/** @ignore */
export class Utf8ViewBuilder<TNull = any> extends BinaryViewBuilder<Utf8View, TNull> {
constructor(opts: BuilderOptions<Utf8View, TNull>) {
super(opts);
}

public override setValue(index: number, value: Utf8View['TValue']) {
return this.writeBinaryValue(index, encodeUtf8(value));
}
}
Loading