Skip to content
Permalink
Browse files
Allow specifying custom alignment for buffer writing padding (#35)
* Allow specifying custom alignment for buffer writing padding

Implements #31. Pretty easy, just have to thread the new `alignment`
keyword down everywhere.

* Update docs and add test
  • Loading branch information
quinnj committed Oct 6, 2020
1 parent 315d39a commit 8b6bcf692f3f21107ac8798af693fa7a4f64b5c3
Show file tree
Hide file tree
Showing 7 changed files with 125 additions and 113 deletions.
@@ -122,6 +122,7 @@ record batches simultaneously (e.g. if julia is started with `julia -t 8`).

Supported keyword arguments to `Arrow.write` include:
* `compress`: possible values include `:lz4`, `:zstd`, or your own initialized `LZ4FrameCompressor` or `ZstdCompressor` objects; will cause all buffers in each record batch to use the respective compression encoding
* `alignment::Int=8`: specify the number of bytes to align buffers to when written in messages; strongly recommended to only use alignment values of 8 or 64 for modern memory cache line optimization
* `dictencode::Bool=false`: whether all columns should use dictionary encoding when being written
* `dictencodenested::Bool=false`: whether nested data type columns should also dict encode nested arrays/buffers; many other implementations don't support this
* `denseunions::Bool=true`: whether Julia `Vector{<:Union}` arrays should be written using the dense union layout; passing `false` will result in the sparse union layout
@@ -56,7 +56,6 @@ macro debug(level, msg)
end)
end

const ALIGNMENT = 8
const FILE_FORMAT_MAGIC_BYTES = b"ARROW1"
const CONTINUATION_INDICATOR_BYTES = 0xffffffff

@@ -34,7 +34,7 @@ nullcount(x::ArrowVector) = validitybitmap(x).nc
getmetadata(x::ArrowVector) = x.metadata

function toarrowvector(x, de=DictEncoding[], meta=getmetadata(x); compression::Union{Nothing, LZ4FrameCompressor, ZstdCompressor}=nothing, kw...)
@debug 2 "converting top-level column to arrow format: col = $(typeof(x)), compression = $compression, kw = $kw"
@debug 2 "converting top-level column to arrow format: col = $(typeof(x)), compression = $compression, kw = $(kw.data)"
@debug 3 x
A = arrowvector(x, de, meta; compression=compression, kw...)
if compression isa LZ4FrameCompressor
@@ -88,7 +88,7 @@ struct ValidityBitmap <: ArrowVector{Bool}
end

compress(Z::Meta.CompressionType, comp, v::ValidityBitmap) =
v.nc == 0 ? CompressedBuffer(UInt8[], 0) : compress(Z, comp, view(v.bytes, v.pos:(v.pos + bitpackedbytes(v.ℓ) - 1)))
v.nc == 0 ? CompressedBuffer(UInt8[], 0) : compress(Z, comp, view(v.bytes, v.pos:(v.pos + cld(v., 8) - 1)))

Base.size(p::ValidityBitmap) = (p.ℓ,)
nullcount(x::ValidityBitmap) = x.nc
@@ -99,7 +99,7 @@ end

function ValidityBitmap(x)
len = length(x)
blen = bitpackedbytes(len)
blen = cld(len, 8)
bytes = Vector{UInt8}(undef, blen)
st = iterate(x)
i = 0
@@ -263,6 +263,7 @@ struct Batch
end

function Base.iterate(x::BatchIterator, (pos, id)=(x.startpos, 0))
@debug 1 "checking for next arrow message: pos = $pos"
if pos + 3 > length(x.bytes)
@debug 1 "not enough bytes left for another batch message"
return nothing
@@ -285,7 +286,7 @@ function Base.iterate(x::BatchIterator, (pos, id)=(x.startpos, 0))
msg = FlatBuffers.getrootas(Meta.Message, x.bytes, pos-1)
pos += msglen
# pos now points to message body
@debug 1 "parsing message: msglen = $msglen, bodyLength = $(msg.bodyLength)"
@debug 1 "parsing message: pos = $pos, msglen = $msglen, bodyLength = $(msg.bodyLength)"
return Batch(msg, x.bytes, pos, id), (pos + msg.bodyLength, id + 1)
end

@@ -5,9 +5,9 @@
Determines the total number of bytes needed to store `n` bytes with padding.
Note that the Arrow standard requires buffers to be aligned to 8-byte boundaries.
"""
padding(n::Integer) = ((n + ALIGNMENT - 1) ÷ ALIGNMENT) * ALIGNMENT
padding(n::Integer, alignment) = ((n + alignment - 1) ÷ alignment) * alignment

paddinglength(n::Integer) = padding(n) - n
paddinglength(n::Integer, alignment) = padding(n, alignment) - n

function writezeros(io::IO, n::Integer)
s = 0
@@ -61,13 +61,13 @@ function setbit(v::UInt8, b::Bool, n::Integer)
end

"""
bitpackedbytes(n[, pad=true])
bitpackedbytes(n)
Determines the number of bytes used by `n` bits, optionally with padding.
"""
function bitpackedbytes(n::Integer)
function bitpackedbytes(n::Integer, alignment)
= cld(n, 8)
return+ paddinglength(ℓ)
return+ paddinglength(ℓ, alignment)
end

# count # of missing elements in an iterable

0 comments on commit 8b6bcf6

Please sign in to comment.