Skip to content

Commit

Permalink
Introduce BoolVector for Bool column types (#40)
Browse files Browse the repository at this point in the history
Fixes #38. From back in the original feather days, I remember that bool
columns were always bitpacked. Unfortunately, the arrow spec doesn't
really point this bitpacking out very obviously (it's mentioned in
passing as a possibility). This PR introduces a new BoolVector type and
corresponding `ArrowTypes.BoolType` that ensures Bool columns will be
written bitpacked, and read similarly.
  • Loading branch information
quinnj committed Oct 14, 2020
1 parent a53da47 commit 8ba8002
Show file tree
Hide file tree
Showing 4 changed files with 137 additions and 2 deletions.
1 change: 1 addition & 0 deletions src/arraytypes/arraytypes.jl
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ end

include("compressed.jl")
include("primitive.jl")
include("bool.jl")
include("list.jl")
include("fixedsizelist.jl")
include("map.jl")
Expand Down
104 changes: 104 additions & 0 deletions src/arraytypes/bool.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

struct BoolVector{T} <: ArrowVector{T}
arrow::Vector{UInt8} # need to hold a reference to arrow memory blob
pos::Int
validity::ValidityBitmap
::Int64
metadata::Union{Nothing, Dict{String, String}}
end

Base.size(p::BoolVector) = (p.ℓ,)

@propagate_inbounds function Base.getindex(p::BoolVector{T}, i::Integer) where {T}
@boundscheck checkbounds(p, i)
if T >: Missing
@inbounds !p.validity[i] && return missing
end
a, b = fldmod1(i, 8)
@inbounds byte = p.arrow[p.pos + a - 1]
# check individual bit of byte
return getbit(byte, b)
end

@propagate_inbounds function Base.setindex!(p::BoolVector, v, i::Integer)
@boundscheck checkbounds(p, i)
x = convert(Bool, v)
a, b = fldmod1(i, 8)
@inbounds byte = p.arrow[p.pos + a - 1]
@inbounds p.arrow[p.pos + a - 1] = setbit(byte, x, b)
return v
end

function arrowvector(::BoolType, x, de, meta; kw...)
validity = ValidityBitmap(x)
len = length(x)
blen = cld(len, 8)
bytes = Vector{UInt8}(undef, blen)
st = iterate(x)
i = 0
for k = 1:blen
b = 0x00
for j = 1:8
if (i + j) <= len
y, state = st
if y === missing || !y
b = setbit(b, false, j)
else
b = setbit(b, true, j)
end
st = iterate(x, state)
end
end
i += 8
@inbounds bytes[k] = b
end
return BoolVector{eltype(x)}(bytes, 1, validity, len, meta)
end

function compress(Z::Meta.CompressionType, comp, p::P) where {P <: BoolVector}
len = length(p)
nc = nullcount(p)
validity = compress(Z, comp, p.validity)
data = compress(Z, comp, view(p.arrow, p.pos:(p.pos + cld(p.ℓ, 8) - 1)))
return Compressed{Z, P}(p, [validity, data], len, nc, Compressed[])
end

function makenodesbuffers!(col::BoolVector, fieldnodes, fieldbuffers, bufferoffset, alignment)
len = length(col)
nc = nullcount(col)
push!(fieldnodes, FieldNode(len, nc))
@debug 1 "made field node: nodeidx = $(length(fieldnodes)), col = $(typeof(col)), len = $(fieldnodes[end].length), nc = $(fieldnodes[end].null_count)"
# validity bitmap
blen = nc == 0 ? 0 : bitpackedbytes(len, alignment)
push!(fieldbuffers, Buffer(bufferoffset, blen))
@debug 1 "made field buffer: bufferidx = $(length(fieldbuffers)), offset = $(fieldbuffers[end].offset), len = $(fieldbuffers[end].length), padded = $(padding(fieldbuffers[end].length, alignment))"
# adjust buffer offset, make primitive array buffer
bufferoffset += blen
blen = bitpackedbytes(len, alignment)
push!(fieldbuffers, Buffer(bufferoffset, blen))
@debug 1 "made field buffer: bufferidx = $(length(fieldbuffers)), offset = $(fieldbuffers[end].offset), len = $(fieldbuffers[end].length), padded = $(padding(fieldbuffers[end].length, alignment))"
return bufferoffset + blen
end

function writebuffer(io, col::BoolVector, alignment)
@debug 1 "writebuffer: col = $(typeof(col))"
@debug 2 col
writebitmap(io, col, alignment)
n = Base.write(io, view(col.arrow, col.pos:(col.pos + cld(col.ℓ, 8) - 1)))
return n + writezeros(io, paddinglength(n, alignment))
end
6 changes: 4 additions & 2 deletions src/arrowtypes.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

module ArrowTypes

export ArrowType, NullType, PrimitiveType, ListType, FixedSizeListType, MapType, StructType, UnionType, DictEncodedType
export ArrowType, NullType, PrimitiveType, BoolType, ListType, FixedSizeListType, MapType, StructType, UnionType, DictEncodedType

abstract type ArrowType end

Expand All @@ -37,7 +37,9 @@ struct PrimitiveType <: ArrowType end

ArrowType(::Type{<:Integer}) = PrimitiveType()
ArrowType(::Type{<:AbstractFloat}) = PrimitiveType()
ArrowType(::Type{Bool}) = PrimitiveType()

struct BoolType <: ArrowType end
ArrowType(::Type{Bool}) = BoolType()

struct ListType <: ArrowType end

Expand Down
28 changes: 28 additions & 0 deletions src/table.jl
Original file line number Diff line number Diff line change
Expand Up @@ -517,3 +517,31 @@ function build(f::Meta.Field, ::L, batch, rb, de, nodeidx, bufferidx, convert) w
@debug 2 "final julia type for primitive: T = $T"
return Primitive(T, bytes, validity, A, len, meta), nodeidx + 1, bufferidx + 1
end

function build(f::Meta.Field, L::Meta.Bool, batch, rb, de, nodeidx, bufferidx, convert)
@debug 2 "building array: L = $L"
validity = buildbitmap(batch, rb, nodeidx, bufferidx)
bufferidx += 1
buffer = rb.buffers[bufferidx]
meta = buildmetadata(f.custom_metadata)
# get storage type (non-converted)
T = juliaeltype(f, nothing, false)
@debug 2 "storage type for primitive: T = $T"
buffer = rb.buffers[bufferidx]
voff = batch.pos + buffer.offset
node = rb.nodes[nodeidx]
if rb.compression === nothing
decodedbytes = batch.bytes
pos = voff
# return ValidityBitmap(batch.bytes, voff, node.length, node.null_count)
else
# compressed
ptr = pointer(batch.bytes, voff)
_, decodedbytes = uncompress(ptr, buffer, rb.compression)
pos = 1
# return ValidityBitmap(decodedbytes, 1, node.length, node.null_count)
end
len = rb.nodes[nodeidx].length
T = juliaeltype(f, meta, convert)
return BoolVector{T}(decodedbytes, pos, validity, len, meta), nodeidx + 1, bufferidx + 1
end

0 comments on commit 8ba8002

Please sign in to comment.