Skip to content
Permalink
Browse files
Implement optimized copy for Primitive/DictEncoded ArrowVector types (#…
…21)

* Implement optimized copy for Primitive/DictEncoded ArrowVector types

Closes #19. If a Primitive array type doesn't have missing values, we
can optimize the copy by just copying the underlying storage array. For
DictEncoded, we take a dependency on PooledArrays to convert the compact
DictEncoded form to the equally compact PooledArray form. The other
arrow array types are left as-is, since the default copy method is
good/performance enough.
  • Loading branch information
quinnj committed Oct 3, 2020
1 parent 035ac29 commit ac39f5cbe98f8bdd8faa77a3267ea4c10164019c
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 3 deletions.
@@ -7,12 +7,14 @@ version = "1.0.0"
DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
Mmap = "a63ad114-7e13-5084-954f-fe012c677804"
PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
SentinelArrays = "91c51154-3ec4-41a3-a24f-3f23e20d615c"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"

[compat]
julia = "1"
DataAPI = "1"
PooledArrays = "0.5"
Tables = "1" # should be 1.1 for Tables.partitions
SentinelArrays = "1"

@@ -2,7 +2,7 @@ module Arrow

using Mmap
import Dates
using DataAPI, Tables, SentinelArrays
using DataAPI, Tables, SentinelArrays, PooledArrays

using Base: @propagate_inbounds
import Base: ==
@@ -1,6 +1,7 @@
abstract type ArrowVector{T} <: AbstractVector{T} end

Base.IndexStyle(::Type{A}) where {A <: ArrowVector} = Base.IndexLinear()
Base.similar(::Type{A}, dims::Dims) where {T, A <: ArrowVector{T}} = Vector{T}(undef, dims)

struct ValidityBitmap <: ArrowVector{Bool}
bytes::Vector{UInt8} # arrow memory blob
@@ -40,6 +41,19 @@ end

Base.size(p::Primitive) = (p.ℓ,)

function Base.copy(p::Primitive{T, S}) where {T, S}
if T !== S
A = Vector{T}(undef, p.ℓ)
valid = p.validity
data = p.data
@inbounds for i = 1:p.
A[i] = ifelse(valid[i], data[i], missing)
end
else
return copy(p.data)
end
end

@propagate_inbounds function Base.getindex(p::Primitive{T, S}, i::Integer) where {T, S}
@boundscheck checkbounds(p, i)
if T !== S
@@ -204,3 +218,21 @@ Base.size(d::DictEncoded) = size(d.indices)
@inbounds idx = d.indices[i]
return d.encoding[idx + 1]
end

function Base.copy(x::DictEncoded{T, S}) where {T, S}
pool = copy(x.encoding.data)
valid = x.validity
inds = x.indices
if T !== S
refs = Vector{S}(undef, length(inds))
@inbounds for i = 1:length(inds)
refs[i] = ifelse(valid[i], inds[i] + one(S), missing)
end
else
refs = copy(inds)
@inbounds for i = 1:length(inds)
refs[i] = refs[i] + one(S)
end
end
return PooledArray(PooledArrays.RefArray(refs), Dict{T, S}(val => i for (i, val) in enumerate(pool)), pool)
end
@@ -31,7 +31,7 @@ schema(t::Table) = getfield(t, :schema)

Tables.istable(::Table) = true
Tables.columnaccess(::Table) = true
Tables.columns(t::Table) = t
Tables.columns(t::Table) = Tables.CopiedColumns(t)
Tables.schema(t::Table) = Tables.Schema(names(t), types(t))
Tables.columnnames(t::Table) = names(t)
Tables.getcolumn(t::Table, i::Int) = columns(t)[i]
@@ -20,6 +20,8 @@ tt = Arrow.Table(io)
@test length(tt) == length(t)
@test tt.col1 == t.col1
@test eltype(tt.col1) === Int64
col1 = copy(tt.col1)
@test typeof(col1) == Vector{Int64}

# missing values
t = (col1=Union{Int64, Missing}[1,2,3,4,5,6,7,8,9,missing],)
@@ -99,14 +101,16 @@ tt = Arrow.Table(io)

# dict encodings
t = (
col1=Arrow.DictEncode([4, 5, 6]),
col1=Arrow.DictEncode(Int64[4, 5, 6]),
)
io = IOBuffer()
Arrow.write(io, t; debug=true)
seekstart(io)
tt = Arrow.Table(io; debug=true)
@test length(tt) == length(t)
@test all(isequal.(values(t), values(tt)))
col1 = copy(tt.col1)
@test typeof(col1) == PooledVector{Int64, Int8, Vector{Int8}}

t = (
col1=Arrow.DictEncode(NamedTuple{(:a, :b), Tuple{Int64, Union{String, Missing}}}[(a=Int64(1), b=missing), (a=Int64(1), b=missing), (a=Int64(3), b="sailor"), (a=Int64(4), b="jo-bob")]),

0 comments on commit ac39f5c

Please sign in to comment.