Skip to content
Permalink
Browse files
Rework dict encoding of PooledArray/CategoricalArray to fix outstandi… (
#119)

* Rework dict encoding of PooledArray/CategoricalArray to fix outstanding issues

Fixes #117, #116, and #113. For #116, we just need to special case if user happens to pass in a DictEncoded themselves. We need to pass it through to the `toarrowvector` method that no-ops. For #113, we require the new functionality in PooledArrays that allows passing the `signed` and `compress` keyword arguments to ensure we get signed refs for our dict encoding. For #117, we add CategoricalArrays as a test dependency and ensure that if it contains any `missing` value, we *don't* recode the indices values down by 1, since the `missing` ref is 0, so other refs can already be considered "offsets". If there are no `missing`, then we still need to recode down since refs should always start from 0 in arrow format.

* PooledArrays 1.0 compat

* Update src/arraytypes/dictencoding.jl

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>

* Check refpool

* Fix test

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
  • Loading branch information
quinnj and nalimilan committed Jan 31, 2021
1 parent da73d82 commit 62266b5919f6bc5b543f9cfed64d7c1bf4818558
Showing 5 changed files with 46 additions and 12 deletions.
@@ -5,3 +5,4 @@ Manifest.toml
*.jl.mem

test/_scrap.jl
.DS_STORE
@@ -21,17 +21,18 @@ BitIntegers = "0.2"
CodecLz4 = "0.4"
CodecZstd = "0.7"
DataAPI = "1"
PooledArrays = "0.5"
PooledArrays = "0.5, 1.0"
SentinelArrays = "1"
Tables = "1.1"
TimeZones = "1"
julia = "1.3"

[extras]
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
StructTypes = "856f2bd8-1eba-4b0a-8007-ebc267875bd4"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Test", "Random", "JSON3", "StructTypes"]
test = ["Test", "Random", "JSON3", "StructTypes", "CategoricalArrays"]
@@ -51,6 +51,8 @@ end
function arrowvector(x, i, nl, fi, de, ded, meta; dictencoding::Bool=false, dictencode::Bool=false, kw...)
if !(x isa DictEncode) && !dictencoding && (dictencode || (x isa AbstractArray && DataAPI.refarray(x) !== x))
x = DictEncode(x, dictencodeid(i, nl, fi))
elseif x isa DictEncoded
return arrowvector(DictEncodeType, x, i, nl, fi, de, ded, meta; dictencode=dictencode, kw...)
end
S = maybemissing(eltype(x))
return arrowvector(S, x, i, nl, fi, de, ded, meta; dictencode=dictencode, kw...)
@@ -94,6 +94,7 @@ signedtype(::Type{UInt8}) = Int8
signedtype(::Type{UInt16}) = Int16
signedtype(::Type{UInt32}) = Int32
signedtype(::Type{UInt64}) = Int64
signedtype(::Type{T}) where {T <: Signed} = T

indtype(d::DictEncoded{T, S, A}) where {T, S, A} = S
indtype(c::Compressed{Z, A}) where {Z, A <: DictEncoded} = indtype(c.data)
@@ -113,21 +114,31 @@ function arrowvector(::DictEncodedType, x, i, nl, fi, de, ded, meta; dictencode:
validity = ValidityBitmap(x)
if !haskey(de, id)
# dict encoding doesn't exist yet, so create for 1st time
if DataAPI.refarray(x) === x
if DataAPI.refarray(x) === x || DataAPI.refpool(x) === nothing
# need to encode ourselves
x = PooledArray(x, encodingtype(length(x)))
x = PooledArray(x; signed=true, compress=true)
inds = DataAPI.refarray(x)
pool = DataAPI.refpool(x)
else
inds = copy(DataAPI.refarray(x))
pool = DataAPI.refpool(x)
refa = DataAPI.refarray(x)
inds = copyto!(similar(Vector{signedtype(eltype(refa))}, length(refa)), refa)
end
# adjust to "offset" instead of index
for i = 1:length(inds)
@inbounds inds[i] -= 1
end
pool = DataAPI.refpool(x)
# horrible hack? yes. better than taking CategoricalArrays dependency? also yes.
if typeof(pool).name.name == :CategoricalRefPool
pool = [get(pool[i]) for i = 1:length(pool)]
if eltype(x) >: Missing
pool = vcat(missing, DataAPI.levels(x))
else
pool = DataAPI.levels(x)
for i = 1:length(inds)
@inbounds inds[i] -= 1
end
end
else
# adjust to "offset" instead of index
for i = 1:length(inds)
@inbounds inds[i] -= 1
end
end
data = arrowvector(pool, i, nl, fi, de, ded, nothing; dictencode=dictencodenested, dictencodenested=dictencodenested, dictencoding=true, kw...)
encoding = DictEncoding{eltype(data), typeof(data)}(id, data, false, getmetadata(data))
@@ -14,7 +14,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

using Test, Arrow, Tables, Dates, PooledArrays, TimeZones, UUIDs
using Test, Arrow, Tables, Dates, PooledArrays, TimeZones, UUIDs, CategoricalArrays

include(joinpath(dirname(pathof(Arrow)), "../test/testtables.jl"))
include(joinpath(dirname(pathof(Arrow)), "../test/integrationtest.jl"))
@@ -216,6 +216,25 @@ x = PooledArray(["hey", missing])
x2 = Arrow.toarrowvector(x)
@test isequal(copy(x2), x)

# some dict encoding coverage

# signed indices for DictEncodedType #112 #113 #114
av = Arrow.toarrowvector(PooledArray(repeat(["a", "b"], inner = 5)))
@test isa(first(av.indices), Signed)

av = Arrow.toarrowvector(CategoricalArray(repeat(["a", "b"], inner = 5)))
@test isa(first(av.indices), Signed)

av = Arrow.toarrowvector(CategoricalArray(["a", "bb", missing]))
@test isa(first(av.indices), Signed)
@test length(av) == 3
@test eltype(av) == Union{String, Missing}

av = Arrow.toarrowvector(CategoricalArray(["a", "bb", "ccc"]))
@test isa(first(av.indices), Signed)
@test length(av) == 3
@test eltype(av) == String

end # @testset "misc"

end

0 comments on commit 62266b5

Please sign in to comment.