From 87aaf13f1538224df46adddf2e07119bd21042d2 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 1 Sep 2022 11:17:12 -0400 Subject: [PATCH 1/2] ARROW-17588: [Go] Implement Binary-like casting --- go/arrow/array/string.go | 16 +- go/arrow/compute/cast.go | 22 + go/arrow/compute/cast_test.go | 44 +- go/arrow/compute/datum.go | 1 - go/arrow/compute/internal/exec/utils.go | 4 + go/arrow/compute/internal/kernels/helpers.go | 4 + .../compute/internal/kernels/string_casts.go | 407 ++++++++++++++++++ go/arrow/datatype.go | 1 + go/arrow/datatype_binary.go | 4 + 9 files changed, 484 insertions(+), 19 deletions(-) create mode 100644 go/arrow/compute/internal/kernels/string_casts.go diff --git a/go/arrow/array/string.go b/go/arrow/array/string.go index 5cb73a79b5dbf..2f5fb739c9554 100644 --- a/go/arrow/array/string.go +++ b/go/arrow/array/string.go @@ -482,9 +482,17 @@ func (b *LargeStringBuilder) UnmarshalJSON(data []byte) error { return b.unmarshal(dec) } +type StringLikeBuilder interface { + Builder + Append(string) + ReserveData(int) +} + var ( - _ arrow.Array = (*String)(nil) - _ arrow.Array = (*LargeString)(nil) - _ Builder = (*StringBuilder)(nil) - _ Builder = (*LargeStringBuilder)(nil) + _ arrow.Array = (*String)(nil) + _ arrow.Array = (*LargeString)(nil) + _ Builder = (*StringBuilder)(nil) + _ Builder = (*LargeStringBuilder)(nil) + _ StringLikeBuilder = (*StringBuilder)(nil) + _ StringLikeBuilder = (*LargeStringBuilder)(nil) ) diff --git a/go/arrow/compute/cast.go b/go/arrow/compute/cast.go index f8e6e27b9ab59..2480cbfb3583b 100644 --- a/go/arrow/compute/cast.go +++ b/go/arrow/compute/cast.go @@ -163,6 +163,7 @@ func initCastTable() { castTable = make(map[arrow.Type]*castFunction) addCastFuncs(getBooleanCasts()) addCastFuncs(getNumericCasts()) + addCastFuncs(getBinaryLikeCasts()) addCastFuncs(getTemporalCasts()) } @@ -273,6 +274,27 @@ func getNumericCasts() []*castFunction { return out } +func getBinaryLikeCasts() []*castFunction { + out := make([]*castFunction, 0) + + addFn := func(name string, ty arrow.Type, kns []exec.ScalarKernel) { + fn := newCastFunction(name, ty) + for _, k := range kns { + if err := fn.AddTypeCast(k.Signature.InputTypes[0].MatchID(), k); err != nil { + panic(err) + } + } + out = append(out, fn) + } + + addFn("cast_binary", arrow.BINARY, kernels.GetToBinaryKernels(arrow.BinaryTypes.Binary)) + addFn("cast_large_binary", arrow.LARGE_BINARY, kernels.GetToBinaryKernels(arrow.BinaryTypes.LargeBinary)) + addFn("cast_string", arrow.STRING, kernels.GetToBinaryKernels(arrow.BinaryTypes.String)) + addFn("cast_large_string", arrow.LARGE_STRING, kernels.GetToBinaryKernels(arrow.BinaryTypes.LargeString)) + addFn("cast_fixed_sized_binary", arrow.FIXED_SIZE_BINARY, kernels.GetFsbCastKernels()) + return out +} + // CastDatum is a convenience function for casting a Datum to another type. // It is equivalent to calling CallFunction(ctx, "cast", opts, Datum) and // should work for Scalar, Array or ChunkedArray Datums. diff --git a/go/arrow/compute/cast_test.go b/go/arrow/compute/cast_test.go index a4b4338f1a889..a4a20e268eac7 100644 --- a/go/arrow/compute/cast_test.go +++ b/go/arrow/compute/cast_test.go @@ -299,7 +299,7 @@ func (c *CastSuite) TestCanCast() { canCast(arrow.Null, []arrow.DataType{arrow.FixedWidthTypes.Boolean}) canCast(arrow.Null, numericTypes) - cannotCast(arrow.Null, baseBinaryTypes) + canCast(arrow.Null, baseBinaryTypes) canCast(arrow.Null, []arrow.DataType{ arrow.FixedWidthTypes.Date32, arrow.FixedWidthTypes.Date64, arrow.FixedWidthTypes.Time32ms, arrow.FixedWidthTypes.Timestamp_s, }) @@ -307,7 +307,7 @@ func (c *CastSuite) TestCanCast() { canCast(arrow.FixedWidthTypes.Boolean, []arrow.DataType{arrow.FixedWidthTypes.Boolean}) canCast(arrow.FixedWidthTypes.Boolean, numericTypes) - // canCast(arrow.FixedWidthTypes.Boolean, []arrow.DataType{arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString}) + canCast(arrow.FixedWidthTypes.Boolean, []arrow.DataType{arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString}) // canCast(&arrow.DictionaryType{IndexType: arrow.PrimitiveTypes.Int32, ValueType: arrow.FixedWidthTypes.Boolean}, []arrow.DataType{arrow.FixedWidthTypes.Boolean}) cannotCast(arrow.FixedWidthTypes.Boolean, []arrow.DataType{arrow.Null}) @@ -318,16 +318,16 @@ func (c *CastSuite) TestCanCast() { for _, from := range numericTypes { canCast(from, []arrow.DataType{arrow.FixedWidthTypes.Boolean}) canCast(from, numericTypes) - // canCast(from, []arrow.DataType{arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString}) + canCast(from, []arrow.DataType{arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString}) // canCast(&arrow.DictionaryType{IndexType: arrow.PrimitiveTypes.Int32, ValueType: from}, []arrow.DataType{from}) cannotCast(from, []arrow.DataType{arrow.Null}) } for _, from := range baseBinaryTypes { - // canCast(from, []arrow.DataType{arrow.FixedWidthTypes.Boolean}) + canCast(from, []arrow.DataType{arrow.FixedWidthTypes.Boolean}) // canCast(from, numericTypes) - // canCast(from, baseBinaryTypes) + canCast(from, baseBinaryTypes) // canCast(&arrow.DictionaryType{IndexType: arrow.PrimitiveTypes.Int64, ValueType: from}, []arrow.DataType{from}) // any cast which is valid for the dictionary is valid for the dictionary array @@ -342,9 +342,9 @@ func (c *CastSuite) TestCanCast() { // no formatting supported cannotCast(arrow.FixedWidthTypes.Timestamp_us, []arrow.DataType{arrow.BinaryTypes.Binary, arrow.BinaryTypes.LargeBinary}) - // canCast(&arrow.FixedSizeBinaryType{ByteWidth: 3}, []arrow.DataType{ - // arrow.BinaryTypes.Binary, arrow.BinaryTypes.LargeBinary, arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString, - // &arrow.FixedSizeBinaryType{ByteWidth: 3}}) + canCast(&arrow.FixedSizeBinaryType{ByteWidth: 3}, []arrow.DataType{ + arrow.BinaryTypes.Binary, arrow.BinaryTypes.LargeBinary, arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString, + &arrow.FixedSizeBinaryType{ByteWidth: 3}}) arrow.RegisterExtensionType(types.NewSmallintType()) defer arrow.UnregisterExtensionType("smallint") @@ -352,12 +352,12 @@ func (c *CastSuite) TestCanCast() { // canCast(types.NewSmallintType(), numericTypes) // any cast which is valid for storage is supported // canCast(arrow.Null, []arrow.DataType{types.NewSmallintType()}) - // canCast(arrow.FixedWidthTypes.Date32, []arrow.DataType{arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString}) - // canCast(arrow.FixedWidthTypes.Date64, []arrow.DataType{arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString}) - // canCast(arrow.FixedWidthTypes.Timestamp_ns, []arrow.DataType{arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString}) - // canCast(arrow.FixedWidthTypes.Timestamp_us, []arrow.DataType{arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString}) - // canCast(arrow.FixedWidthTypes.Time32ms, []arrow.DataType{arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString}) - // canCast(arrow.FixedWidthTypes.Time64ns, []arrow.DataType{arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString}) + canCast(arrow.FixedWidthTypes.Date32, []arrow.DataType{arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString}) + canCast(arrow.FixedWidthTypes.Date64, []arrow.DataType{arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString}) + canCast(arrow.FixedWidthTypes.Timestamp_ns, []arrow.DataType{arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString}) + canCast(arrow.FixedWidthTypes.Timestamp_us, []arrow.DataType{arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString}) + canCast(arrow.FixedWidthTypes.Time32ms, []arrow.DataType{arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString}) + canCast(arrow.FixedWidthTypes.Time64ns, []arrow.DataType{arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString}) } func (c *CastSuite) checkCastFails(dt arrow.DataType, input string, opts *compute.CastOptions) { @@ -1220,6 +1220,22 @@ func (c *CastSuite) TestDecimalToFloating() { } } +func (c *CastSuite) TestDateToString() { + for _, stype := range []arrow.DataType{arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString} { + c.checkCast(arrow.FixedWidthTypes.Date32, stype, + `[0, null]`, `["1970-01-01", null]`) + c.checkCast(arrow.FixedWidthTypes.Date64, stype, + `[86400000, null]`, `["1970-01-02", null]`) + } +} + +func (c *CastSuite) TestTimeToString() { + for _, stype := range []arrow.DataType{arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString} { + c.checkCast(arrow.FixedWidthTypes.Time32s, stype, `[1, 62]`, `["00:00:01", "00:01:02"]`) + c.checkCast(arrow.FixedWidthTypes.Time64ns, stype, `[0, 1]`, `["00:00:00.000000000", "00:00:00.000000001"]`) + } +} + func (c *CastSuite) TestStringToInt() { for _, stype := range []arrow.DataType{arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString} { for _, dt := range signedIntTypes { diff --git a/go/arrow/compute/datum.go b/go/arrow/compute/datum.go index b5b88613b4447..005126ecad44e 100644 --- a/go/arrow/compute/datum.go +++ b/go/arrow/compute/datum.go @@ -125,7 +125,6 @@ func (d *ScalarDatum) Release() { if !d.Value.IsValid() { return } - if v, ok := d.Value.(releasable); ok { v.Release() } diff --git a/go/arrow/compute/internal/exec/utils.go b/go/arrow/compute/internal/exec/utils.go index da115f31fb24e..cbba2a8604479 100644 --- a/go/arrow/compute/internal/exec/utils.go +++ b/go/arrow/compute/internal/exec/utils.go @@ -95,6 +95,10 @@ func GetSpanOffsets[T int32 | int64](span *ArraySpan, i int) []T { return ret[span.Offset:] } +func GetBytes[T constraints.Integer](in []T) []byte { + return unsafe.Slice((*byte)(unsafe.Pointer(&in[0])), len(in)*int(unsafe.Sizeof(T(0)))) +} + func Min[T constraints.Ordered](a, b T) T { if a < b { return a diff --git a/go/arrow/compute/internal/kernels/helpers.go b/go/arrow/compute/internal/kernels/helpers.go index dbfbeef5f4124..b09a9f44089b9 100644 --- a/go/arrow/compute/internal/kernels/helpers.go +++ b/go/arrow/compute/internal/kernels/helpers.go @@ -440,3 +440,7 @@ func ResolveOutputFromOptions(ctx *exec.KernelCtx, _ []arrow.DataType) (arrow.Da } var OutputTargetType = exec.NewComputedOutputType(ResolveOutputFromOptions) + +func resolveToFirstType(_ *exec.KernelCtx, args []arrow.DataType) (arrow.DataType, error) { + return args[0], nil +} diff --git a/go/arrow/compute/internal/kernels/string_casts.go b/go/arrow/compute/internal/kernels/string_casts.go new file mode 100644 index 0000000000000..bcd2ba2b00d1a --- /dev/null +++ b/go/arrow/compute/internal/kernels/string_casts.go @@ -0,0 +1,407 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernels + +import ( + "fmt" + "strconv" + "unicode/utf8" + + "github.com/apache/arrow/go/v10/arrow" + "github.com/apache/arrow/go/v10/arrow/array" + "github.com/apache/arrow/go/v10/arrow/bitutil" + "github.com/apache/arrow/go/v10/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v10/arrow/float16" + "github.com/apache/arrow/go/v10/internal/bitutils" +) + +func validateUtf8Fsb(input *exec.ArraySpan) error { + var ( + inputData = input.Buffers[1].Buf + width = int64(input.Type.(*arrow.FixedSizeBinaryType).ByteWidth) + bitmap = input.Buffers[0].Buf + ) + + return bitutils.VisitBitBlocksShort(bitmap, input.Offset, input.Len, + func(pos int64) error { + pos += input.Offset + beg := pos * width + end := (pos + 1) * width + if !utf8.Valid(inputData[beg:end]) { + return fmt.Errorf("%w: invalid UTF8 bytes: %x", arrow.ErrInvalid, inputData[beg:end]) + } + return nil + }, func() error { return nil }) +} + +func validateUtf8[OffsetT int32 | int64](input *exec.ArraySpan) error { + var ( + inputOffsets = exec.GetSpanOffsets[OffsetT](input, 1) + inputData = input.Buffers[2].Buf + bitmap = input.Buffers[0].Buf + ) + + return bitutils.VisitBitBlocksShort(bitmap, input.Offset, input.Len, + func(pos int64) error { + v := inputData[inputOffsets[pos]:inputOffsets[pos+1]] + if !utf8.Valid(v) { + return fmt.Errorf("%w: invalid UTF8 bytes: %x", arrow.ErrInvalid, v) + } + return nil + }, func() error { return nil }) +} + +func CastFsbToFsb(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { + inputWidth := batch.Values[0].Array.Type.(*arrow.FixedSizeBinaryType).ByteWidth + outputWidth := out.Type.(*arrow.FixedSizeBinaryType).ByteWidth + + if inputWidth != outputWidth { + return fmt.Errorf("%w: failed casting from %s to %s: widths must match", + arrow.ErrInvalid, batch.Values[0].Array.Type, out.Type) + } + + return ZeroCopyCastExec(ctx, batch, out) +} + +func CastBinaryToBinary[InOffsetsT, OutOffsetsT int32 | int64](ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { + opts := ctx.State.(CastState) + input := &batch.Values[0].Array + + if !input.Type.(arrow.BinaryDataType).IsUtf8() && out.Type.(arrow.BinaryDataType).IsUtf8() && !opts.AllowInvalidUtf8 { + if err := validateUtf8[InOffsetsT](input); err != nil { + return err + } + } + + // start with a zero-copy cast, then change the indices to the + // expected size + if err := ZeroCopyCastExec(ctx, batch, out); err != nil { + return err + } + + switch { + case SizeOf[InOffsetsT]() == SizeOf[OutOffsetsT](): + // offsets are the same width, nothing more to do + return nil + case SizeOf[InOffsetsT]() > SizeOf[OutOffsetsT](): + // downcast from int64 -> int32 + inputOffsets := exec.GetSpanOffsets[InOffsetsT](input, 1) + + // binary offsets are ascending, so it's enough to check + // the last one for overflow + if inputOffsets[input.Len] > InOffsetsT(MaxOf[OutOffsetsT]()) { + return fmt.Errorf("%w: failed casting from %s to %s: input array too large", + arrow.ErrInvalid, input.Type, out.Type) + } + + buf := ctx.Allocate(out.Type.(arrow.OffsetTraits).BytesRequired(int(out.Len + out.Offset + 1))) + out.Buffers[1].WrapBuffer(buf) + + outOffsets := exec.GetSpanOffsets[OutOffsetsT](out, 1) + + castNumericUnsafe(arrow.INT64, arrow.INT32, + exec.GetBytes(inputOffsets), exec.GetBytes(outOffsets), len(inputOffsets)) + return nil + default: + // upcast from int32 -> int64 + buf := ctx.Allocate(out.Type.(arrow.OffsetTraits).BytesRequired(int(out.Len + out.Offset + 1))) + out.Buffers[1].WrapBuffer(buf) + + inputOffsets := exec.GetSpanOffsets[InOffsetsT](input, 1) + outOffsets := exec.GetSpanOffsets[OutOffsetsT](out, 1) + + castNumericUnsafe(arrow.INT32, arrow.INT64, + exec.GetBytes(inputOffsets), exec.GetBytes(outOffsets), len(inputOffsets)) + return nil + } +} + +func CastFsbToBinary[OffsetsT int32 | int64](ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { + opts := ctx.State.(CastState) + input := &batch.Values[0].Array + + if out.Type.(arrow.BinaryDataType).IsUtf8() && !opts.AllowInvalidUtf8 { + if err := validateUtf8Fsb(input); err != nil { + return err + } + } + + // check for overflow + maxOffset := int64(MaxOf[OffsetsT]()) + width := OffsetsT(input.Type.(*arrow.FixedSizeBinaryType).ByteWidth) + if (int64(width) * input.Len) > maxOffset { + return fmt.Errorf("%w: failed casting from %s to %s: input array too large", + arrow.ErrInvalid, input.Type, out.Type) + } + + out.Len = input.Len + out.Nulls = input.Nulls + if input.Offset == out.Offset { + out.Buffers[0].SetBuffer(input.GetBuffer(0)) + } else { + out.Buffers[0].WrapBuffer(ctx.AllocateBitmap(input.Len)) + bitutil.CopyBitmap(input.Buffers[0].Buf, int(input.Offset), int(input.Len), out.Buffers[0].Buf, int(out.Offset)) + } + + // this buffer is preallocated + offsets := exec.GetSpanOffsets[OffsetsT](out, 1) + offsets[0] = OffsetsT(input.Offset) * width + for i := 0; i < int(input.Len); i++ { + offsets[i+1] = offsets[i] + width + } + + if len(input.Buffers[1].Buf) > 0 { + out.Buffers[2] = input.Buffers[1] + } + + return nil +} + +func addBinaryToBinaryCast[InOffsetT, OutOffsetT int32 | int64](inType arrow.Type, outType exec.OutputType) exec.ScalarKernel { + return exec.NewScalarKernel([]exec.InputType{exec.NewIDInput(inType)}, + outType, CastBinaryToBinary[InOffsetT, OutOffsetT], nil) +} + +func addToBinaryKernels[OffsetsT int32 | int64](outType exec.OutputType, kernels []exec.ScalarKernel) []exec.ScalarKernel { + return append(kernels, + addBinaryToBinaryCast[int32, OffsetsT](arrow.STRING, outType), + addBinaryToBinaryCast[int32, OffsetsT](arrow.BINARY, outType), + addBinaryToBinaryCast[int64, OffsetsT](arrow.LARGE_STRING, outType), + addBinaryToBinaryCast[int64, OffsetsT](arrow.LARGE_BINARY, outType), + exec.NewScalarKernel([]exec.InputType{exec.NewIDInput(arrow.FIXED_SIZE_BINARY)}, + outType, CastFsbToBinary[OffsetsT], nil), + ) +} + +func GetFsbCastKernels() []exec.ScalarKernel { + outputType := exec.NewComputedOutputType(resolveOutputFromOptions) + out := GetCommonCastKernels(arrow.FIXED_SIZE_BINARY, outputType) + kernel := exec.NewScalarKernel([]exec.InputType{exec.NewIDInput(arrow.FIXED_SIZE_BINARY)}, + exec.NewComputedOutputType(resolveToFirstType), CastFsbToFsb, nil) + kernel.NullHandling = exec.NullComputedNoPrealloc + return append(out, kernel) +} + +func float16Formatter(v float16.Num) string { return v.String() } +func date32Formatter(v arrow.Date32) string { return v.FormattedString() } +func date64Formatter(v arrow.Date64) string { return v.FormattedString() } +func numericFormatterSigned[T exec.IntTypes](v T) string { return strconv.FormatInt(int64(v), 10) } +func numericFormatterUnsigned[T exec.UintTypes](v T) string { return strconv.FormatUint(uint64(v), 10) } +func float32Formatter(v float32) string { return strconv.FormatFloat(float64(v), 'g', -1, 32) } +func float64Formatter(v float64) string { return strconv.FormatFloat(v, 'g', -1, 64) } + +func boolToStringCastExec(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { + var ( + input = &batch.Values[0].Array + bldr = array.NewBuilder(exec.GetAllocator(ctx.Ctx), out.Type).(array.StringLikeBuilder) + ) + defer bldr.Release() + + bitutils.VisitBitBlocks(input.Buffers[0].Buf, input.Offset, input.Len, + func(pos int64) { + bldr.Append(strconv.FormatBool(bitutil.BitIsSet(input.Buffers[1].Buf, int(pos)))) + }, func() { bldr.AppendNull() }) + + arr := bldr.NewArray() + out.TakeOwnership(arr.Data()) + return nil +} + +type timeIntrinsic interface { + arrow.Time32 | arrow.Time64 + FormattedString(arrow.TimeUnit) string +} + +func timeToStringCastExec[T timeIntrinsic](ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { + var ( + input = &batch.Values[0].Array + inputData = exec.GetSpanValues[T](input, 1) + bldr = array.NewBuilder(exec.GetAllocator(ctx.Ctx), out.Type).(array.StringLikeBuilder) + inputType = input.Type.(arrow.TemporalWithUnit) + ) + defer bldr.Release() + + bitutils.VisitBitBlocks(input.Buffers[0].Buf, input.Offset, input.Len, + func(pos int64) { + bldr.Append(inputData[pos].FormattedString(inputType.TimeUnit())) + }, func() { bldr.AppendNull() }) + + arr := bldr.NewArray() + out.TakeOwnership(arr.Data()) + return nil +} + +func numericToStringCastExec[T exec.IntTypes | exec.UintTypes | exec.FloatTypes](formatter func(T) string) exec.ArrayKernelExec { + return func(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { + var ( + input = &batch.Values[0].Array + inputData = exec.GetSpanValues[T](input, 1) + bldr = array.NewBuilder(exec.GetAllocator(ctx.Ctx), out.Type).(array.StringLikeBuilder) + ) + defer bldr.Release() + + bitutils.VisitBitBlocks(input.Buffers[0].Buf, input.Offset, input.Len, + func(pos int64) { + bldr.Append(formatter(inputData[pos])) + }, func() { bldr.AppendNull() }) + + arr := bldr.NewArray() + out.TakeOwnership(arr.Data()) + return nil + } +} + +func castTimestampToString(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { + var ( + input = &batch.Values[0].Array + inputData = exec.GetSpanValues[arrow.Timestamp](input, 1) + inputType = input.Type.(*arrow.TimestampType) + bldr = array.NewBuilder(exec.GetAllocator(ctx.Ctx), out.Type).(array.StringLikeBuilder) + ) + defer bldr.Release() + + toTime, err := inputType.GetToTimeFunc() + if err != nil { + return err + } + + // prealloc + fmtstring := "2006-01-02 15:04:05" + switch inputType.Unit { + case arrow.Millisecond: + fmtstring += ".000" + case arrow.Microsecond: + fmtstring += ".000000" + case arrow.Nanosecond: + fmtstring += ".000000000" + } + + switch inputType.TimeZone { + case "UTC": + fmtstring += "Z" + case "": + default: + fmtstring += "-0700" + } + + strlen := len(fmtstring) + bldr.Reserve(int(input.Len)) + bldr.ReserveData(int(input.Len-input.Nulls) * strlen) + + bitutils.VisitBitBlocks(input.Buffers[0].Buf, input.Offset, input.Len, + func(pos int64) { + bldr.Append(toTime(inputData[pos]).Format(fmtstring)) + }, + func() { bldr.AppendNull() }) + + arr := bldr.NewArray() + out.TakeOwnership(arr.Data()) + return nil +} + +func getNumericToStringCastExec(inType arrow.Type) exec.ArrayKernelExec { + switch inType { + case arrow.INT8: + return numericToStringCastExec(numericFormatterSigned[int8]) + case arrow.UINT8: + return numericToStringCastExec(numericFormatterUnsigned[uint8]) + case arrow.INT16: + return numericToStringCastExec(numericFormatterSigned[int16]) + case arrow.UINT16: + return numericToStringCastExec(numericFormatterUnsigned[uint16]) + case arrow.INT32: + return numericToStringCastExec(numericFormatterSigned[int32]) + case arrow.UINT32: + return numericToStringCastExec(numericFormatterUnsigned[uint32]) + case arrow.INT64: + return numericToStringCastExec(numericFormatterSigned[int64]) + case arrow.UINT64: + return numericToStringCastExec(numericFormatterUnsigned[uint64]) + case arrow.FLOAT16: + return numericToStringCastExec(float16Formatter) + case arrow.FLOAT32: + return numericToStringCastExec(float32Formatter) + case arrow.FLOAT64: + return numericToStringCastExec(float64Formatter) + case arrow.BOOL: + return boolToStringCastExec + case arrow.DATE32: + return numericToStringCastExec(date32Formatter) + case arrow.DATE64: + return numericToStringCastExec(date64Formatter) + case arrow.TIME32: + return timeToStringCastExec[arrow.Time32] + case arrow.TIME64: + return timeToStringCastExec[arrow.Time64] + case arrow.TIMESTAMP: + return castTimestampToString + } + panic("unimplemented cast: " + inType.String()) +} + +func addNumericAndTemporalToStringCasts(outType exec.OutputType, out []exec.ScalarKernel) []exec.ScalarKernel { + k := exec.NewScalarKernel([]exec.InputType{exec.NewExactInput(arrow.FixedWidthTypes.Boolean)}, outType, + getNumericToStringCastExec(arrow.BOOL), nil) + k.NullHandling = exec.NullComputedNoPrealloc + out = append(out, k) + + for _, dt := range numericTypes { + k = exec.NewScalarKernel([]exec.InputType{exec.NewExactInput(dt)}, outType, + getNumericToStringCastExec(dt.ID()), nil) + k.NullHandling = exec.NullComputedNoPrealloc + out = append(out, k) + } + + for _, dt := range []arrow.DataType{arrow.FixedWidthTypes.Date32, arrow.FixedWidthTypes.Date64} { + k = exec.NewScalarKernel([]exec.InputType{exec.NewExactInput(dt)}, outType, + getNumericToStringCastExec(dt.ID()), nil) + k.NullHandling = exec.NullComputedNoPrealloc + out = append(out, k) + } + + for _, id := range []arrow.Type{arrow.TIME32, arrow.TIME64, arrow.TIMESTAMP} { + k = exec.NewScalarKernel([]exec.InputType{exec.NewIDInput(id)}, outType, + getNumericToStringCastExec(id), nil) + k.NullHandling = exec.NullComputedNoPrealloc + out = append(out, k) + } + + return out +} + +func GetToBinaryKernels(outType arrow.DataType) []exec.ScalarKernel { + if outType.ID() == arrow.FIXED_SIZE_BINARY { + return nil + } + + outputType := exec.NewOutputType(outType) + out := GetCommonCastKernels(outType.ID(), outputType) + + switch outType.ID() { + case arrow.BINARY: + return addToBinaryKernels[int32](outputType, out) + case arrow.LARGE_BINARY: + return addToBinaryKernels[int64](outputType, out) + case arrow.STRING: + out = addToBinaryKernels[int32](outputType, out) + return addNumericAndTemporalToStringCasts(outputType, out) + case arrow.LARGE_STRING: + out = addToBinaryKernels[int64](outputType, out) + return addNumericAndTemporalToStringCasts(outputType, out) + } + return nil +} diff --git a/go/arrow/datatype.go b/go/arrow/datatype.go index 4a7915f9b301a..2ffca317e7e06 100644 --- a/go/arrow/datatype.go +++ b/go/arrow/datatype.go @@ -196,6 +196,7 @@ type FixedWidthDataType interface { type BinaryDataType interface { DataType + IsUtf8() bool binary() } diff --git a/go/arrow/datatype_binary.go b/go/arrow/datatype_binary.go index fa6513693f8bd..a3a8568645052 100644 --- a/go/arrow/datatype_binary.go +++ b/go/arrow/datatype_binary.go @@ -39,6 +39,7 @@ func (t *BinaryType) Layout() DataTypeLayout { SpecFixedWidth(Int32SizeBytes), SpecVariableWidth()}} } func (t *BinaryType) OffsetTypeTraits() OffsetTraits { return Int32Traits } +func (BinaryType) IsUtf8() bool { return false } type StringType struct{} @@ -52,6 +53,7 @@ func (t *StringType) Layout() DataTypeLayout { SpecFixedWidth(Int32SizeBytes), SpecVariableWidth()}} } func (t *StringType) OffsetTypeTraits() OffsetTraits { return Int32Traits } +func (StringType) IsUtf8() bool { return true } type LargeBinaryType struct{} @@ -65,6 +67,7 @@ func (t *LargeBinaryType) Layout() DataTypeLayout { SpecFixedWidth(Int64SizeBytes), SpecVariableWidth()}} } func (t *LargeBinaryType) OffsetTypeTraits() OffsetTraits { return Int64Traits } +func (LargeBinaryType) IsUtf8() bool { return false } type LargeStringType struct{} @@ -78,6 +81,7 @@ func (t *LargeStringType) Layout() DataTypeLayout { SpecFixedWidth(Int64SizeBytes), SpecVariableWidth()}} } func (t *LargeStringType) OffsetTypeTraits() OffsetTraits { return Int64Traits } +func (LargeStringType) IsUtf8() bool { return true } var ( BinaryTypes = struct { From 84b1cfafaf871b89056a8e13b2bc9ce9ed102723 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 1 Sep 2022 17:59:30 -0400 Subject: [PATCH 2/2] rest of the tests and fixes --- go/arrow/compute/cast_test.go | 361 +++++++++++++++++- go/arrow/compute/internal/exec/span.go | 6 + .../compute/internal/kernels/string_casts.go | 6 +- go/arrow/scalar/scalar.go | 4 + 4 files changed, 368 insertions(+), 9 deletions(-) diff --git a/go/arrow/compute/cast_test.go b/go/arrow/compute/cast_test.go index a4a20e268eac7..807a5c281d92f 100644 --- a/go/arrow/compute/cast_test.go +++ b/go/arrow/compute/cast_test.go @@ -19,6 +19,7 @@ package compute_test import ( "context" "fmt" + "math" "strconv" "strings" "testing" @@ -257,18 +258,45 @@ func (c *CastSuite) maskArrayWithNullsAt(input arrow.Array, toMask []int) arrow. } func (c *CastSuite) invalidUtf8Arr(dt arrow.DataType) arrow.Array { - arr, _, err := array.FromJSON(c.mem, dt, strings.NewReader(`["Hi", "olá mundo", "你好世界", "", "`+"\xa0\xa1"+`"]`)) - c.Require().NoError(err) - return arr + bldr := array.NewBinaryBuilder(c.mem, dt.(arrow.BinaryDataType)) + defer bldr.Release() + + bldr.AppendValues([][]byte{ + []byte("Hi"), + []byte("olá mundo"), + []byte("你好世界"), + []byte(""), + []byte("\xa0\xa1"), // invalid utf8! + }, nil) + + return bldr.NewArray() +} + +type binaryBuilderAppend interface { + array.Builder + AppendValues([][]byte, []bool) } func (c *CastSuite) fixedSizeInvalidUtf8(dt arrow.DataType) arrow.Array { + var bldr binaryBuilderAppend if dt.ID() == arrow.FIXED_SIZE_BINARY { c.Require().Equal(3, dt.(*arrow.FixedSizeBinaryType).ByteWidth) + bldr = array.NewFixedSizeBinaryBuilder(c.mem, dt.(*arrow.FixedSizeBinaryType)) + } else { + bldr = array.NewBinaryBuilder(c.mem, dt.(arrow.BinaryDataType)) } - arr, _, err := array.FromJSON(c.mem, dt, strings.NewReader(`["Hi!", "lá", "你", " ", "`+"\xa0\xa1\xa2"+`"]`)) - c.Require().NoError(err) - return arr + + defer bldr.Release() + + bldr.AppendValues([][]byte{ + []byte("Hi!"), + []byte("lá"), + []byte("你"), + []byte(" "), + []byte("\xa0\xa1\xa2"), // invalid utf8! + }, nil) + + return bldr.NewArray() } func (c *CastSuite) SetupTest() { @@ -1236,6 +1264,243 @@ func (c *CastSuite) TestTimeToString() { } } +func (c *CastSuite) TestTimestampToString() { + for _, stype := range []arrow.DataType{arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString} { + c.checkCast(&arrow.TimestampType{Unit: arrow.Second}, stype, + `[-30610224000, -5364662400]`, `["1000-01-01 00:00:00", "1800-01-01 00:00:00"]`) + + c.checkCast(&arrow.TimestampType{Unit: arrow.Millisecond}, stype, + `[-30610224000000, -5364662400000]`, `["1000-01-01 00:00:00.000", "1800-01-01 00:00:00.000"]`) + + c.checkCast(&arrow.TimestampType{Unit: arrow.Microsecond}, stype, + `[-30610224000000000, -5364662400000000]`, `["1000-01-01 00:00:00.000000", "1800-01-01 00:00:00.000000"]`) + + c.checkCast(&arrow.TimestampType{Unit: arrow.Nanosecond}, stype, + `[-596933876543210988, 349837323456789012]`, `["1951-02-01 01:02:03.456789012", "1981-02-01 01:02:03.456789012"]`) + } +} + +func (c *CastSuite) TestTimestampWithZoneToString() { + for _, stype := range []arrow.DataType{arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString} { + c.checkCast(arrow.FixedWidthTypes.Timestamp_s, stype, + `[-30610224000, -5364662400]`, `["1000-01-01 00:00:00Z", "1800-01-01 00:00:00Z"]`) + + c.checkCast(&arrow.TimestampType{Unit: arrow.Second, TimeZone: "America/Phoenix"}, stype, + `[-34226955, 1456767743]`, `["1968-11-30 13:30:45-0700", "2016-02-29 10:42:23-0700"]`) + + c.checkCast(&arrow.TimestampType{Unit: arrow.Millisecond, TimeZone: "America/Phoenix"}, stype, + `[-34226955877, 1456767743456]`, `["1968-11-30 13:30:44.123-0700", "2016-02-29 10:42:23.456-0700"]`) + + c.checkCast(&arrow.TimestampType{Unit: arrow.Microsecond, TimeZone: "America/Phoenix"}, stype, + `[-34226955877000, 1456767743456789]`, `["1968-11-30 13:30:44.123000-0700", "2016-02-29 10:42:23.456789-0700"]`) + + c.checkCast(&arrow.TimestampType{Unit: arrow.Nanosecond, TimeZone: "America/Phoenix"}, stype, + `[-34226955876543211, 1456767743456789246]`, `["1968-11-30 13:30:44.123456789-0700", "2016-02-29 10:42:23.456789246-0700"]`) + } +} + +func (c *CastSuite) assertBinaryZeroCopy(lhs, rhs arrow.Array) { + // null bitmap and data buffers are always zero-copied + assertBufferSame(c.T(), lhs, rhs, 0) + assertBufferSame(c.T(), lhs, rhs, 2) + + lOffsetByteWidth := lhs.DataType().Layout().Buffers[1].ByteWidth + rOffsetByteWidth := rhs.DataType().Layout().Buffers[1].ByteWidth + if lOffsetByteWidth == rOffsetByteWidth { + assertBufferSame(c.T(), lhs, rhs, 1) + return + } + + offsets := make([]arrow.Array, 0, 2) + for _, arr := range []arrow.Array{lhs, rhs} { + length := arr.Len() + buffer := arr.Data().Buffers()[1] + + byteWidth := arr.DataType().Layout().Buffers[1].ByteWidth + switch byteWidth { + case 4: + data := array.NewData(arrow.PrimitiveTypes.Int32, length, []*memory.Buffer{nil, buffer}, nil, 0, 0) + defer data.Release() + i32 := array.NewInt32Data(data) + i64, err := compute.CastArray(context.Background(), i32, compute.SafeCastOptions(arrow.PrimitiveTypes.Int64)) + c.Require().NoError(err) + i32.Release() + defer i64.Release() + offsets = append(offsets, i64) + default: + data := array.NewData(arrow.PrimitiveTypes.Int64, length, []*memory.Buffer{nil, buffer}, nil, 0, 0) + defer data.Release() + i64 := array.NewInt64Data(data) + defer i64.Release() + offsets = append(offsets, i64) + } + } + c.Truef(array.Equal(offsets[0], offsets[1]), "lhs: %s\nrhs: %s", offsets[0], offsets[1]) +} + +func (c *CastSuite) TestBinaryToString() { + for _, btype := range []arrow.DataType{arrow.BinaryTypes.Binary, arrow.BinaryTypes.LargeBinary} { + c.Run(btype.String(), func() { + for _, stype := range []arrow.DataType{arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString} { + c.Run(stype.String(), func() { + // empty -> empty always works + c.checkCast(btype, stype, `[]`, `[]`) + + invalidUtf8 := c.invalidUtf8Arr(btype) + defer invalidUtf8.Release() + + invalidutf8Str := c.invalidUtf8Arr(stype) + defer invalidutf8Str.Release() + + // invalid utf8 masked by a null bit is not an error + masked := c.maskArrayWithNullsAt(invalidUtf8, []int{4}) + expMasked := c.maskArrayWithNullsAt(invalidutf8Str, []int{4}) + defer masked.Release() + defer expMasked.Release() + + checkCast(c.T(), masked, expMasked, *compute.SafeCastOptions(stype)) + + opts := compute.SafeCastOptions(stype) + checkCastFails(c.T(), invalidUtf8, *opts) + + // override utf8 check + opts.AllowInvalidUtf8 = true + strs, err := compute.CastArray(context.Background(), invalidUtf8, opts) + c.NoError(err) + defer strs.Release() + c.assertBinaryZeroCopy(invalidUtf8, strs) + }) + } + }) + } + + c.Run("fixed size binary", func() { + fromType := &arrow.FixedSizeBinaryType{ByteWidth: 3} + invalidUtf8Arr := c.fixedSizeInvalidUtf8(fromType) + defer invalidUtf8Arr.Release() + for _, stype := range []arrow.DataType{arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString} { + c.Run(stype.String(), func() { + c.checkCast(fromType, stype, `[]`, `[]`) + + // invalid utf-8 masked by a null bit is not an error + strInvalidUtf8 := c.fixedSizeInvalidUtf8(stype) + defer strInvalidUtf8.Release() + + masked := c.maskArrayWithNullsAt(invalidUtf8Arr, []int{4}) + expMasked := c.maskArrayWithNullsAt(strInvalidUtf8, []int{4}) + defer masked.Release() + defer expMasked.Release() + + checkCast(c.T(), masked, expMasked, *compute.SafeCastOptions(stype)) + + opts := compute.SafeCastOptions(stype) + checkCastFails(c.T(), invalidUtf8Arr, *opts) + + // override utf8 check + opts.AllowInvalidUtf8 = true + strs, err := compute.CastArray(context.Background(), invalidUtf8Arr, opts) + c.NoError(err) + defer strs.Release() + + // null buffer is not always the same if input is sliced + assertBufferSame(c.T(), invalidUtf8Arr, strs, 0) + + c.Same(invalidUtf8Arr.Data().Buffers()[1], strs.Data().Buffers()[2]) + }) + } + }) +} + +func (c *CastSuite) TestBinaryOrStringToBinary() { + for _, fromType := range baseBinaryTypes { + c.Run(fromType.String(), func() { + for _, toType := range []arrow.DataType{arrow.BinaryTypes.Binary, arrow.BinaryTypes.LargeBinary} { + c.Run(toType.String(), func() { + // empty -> empty always works + c.checkCast(fromType, toType, `[]`, `[]`) + + invalidUtf8 := c.invalidUtf8Arr(fromType) + defer invalidUtf8.Release() + + // invalid utf-8 is not an error for binary + out, err := compute.CastToType(context.Background(), invalidUtf8, toType) + c.NoError(err) + defer out.Release() + c.assertBinaryZeroCopy(invalidUtf8, out) + + // invalid utf-8 masked by a null is also not an erro + invalidutf8Bin := c.invalidUtf8Arr(toType) + defer invalidutf8Bin.Release() + + // invalid utf8 masked by a null bit is not an error + masked := c.maskArrayWithNullsAt(invalidUtf8, []int{4}) + expMasked := c.maskArrayWithNullsAt(invalidutf8Bin, []int{4}) + defer masked.Release() + defer expMasked.Release() + + checkCast(c.T(), masked, expMasked, *compute.SafeCastOptions(toType)) + }) + } + }) + } + + c.Run("fixed size binary", func() { + fromType := &arrow.FixedSizeBinaryType{ByteWidth: 3} + invalidUtf8Arr := c.fixedSizeInvalidUtf8(fromType) + defer invalidUtf8Arr.Release() + + checkCast(c.T(), invalidUtf8Arr, invalidUtf8Arr, *compute.DefaultCastOptions(true)) + checkCastFails(c.T(), invalidUtf8Arr, *compute.SafeCastOptions(&arrow.FixedSizeBinaryType{ByteWidth: 5})) + for _, toType := range []arrow.DataType{arrow.BinaryTypes.Binary, arrow.BinaryTypes.LargeBinary} { + c.Run(toType.String(), func() { + c.checkCast(fromType, toType, `[]`, `[]`) + + out, err := compute.CastToType(context.Background(), invalidUtf8Arr, toType) + c.NoError(err) + defer out.Release() + assertBufferSame(c.T(), invalidUtf8Arr, out, 0) + + c.Same(invalidUtf8Arr.Data().Buffers()[1], out.Data().Buffers()[2]) + }) + } + }) +} + +func (c *CastSuite) TestStringToString() { + for _, fromType := range []arrow.DataType{arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString} { + c.Run("from "+fromType.String(), func() { + for _, toType := range []arrow.DataType{arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString} { + c.Run("to "+toType.String(), func() { + c.checkCast(fromType, toType, `[]`, `[]`) + + invalidUtf8 := c.invalidUtf8Arr(fromType) + defer invalidUtf8.Release() + + invalidutf8Str := c.invalidUtf8Arr(toType) + defer invalidutf8Str.Release() + + // invalid utf8 masked by a null bit is not an error + masked := c.maskArrayWithNullsAt(invalidUtf8, []int{4}) + expMasked := c.maskArrayWithNullsAt(invalidutf8Str, []int{4}) + defer masked.Release() + defer expMasked.Release() + + checkCast(c.T(), masked, expMasked, *compute.SafeCastOptions(toType)) + + opts := compute.SafeCastOptions(toType) + // override utf8 check + opts.AllowInvalidUtf8 = true + // utf-8 is not checked by cast when the origin (utf-8) guarantees utf-8 + strs, err := compute.CastArray(context.Background(), invalidUtf8, opts) + c.NoError(err) + defer strs.Release() + c.assertBinaryZeroCopy(invalidUtf8, strs) + }) + } + }) + } +} + func (c *CastSuite) TestStringToInt() { for _, stype := range []arrow.DataType{arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString} { for _, dt := range signedIntTypes { @@ -1889,8 +2154,92 @@ func (c *CastSuite) TestStringToTimestamp() { } } +func (c *CastSuite) TestIntToString() { + for _, stype := range []arrow.DataType{arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString} { + c.Run(stype.String(), func() { + c.checkCast(arrow.PrimitiveTypes.Int8, stype, + `[0, 1, 127, -128, null]`, `["0", "1", "127", "-128", null]`) + + c.checkCast(arrow.PrimitiveTypes.Uint8, stype, + `[0, 1, 255, null]`, `["0", "1", "255", null]`) + + c.checkCast(arrow.PrimitiveTypes.Int16, stype, + `[0, 1, 32767, -32768, null]`, `["0", "1", "32767", "-32768", null]`) + + c.checkCast(arrow.PrimitiveTypes.Uint16, stype, + `[0, 1, 65535, null]`, `["0", "1", "65535", null]`) + + c.checkCast(arrow.PrimitiveTypes.Int32, stype, + `[0, 1, 2147483647, -2147483648, null]`, + `["0", "1", "2147483647", "-2147483648", null]`) + + c.checkCast(arrow.PrimitiveTypes.Uint32, stype, + `[0, 1, 4294967295, null]`, `["0", "1", "4294967295", null]`) + + c.checkCast(arrow.PrimitiveTypes.Int64, stype, + `[0, 1, 9223372036854775807, -9223372036854775808, null]`, + `["0", "1", "9223372036854775807", "-9223372036854775808", null]`) + + c.checkCast(arrow.PrimitiveTypes.Uint64, stype, + `[0, 1, 18446744073709551615, null]`, `["0", "1", "18446744073709551615", null]`) + }) + } +} + +func (c *CastSuite) TestFloatingToString() { + for _, stype := range []arrow.DataType{arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString} { + c.Run(stype.String(), func() { + bldr := array.NewFloat32Builder(c.mem) + defer bldr.Release() + bldr.AppendValues([]float32{ + 0, float32(math.Copysign(0, -1)), 1.5, float32(math.Inf(-1)), + float32(math.Inf(0)), float32(math.NaN())}, nil) + bldr.AppendNull() + arr := bldr.NewArray() + defer arr.Release() + + bldr64 := array.NewFloat64Builder(c.mem) + defer bldr64.Release() + bldr64.AppendValues([]float64{ + 0, math.Copysign(0, -1), 1.5, math.Inf(-1), math.Inf(0), math.NaN()}, nil) + bldr64.AppendNull() + arr64 := bldr64.NewArray() + defer arr64.Release() + + c.checkCastArr(arr, stype, `["0", "-0", "1.5", "-Inf", "+Inf", "NaN", null]`, *compute.DefaultCastOptions(true)) + + c.checkCastArr(arr64, stype, `["0", "-0", "1.5", "-Inf", "+Inf", "NaN", null]`, *compute.DefaultCastOptions(true)) + }) + } +} + +func (c *CastSuite) TestBooleanToString() { + for _, stype := range []arrow.DataType{arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString} { + c.Run(stype.String(), func() { + c.checkCast(arrow.FixedWidthTypes.Boolean, stype, + `[true, true, false, null]`, `["true", "true", "false", null]`) + }) + } +} + func (c *CastSuite) TestIdentityCasts() { c.checkCastSelfZeroCopy(arrow.FixedWidthTypes.Boolean, `[false, true, null, false]`) + + c.checkCastSelfZeroCopy(arrow.Null, `[null, null, null]`) + for _, typ := range numericTypes { + c.checkCastSelfZeroCopy(typ, `[1, 2, null, 4]`) + } + + // ["foo", "bar"] base64 encoded for binary + c.checkCastSelfZeroCopy(arrow.BinaryTypes.Binary, `["Zm9v", "YmFy"]`) + c.checkCastSelfZeroCopy(arrow.BinaryTypes.String, `["foo", "bar"]`) + c.checkCastSelfZeroCopy(&arrow.FixedSizeBinaryType{ByteWidth: 3}, `["Zm9v", "YmFy"]`) + + c.checkCastSelfZeroCopy(arrow.FixedWidthTypes.Time32ms, `[1, 2, 3, 4]`) + c.checkCastSelfZeroCopy(arrow.FixedWidthTypes.Time64us, `[1, 2, 3, 4]`) + c.checkCastSelfZeroCopy(arrow.FixedWidthTypes.Date32, `[1, 2, 3, 4]`) + c.checkCastSelfZeroCopy(arrow.FixedWidthTypes.Date64, `[86400000, 0]`) + c.checkCastSelfZeroCopy(arrow.FixedWidthTypes.Timestamp_s, `[1, 2, 3, 4]`) } func (c *CastSuite) smallIntArrayFromJSON(data string) arrow.Array { diff --git a/go/arrow/compute/internal/exec/span.go b/go/arrow/compute/internal/exec/span.go index c969897edab53..4b5a1cdc0b60d 100644 --- a/go/arrow/compute/internal/exec/span.go +++ b/go/arrow/compute/internal/exec/span.go @@ -300,6 +300,12 @@ func (a *ArraySpan) FillFromScalar(val scalar.Scalar) { a.Buffers[2].Buf = dataBuffer case typeID == arrow.FIXED_SIZE_BINARY: sc := val.(scalar.BinaryScalar) + if !sc.IsValid() { + a.Buffers[1].Buf = make([]byte, sc.DataType().(*arrow.FixedSizeBinaryType).ByteWidth) + a.Buffers[1].Owner = nil + a.Buffers[1].SelfAlloc = false + break + } a.Buffers[1].Buf = sc.Data() a.Buffers[1].Owner = sc.Buffer() a.Buffers[1].SelfAlloc = false diff --git a/go/arrow/compute/internal/kernels/string_casts.go b/go/arrow/compute/internal/kernels/string_casts.go index bcd2ba2b00d1a..568ee570a31a9 100644 --- a/go/arrow/compute/internal/kernels/string_casts.go +++ b/go/arrow/compute/internal/kernels/string_casts.go @@ -67,7 +67,7 @@ func validateUtf8[OffsetT int32 | int64](input *exec.ArraySpan) error { func CastFsbToFsb(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { inputWidth := batch.Values[0].Array.Type.(*arrow.FixedSizeBinaryType).ByteWidth - outputWidth := out.Type.(*arrow.FixedSizeBinaryType).ByteWidth + outputWidth := ctx.State.(CastState).ToType.(*arrow.FixedSizeBinaryType).ByteWidth if inputWidth != outputWidth { return fmt.Errorf("%w: failed casting from %s to %s: widths must match", @@ -108,7 +108,7 @@ func CastBinaryToBinary[InOffsetsT, OutOffsetsT int32 | int64](ctx *exec.KernelC arrow.ErrInvalid, input.Type, out.Type) } - buf := ctx.Allocate(out.Type.(arrow.OffsetTraits).BytesRequired(int(out.Len + out.Offset + 1))) + buf := ctx.Allocate(out.Type.(arrow.OffsetsDataType).OffsetTypeTraits().BytesRequired(int(out.Len + out.Offset + 1))) out.Buffers[1].WrapBuffer(buf) outOffsets := exec.GetSpanOffsets[OutOffsetsT](out, 1) @@ -118,7 +118,7 @@ func CastBinaryToBinary[InOffsetsT, OutOffsetsT int32 | int64](ctx *exec.KernelC return nil default: // upcast from int32 -> int64 - buf := ctx.Allocate(out.Type.(arrow.OffsetTraits).BytesRequired(int(out.Len + out.Offset + 1))) + buf := ctx.Allocate(out.Type.(arrow.OffsetsDataType).OffsetTypeTraits().BytesRequired(int(out.Len + out.Offset + 1))) out.Buffers[1].WrapBuffer(buf) inputOffsets := exec.GetSpanOffsets[InOffsetsT](input, 1) diff --git a/go/arrow/scalar/scalar.go b/go/arrow/scalar/scalar.go index 8eb19d6d06e47..1417644362156 100644 --- a/go/arrow/scalar/scalar.go +++ b/go/arrow/scalar/scalar.go @@ -571,6 +571,10 @@ func GetScalar(arr arrow.Array, idx int) (Scalar, error) { buf := memory.NewBufferBytes(arr.Value(idx)) defer buf.Release() return NewBinaryScalar(buf, arr.DataType()), nil + case *array.LargeBinary: + buf := memory.NewBufferBytes(arr.Value(idx)) + defer buf.Release() + return NewLargeBinaryScalar(buf), nil case *array.Boolean: return NewBooleanScalar(arr.Value(idx)), nil case *array.Date32: