diff --git a/r/NEWS.md b/r/NEWS.md index 06b7bc0025c9b..ff7f66399cdf1 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -23,6 +23,7 @@ * `write_feather`, `write_arrow` and `write_parquet` now return their input similar to `write_*` functions from `readr` (#6387, @boshek) * Dataset filtering is now correctly supported for all Arrow date/time/timestamp column types. +* Can now infer the type of an R `list` and create a ListArray when all list elements are the same type (#6275, @michaelchirico) # arrow 0.16.0 diff --git a/r/src/array_from_vector.cpp b/r/src/array_from_vector.cpp index 02a340397c6c0..f794e19636ad0 100644 --- a/r/src/array_from_vector.cpp +++ b/r/src/array_from_vector.cpp @@ -15,9 +15,14 @@ // specific language governing permissions and limitations // under the License. +#include + #include "./arrow_types.h" + #if defined(ARROW_R_WITH_ARROW) +using arrow::internal::checked_cast; + namespace arrow { namespace r { @@ -41,75 +46,179 @@ inline bool is_na(int value) { return value == NA_INTEGER; } -std::shared_ptr MakeStringArray(Rcpp::StringVector_ vec) { - R_xlen_t n = vec.size(); +std::shared_ptr InferArrowType(SEXP x); - std::shared_ptr null_buffer; - std::shared_ptr offset_buffer; - std::shared_ptr value_buffer; +struct VectorToArrayConverter { + Status Visit(const arrow::NullType& type) { + auto* null_builder = checked_cast(builder); + return null_builder->AppendNulls(XLENGTH(x)); + } - // there is always an offset buffer - STOP_IF_NOT_OK(AllocateBuffer((n + 1) * sizeof(int32_t), &offset_buffer)); + Status Visit(const arrow::BooleanType& type) { + ARROW_RETURN_IF(TYPEOF(x) != LGLSXP, Status::RError("Expecting a logical vector")); + R_xlen_t n = XLENGTH(x); - R_xlen_t i = 0; - int current_offset = 0; - int64_t null_count = 0; - auto p_offset = reinterpret_cast(offset_buffer->mutable_data()); - *p_offset = 0; - for (++p_offset; i < n; i++, ++p_offset) { - SEXP s = STRING_ELT(vec, i); - if (s == NA_STRING) { - // break as we are going to need a null_bitmap buffer - break; + auto* bool_builder = checked_cast(builder); + auto* p = LOGICAL(x); + + RETURN_NOT_OK(bool_builder->Reserve(n)); + for (R_xlen_t i = 0; i < n; i++) { + auto value = p[i]; + if (value == NA_LOGICAL) { + bool_builder->UnsafeAppendNull(); + } else { + bool_builder->UnsafeAppend(value == 1); + } } + return Status::OK(); + } + + Status Visit(const arrow::Int32Type& type) { + ARROW_RETURN_IF(TYPEOF(x) != INTSXP, Status::RError("Expecting an integer vector")); - *p_offset = current_offset += LENGTH(s); + auto* int_builder = checked_cast(builder); + + R_xlen_t n = XLENGTH(x); + const auto* data = INTEGER(x); + + RETURN_NOT_OK(int_builder->Reserve(n)); + for (R_xlen_t i = 0; i < n; i++) { + const auto value = data[i]; + if (value == NA_INTEGER) { + int_builder->UnsafeAppendNull(); + } else { + int_builder->UnsafeAppend(value); + } + } + + return Status::OK(); } - if (i < n) { - STOP_IF_NOT_OK(AllocateBuffer(BitUtil::BytesForBits(n), &null_buffer)); - internal::FirstTimeBitmapWriter null_bitmap_writer(null_buffer->mutable_data(), 0, n); + Status Visit(const arrow::Int64Type& type) { + ARROW_RETURN_IF(TYPEOF(x) != REALSXP, Status::RError("Expecting a numeric vector")); + ARROW_RETURN_IF(Rf_inherits(x, "integer64"), + Status::RError("Expecting a vector that inherits integer64")); - // catch up - for (R_xlen_t j = 0; j < i; j++, null_bitmap_writer.Next()) { - null_bitmap_writer.Set(); + auto* int_builder = checked_cast(builder); + + R_xlen_t n = XLENGTH(x); + const auto* data = (REAL(x)); + + RETURN_NOT_OK(int_builder->Reserve(n)); + for (R_xlen_t i = 0; i < n; i++) { + const auto value = arrow::util::SafeCopy(data[i]); + if (value == NA_INT64) { + int_builder->UnsafeAppendNull(); + } else { + int_builder->UnsafeAppend(value); + } } - // resume offset filling - for (; i < n; i++, ++p_offset, null_bitmap_writer.Next()) { - SEXP s = STRING_ELT(vec, i); - if (s == NA_STRING) { - null_bitmap_writer.Clear(); - *p_offset = current_offset; - null_count++; + return Status::OK(); + } + + Status Visit(const arrow::DoubleType& type) { + ARROW_RETURN_IF(TYPEOF(x) != REALSXP, Status::RError("Expecting a numeric vector")); + + auto* double_builder = checked_cast(builder); + + R_xlen_t n = XLENGTH(x); + const auto* data = (REAL(x)); + + RETURN_NOT_OK(double_builder->Reserve(n)); + for (R_xlen_t i = 0; i < n; i++) { + const auto value = data[i]; + if (ISNA(value)) { + double_builder->UnsafeAppendNull(); } else { - null_bitmap_writer.Set(); - *p_offset = current_offset += LENGTH(s); + double_builder->UnsafeAppend(value); } } - null_bitmap_writer.Finish(); + return Status::OK(); } - // ----- data buffer - if (current_offset > 0) { - STOP_IF_NOT_OK(AllocateBuffer(current_offset, &value_buffer)); - p_offset = reinterpret_cast(offset_buffer->mutable_data()); - auto p_data = reinterpret_cast(value_buffer->mutable_data()); + template + arrow::enable_if_base_binary Visit(const T& type) { + using BuilderType = typename TypeTraits::BuilderType; + + ARROW_RETURN_IF(TYPEOF(x) != STRSXP, Status::RError("Expecting a character vector")); + auto* binary_builder = checked_cast(builder); + + R_xlen_t n = XLENGTH(x); + RETURN_NOT_OK(builder->Reserve(n)); for (R_xlen_t i = 0; i < n; i++) { - SEXP s = STRING_ELT(vec, i); - if (s != NA_STRING) { - auto ni = LENGTH(s); - std::copy_n(CHAR(s), ni, p_data); - p_data += ni; + SEXP s = STRING_ELT(x, i); + if (s == NA_STRING) { + RETURN_NOT_OK(binary_builder->AppendNull()); + continue; } + + RETURN_NOT_OK(binary_builder->Append(CHAR(s), LENGTH(s))); } + + return Status::OK(); } - auto data = ArrayData::Make(arrow::utf8(), n, - {null_buffer, offset_buffer, value_buffer}, null_count, 0); - return MakeArray(data); + template + arrow::enable_if_base_list Visit(const T& type) { + using BuilderType = typename TypeTraits::BuilderType; + + ARROW_RETURN_IF(TYPEOF(x) != VECSXP, Status::RError("Expecting a list vector")); + + auto* list_builder = checked_cast(builder); + auto* value_builder = list_builder->value_builder(); + auto value_type = type.value_type(); + + R_xlen_t n = XLENGTH(x); + RETURN_NOT_OK(builder->Reserve(n)); + for (R_xlen_t i = 0; i < n; i++) { + SEXP vector = VECTOR_ELT(x, i); + if (vector == R_NilValue) { + list_builder->AppendNull(); + continue; + } + + list_builder->Append(); + + auto vect_type = arrow::r::InferArrowType(vector); + if (!value_type->Equals(vect_type)) { + return Status::RError("List vector expecting elements vector of type ", + value_type->ToString(), " but got ", vect_type->ToString()); + } + + // Recurse. + VectorToArrayConverter converter{vector, value_builder}; + RETURN_NOT_OK(arrow::VisitTypeInline(*value_type, &converter)); + } + + return Status::OK(); + } + + Status Visit(const arrow::DataType& type) { + return Status::NotImplemented("Converting vector to arrow type ", type.ToString(), + " not implemented"); + } + + static std::shared_ptr Visit(SEXP x, const std::shared_ptr& type) { + std::unique_ptr builder; + STOP_IF_NOT_OK(MakeBuilder(arrow::default_memory_pool(), type, &builder)); + + VectorToArrayConverter converter{x, builder.get()}; + STOP_IF_NOT_OK(arrow::VisitTypeInline(*type, &converter)); + + std::shared_ptr result; + STOP_IF_NOT_OK(builder->Finish(&result)); + return result; + } + + SEXP x; + arrow::ArrayBuilder* builder; +}; + +std::shared_ptr MakeStringArray(SEXP x, const std::shared_ptr& type) { + return VectorToArrayConverter::Visit(x, type); } template @@ -163,7 +272,7 @@ std::shared_ptr MakeFactorArrayImpl(Rcpp::IntegerVector_ factor, auto array_indices = MakeArray(array_indices_data); SEXP levels = Rf_getAttrib(factor, R_LevelsSymbol); - auto dict = MakeStringArray(levels); + auto dict = MakeStringArray(levels, utf8()); std::shared_ptr out; STOP_IF_NOT_OK(DictionaryArray::FromArrays(type, array_indices, dict, &out)); @@ -192,6 +301,10 @@ std::shared_ptr MakeStructArray(SEXP df, const std::shared_ptr& return std::make_shared(type, children[0]->length(), children); } +std::shared_ptr MakeListArray(SEXP x, const std::shared_ptr& type) { + return VectorToArrayConverter::Visit(x, type); +} + template int64_t time_cast(T value); @@ -211,7 +324,6 @@ inline int64_t time_cast(double value) { // ---------------- new api namespace arrow { -using internal::checked_cast; namespace internal { @@ -797,84 +909,118 @@ Status GetConverter(const std::shared_ptr& type, return Status::NotImplemented("type not implemented"); } -template -std::shared_ptr GetFactorTypeImpl(bool ordered) { - return dictionary(std::make_shared(), arrow::utf8(), ordered); +static inline std::shared_ptr IndexTypeForFactors(int n_factors) { + if (n_factors < INT8_MAX) { + return arrow::int8(); + } else if (n_factors < INT16_MAX) { + return arrow::int16(); + } else { + return arrow::int32(); + } } -std::shared_ptr GetFactorType(SEXP factor) { - SEXP levels = Rf_getAttrib(factor, R_LevelsSymbol); +std::shared_ptr InferArrowTypeFromFactor(SEXP factor) { + SEXP factors = Rf_getAttrib(factor, R_LevelsSymbol); + auto index_type = IndexTypeForFactors(Rf_length(factors)); bool is_ordered = Rf_inherits(factor, "ordered"); - int n = Rf_length(levels); - if (n < 128) { - return GetFactorTypeImpl(is_ordered); - } else if (n < 32768) { - return GetFactorTypeImpl(is_ordered); + return dictionary(index_type, arrow::utf8(), is_ordered); +} + +template +std::shared_ptr InferArrowTypeFromVector(SEXP x) { + Rcpp::stop("Unknown vector type: ", VectorType); +} + +template <> +std::shared_ptr InferArrowTypeFromVector(SEXP x) { + if (Rf_inherits(x, "Array")) { + Rcpp::ConstReferenceSmartPtrInputParameter> array(x); + return static_cast>(array)->type(); + } + + Rcpp::stop("Unrecognized vector instance for type ENVSXP"); +} + +template <> +std::shared_ptr InferArrowTypeFromVector(SEXP x) { + return Rf_inherits(x, "vctrs_unspecified") ? null() : boolean(); +} + +template <> +std::shared_ptr InferArrowTypeFromVector(SEXP x) { + if (Rf_isFactor(x)) { + return InferArrowTypeFromFactor(x); + } else if (Rf_inherits(x, "Date")) { + return date32(); + } else if (Rf_inherits(x, "POSIXct")) { + return timestamp(TimeUnit::MICRO, "GMT"); + } + return int32(); +} + +template <> +std::shared_ptr InferArrowTypeFromVector(SEXP x) { + if (Rf_inherits(x, "Date")) { + return date32(); + } + if (Rf_inherits(x, "POSIXct")) { + return timestamp(TimeUnit::MICRO, "GMT"); + } + if (Rf_inherits(x, "integer64")) { + return int64(); + } + if (Rf_inherits(x, "difftime")) { + return time32(TimeUnit::SECOND); + } + return float64(); +} + +static inline std::shared_ptr InferArrowTypeFromDataFrame(SEXP x) { + R_xlen_t n = XLENGTH(x); + SEXP names = Rf_getAttrib(x, R_NamesSymbol); + std::vector> fields(n); + for (R_xlen_t i = 0; i < n; i++) { + const auto* field_name = CHAR(STRING_ELT(names, i)); + fields[i] = arrow::field(field_name, InferArrowType(VECTOR_ELT(x, i))); + } + return arrow::struct_(std::move(fields)); +} + +template <> +std::shared_ptr InferArrowTypeFromVector(SEXP x) { + if (Rf_inherits(x, "data.frame")) { + return InferArrowTypeFromDataFrame(x); } else { - return GetFactorTypeImpl(is_ordered); + if (XLENGTH(x) == 0) { + Rcpp::stop( + "Requires at least one element to infer the values' type of a list vector"); + } + + return arrow::list(InferArrowType(VECTOR_ELT(x, 0))); } } -std::shared_ptr InferType(SEXP x) { +std::shared_ptr InferArrowType(SEXP x) { switch (TYPEOF(x)) { case ENVSXP: - if (Rf_inherits(x, "Array")) { - Rcpp::ConstReferenceSmartPtrInputParameter> array( - x); - return static_cast>(array)->type(); - } - break; + return InferArrowTypeFromVector(x); case LGLSXP: - if (Rf_inherits(x, "vctrs_unspecified")) { - return null(); - } - return boolean(); + return InferArrowTypeFromVector(x); case INTSXP: - if (Rf_isFactor(x)) { - return GetFactorType(x); - } - if (Rf_inherits(x, "Date")) { - return date32(); - } - if (Rf_inherits(x, "POSIXct")) { - return timestamp(TimeUnit::MICRO, "GMT"); - } - return int32(); + return InferArrowTypeFromVector(x); case REALSXP: - if (Rf_inherits(x, "Date")) { - return date32(); - } - if (Rf_inherits(x, "POSIXct")) { - return timestamp(TimeUnit::MICRO, "GMT"); - } - if (Rf_inherits(x, "integer64")) { - return int64(); - } - if (Rf_inherits(x, "difftime")) { - return time32(TimeUnit::SECOND); - } - return float64(); + return InferArrowTypeFromVector(x); case RAWSXP: return int8(); case STRSXP: return utf8(); case VECSXP: - if (Rf_inherits(x, "data.frame")) { - R_xlen_t n = XLENGTH(x); - SEXP names = Rf_getAttrib(x, R_NamesSymbol); - std::vector> fields(n); - for (R_xlen_t i = 0; i < n; i++) { - fields[i] = std::make_shared(CHAR(STRING_ELT(names, i)), - InferType(VECTOR_ELT(x, i))); - } - return std::make_shared(std::move(fields)); - } - break; + return InferArrowTypeFromVector(x); default: break; } - Rcpp::stop("cannot infer type from data"); + Rcpp::stop("Cannot infer type from vector"); } // in some situations we can just use the memory of the R object in an RBuffer @@ -945,29 +1091,28 @@ std::shared_ptr MakeSimpleArray(SEXP x) { } std::shared_ptr Array__from_vector_reuse_memory(SEXP x) { - switch (TYPEOF(x)) { - case INTSXP: - return MakeSimpleArray(x); - case REALSXP: - if (Rf_inherits(x, "integer64")) { - return MakeSimpleArray(x); - } - return MakeSimpleArray(x); - case RAWSXP: - return MakeSimpleArray(x); - default: - break; + auto type = TYPEOF(x); + + if (type == INTSXP) { + return MakeSimpleArray(x); + } else if (type == REALSXP && Rf_inherits(x, "integer64")) { + return MakeSimpleArray(x); + } else if (type == REALSXP) { + return MakeSimpleArray(x); + } else if (type == RAWSXP) { + return MakeSimpleArray(x); } - Rcpp::stop("not implemented"); + Rcpp::stop("Unreachable: you might need to fix can_reuse_memory()"); } bool CheckCompatibleFactor(SEXP obj, const std::shared_ptr& type) { - if (!Rf_inherits(obj, "factor")) return false; + if (!Rf_inherits(obj, "factor")) { + return false; + } - arrow::DictionaryType* dict_type = - arrow::checked_cast(type.get()); - return dict_type->value_type() == utf8(); + auto* dict_type = checked_cast(type.get()); + return dict_type->value_type()->Equals(utf8()); } arrow::Status CheckCompatibleStruct(SEXP obj, @@ -1016,8 +1161,7 @@ std::shared_ptr Array__from_vector( // treat strings separately for now if (type->id() == Type::STRING) { - STOP_IF_NOT(TYPEOF(x) == STRSXP, "Cannot convert R object to string array"); - return arrow::r::MakeStringArray(x); + return arrow::r::MakeStringArray(x, type); } // factors only when type has been inferred @@ -1029,6 +1173,10 @@ std::shared_ptr Array__from_vector( Rcpp::stop("Object incompatible with dictionary type"); } + if (type->id() == Type::LIST) { + return arrow::r::MakeListArray(x, type); + } + // struct types if (type->id() == Type::STRUCT) { if (!type_inferred) { @@ -1060,7 +1208,7 @@ std::shared_ptr Array__from_vector( // [[arrow::export]] std::shared_ptr Array__infer_type(SEXP x) { - return arrow::r::InferType(x); + return arrow::r::InferArrowType(x); } // [[arrow::export]] @@ -1070,7 +1218,7 @@ std::shared_ptr Array__from_vector(SEXP x, SEXP s_type) { bool type_inferred = Rf_isNull(s_type); std::shared_ptr type; if (type_inferred) { - type = arrow::r::InferType(x); + type = arrow::r::InferArrowType(x); } else { type = arrow::r::extract(s_type); } @@ -1093,7 +1241,7 @@ std::shared_ptr ChunkedArray__from_list(Rcpp::List chunks, if (n == 0) { Rcpp::stop("type must be specified for empty list"); } - type = arrow::r::InferType(VECTOR_ELT(chunks, 0)); + type = arrow::r::InferArrowType(VECTOR_ELT(chunks, 0)); } else { type = arrow::r::extract(s_type); } diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index 86f74c917f51b..5facffa10d5a9 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -210,7 +210,10 @@ inline std::shared_ptr extract(SEXP x) { #include #include #include +#include #include +#include +#include #include #include #include @@ -284,7 +287,7 @@ class RBuffer : public MutableBuffer { Vec vec_; }; -std::shared_ptr GetFactorType(SEXP); +std::shared_ptr InferArrowTypeFromFactor(SEXP); } // namespace r } // namespace arrow diff --git a/r/src/expression.cpp b/r/src/expression.cpp index 2615e21fd898b..d63aab86ce496 100644 --- a/r/src/expression.cpp +++ b/r/src/expression.cpp @@ -142,7 +142,7 @@ std::shared_ptr dataset___expr__scalar(SEXP x) { case INTSXP: if (Rf_inherits(x, "factor")) { // TODO: This does not use the actual value, just the levels - auto type = arrow::r::GetFactorType(x); + auto type = arrow::r::InferArrowTypeFromFactor(x); return ds::scalar(std::make_shared(type)); } return ds::scalar(Rf_asInteger(x)); diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index e3791862946c4..c8252d67eb20c 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -456,7 +456,68 @@ test_that("Array$create() can handle data frame with custom struct type (not inf expect_error(Array$create(df, type = type), regexp = "Field name in position.*does not match the name of the column of the data frame") type <- struct(x = float64(), y = utf8()) - expect_error(Array$create(df, type = type), regexp = "Cannot convert R object to string array") + expect_error(Array$create(df, type = type), regexp = "Expecting a character vector") +}) + +test_that("Array$create() handles vector -> list arrays (ARROW-7662)", { + expect_list_array <- function(v, type) { + a <- Array$create(v) + expect_equal(a$type, list_of(type)) + expect_equivalent(a$as_vector(), v) + } + + # Should be able to create an empty list with a type hint. + Array$create(list(), list_of(bool())) + + # logical + expect_list_array(list(NA), bool()) + expect_list_array(list(logical(0)), bool()) + expect_list_array(list(c(TRUE), c(FALSE), c(FALSE, TRUE)), bool()) + expect_list_array(list(c(TRUE), c(FALSE), NA, logical(0), c(FALSE, NA, TRUE)), bool()) + + # integer + expect_list_array(list(NA_integer_), int32()) + expect_list_array(list(integer(0)), int32()) + expect_list_array(list(1:2, 3:4, 12:18), int32()) + expect_list_array(list(c(1:2), NA_integer_, integer(0), c(12:18, NA_integer_)), int32()) + + # numeric + expect_list_array(list(NA_real_), float64()) + expect_list_array(list(numeric(0)), float64()) + expect_list_array(list(1, c(2, 3), 4), float64()) + expect_list_array(list(1, numeric(0), c(2, 3, NA_real_), 4), float64()) + + # character + expect_list_array(list(NA_character_), utf8()) + expect_list_array(list(character(0)), utf8()) + expect_list_array(list("itsy", c("bitsy", "spider"), c("is")), utf8()) + expect_list_array(list("itsy", character(0), c("bitsy", "spider", NA_character_), c("is")), utf8()) +}) + +test_that("Array$create() should have helpful error on lists with type hint", { + expect_error(Array$create(list(numeric(0)), list_of(bool())), + regexp = "List vector expecting elements vector of type") + expect_error(Array$create(list(numeric(0)), list_of(int32())), + regexp = "List vector expecting elements vector of type") + expect_error(Array$create(list(integer(0)), list_of(float64())), + regexp = "List vector expecting elements vector of type") +}) + +test_that("Array$create() should refuse heterogeneous lists", { + lgl <- logical(0) + int <- integer(0) + num <- numeric(0) + char <- character(0) + + expect_error(Array$create(list()), + regexp = "Requires at least one element to infer the values'") + + expect_error(Array$create(list(lgl, lgl, int)), + regexp = "List vector expecting elements vector of type") + expect_error(Array$create(list(char, num, char)), + regexp = "List vector expecting elements vector of type") + expect_error(Array$create(list(int, int, num)), + regexp = "List vector expecting elements vector of type") }) test_that("Array$View() (ARROW-6542)", { diff --git a/r/tests/testthat/test-array-data.R b/r/tests/testthat/test-array-data.R index 78904823d8967..881706c797d6b 100644 --- a/r/tests/testthat/test-array-data.R +++ b/r/tests/testthat/test-array-data.R @@ -22,7 +22,14 @@ test_that("string vectors with only empty strings and nulls don't allocate a dat expect_equal(a$length(), 1L) buffers <- a$data()$buffers - expect_null(buffers[[1]]) - expect_null(buffers[[3]]) + + # No nulls + expect_equal(buffers[[1]]$size, 1) + + # Offsets has 2 elements expect_equal(buffers[[2]]$size, 8L) + + # As per ARROW-2744, values buffer should preferably be non-null. + expect_equal(buffers[[3]]$size, 0L) + expect_equal(buffers[[3]]$capacity, 0L) }) diff --git a/r/tests/testthat/test-parquet.R b/r/tests/testthat/test-parquet.R index d05602d9de23b..d46eb5e88a1e3 100644 --- a/r/tests/testthat/test-parquet.R +++ b/r/tests/testthat/test-parquet.R @@ -121,6 +121,21 @@ test_that("Factors are preserved when writing/reading from Parquet", { expect_identical(df, df_read) }) +test_that("Lists are preserved when writing/reading from Parquet", { + bool <- list(logical(0), NA, c(TRUE, FALSE)) + int <- list(integer(0), NA_integer_, 1:4) + num <- list(numeric(0), NA_real_, c(1, 2)) + char <- list(character(0), NA_character_, c("itsy", "bitsy")) + df <- tibble::tibble(bool = bool, int = int, num = num, char = char) + + pq_tmp_file <- tempfile() + on.exit(unlink(pq_tmp_file)) + + write_parquet(df, pq_tmp_file) + df_read <- read_parquet(pq_tmp_file) + expect_equivalent(df, df_read) +}) + test_that("write_parquet() to stream", { df <- tibble::tibble(x = 1:5) tf <- tempfile()