diff --git a/c_glib/arrow-dataset-glib/arrow-dataset-glib.h b/c_glib/arrow-dataset-glib/arrow-dataset-glib.h index ff160452845d5..03e565161129a 100644 --- a/c_glib/arrow-dataset-glib/arrow-dataset-glib.h +++ b/c_glib/arrow-dataset-glib/arrow-dataset-glib.h @@ -21,6 +21,8 @@ #include +#include +#include #include #include #include diff --git a/c_glib/arrow-dataset-glib/arrow-dataset-glib.hpp b/c_glib/arrow-dataset-glib/arrow-dataset-glib.hpp index c221825bc2a83..65341b9b77eff 100644 --- a/c_glib/arrow-dataset-glib/arrow-dataset-glib.hpp +++ b/c_glib/arrow-dataset-glib/arrow-dataset-glib.hpp @@ -21,6 +21,8 @@ #include +#include +#include #include #include #include diff --git a/c_glib/arrow-dataset-glib/dataset-factory.cpp b/c_glib/arrow-dataset-glib/dataset-factory.cpp new file mode 100644 index 0000000000000..146db69adfc80 --- /dev/null +++ b/c_glib/arrow-dataset-glib/dataset-factory.cpp @@ -0,0 +1,468 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include + +#include +#include +#include + +G_BEGIN_DECLS + +/** + * SECTION: dataset-factory + * @section_id: dataset-factory + * @title: Dataset factory related classes + * @include: arrow-dataset-glib/arrow-dataset-glib.h + * + * #GADatasetDatasetFactory is a base class for dataset factories. + * + * #GADatasetFileSystemDatasetFactory is a class for + * #GADatasetFileSystemDataset factory. + * + * Since: 5.0.0 + */ + +typedef struct GADatasetDatasetFactoryPrivate_ { + std::shared_ptr factory; +} GADatasetDatasetFactoryPrivate; + +enum { + PROP_DATASET_FACTORY = 1, +}; + +G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GADatasetDatasetFactory, + gadataset_dataset_factory, + G_TYPE_OBJECT) + +#define GADATASET_DATASET_FACTORY_GET_PRIVATE(obj) \ + static_cast( \ + gadataset_dataset_factory_get_instance_private( \ + GADATASET_DATASET_FACTORY(obj))) + +static void +gadataset_dataset_factory_finalize(GObject *object) +{ + auto priv = GADATASET_DATASET_FACTORY_GET_PRIVATE(object); + priv->factory.~shared_ptr(); + G_OBJECT_CLASS(gadataset_dataset_factory_parent_class)->finalize(object); +} + +static void +gadataset_dataset_factory_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_DATASET_FACTORY_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_DATASET_FACTORY: + { + auto arrow_factory_pointer = + static_cast *>( + g_value_get_pointer(value)); + if (arrow_factory_pointer) { + priv->factory = *arrow_factory_pointer; + } + } + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_dataset_factory_init(GADatasetDatasetFactory *object) +{ + auto priv = GADATASET_DATASET_FACTORY_GET_PRIVATE(object); + new(&priv->factory) std::shared_ptr; +} + +static void +gadataset_dataset_factory_class_init(GADatasetDatasetFactoryClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->finalize = gadataset_dataset_factory_finalize; + gobject_class->set_property = gadataset_dataset_factory_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer("dataset-factory", + "Dataset factory", + "The raw " + "std::shared *", + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_DATASET_FACTORY, spec); +} + +/** + * gadataset_dataset_factory_finish: + * @factory: A #GADatasetDatasetFactory. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full) (nullable): + * A newly created #GADatasetDataset on success, %NULL on error. + * + * Since: 5.0.0 + */ +GADatasetDataset * +gadataset_dataset_factory_finish(GADatasetDatasetFactory *factory, + GError **error) +{ + auto arrow_factory = gadataset_dataset_factory_get_raw(factory); + auto arrow_dataset_result = arrow_factory->Finish(); + if (garrow::check(error, arrow_dataset_result, "[dataset-factory][finish]")) { + auto arrow_dataset = *arrow_dataset_result; + return gadataset_dataset_new_raw(&arrow_dataset); + } else { + return NULL; + } +} + + +typedef struct GADatasetFileSystemDatasetFactoryPrivate_ { + GADatasetFileFormat *format; + GArrowFileSystem *file_system; + GList *files; + arrow::dataset::FileSystemFactoryOptions options; +} GADatasetFileSystemDatasetFactoryPrivate; + +enum { + PROP_FORMAT = 1, + PROP_FILE_SYSTEM, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFileSystemDatasetFactory, + gadataset_file_system_dataset_factory, + GADATASET_TYPE_DATASET_FACTORY) + +#define GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(obj) \ + static_cast( \ + gadataset_file_system_dataset_factory_get_instance_private( \ + GADATASET_FILE_SYSTEM_DATASET_FACTORY(obj))) + +static void +gadataset_file_system_dataset_factory_dispose(GObject *object) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(object); + + if (priv->format) { + g_object_unref(priv->format); + priv->format = NULL; + } + + if (priv->file_system) { + g_object_unref(priv->file_system); + priv->file_system = NULL; + } + + if (priv->files) { + g_list_free_full(priv->files, g_object_unref); + priv->files = NULL; + } + + G_OBJECT_CLASS( + gadataset_file_system_dataset_factory_parent_class)->dispose(object); +} + +static void +gadataset_file_system_dataset_factory_finalize(GObject *object) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(object); + priv->options.~FileSystemFactoryOptions(); + G_OBJECT_CLASS( + gadataset_file_system_dataset_factory_parent_class)->finalize(object); +} + +static void +gadataset_file_system_dataset_factory_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_FORMAT: + priv->format = GADATASET_FILE_FORMAT(g_value_dup_object(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_file_system_dataset_factory_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_FORMAT: + g_value_set_object(value, priv->format); + break; + case PROP_FILE_SYSTEM: + g_value_set_object(value, priv->file_system); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_file_system_dataset_factory_init( + GADatasetFileSystemDatasetFactory *object) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(object); + new(&priv->options) arrow::dataset::FileSystemFactoryOptions; +} + +static void +gadataset_file_system_dataset_factory_class_init( + GADatasetFileSystemDatasetFactoryClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->dispose = gadataset_file_system_dataset_factory_dispose; + gobject_class->finalize = gadataset_file_system_dataset_factory_finalize; + gobject_class->set_property = gadataset_file_system_dataset_factory_set_property; + gobject_class->get_property = gadataset_file_system_dataset_factory_get_property; + + GParamSpec *spec; + /** + * GADatasetFileSystemDatasetFactory:format: + * + * Format passed to #GADatasetFileSystemDataset. + * + * Since: 5.0.0 + */ + spec = g_param_spec_object("format", + "Format", + "Format passed to GADatasetFileSystemDataset", + GADATASET_TYPE_FILE_FORMAT, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_FORMAT, spec); + + /** + * GADatasetFileSystemDatasetFactory:file-system: + * + * File system passed to #GADatasetFileSystemDataset. + * + * Since: 5.0.0 + */ + spec = g_param_spec_object("file-system", + "File system", + "File system passed to GADatasetFileSystemDataset", + GARROW_TYPE_FILE_SYSTEM, + static_cast(G_PARAM_READABLE)); + g_object_class_install_property(gobject_class, PROP_FILE_SYSTEM, spec); +} + +/** + * gadataset_file_system_factory_new: + * @format: A #GADatasetFileFormat. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: A newly created #GADatasetDatasetFileSystemFactory on success, + * %NULL on error. + * + * Since: 5.0.0 + */ +GADatasetFileSystemDatasetFactory * +gadataset_file_system_dataset_factory_new(GADatasetFileFormat *format) +{ + return GADATASET_FILE_SYSTEM_DATASET_FACTORY( + g_object_new(GADATASET_TYPE_FILE_SYSTEM_DATASET_FACTORY, + "format", format, + NULL)); +} + +/** + * gadataset_file_system_dataset_factory_set_file_system: + * @factory: A #GADatasetFileSystemDatasetFactory. + * @file_system: A #GArrowFileSystem. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE otherwise. + * + * Since: 5.0.0 + */ +gboolean +gadataset_file_system_dataset_factory_set_file_system( + GADatasetFileSystemDatasetFactory *factory, + GArrowFileSystem *file_system, + GError **error) +{ + const gchar *context = "[file-system-dataset-factory][set-file-system]"; + auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(factory); + if (priv->file_system) { + garrow::check(error, + arrow::Status::Invalid("file system is already set"), + context); + return FALSE; + } + priv->file_system = file_system; + g_object_ref(priv->file_system); + return TRUE; +} + +/** + * gadataset_file_system_dataset_factory_set_file_system_uri: + * @factory: A #GADatasetFileSystemDatasetFactory. + * @uri: An URI for file system. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE otherwise. + * + * Since: 5.0.0 + */ +gboolean +gadataset_file_system_dataset_factory_set_file_system_uri( + GADatasetFileSystemDatasetFactory *factory, + const gchar *uri, + GError **error) +{ + const gchar *context = "[file-system-dataset-factory][set-file-system-uri]"; + auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(factory); + if (priv->file_system) { + garrow::check(error, + arrow::Status::Invalid("file system is already set"), + context); + return FALSE; + } + std::string internal_path; + auto arrow_file_system_result = + arrow::fs::FileSystemFromUri(uri, &internal_path); + if (!garrow::check(error, arrow_file_system_result, context)) { + return FALSE; + } + auto arrow_file_system = *arrow_file_system_result; + auto arrow_file_info_result = arrow_file_system->GetFileInfo(internal_path); + if (!garrow::check(error, arrow_file_info_result, context)) { + return FALSE; + } + priv->file_system = garrow_file_system_new_raw(&arrow_file_system); + auto file_info = garrow_file_info_new_raw(*arrow_file_info_result); + priv->files = g_list_prepend(priv->files, file_info); + return TRUE; +} + +/** + * gadataset_file_system_dataset_factory_add_path: + * @factory: A #GADatasetFileSystemDatasetFactory. + * @path: A path to be added. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE otherwise. + * + * Since: 5.0.0 + */ +gboolean +gadataset_file_system_dataset_factory_add_path( + GADatasetFileSystemDatasetFactory *factory, + const gchar *path, + GError **error) +{ + const gchar *context = "[file-system-dataset-factory][add-path]"; + auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(factory); + if (!priv->file_system) { + garrow::check(error, + arrow::Status::Invalid("file system isn't set"), + context); + return FALSE; + } + auto arrow_file_system = garrow_file_system_get_raw(priv->file_system); + auto arrow_file_info_result = arrow_file_system->GetFileInfo(path); + if (!garrow::check(error, arrow_file_info_result, context)) { + return FALSE; + } + auto file_info = garrow_file_info_new_raw(*arrow_file_info_result); + priv->files = g_list_prepend(priv->files, file_info); + return TRUE; +} + +/** + * gadataset_file_system_dataset_factory_finish: + * @factory: A #GADatasetFileSystemDatasetFactory. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full) (nullable): + * A newly created #GADatasetFileSystemDataset on success, %NULL on error. + * + * Since: 5.0.0 + */ +GADatasetFileSystemDataset * +gadataset_file_system_dataset_factory_finish( + GADatasetFileSystemDatasetFactory *factory, + GError **error) +{ + const gchar *context = "[file-system-dataset-factory][finish]"; + auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(factory); + if (!priv->file_system) { + garrow::check(error, + arrow::Status::Invalid("file system isn't set"), + context); + return NULL; + } + auto arrow_file_system = garrow_file_system_get_raw(priv->file_system); + auto arrow_format = gadataset_file_format_get_raw(priv->format); + std::vector arrow_files; + priv->files = g_list_reverse(priv->files); + for (auto node = priv->files; node; node = node->next) { + auto file = GARROW_FILE_INFO(node->data); + arrow_files.push_back(*garrow_file_info_get_raw(file)); + } + priv->files = g_list_reverse(priv->files); + auto arrow_factory_result = + arrow::dataset::FileSystemDatasetFactory::Make(arrow_file_system, + arrow_files, + arrow_format, + priv->options); + if (!garrow::check(error, arrow_factory_result, context)) { + return NULL; + } + auto arrow_dataset_result = (*arrow_factory_result)->Finish(); + if (!garrow::check(error, arrow_dataset_result, context)) { + return NULL; + } + auto arrow_dataset = *arrow_dataset_result; + return GADATASET_FILE_SYSTEM_DATASET( + gadataset_dataset_new_raw(&arrow_dataset, + "dataset", &arrow_dataset, + "file-system", priv->file_system, + "format", priv->format, + NULL)); +} + + +G_END_DECLS + +std::shared_ptr +gadataset_dataset_factory_get_raw(GADatasetDatasetFactory *factory) +{ + auto priv = GADATASET_DATASET_FACTORY_GET_PRIVATE(factory); + return priv->factory; +} diff --git a/c_glib/arrow-dataset-glib/dataset-factory.h b/c_glib/arrow-dataset-glib/dataset-factory.h new file mode 100644 index 0000000000000..e2ee3ed9806e7 --- /dev/null +++ b/c_glib/arrow-dataset-glib/dataset-factory.h @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +G_BEGIN_DECLS + +#define GADATASET_TYPE_DATASET_FACTORY (gadataset_dataset_factory_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetDatasetFactory, + gadataset_dataset_factory, + GADATASET, + DATASET_FACTORY, + GObject) +struct _GADatasetDatasetFactoryClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GADatasetDataset * +gadataset_dataset_factory_finish(GADatasetDatasetFactory *factory, + GError **error); + + +#define GADATASET_TYPE_FILE_SYSTEM_DATASET_FACTORY \ + (gadataset_file_system_dataset_factory_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetFileSystemDatasetFactory, + gadataset_file_system_dataset_factory, + GADATASET, + FILE_SYSTEM_DATASET_FACTORY, + GADatasetDatasetFactory) +struct _GADatasetFileSystemDatasetFactoryClass +{ + GADatasetDatasetFactoryClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GADatasetFileSystemDatasetFactory * +gadataset_file_system_dataset_factory_new(GADatasetFileFormat *file_format); +GARROW_AVAILABLE_IN_5_0 +gboolean +gadataset_file_system_dataset_factory_set_file_system( + GADatasetFileSystemDatasetFactory *factory, + GArrowFileSystem *file_system, + GError **error); +gboolean +gadataset_file_system_dataset_factory_set_file_system_uri( + GADatasetFileSystemDatasetFactory *factory, + const gchar *uri, + GError **error); + +GARROW_AVAILABLE_IN_5_0 +gboolean +gadataset_file_system_dataset_factory_add_path( + GADatasetFileSystemDatasetFactory *factory, + const gchar *path, + GError **error); +/* +GARROW_AVAILABLE_IN_5_0 +gboolean +gadataset_file_system_dataset_factory_add_file( + GADatasetFileSystemDatasetFactory *factory, + GArrowFileInfo *file, + GError **error); +GARROW_AVAILABLE_IN_5_0 +gboolean +gadataset_file_system_dataset_factory_add_selector( + GADatasetFileSystemDatasetFactory *factory, + GArrorFileSelector *selector, + GError **error); +*/ + +GARROW_AVAILABLE_IN_5_0 +GADatasetFileSystemDataset * +gadataset_file_system_dataset_factory_finish( + GADatasetFileSystemDatasetFactory *factory, + GError **error); + + +G_END_DECLS diff --git a/c_glib/arrow-dataset-glib/dataset-factory.hpp b/c_glib/arrow-dataset-glib/dataset-factory.hpp new file mode 100644 index 0000000000000..114db35bc59c8 --- /dev/null +++ b/c_glib/arrow-dataset-glib/dataset-factory.hpp @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +#include + +std::shared_ptr +gadataset_dataset_factory_get_raw(GADatasetDatasetFactory *factory); diff --git a/c_glib/arrow-dataset-glib/dataset.cpp b/c_glib/arrow-dataset-glib/dataset.cpp new file mode 100644 index 0000000000000..3bd62f99ef3f5 --- /dev/null +++ b/c_glib/arrow-dataset-glib/dataset.cpp @@ -0,0 +1,365 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include + +#include +#include +#include + +G_BEGIN_DECLS + +/** + * SECTION: dataset + * @section_id: dataset + * @title: Dataset related classes + * @include: arrow-dataset-glib/arrow-dataset-glib.h + * + * #GADatasetDataset is a base class for datasets. + * + * #GADatasetFileSystemDataset is a class for file system dataset. + * + * #GADatasetFileFormat is a base class for file formats. + * + * #GADatasetCSVFileFormat is a class for CSV file format. + * + * #GADatasetIPCFileFormat is a class for IPC file format. + * + * #GADatasetParquetFileFormat is a class for Apache Parquet file format. + * + * Since: 5.0.0 + */ + +typedef struct GADatasetDatasetPrivate_ { + std::shared_ptr dataset; +} GADatasetDatasetPrivate; + +enum { + PROP_DATASET = 1, +}; + +G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GADatasetDataset, + gadataset_dataset, + G_TYPE_OBJECT) + +#define GADATASET_DATASET_GET_PRIVATE(obj) \ + static_cast( \ + gadataset_dataset_get_instance_private( \ + GADATASET_DATASET(obj))) + +static void +gadataset_dataset_finalize(GObject *object) +{ + auto priv = GADATASET_DATASET_GET_PRIVATE(object); + priv->dataset.~shared_ptr(); + G_OBJECT_CLASS(gadataset_dataset_parent_class)->finalize(object); +} + +static void +gadataset_dataset_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_DATASET_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_DATASET: + priv->dataset = + *static_cast *>( + g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_dataset_init(GADatasetDataset *object) +{ + auto priv = GADATASET_DATASET_GET_PRIVATE(object); + new(&priv->dataset) std::shared_ptr; +} + +static void +gadataset_dataset_class_init(GADatasetDatasetClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->finalize = gadataset_dataset_finalize; + gobject_class->set_property = gadataset_dataset_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer("dataset", + "Dataset", + "The raw " + "std::shared *", + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_DATASET, spec); +} + +/** + * gadataset_dataset_begin_scan: + * @dataset: A #GADatasetDataset. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full) (nullable): + * A newly created #GADatasetScannerBuilder on success, %NULL on error. + * + * Since: 5.0.0 + */ +GADatasetScannerBuilder * +gadataset_dataset_begin_scan(GADatasetDataset *dataset, + GError **error) +{ + return gadataset_scanner_builder_new(dataset, error); +} + +/** + * gadataset_dataset_to_table: + * @dataset: A #GADatasetDataset. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full) (nullable): + * A loaded #GArrowTable on success, %NULL on error. + * + * Since: 5.0.0 + */ +GArrowTable * +gadataset_dataset_to_table(GADatasetDataset *dataset, + GError **error) +{ + auto arrow_dataset = gadataset_dataset_get_raw(dataset); + auto arrow_scanner_builder_result = arrow_dataset->NewScan(); + if (!garrow::check(error, + arrow_scanner_builder_result, + "[dataset][to-table]")) { + return NULL; + } + auto arrow_scanner_builder = *arrow_scanner_builder_result; + auto arrow_scanner_result = arrow_scanner_builder->Finish(); + if (!garrow::check(error, + arrow_scanner_result, + "[dataset][to-table]")) { + return NULL; + } + auto arrow_scanner = *arrow_scanner_result; + auto arrow_table_result = arrow_scanner->ToTable(); + if (!garrow::check(error, + arrow_scanner_result, + "[dataset][to-table]")) { + return NULL; + } + return garrow_table_new_raw(&(*arrow_table_result)); +} + +/** + * gadataset_dataset_get_type_name: + * @dataset: A #GADatasetDataset. + * + * Returns: The type name of @dataset. + * + * It should be freed with g_free() when no longer needed. + * + * Since: 5.0.0 + */ +gchar * +gadataset_dataset_get_type_name(GADatasetDataset *dataset) +{ + const auto arrow_dataset = gadataset_dataset_get_raw(dataset); + const auto &type_name = arrow_dataset->type_name(); + return g_strndup(type_name.data(), type_name.size()); +} + + +typedef struct GADatasetFileSystemDatasetPrivate_ { + GADatasetFileFormat *format; + GArrowFileSystem *file_system; +} GADatasetFileSystemDatasetPrivate; + +enum { + PROP_FORMAT = 1, + PROP_FILE_SYSTEM, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFileSystemDataset, + gadataset_file_system_dataset, + GADATASET_TYPE_DATASET) + +#define GADATASET_FILE_SYSTEM_DATASET_GET_PRIVATE(obj) \ + static_cast( \ + gadataset_file_system_dataset_get_instance_private( \ + GADATASET_FILE_SYSTEM_DATASET(obj))) + +static void +gadataset_file_system_dataset_dispose(GObject *object) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_GET_PRIVATE(object); + + if (priv->format) { + g_object_unref(priv->format); + priv->format = NULL; + } + + if (priv->file_system) { + g_object_unref(priv->file_system); + priv->file_system = NULL; + } + + G_OBJECT_CLASS(gadataset_file_system_dataset_parent_class)->dispose(object); +} + +static void +gadataset_file_system_dataset_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_FORMAT: + priv->format = GADATASET_FILE_FORMAT(g_value_dup_object(value)); + break; + case PROP_FILE_SYSTEM: + priv->file_system = GARROW_FILE_SYSTEM(g_value_dup_object(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_file_system_dataset_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_FORMAT: + g_value_set_object(value, priv->format); + break; + case PROP_FILE_SYSTEM: + g_value_set_object(value, priv->file_system); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_file_system_dataset_init(GADatasetFileSystemDataset *object) +{ +} + +static void +gadataset_file_system_dataset_class_init(GADatasetFileSystemDatasetClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->dispose = gadataset_file_system_dataset_dispose; + gobject_class->set_property = gadataset_file_system_dataset_set_property; + gobject_class->get_property = gadataset_file_system_dataset_get_property; + + GParamSpec *spec; + /** + * GADatasetFileSystemDataset:format: + * + * Format of the dataset. + * + * Since: 5.0.0 + */ + spec = g_param_spec_object("format", + "Format", + "Format of the dataset", + GADATASET_TYPE_FILE_FORMAT, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_FORMAT, spec); + + /** + * GADatasetFileSystemDataset:file-system: + * + * File system of the dataset. + * + * Since: 5.0.0 + */ + spec = g_param_spec_object("file-system", + "File system", + "File system of the dataset", + GARROW_TYPE_FILE_SYSTEM, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_FILE_SYSTEM, spec); +} + + +G_END_DECLS + +GADatasetDataset * +gadataset_dataset_new_raw( + std::shared_ptr *arrow_dataset) +{ + return gadataset_dataset_new_raw(arrow_dataset, + "dataset", arrow_dataset, + NULL); +} + +GADatasetDataset * +gadataset_dataset_new_raw( + std::shared_ptr *arrow_dataset, + const gchar *first_property_name, + ...) +{ + va_list args; + va_start(args, first_property_name); + auto array = gadataset_dataset_new_raw_valist(arrow_dataset, + first_property_name, + args); + va_end(args); + return array; +} + +GADatasetDataset * +gadataset_dataset_new_raw_valist( + std::shared_ptr *arrow_dataset, + const gchar *first_property_name, + va_list args) +{ + GType type = GADATASET_TYPE_DATASET; + const auto type_name = (*arrow_dataset)->type_name(); + if (type_name == "filesystem") { + type = GADATASET_TYPE_FILE_SYSTEM_DATASET; + } + return GADATASET_DATASET(g_object_new_valist(type, + first_property_name, + args)); +} + +std::shared_ptr +gadataset_dataset_get_raw(GADatasetDataset *dataset) +{ + auto priv = GADATASET_DATASET_GET_PRIVATE(dataset); + return priv->dataset; +} diff --git a/c_glib/arrow-dataset-glib/dataset.h b/c_glib/arrow-dataset-glib/dataset.h new file mode 100644 index 0000000000000..97cf35d74d7eb --- /dev/null +++ b/c_glib/arrow-dataset-glib/dataset.h @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +G_BEGIN_DECLS + +typedef struct _GADatasetScannerBuilder GADatasetScannerBuilder; + +#define GADATASET_TYPE_DATASET (gadataset_dataset_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetDataset, + gadataset_dataset, + GADATASET, + DATASET, + GObject) +struct _GADatasetDatasetClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GADatasetScannerBuilder * +gadataset_dataset_begin_scan(GADatasetDataset *dataset, + GError **error); +GARROW_AVAILABLE_IN_5_0 +GArrowTable * +gadataset_dataset_to_table(GADatasetDataset *dataset, + GError **error); +GARROW_AVAILABLE_IN_5_0 +gchar * +gadataset_dataset_get_type_name(GADatasetDataset *dataset); + + +#define GADATASET_TYPE_FILE_SYSTEM_DATASET \ + (gadataset_file_system_dataset_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetFileSystemDataset, + gadataset_file_system_dataset, + GADATASET, + FILE_SYSTEM_DATASET, + GADatasetDataset) +struct _GADatasetFileSystemDatasetClass +{ + GADatasetDatasetClass parent_class; +}; + + +G_END_DECLS diff --git a/c_glib/arrow-dataset-glib/dataset.hpp b/c_glib/arrow-dataset-glib/dataset.hpp new file mode 100644 index 0000000000000..94dddd2eb7ac4 --- /dev/null +++ b/c_glib/arrow-dataset-glib/dataset.hpp @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +#include + +GADatasetDataset * +gadataset_dataset_new_raw( + std::shared_ptr *arrow_dataset); +GADatasetDataset * +gadataset_dataset_new_raw( + std::shared_ptr *arrow_dataset, + const gchar *first_property_name, + ...); +GADatasetDataset * +gadataset_dataset_new_raw_valist( + std::shared_ptr *arrow_dataset, + const gchar *first_property_name, + va_list arg); +std::shared_ptr +gadataset_dataset_get_raw(GADatasetDataset *dataset); + +GADatasetFileFormat * +gadataset_file_format_new_raw( + std::shared_ptr *arrow_format); +std::shared_ptr +gadataset_dataset_get_raw(GADatasetDataset *dataset); + + diff --git a/c_glib/arrow-dataset-glib/meson.build b/c_glib/arrow-dataset-glib/meson.build index 04dc420b05748..b3f617330cff5 100644 --- a/c_glib/arrow-dataset-glib/meson.build +++ b/c_glib/arrow-dataset-glib/meson.build @@ -18,6 +18,8 @@ # under the License. sources = files( + 'dataset-factory.cpp', + 'dataset.cpp', 'file-format.cpp', 'fragment.cpp', 'scanner.cpp', @@ -25,6 +27,8 @@ sources = files( c_headers = files( 'arrow-dataset-glib.h', + 'dataset-factory.h', + 'dataset.h', 'file-format.h', 'fragment.h', 'scanner.h', @@ -32,6 +36,8 @@ c_headers = files( cpp_headers = files( 'arrow-dataset-glib.hpp', + 'dataset-factory.hpp', + 'dataset.hpp', 'file-format.hpp', 'fragment.hpp', 'scanner.hpp', diff --git a/c_glib/arrow-dataset-glib/scanner.cpp b/c_glib/arrow-dataset-glib/scanner.cpp index 04778c8ae9977..7f8d8be5fdb1c 100644 --- a/c_glib/arrow-dataset-glib/scanner.cpp +++ b/c_glib/arrow-dataset-glib/scanner.cpp @@ -17,13 +17,10 @@ * under the License. */ -#include - #include -#include -#include +#include -#include +#include #include G_BEGIN_DECLS @@ -31,72 +28,55 @@ G_BEGIN_DECLS /** * SECTION: scanner * @section_id: scanner - * @title: Scanner classes + * @title: Scanner related classes * @include: arrow-dataset-glib/arrow-dataset-glib.h * - * #GADatasetScanOptions is a class for a set of scan options. - * - * #GADatasetScanTask is an abstract class for a scan task. + * #GADatasetScanner is a class for scanning dataset. * - * #GADatasetInMemoryScanTask is a class for a scan task of record batches. + * #GADatasetScannerBuilder is a class for building a scanner. * - * Since: 1.0.0 + * Since: 5.0.0 */ -/* arrow::dataset::ScanOptions */ - -typedef struct GADatasetScanOptionsPrivate_ { - std::shared_ptr scan_options; -} GADatasetScanOptionsPrivate; +typedef struct GADatasetScannerPrivate_ { + std::shared_ptr scanner; +} GADatasetScannerPrivate; enum { - PROP_SCAN_OPTIONS = 1, - PROP_FILTER, - PROP_EVALUATOR, - PROP_PROJECTOR, - PROP_BATCH_SIZE, - PROP_USE_THREADS, + PROP_SCANNER = 1, }; -G_DEFINE_TYPE_WITH_PRIVATE(GADatasetScanOptions, - gadataset_scan_options, +G_DEFINE_TYPE_WITH_PRIVATE(GADatasetScanner, + gadataset_scanner, G_TYPE_OBJECT) -#define GADATASET_SCAN_OPTIONS_GET_PRIVATE(obj) \ - static_cast( \ - gadataset_scan_options_get_instance_private( \ - GADATASET_SCAN_OPTIONS(obj))) +#define GADATASET_SCANNER_GET_PRIVATE(obj) \ + static_cast( \ + gadataset_scanner_get_instance_private( \ + GADATASET_SCANNER(obj))) static void -gadataset_scan_options_finalize(GObject *object) +gadataset_scanner_finalize(GObject *object) { - auto priv = GADATASET_SCAN_OPTIONS_GET_PRIVATE(object); - - priv->scan_options.~shared_ptr(); - - G_OBJECT_CLASS(gadataset_scan_options_parent_class)->finalize(object); + auto priv = GADATASET_SCANNER_GET_PRIVATE(object); + priv->scanner.~shared_ptr(); + G_OBJECT_CLASS(gadataset_scanner_parent_class)->finalize(object); } static void -gadataset_scan_options_set_property(GObject *object, - guint prop_id, - const GValue *value, - GParamSpec *pspec) +gadataset_scanner_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) { - auto priv = GADATASET_SCAN_OPTIONS_GET_PRIVATE(object); + auto priv = GADATASET_SCANNER_GET_PRIVATE(object); switch (prop_id) { - case PROP_SCAN_OPTIONS: - priv->scan_options = - *static_cast *>( + case PROP_SCANNER: + priv->scanner = + *static_cast *>( g_value_get_pointer(value)); break; - case PROP_BATCH_SIZE: - priv->scan_options->batch_size = g_value_get_int64(value); - break; - case PROP_USE_THREADS: - priv->scan_options->use_threads = g_value_get_boolean(value); - break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); break; @@ -104,193 +84,92 @@ gadataset_scan_options_set_property(GObject *object, } static void -gadataset_scan_options_get_property(GObject *object, - guint prop_id, - GValue *value, - GParamSpec *pspec) +gadataset_scanner_init(GADatasetScanner *object) { - auto priv = GADATASET_SCAN_OPTIONS_GET_PRIVATE(object); - - switch (prop_id) { - case PROP_BATCH_SIZE: - g_value_set_int64(value, priv->scan_options->batch_size); - break; - case PROP_USE_THREADS: - g_value_set_boolean(value, priv->scan_options->use_threads); - break; - default: - G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); - break; - } + auto priv = GADATASET_SCANNER_GET_PRIVATE(object); + new(&priv->scanner) std::shared_ptr; } static void -gadataset_scan_options_init(GADatasetScanOptions *object) +gadataset_scanner_class_init(GADatasetScannerClass *klass) { - auto priv = GADATASET_SCAN_OPTIONS_GET_PRIVATE(object); - new(&priv->scan_options) std::shared_ptr; -} + auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->finalize = gadataset_scanner_finalize; + gobject_class->set_property = gadataset_scanner_set_property; -static void -gadataset_scan_options_class_init(GADatasetScanOptionsClass *klass) -{ - GObjectClass *gobject_class; GParamSpec *spec; - - gobject_class = G_OBJECT_CLASS(klass); - - gobject_class->finalize = gadataset_scan_options_finalize; - gobject_class->set_property = gadataset_scan_options_set_property; - gobject_class->get_property = gadataset_scan_options_get_property; - - auto scan_options = std::make_shared(); - - spec = g_param_spec_pointer("scan-options", - "ScanOptions", - "The raw std::shared *", + spec = g_param_spec_pointer("scanner", + "Scanner", + "The raw std::shared *", static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); - g_object_class_install_property(gobject_class, PROP_SCAN_OPTIONS, spec); - - // TODO: PROP_FILTER - // TODO: PROP_EVALUATOR - // TODO: PROP_PROJECTOR - - /** - * GADatasetScanOptions:batch-size: - * - * Maximum row count for scanned batches. - * - * Since: 1.0.0 - */ - spec = g_param_spec_int64("batch-size", - "Batch size", - "Maximum row count for scanned batches", - 0, - G_MAXINT64, - scan_options->batch_size, - static_cast(G_PARAM_READWRITE)); - g_object_class_install_property(gobject_class, PROP_BATCH_SIZE, spec); - - /** - * GADatasetScanOptions:use-threads: - * - * Indicate if the Scanner should make use of a ThreadPool. - * - * Since: 4.0.0 - */ - spec = g_param_spec_boolean("use-threads", - "Use threads", - "Indicate if the Scanner should make use of a ThreadPool", - scan_options->use_threads, - static_cast(G_PARAM_READWRITE)); - g_object_class_install_property(gobject_class, PROP_USE_THREADS, spec); + g_object_class_install_property(gobject_class, PROP_SCANNER, spec); } /** - * gadataset_scan_options_new: - * @schema: A #GArrowSchema. - * - * Returns: A newly created #GADatasetScanOptions. - * - * Since: 1.0.0 - */ -GADatasetScanOptions * -gadataset_scan_options_new(GArrowSchema *schema) -{ - auto arrow_schema = garrow_schema_get_raw(schema); - auto arrow_scan_options = std::make_shared(); - arrow_scan_options->dataset_schema = arrow_schema; - return gadataset_scan_options_new_raw(&arrow_scan_options); -} - -/** - * gadataset_scan_options_get_schema: - * @scan_options: A #GADatasetScanOptions. + * gadataset_scanner_to_table: + * @scanner: A #GADatasetScanner. + * @error: (nullable): Return location for a #GError or %NULL. * - * Returns: (transfer full): A #GArrowSchema. + * Returns: (transfer full) (nullable): + * A newly created #GArrowTable on success, %NULL on error. * - * Since: 1.0.0 + * Since: 5.0.0 */ -GArrowSchema * -gadataset_scan_options_get_schema(GADatasetScanOptions *scan_options) +GArrowTable * +gadataset_scanner_to_table(GADatasetScanner *scanner, + GError **error) { - auto priv = GADATASET_SCAN_OPTIONS_GET_PRIVATE(scan_options); - auto arrow_schema = priv->scan_options->dataset_schema; - return garrow_schema_new_raw(&arrow_schema); + auto arrow_scanner = gadataset_scanner_get_raw(scanner); + auto arrow_table_result = arrow_scanner->ToTable(); + if (garrow::check(error, arrow_table_result, "[scanner][to-table]")) { + auto arrow_table = *arrow_table_result; + return garrow_table_new_raw(&arrow_table); + } else { + return NULL; + } } -/* arrow::dataset::ScanTask */ -typedef struct GADatasetScanTaskPrivate_ { - std::shared_ptr scan_task; - GADatasetScanOptions *options; - GADatasetFragment *fragment; -} GADatasetScanTaskPrivate; +typedef struct GADatasetScannerBuilderPrivate_ { + std::shared_ptr scanner_builder; +} GADatasetScannerBuilderPrivate; enum { - PROP_SCAN_TASK = 1, - PROP_OPTIONS, - PROP_FRAGMENT, + PROP_SCANNER_BUILDER = 1, }; -G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GADatasetScanTask, - gadataset_scan_task, - G_TYPE_OBJECT) - -#define GADATASET_SCAN_TASK_GET_PRIVATE(obj) \ - static_cast( \ - gadataset_scan_task_get_instance_private( \ - GADATASET_SCAN_TASK(obj))) - -static void -gadataset_scan_task_dispose(GObject *object) -{ - auto priv = GADATASET_SCAN_TASK_GET_PRIVATE(object); - - if (priv->options) { - g_object_unref(priv->options); - priv->options = NULL; - } +G_DEFINE_TYPE_WITH_PRIVATE(GADatasetScannerBuilder, + gadataset_scanner_builder, + G_TYPE_OBJECT) - if (priv->fragment) { - g_object_unref(priv->fragment); - priv->fragment = NULL; - } - - G_OBJECT_CLASS(gadataset_scan_task_parent_class)->dispose(object); -} +#define GADATASET_SCANNER_BUILDER_GET_PRIVATE(obj) \ + static_cast( \ + gadataset_scanner_builder_get_instance_private( \ + GADATASET_SCANNER_BUILDER(obj))) static void -gadataset_scan_task_finalize(GObject *object) +gadataset_scanner_builder_finalize(GObject *object) { - auto priv = GADATASET_SCAN_TASK_GET_PRIVATE(object); - - priv->scan_task.~shared_ptr(); - - G_OBJECT_CLASS(gadataset_scan_task_parent_class)->finalize(object); + auto priv = GADATASET_SCANNER_BUILDER_GET_PRIVATE(object); + priv->scanner_builder.~shared_ptr(); + G_OBJECT_CLASS(gadataset_scanner_builder_parent_class)->finalize(object); } static void -gadataset_scan_task_set_property(GObject *object, - guint prop_id, - const GValue *value, - GParamSpec *pspec) +gadataset_scanner_builder_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) { - auto priv = GADATASET_SCAN_TASK_GET_PRIVATE(object); + auto priv = GADATASET_SCANNER_BUILDER_GET_PRIVATE(object); switch (prop_id) { - case PROP_SCAN_TASK: - priv->scan_task = - *static_cast *>( + case PROP_SCANNER_BUILDER: + priv->scanner_builder = + *static_cast *>( g_value_get_pointer(value)); break; - case PROP_OPTIONS: - priv->options = GADATASET_SCAN_OPTIONS(g_value_dup_object(value)); - break; - case PROP_FRAGMENT: - priv->fragment = GADATASET_FRAGMENT(g_value_dup_object(value)); - break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); break; @@ -298,230 +177,112 @@ gadataset_scan_task_set_property(GObject *object, } static void -gadataset_scan_task_get_property(GObject *object, - guint prop_id, - GValue *value, - GParamSpec *pspec) +gadataset_scanner_builder_init(GADatasetScannerBuilder *object) { - auto priv = GADATASET_SCAN_TASK_GET_PRIVATE(object); - - switch (prop_id) { - case PROP_OPTIONS: - g_value_set_object(value, priv->options); - break; - case PROP_FRAGMENT: - g_value_set_object(value, priv->fragment); - break; - default: - G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); - break; - } + auto priv = GADATASET_SCANNER_BUILDER_GET_PRIVATE(object); + new(&priv->scanner_builder) std::shared_ptr; } static void -gadataset_scan_task_init(GADatasetScanTask *object) -{ - auto priv = GADATASET_SCAN_TASK_GET_PRIVATE(object); - new(&priv->scan_task) std::shared_ptr; -} - -static void -gadataset_scan_task_class_init(GADatasetScanTaskClass *klass) +gadataset_scanner_builder_class_init(GADatasetScannerBuilderClass *klass) { auto gobject_class = G_OBJECT_CLASS(klass); - - gobject_class->dispose = gadataset_scan_task_dispose; - gobject_class->finalize = gadataset_scan_task_finalize; - gobject_class->set_property = gadataset_scan_task_set_property; - gobject_class->get_property = gadataset_scan_task_get_property; + gobject_class->finalize = gadataset_scanner_builder_finalize; + gobject_class->set_property = gadataset_scanner_builder_set_property; GParamSpec *spec; - spec = g_param_spec_pointer("scan-task", - "ScanTask", - "The raw std::shared *", + spec = g_param_spec_pointer("scanner-builder", + "Scanner builder", + "The raw " + "std::shared *", static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); - g_object_class_install_property(gobject_class, PROP_SCAN_TASK, spec); - - /** - * GADatasetScanTask:options: - * - * The options of the scan task. - * - * Since: 1.0.0 - */ - spec = g_param_spec_object("options", - "Options", - "The options of the scan task", - GADATASET_TYPE_SCAN_OPTIONS, - static_cast(G_PARAM_READWRITE | - G_PARAM_CONSTRUCT_ONLY)); - g_object_class_install_property(gobject_class, PROP_OPTIONS, spec); - - /** - * GADatasetScanTask:fragment: - * - * The fragment of the scan task. - * - * Since: 4.0.0 - */ - spec = g_param_spec_object("fragment", - "Fragment", - "The fragment of the scan task", - GADATASET_TYPE_FRAGMENT, - static_cast(G_PARAM_READWRITE | - G_PARAM_CONSTRUCT_ONLY)); - g_object_class_install_property(gobject_class, PROP_FRAGMENT, spec); + g_object_class_install_property(gobject_class, PROP_SCANNER_BUILDER, spec); } /** - * gadataset_scan_task_get_options: - * @scan_task: A #GADatasetScanTask. - * - * Returns: (transfer full): A #GADatasetScanOptions. - * - * Since: 1.0.0 - */ -GADatasetScanOptions * -gadataset_scan_task_get_options(GADatasetScanTask *scan_task) -{ - auto priv = GADATASET_SCAN_TASK_GET_PRIVATE(scan_task); - if (priv->options) { - g_object_ref(priv->options); - return priv->options; - } - - auto arrow_options = priv->scan_task->options(); - return gadataset_scan_options_new_raw(&arrow_options); -} - -/** - * gadataset_scan_task_get_fragment: - * @scan_task: A #GADatasetFragment. + * gadataset_scanner_builder_new: + * @dataset: A #GADatasetDatast to be scanned. + * @error: (nullable): Return location for a #GError or %NULL. * - * Returns: (transfer full): A #GADatasetFragment. + * Returns: (nullable): A newly created #GADatasetScannerBuilder on success, + * %NULL on error. * - * Since: 4.0.0 + * Since: 5.0.0 */ -GADatasetFragment * -gadataset_scan_task_get_fragment(GADatasetScanTask *scan_task) +GADatasetScannerBuilder * +gadataset_scanner_builder_new(GADatasetDataset *dataset, GError **error) { - auto priv = GADATASET_SCAN_TASK_GET_PRIVATE(scan_task); - if (priv->fragment) { - g_object_ref(priv->fragment); - return priv->fragment; + auto arrow_dataset = gadataset_dataset_get_raw(dataset); + auto arrow_scanner_builder_result = arrow_dataset->NewScan(); + if (garrow::check(error, + arrow_scanner_builder_result, + "[scanner-builder][new]")) { + auto arrow_scanner_builder = *arrow_scanner_builder_result; + return gadataset_scanner_builder_new_raw(&arrow_scanner_builder); + } else { + return NULL; } - - auto arrow_fragment = priv->scan_task->fragment(); - return gadataset_fragment_new_raw(&arrow_fragment); } /** - * gadataset_scan_task_execute: - * @scan_task: A #GADatasetScanTask. + * gadataset_scanner_builder_finish: + * @builder: A #GADatasetScannerBuilder. * @error: (nullable): Return location for a #GError or %NULL. * - * Returns: (nullable) (transfer full): A newly created #GArrowRecordBatchIterator, - * or %NULL on error. + * Returns: (transfer full) (nullable): + * A newly created #GADatasetScanner on success, %NULL on error. * - * Since: 1.0.0 + * Since: 5.0.0 */ -GArrowRecordBatchIterator * -gadataset_scan_task_execute(GADatasetScanTask *scan_task, - GError **error) +GADatasetScanner * +gadataset_scanner_builder_finish(GADatasetScannerBuilder *builder, + GError **error) { - auto priv = GADATASET_SCAN_TASK_GET_PRIVATE(scan_task); - auto arrow_result = priv->scan_task->Execute(); - if (garrow::check(error, arrow_result, "[datasets][scan-task][execute]")) { - auto arrow_record_batch_iteraor = std::move(*arrow_result); - return garrow_record_batch_iterator_new_raw(&arrow_record_batch_iteraor); + auto arrow_builder = gadataset_scanner_builder_get_raw(builder); + auto arrow_scanner_result = arrow_builder->Finish(); + if (garrow::check(error, arrow_scanner_result, "[scanner-builder][finish]")) { + auto arrow_scanner = *arrow_scanner_result; + return gadataset_scanner_new_raw(&arrow_scanner); } else { return NULL; } } -/* arrow::dataset::InMemoryScanTask */ - -G_DEFINE_TYPE(GADatasetInMemoryScanTask, - gadataset_in_memory_scan_task, - GADATASET_TYPE_SCAN_TASK) - -static void -gadataset_in_memory_scan_task_init(GADatasetInMemoryScanTask *object) -{ -} -static void -gadataset_in_memory_scan_task_class_init(GADatasetInMemoryScanTaskClass *klass) -{ -} +G_END_DECLS -/** - * gadataset_in_memory_scan_task_new: - * @record_batches: (array length=n_record_batches): - * (element-type GArrowRecordBatch): The record batches of the table. - * @n_record_batches: The number of record batches. - * @options: A #GADatasetScanOptions. - * @fragment: A #GADatasetInMemoryFragment. - * - * Returns: A newly created #GADatasetInMemoryScanTask. - * - * Since: 1.0.0 - */ -GADatasetInMemoryScanTask * -gadataset_in_memory_scan_task_new(GArrowRecordBatch **record_batches, - gsize n_record_batches, - GADatasetScanOptions *options, - GADatasetInMemoryFragment *fragment) +GADatasetScanner * +gadataset_scanner_new_raw( + std::shared_ptr *arrow_scanner) { - std::vector> arrow_record_batches; - arrow_record_batches.reserve(n_record_batches); - for (gsize i = 0; i < n_record_batches; ++i) { - auto arrow_record_batch = garrow_record_batch_get_raw(record_batches[i]); - arrow_record_batches.push_back(arrow_record_batch); - } - auto arrow_options = gadataset_scan_options_get_raw(options); - auto arrow_fragment = gadataset_fragment_get_raw(GADATASET_FRAGMENT(fragment)); - auto arrow_in_memory_scan_task = - std::make_shared(arrow_record_batches, - arrow_options, - arrow_fragment); - return gadataset_in_memory_scan_task_new_raw(&arrow_in_memory_scan_task, - options, - fragment); + auto scanner = + GADATASET_SCANNER(g_object_new(GADATASET_TYPE_SCANNER, + "scanner", arrow_scanner, + NULL)); + return scanner; } -G_END_DECLS - -GADatasetScanOptions * -gadataset_scan_options_new_raw( - std::shared_ptr *arrow_scan_options) +std::shared_ptr +gadataset_scanner_get_raw(GADatasetScanner *scanner) { - auto scan_options = - GADATASET_SCAN_OPTIONS(g_object_new(GADATASET_TYPE_SCAN_OPTIONS, - "scan-options", arrow_scan_options, - NULL)); - return scan_options; + auto priv = GADATASET_SCANNER_GET_PRIVATE(scanner); + return priv->scanner; } -std::shared_ptr -gadataset_scan_options_get_raw(GADatasetScanOptions *scan_options) +GADatasetScannerBuilder * +gadataset_scanner_builder_new_raw( + std::shared_ptr *arrow_scanner_builder) { - auto priv = GADATASET_SCAN_OPTIONS_GET_PRIVATE(scan_options); - return priv->scan_options; + return GADATASET_SCANNER_BUILDER( + g_object_new(GADATASET_TYPE_SCANNER_BUILDER, + "scanner-builder", arrow_scanner_builder, + NULL)); } -GADatasetInMemoryScanTask * -gadataset_in_memory_scan_task_new_raw( - std::shared_ptr *arrow_in_memory_scan_task, - GADatasetScanOptions *options, - GADatasetInMemoryFragment *fragment) +std::shared_ptr +gadataset_scanner_builder_get_raw(GADatasetScannerBuilder *scanner_builder) { - auto in_memory_scan_task = - GADATASET_IN_MEMORY_SCAN_TASK(g_object_new(GADATASET_TYPE_IN_MEMORY_SCAN_TASK, - "scan-task", arrow_in_memory_scan_task, - "options", options, - "fragment", fragment, - NULL)); - return in_memory_scan_task; + auto priv = GADATASET_SCANNER_BUILDER_GET_PRIVATE(scanner_builder); + return priv->scanner_builder; } diff --git a/c_glib/arrow-dataset-glib/scanner.h b/c_glib/arrow-dataset-glib/scanner.h index 90a60363e8227..446815d6db102 100644 --- a/c_glib/arrow-dataset-glib/scanner.h +++ b/c_glib/arrow-dataset-glib/scanner.h @@ -19,76 +19,45 @@ #pragma once -#include - +#include #include G_BEGIN_DECLS -/* arrow::dataset::ScanOptions */ - -#define GADATASET_TYPE_SCAN_OPTIONS (gadataset_scan_options_get_type()) -G_DECLARE_DERIVABLE_TYPE(GADatasetScanOptions, - gadataset_scan_options, +#define GADATASET_TYPE_SCANNER (gadataset_scanner_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetScanner, + gadataset_scanner, GADATASET, - SCAN_OPTIONS, + SCANNER, GObject) -struct _GADatasetScanOptionsClass +struct _GADatasetScannerClass { GObjectClass parent_class; }; +GARROW_AVAILABLE_IN_5_0 +GArrowTable * +gadataset_scanner_to_table(GADatasetScanner *scanner, + GError **error); -GARROW_AVAILABLE_IN_1_0 -GADatasetScanOptions * -gadataset_scan_options_new(GArrowSchema *schema); -GARROW_AVAILABLE_IN_1_0 -GArrowSchema * -gadataset_scan_options_get_schema(GADatasetScanOptions *scan_options); - -/* arrow::dataset::ScanTask */ - -#define GADATASET_TYPE_SCAN_TASK (gadataset_scan_task_get_type()) -G_DECLARE_DERIVABLE_TYPE(GADatasetScanTask, - gadataset_scan_task, +#define GADATASET_TYPE_SCANNER_BUILDER (gadataset_scanner_builder_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetScannerBuilder, + gadataset_scanner_builder, GADATASET, - SCAN_TASK, + SCANNER_BUILDER, GObject) -struct _GADatasetScanTaskClass +struct _GADatasetScannerBuilderClass { GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_1_0 -GADatasetScanOptions * -gadataset_scan_task_get_options(GADatasetScanTask *scan_task); -GARROW_AVAILABLE_IN_4_0 -GADatasetFragment * -gadataset_scan_task_get_fragment(GADatasetScanTask *scan_task); -GARROW_AVAILABLE_IN_1_0 -GArrowRecordBatchIterator * -gadataset_scan_task_execute(GADatasetScanTask *scan_task, - GError **error); - -/* arrow::dataset::InMemoryScanTask */ - -#define GADATASET_TYPE_IN_MEMORY_SCAN_TASK \ - (gadataset_in_memory_scan_task_get_type()) -G_DECLARE_DERIVABLE_TYPE(GADatasetInMemoryScanTask, - gadataset_in_memory_scan_task, - GADATASET, - IN_MEMORY_SCAN_TASK, - GADatasetScanTask) -struct _GADatasetInMemoryScanTaskClass -{ - GADatasetScanTaskClass parent_class; -}; - -GARROW_AVAILABLE_IN_1_0 -GADatasetInMemoryScanTask * -gadataset_in_memory_scan_task_new(GArrowRecordBatch **record_batches, - gsize n_record_batches, - GADatasetScanOptions *options, - GADatasetInMemoryFragment *fragment); +GARROW_AVAILABLE_IN_5_0 +GADatasetScannerBuilder * +gadataset_scanner_builder_new(GADatasetDataset *dataset, + GError **error); +GARROW_AVAILABLE_IN_5_0 +GADatasetScanner * +gadataset_scanner_builder_finish(GADatasetScannerBuilder *builder, + GError **error); G_END_DECLS diff --git a/c_glib/arrow-dataset-glib/scanner.hpp b/c_glib/arrow-dataset-glib/scanner.hpp index ad3ac6a03cdbb..663ab6fc44b82 100644 --- a/c_glib/arrow-dataset-glib/scanner.hpp +++ b/c_glib/arrow-dataset-glib/scanner.hpp @@ -24,14 +24,14 @@ #include #include -GADatasetScanOptions * -gadataset_scan_options_new_raw( - std::shared_ptr *arrow_scan_options); -std::shared_ptr -gadataset_scan_options_get_raw(GADatasetScanOptions *scan_options); +GADatasetScanner * +gadataset_scanner_new_raw( + std::shared_ptr *arrow_scanner); +std::shared_ptr +gadataset_scanner_get_raw(GADatasetScanner *scanner); -GADatasetInMemoryScanTask * -gadataset_in_memory_scan_task_new_raw( - std::shared_ptr *arrow_in_memory_scan_task, - GADatasetScanOptions *scan_options, - GADatasetInMemoryFragment *fragment); +GADatasetScannerBuilder * +gadataset_scanner_builder_new_raw( + std::shared_ptr *arrow_scanner_builder); +std::shared_ptr +gadataset_scanner_builder_get_raw(GADatasetScannerBuilder *scanner_builder); diff --git a/c_glib/arrow-glib/basic-array.cpp b/c_glib/arrow-glib/basic-array.cpp index d5b221a36b00b..1eb65b88964a9 100644 --- a/c_glib/arrow-glib/basic-array.cpp +++ b/c_glib/arrow-glib/basic-array.cpp @@ -221,9 +221,9 @@ garrow_equal_options_set_property(GObject *object, static void garrow_equal_options_get_property(GObject *object, - guint prop_id, - GValue *value, - GParamSpec *pspec) + guint prop_id, + GValue *value, + GParamSpec *pspec) { auto priv = GARROW_EQUAL_OPTIONS_GET_PRIVATE(object); diff --git a/c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-docs.xml b/c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-docs.xml index 9a1ae059378b5..3e8da5bd9d184 100644 --- a/c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-docs.xml +++ b/c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-docs.xml @@ -36,9 +36,15 @@ - - Read - + + Data + + Dataset + + Dataset factory + + + Scan Fragment @@ -60,6 +66,10 @@ Index of deprecated API + + Index of new symbols in 4.0.0 + + Index of new symbols in 4.0.0 @@ -68,9 +78,5 @@ Index of new symbols in 3.0.0 - - Index of new symbols in 1.0.0 - - diff --git a/c_glib/test/dataset/test-file-system-dataset-factory.rb b/c_glib/test/dataset/test-file-system-dataset-factory.rb new file mode 100644 index 0000000000000..9ef629c222eed --- /dev/null +++ b/c_glib/test/dataset/test-file-system-dataset-factory.rb @@ -0,0 +1,55 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestDatasetFileSystemDatasetFactory < Test::Unit::TestCase + include Helper::Buildable + include Helper::Writable + + def setup + omit("Arrow Dataset is required") unless defined?(ArrowDataset) + Dir.mktmpdir do |tmpdir| + @dir = tmpdir + @path = File.join(@dir, "table.arrow") + @table = build_table(visible: [ + build_boolean_array([true, false, true]), + build_boolean_array([false, true, false, true]), + ], + point: [ + build_int32_array([1, 2, 3]), + build_int32_array([-1, -2, -3, -4]), + ]) + @format = ArrowDataset::IPCFileFormat.new + write_table(@table, @path) + yield + end + end + + def test_file_system + factory = ArrowDataset::FileSystemDatasetFactory.new(@format) + factory.file_system = Arrow::LocalFileSystem.new + factory.add_path(File.expand_path(@path)) + dataset = factory.finish + assert_equal(@table, dataset.to_table) + end + + def test_file_system_uri + factory = ArrowDataset::FileSystemDatasetFactory.new(@format) + factory.file_system_uri = build_file_uri(@path) + dataset = factory.finish + assert_equal(@table, dataset.to_table) + end +end diff --git a/ruby/red-arrow-dataset/test/test-in-memory-scan-task.rb b/c_glib/test/dataset/test-file-system-dataset.rb similarity index 64% rename from ruby/red-arrow-dataset/test/test-in-memory-scan-task.rb rename to c_glib/test/dataset/test-file-system-dataset.rb index 37f041d315986..6d6ec3b18c6d0 100644 --- a/ruby/red-arrow-dataset/test/test-in-memory-scan-task.rb +++ b/c_glib/test/dataset/test-file-system-dataset.rb @@ -15,19 +15,20 @@ # specific language governing permissions and limitations # under the License. -class TestInMemoryScanTask < Test::Unit::TestCase +class TestDatasetFileSystemDataset < Test::Unit::TestCase def setup - @record_batches = [ - Arrow::RecordBatch.new(visible: [true, false, true], - point: [1, 2, 3]), - ] + omit("Arrow Dataset is required") unless defined?(ArrowDataset) + Dir.mktmpdir do |tmpdir| + @dir = tmpdir + format = ArrowDataset::IPCFileFormat.new + factory = ArrowDataset::FileSystemDatasetFactory.new(format) + factory.file_system = Arrow::LocalFileSystem.new + @dataset = factory.finish + yield + end end - sub_test_case(".new") do - test("[[Arrow::RecordBatch]]") do - scan_task = ArrowDataset::InMemoryScanTask.new(@record_batches) - assert_equal(@record_batches, - scan_task.execute.to_a) - end + def test_type_name + assert_equal("filesystem", @dataset.type_name) end end diff --git a/c_glib/test/dataset/test-in-memory-scan-task.rb b/c_glib/test/dataset/test-in-memory-scan-task.rb deleted file mode 100644 index 06e3d0d2424d6..0000000000000 --- a/c_glib/test/dataset/test-in-memory-scan-task.rb +++ /dev/null @@ -1,59 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -class TestDatasetInMemoryScanTask < Test::Unit::TestCase - include Helper::Buildable - - def setup - omit("Arrow Dataset is required") unless defined?(ArrowDataset) - fields = [ - Arrow::Field.new("visible", Arrow::BooleanDataType.new), - Arrow::Field.new("point", Arrow::Int32DataType.new), - ] - @schema = Arrow::Schema.new(fields) - @record_batches = [ - [ - build_boolean_array([true, false, true]), - build_int32_array([1, 2, 3]), - ], - [ - build_boolean_array([false, true, false, true]), - build_int32_array([-1, -2, -3, -4]), - ] - ].collect do |columns| - Arrow::RecordBatch.new(@schema, columns[0].length, columns) - end - - @scan_options = ArrowDataset::ScanOptions.new(@schema) - - @fragment = ArrowDataset::InMemoryFragment.new(@schema, - @record_batches) - - @scan_task = ArrowDataset::InMemoryScanTask.new(@record_batches, - @scan_options, - @fragment) - end - - def test_scan_options - assert_equal(@scan_options, @scan_task.options) - end - - def test_execute - assert_equal(@record_batches, - @scan_task.execute.to_list) - end -end diff --git a/c_glib/test/dataset/test-scan-options.rb b/c_glib/test/dataset/test-scan-options.rb deleted file mode 100644 index 0536b2a7cca57..0000000000000 --- a/c_glib/test/dataset/test-scan-options.rb +++ /dev/null @@ -1,47 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -class TestDatasetScanOptions < Test::Unit::TestCase - def setup - omit("Arrow Dataset is required") unless defined?(ArrowDataset) - @schema = Arrow::Schema.new([]) - @scan_options = ArrowDataset::ScanOptions.new(@schema) - end - - def test_schema - assert_equal(@schema, - @scan_options.schema) - end - - def test_batch_size - assert_equal(1<<20, - @scan_options.batch_size) - @scan_options.batch_size = 42 - assert_equal(42, - @scan_options.batch_size) - end - - def test_use_threads - assert do - not @scan_options.use_threads? - end - @scan_options.use_threads = true - assert do - @scan_options.use_threads? - end - end -end diff --git a/c_glib/test/dataset/test-scanner.rb b/c_glib/test/dataset/test-scanner.rb new file mode 100644 index 0000000000000..f7702d4905fb6 --- /dev/null +++ b/c_glib/test/dataset/test-scanner.rb @@ -0,0 +1,48 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestDatasetScanner < Test::Unit::TestCase + include Helper::Buildable + include Helper::Writable + + def setup + omit("Arrow Dataset is required") unless defined?(ArrowDataset) + Dir.mktmpdir do |tmpdir| + path = File.join(tmpdir, "table.arrow") + @table = build_table(visible: [ + build_boolean_array([true, false, true]), + build_boolean_array([false, true, false, true]), + ], + point: [ + build_int32_array([1, 2, 3]), + build_int32_array([-1, -2, -3, -4]), + ]) + @format = ArrowDataset::IPCFileFormat.new + write_table(@table, path) + factory = ArrowDataset::FileSystemDatasetFactory.new(@format) + factory.file_system_uri = build_file_uri(path) + @dataset = factory.finish + builder = @dataset.begin_scan + @scanner = builder.finish + yield + end + end + + def test_to_table + assert_equal(@table, @scanner.to_table) + end +end diff --git a/c_glib/test/helper/buildable.rb b/c_glib/test/helper/buildable.rb index 04ae22f871580..356fa651c6a18 100644 --- a/c_glib/test/helper/buildable.rb +++ b/c_glib/test/helper/buildable.rb @@ -205,7 +205,15 @@ def append_to_builder(builder, value) def build_table(columns) fields = [] chunked_arrays = [] - columns.each do |name, chunked_array| + columns.each do |name, data| + case data + when Arrow::Array + chunked_array = Arrow::ChunkedArray.new([data]) + when Array + chunked_array = Arrow::ChunkedArray.new(data) + else + chunked_array = data + end fields << Arrow::Field.new(name, chunked_array.value_data_type) chunked_arrays << chunked_array end @@ -222,6 +230,15 @@ def build_record_batch(columns) Arrow::RecordBatch.new(schema, n_rows, columns.values) end + def build_file_uri(path) + absolute_path = File.expand_path(path) + if absolute_path.start_with?("/") + "file://#{absolute_path}" + else + "file:///#{absolute_path}" + end + end + private def build_array(builder, values) values.each do |value| diff --git a/ruby/red-arrow-dataset/lib/arrow-dataset/in-memory-fragment.rb b/c_glib/test/helper/writable.rb similarity index 63% rename from ruby/red-arrow-dataset/lib/arrow-dataset/in-memory-fragment.rb rename to c_glib/test/helper/writable.rb index 917d6c79d0d71..0053e972f9177 100644 --- a/ruby/red-arrow-dataset/lib/arrow-dataset/in-memory-fragment.rb +++ b/c_glib/test/helper/writable.rb @@ -15,18 +15,25 @@ # specific language governing permissions and limitations # under the License. -module ArrowDataset - class InMemoryFragment - alias_method :initialize_raw, :initialize - private :initialize_raw - def initialize(schema, record_batches) - record_batches = record_batches.collect do |record_batch| - unless record_batch.is_a?(Arrow::RecordBatch) - record_batch = Arrow::RecordBatch.new(record_batch) +module Helper + module Writable + def write_table(table, path, type: :file) + output = Arrow::FileOutputStream.new(path, false) + begin + if type == :file + writer_class = Arrow::RecordBatchFileWriter + else + writer_class = Arrow::RecordBatchStreamWriter end - record_batch + writer = writer_class.new(output, table.schema) + begin + writer.write_table(table) + ensure + writer.close + end + ensure + output.close end - initialize_raw(schema, record_batches) end end end diff --git a/c_glib/test/run-test.rb b/c_glib/test/run-test.rb index 044cb33a0198b..9c6af05224e02 100755 --- a/c_glib/test/run-test.rb +++ b/c_glib/test/run-test.rb @@ -83,10 +83,11 @@ class BooleanScalar require_relative "helper/buildable" require_relative "helper/data-type" require_relative "helper/fixture" -require_relative "helper/omittable" -require_relative "helper/plasma-store" if defined?(ArrowFlight) require_relative "helper/flight-server" end +require_relative "helper/omittable" +require_relative "helper/plasma-store" +require_relative "helper/writable" exit(Test::Unit::AutoRunner.run(true, test_dir.to_s)) diff --git a/cpp/src/arrow/dataset/discovery.h b/cpp/src/arrow/dataset/discovery.h index 5559638448fe3..40c020519555f 100644 --- a/cpp/src/arrow/dataset/discovery.h +++ b/cpp/src/arrow/dataset/discovery.h @@ -237,16 +237,23 @@ class ARROW_DS_EXPORT FileSystemDatasetFactory : public DatasetFactory { std::shared_ptr format, FileSystemFactoryOptions options); + /// \brief Build a FileSystemDatasetFactory from an explicit list of + /// file information. + /// + /// \param[in] filesystem passed to FileSystemDataset + /// \param[in] files passed to FileSystemDataset + /// \param[in] format passed to FileSystemDataset + /// \param[in] options see FileSystemFactoryOptions for more information. + static Result> Make( + std::shared_ptr filesystem, const std::vector& files, + std::shared_ptr format, FileSystemFactoryOptions options); + Result>> InspectSchemas( InspectOptions options) override; Result> Finish(FinishOptions options) override; protected: - static Result> Make( - std::shared_ptr filesystem, const std::vector& files, - std::shared_ptr format, FileSystemFactoryOptions options); - FileSystemDatasetFactory(std::vector files, std::shared_ptr filesystem, std::shared_ptr format, diff --git a/ruby/red-arrow-dataset/lib/arrow-dataset/scan-options.rb b/ruby/red-arrow-dataset/lib/arrow-dataset/dataset.rb similarity index 69% rename from ruby/red-arrow-dataset/lib/arrow-dataset/scan-options.rb rename to ruby/red-arrow-dataset/lib/arrow-dataset/dataset.rb index 1467743655bd9..a658fc3f2e0c8 100644 --- a/ruby/red-arrow-dataset/lib/arrow-dataset/scan-options.rb +++ b/ruby/red-arrow-dataset/lib/arrow-dataset/dataset.rb @@ -16,21 +16,13 @@ # under the License. module ArrowDataset - class ScanOptions + class Dataset class << self - def try_convert(value) - case value - when Hash - return nil unless value.key?(:schema) - options = new(value[:schema]) - value.each do |name, value| - next if name == :schema - options.__send__("#{name}=", value) - end - options - else - nil - end + def build(*args) + factory_class = ArrowDataset.const_get("#{name}Factory") + factory = factory_class.new(*args) + yield(factory) + factory.finish end end end diff --git a/ruby/red-arrow-dataset/lib/arrow-dataset/in-memory-scan-task.rb b/ruby/red-arrow-dataset/lib/arrow-dataset/in-memory-scan-task.rb deleted file mode 100644 index 5e127e179c658..0000000000000 --- a/ruby/red-arrow-dataset/lib/arrow-dataset/in-memory-scan-task.rb +++ /dev/null @@ -1,35 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -module ArrowDataset - class InMemoryScanTask - alias_method :initialize_raw, :initialize - private :initialize_raw - def initialize(record_batches, **options) - record_batches = record_batches.collect do |record_batch| - unless record_batch.is_a?(Arrow::RecordBatch) - record_batch = Arrow::RecordBatch.new(record_batch) - end - record_batch - end - options[:schema] ||= record_batches.first.schema - fragment = options.delete(:fragment) - fragment ||= InMemoryFragment.new(options[:schema], record_batches) - initialize_raw(record_batches, options, fragment) - end - end -end diff --git a/ruby/red-arrow-dataset/lib/arrow-dataset/loader.rb b/ruby/red-arrow-dataset/lib/arrow-dataset/loader.rb index fcac52d268f86..6a0dc5079d869 100644 --- a/ruby/red-arrow-dataset/lib/arrow-dataset/loader.rb +++ b/ruby/red-arrow-dataset/lib/arrow-dataset/loader.rb @@ -29,8 +29,7 @@ def post_load(repository, namespace) end def require_libraries - require "arrow-dataset/in-memory-scan-task" - require "arrow-dataset/scan-options" + require "arrow-dataset/dataset" end end end diff --git a/ruby/red-arrow-dataset/test/helper.rb b/ruby/red-arrow-dataset/test/helper.rb index 795df3beb0144..7231eb1cb64d6 100644 --- a/ruby/red-arrow-dataset/test/helper.rb +++ b/ruby/red-arrow-dataset/test/helper.rb @@ -17,4 +17,6 @@ require "arrow-dataset" +require "tmpdir" + require "test-unit" diff --git a/ruby/red-arrow-dataset/test/test-scan-options.rb b/ruby/red-arrow-dataset/test/test-file-system-dataset.rb similarity index 58% rename from ruby/red-arrow-dataset/test/test-scan-options.rb rename to ruby/red-arrow-dataset/test/test-file-system-dataset.rb index a9a947ff88de4..17cbcb88d744a 100644 --- a/ruby/red-arrow-dataset/test/test-scan-options.rb +++ b/ruby/red-arrow-dataset/test/test-file-system-dataset.rb @@ -15,22 +15,24 @@ # specific language governing permissions and limitations # under the License. -class TestScanOptions < Test::Unit::TestCase +class TestFileSystemDataset < Test::Unit::TestCase def setup - @record_batches = [ - Arrow::RecordBatch.new(visible: [true, false, true], - point: [1, 2, 3]), - ] - @schema = @record_batches.first.schema + Dir.mktmpdir do |tmpdir| + @dir = tmpdir + @path = File.join(@dir, "table.arrow") + @table = Arrow::Table.new(visible: [true, false, true], + point: [1, 2, 3]) + @table.save(@path) + @format = ArrowDataset::IPCFileFormat.new + yield + end end - sub_test_case(".try_convert") do - def test_hash - batch_size = 1024 - context = ArrowDataset::ScanOptions.try_convert(schema: @schema, - batch_size: batch_size) - assert_equal([@schema, batch_size], - [context.schema, context.batch_size]) + test(".build") do + dataset = ArrowDataset::FileSystemDataset.build(@format) do |factory| + factory.file_system = Arrow::LocalFileSystem.new + factory.add_path(File.expand_path(@path)) end + assert_equal(@table, dataset.to_table) end end