From 5b70f4b0bb3f32972ce349e53afcce3e123312de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Fri, 1 May 2020 10:24:21 -0400 Subject: [PATCH] Address comments --- cpp/src/arrow/dataset/discovery.cc | 11 +++++------ cpp/src/arrow/dataset/discovery.h | 2 +- cpp/src/arrow/dataset/file_base.cc | 13 ++++--------- cpp/src/arrow/dataset/file_base.h | 3 ++- cpp/src/arrow/dataset/filter.h | 6 +++++- cpp/src/arrow/dataset/test_util.h | 2 +- python/pyarrow/_dataset.pyx | 11 +++++++---- python/pyarrow/includes/libarrow_dataset.pxd | 2 +- 8 files changed, 26 insertions(+), 24 deletions(-) diff --git a/cpp/src/arrow/dataset/discovery.cc b/cpp/src/arrow/dataset/discovery.cc index 1449716778f32..412208c6c10eb 100644 --- a/cpp/src/arrow/dataset/discovery.cc +++ b/cpp/src/arrow/dataset/discovery.cc @@ -110,7 +110,7 @@ FileSystemDatasetFactory::FileSystemDatasetFactory( format_(std::move(format)), options_(std::move(options)) {} -util::optional FileSystemDatasetFactory::BaselessPath( +util::optional FileSystemDatasetFactory::RemovePartitionBaseDir( util::string_view path) { const util::string_view partition_base_dir{options_.partition_base_dir}; return fs::internal::RemoveAncestor(partition_base_dir, path); @@ -193,7 +193,7 @@ Result> FileSystemDatasetFactory::PartitionSchema() { std::vector relative_paths; for (const auto& path : paths_) { - if (auto relative = BaselessPath(path)) { + if (auto relative = RemovePartitionBaseDir(path)) { relative_paths.push_back(*relative); } } @@ -241,12 +241,11 @@ Result> FileSystemDatasetFactory::Finish(FinishOptions ARROW_ASSIGN_OR_RAISE(partitioning, factory->Finish(schema)); } - FragmentVector fragments; + std::vector> fragments; for (const auto& path : paths_) { std::shared_ptr partition = scalar(true); - if (auto relative = BaselessPath(path)) { - std::string path_string{*relative}; - partition = partitioning->Parse(path_string).ValueOr(scalar(true)); + if (auto relative = RemovePartitionBaseDir(path)) { + partition = partitioning->Parse(relative->to_string()).ValueOr(scalar(true)); } ARROW_ASSIGN_OR_RAISE(auto fragment, format_->MakeFragment({path, fs_}, partition)); diff --git a/cpp/src/arrow/dataset/discovery.h b/cpp/src/arrow/dataset/discovery.h index 5030b7ee540b7..8c8508c6cddb8 100644 --- a/cpp/src/arrow/dataset/discovery.h +++ b/cpp/src/arrow/dataset/discovery.h @@ -233,7 +233,7 @@ class ARROW_DS_EXPORT FileSystemDatasetFactory : public DatasetFactory { FileSystemFactoryOptions options_; private: - util::optional BaselessPath(util::string_view path); + util::optional RemovePartitionBaseDir(util::string_view path); }; } // namespace dataset diff --git a/cpp/src/arrow/dataset/file_base.cc b/cpp/src/arrow/dataset/file_base.cc index 02ad596cecd22..9d3a427ccc7cd 100644 --- a/cpp/src/arrow/dataset/file_base.cc +++ b/cpp/src/arrow/dataset/file_base.cc @@ -91,16 +91,11 @@ FileSystemDataset::FileSystemDataset(std::shared_ptr schema, Result> FileSystemDataset::Make( std::shared_ptr schema, std::shared_ptr root_partition, - std::shared_ptr format, FragmentVector fragments) { - std::vector> file_fragments; - for (const auto& fragment : fragments) { - auto file_fragment = internal::checked_pointer_cast(fragment); - file_fragments.push_back(std::move(file_fragment)); - } - + std::shared_ptr format, + std::vector> fragments) { return std::shared_ptr( new FileSystemDataset(std::move(schema), std::move(root_partition), - std::move(format), std::move(file_fragments))); + std::move(format), std::move(fragments))); } Result> FileSystemDataset::ReplaceSchema( @@ -164,7 +159,7 @@ Result> FileSystemDataset::Write( auto partition_base_dir = fs::internal::EnsureTrailingSlash(plan.partition_base_dir); auto extension = "." + plan.format->type_name(); - FragmentVector fragments; + std::vector> fragments; for (size_t i = 0; i < plan.paths.size(); ++i) { const auto& op = plan.fragment_or_partition_expressions[i]; if (util::holds_alternative>(op)) { diff --git a/cpp/src/arrow/dataset/file_base.h b/cpp/src/arrow/dataset/file_base.h index f3243e09aa297..85538b7832dd3 100644 --- a/cpp/src/arrow/dataset/file_base.h +++ b/cpp/src/arrow/dataset/file_base.h @@ -202,7 +202,8 @@ class ARROW_DS_EXPORT FileSystemDataset : public Dataset { /// \return A constructed dataset. static Result> Make( std::shared_ptr schema, std::shared_ptr root_partition, - std::shared_ptr format, FragmentVector fragments); + std::shared_ptr format, + std::vector> fragments); /// \brief Write to a new format and filesystem location, preserving partitioning. /// diff --git a/cpp/src/arrow/dataset/filter.h b/cpp/src/arrow/dataset/filter.h index d8ef4f7f890ab..4a00bf09478c9 100644 --- a/cpp/src/arrow/dataset/filter.h +++ b/cpp/src/arrow/dataset/filter.h @@ -188,7 +188,11 @@ class ARROW_DS_EXPORT Expression { /// This is a shortcut to check if the expression is neither null nor false. bool IsSatisfiable() const { return !IsNull() && !Equals(false); } - bool IsSatisfiableWith(const std::shared_ptr other) const { + /// Indicates if the expression is satisfiable given an other expression. + /// + /// This behaves like IsSatisfiable, but it simplifies the current expression + /// with the given `other` information. + bool IsSatisfiableWith(const std::shared_ptr& other) const { return Assume(other)->IsSatisfiable(); } diff --git a/cpp/src/arrow/dataset/test_util.h b/cpp/src/arrow/dataset/test_util.h index 26b43ef78162d..75a45618244ee 100644 --- a/cpp/src/arrow/dataset/test_util.h +++ b/cpp/src/arrow/dataset/test_util.h @@ -300,7 +300,7 @@ struct MakeFileSystemDatasetMixin { MakeFileSystem(infos); auto format = std::make_shared(); - FragmentVector fragments; + std::vector> fragments; for (size_t i = 0; i < n_fragments; i++) { const auto& info = infos[i]; if (!info.IsFile()) { diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 182a3a49d0c10..7c8f006f41f45 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -275,10 +275,11 @@ cdef class FileSystemDataset(Dataset): cdef: FileInfo info Expression expr - Fragment fragment + FileFragment fragment vector[CFileInfo] c_file_infos vector[shared_ptr[CExpression]] c_partitions - vector[shared_ptr[CFragment]] c_fragments + shared_ptr[CFileFragment] c_fragment + vector[shared_ptr[CFileFragment]] c_fragments CResult[shared_ptr[CDataset]] result # validate required arguments @@ -296,7 +297,7 @@ cdef class FileSystemDataset(Dataset): infos = filesystem.get_file_info(paths_or_selector) if partitions is None: - partitions = [ScalarExpression(True) for _ in range(len(infos))] + partitions = [ScalarExpression(True)] * len(infos) if len(infos) != len(partitions): raise ValueError( @@ -308,7 +309,9 @@ cdef class FileSystemDataset(Dataset): if info.is_file: fragment = format.make_fragment(info.path, filesystem, partitions[i]) - c_fragments.push_back(fragment.unwrap()) + c_fragments.push_back( + static_pointer_cast[CFileFragment, CFragment]( + fragment.unwrap())) if root_partition is None: root_partition = ScalarExpression(True) diff --git a/python/pyarrow/includes/libarrow_dataset.pxd b/python/pyarrow/includes/libarrow_dataset.pxd index 75e835b3d88d6..43d8d63df15c7 100644 --- a/python/pyarrow/includes/libarrow_dataset.pxd +++ b/python/pyarrow/includes/libarrow_dataset.pxd @@ -260,7 +260,7 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil: shared_ptr[CSchema] schema, shared_ptr[CExpression] source_partition, shared_ptr[CFileFormat] format, - CFragmentVector fragments) + vector[shared_ptr[CFileFragment]] fragments) c_string type() vector[c_string] files() const shared_ptr[CFileFormat]& format() const