Skip to content

Commit

Permalink
ARROW-15121: [C++] Implement max recursion on GcsFileSystem
Browse files Browse the repository at this point in the history
Recursive listing without limit is about as expensive as only listing
the top-level directories in GCS. Therefore, it is just *more* efficient
to filter the results on the client-side, as this requires fewer request
than listing only only 0, 1, or N levels in a directory hierarchy.

I also improved the tests to verify no objects with similar prefixes are
returned, for example, when listing objects starting with 'aaa' we do
not want 'aaab', but we want 'aaa/b'.

Closes #11969 from coryan/ARROW-15121-gcsfs-implement-max-recursion

Authored-by: Carlos O'Ryan <coryan@google.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
  • Loading branch information
coryan authored and pitrou committed Dec 20, 2021
1 parent e7c2ead commit 61745be
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 14 deletions.
9 changes: 6 additions & 3 deletions cpp/src/arrow/filesystem/gcsfs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,9 @@ class GcsFileSystem::Impl {

Result<FileInfoVector> GetFileInfo(const FileSelector& select) {
ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(select.base_dir));
auto prefix = p.object.empty() ? gcs::Prefix() : gcs::Prefix(p.object);
const auto canonical = internal::EnsureTrailingSlash(p.object);
const auto max_depth = internal::Depth(canonical) + select.max_recursion;
auto prefix = p.object.empty() ? gcs::Prefix() : gcs::Prefix(canonical);
auto delimiter = select.recursive ? gcs::Delimiter() : gcs::Delimiter("/");
bool found_directory = false;
FileInfoVector result;
Expand All @@ -311,8 +313,9 @@ class GcsFileSystem::Impl {
return internal::ToArrowStatus(o.status());
}
found_directory = true;
// Skip the directory itself from the results
if (o->name() == p.object) {
// Skip the directory itself from the results, and any result that is "too deep"
// into the recursion.
if (o->name() == p.object || internal::Depth(o->name()) > max_depth) {
continue;
}
auto path = internal::ConcatAbstractPath(o->bucket(), o->name());
Expand Down
5 changes: 5 additions & 0 deletions cpp/src/arrow/filesystem/gcsfs_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <unordered_map>
#include <vector>

#include "arrow/filesystem/path_util.h"
#include "arrow/util/key_value_metadata.h"

namespace arrow {
Expand Down Expand Up @@ -258,6 +259,10 @@ Result<std::shared_ptr<const KeyValueMetadata>> FromObjectMetadata(
return result;
}

std::int64_t Depth(arrow::util::string_view path) {
return std::count(path.begin(), path.end(), fs::internal::kSep);
}

} // namespace internal
} // namespace fs
} // namespace arrow
2 changes: 2 additions & 0 deletions cpp/src/arrow/filesystem/gcsfs_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ Result<google::cloud::storage::WithObjectMetadata> ToObjectMetadata(
Result<std::shared_ptr<const KeyValueMetadata>> FromObjectMetadata(
google::cloud::storage::ObjectMetadata const& m);

std::int64_t Depth(arrow::util::string_view path);

} // namespace internal
} // namespace fs
} // namespace arrow
72 changes: 61 additions & 11 deletions cpp/src/arrow/filesystem/gcsfs_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -199,16 +199,26 @@ class GcsIntegrationTest : public ::testing::Test {

Result<Hierarchy> CreateHierarchy(std::shared_ptr<arrow::fs::FileSystem> fs) {
const char* const kTestFolders[] = {
"a/", "a/0/", "a/0/0/", "a/1/", "a/2/",
"b/",
"b/0/",
"b/0/0/",
"b/1/",
"b/2/",
// Create some additional folders that should not appear in any listing of b/
"aa/",
"ba/",
"c/",
};
constexpr auto kFilesPerFolder = 4;
auto result = Hierarchy{PreexistingBucketPath() + "a/", {}};
constexpr auto kFilesPerFolder = 2;
auto base_dir = internal::ConcatAbstractPath(PreexistingBucketPath(), "b/");
auto result = Hierarchy{base_dir, {}};
for (auto const* f : kTestFolders) {
const auto folder = PreexistingBucketPath() + f;
const auto folder = internal::ConcatAbstractPath(PreexistingBucketPath(), f);
RETURN_NOT_OK(fs->CreateDir(folder, true));
result.contents.push_back(arrow::fs::Dir(folder));
for (int i = 0; i != kFilesPerFolder; ++i) {
const auto filename = folder + "test-file-" + std::to_string(i);
const auto filename =
internal::ConcatAbstractPath(folder, "test-file-" + std::to_string(i));
CreateFile(fs.get(), filename, filename);
result.contents.push_back(arrow::fs::File(filename));
}
Expand Down Expand Up @@ -483,14 +493,19 @@ TEST_F(GcsIntegrationTest, GetFileInfoSelectorRecursive) {
auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions());
ASSERT_OK_AND_ASSIGN(auto hierarchy, CreateHierarchy(fs));
std::vector<arrow::fs::FileInfo> expected;
std::copy_if(
hierarchy.contents.begin(), hierarchy.contents.end(), std::back_inserter(expected),
[&](const arrow::fs::FileInfo& info) { return hierarchy.base_dir != info.path(); });
std::copy_if(hierarchy.contents.begin(), hierarchy.contents.end(),
std::back_inserter(expected), [&](const arrow::fs::FileInfo& info) {
if (!fs::internal::IsAncestorOf(hierarchy.base_dir, info.path())) {
return false;
}
return hierarchy.base_dir != info.path();
});

auto selector = FileSelector();
selector.base_dir = hierarchy.base_dir;
selector.allow_not_found = false;
selector.recursive = true;
selector.max_recursion = 16;
ASSERT_OK_AND_ASSIGN(auto results, fs->GetFileInfo(selector));
EXPECT_THAT(results, UnorderedElementsAreArray(expected.begin(), expected.end()));
}
Expand All @@ -515,6 +530,34 @@ TEST_F(GcsIntegrationTest, GetFileInfoSelectorNonRecursive) {
EXPECT_THAT(results, UnorderedElementsAreArray(expected.begin(), expected.end()));
}

TEST_F(GcsIntegrationTest, GetFileInfoSelectorLimitedRecursion) {
auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions());
ASSERT_OK_AND_ASSIGN(auto hierarchy, CreateHierarchy(fs));

for (const auto max_recursion : {0, 1, 2, 3}) {
SCOPED_TRACE("Testing with max_recursion=" + std::to_string(max_recursion));
const auto max_depth =
internal::Depth(internal::EnsureTrailingSlash(hierarchy.base_dir)) +
max_recursion;
std::vector<arrow::fs::FileInfo> expected;
std::copy_if(hierarchy.contents.begin(), hierarchy.contents.end(),
std::back_inserter(expected), [&](const arrow::fs::FileInfo& info) {
if (info.path() == hierarchy.base_dir) return false;
if (!fs::internal::IsAncestorOf(hierarchy.base_dir, info.path())) {
return false;
}
return internal::Depth(info.path()) <= max_depth;
});
auto selector = FileSelector();
selector.base_dir = hierarchy.base_dir;
selector.allow_not_found = true;
selector.recursive = true;
selector.max_recursion = max_recursion;
ASSERT_OK_AND_ASSIGN(auto results, fs->GetFileInfo(selector));
EXPECT_THAT(results, UnorderedElementsAreArray(expected.begin(), expected.end()));
}
}

TEST_F(GcsIntegrationTest, GetFileInfoSelectorNotFoundTrue) {
auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions());

Expand Down Expand Up @@ -591,7 +634,10 @@ TEST_F(GcsIntegrationTest, DeleteDirSuccess) {
arrow::fs::AssertFileInfo(fs.get(), PreexistingBucketPath(), FileType::Directory);
arrow::fs::AssertFileInfo(fs.get(), PreexistingObjectPath(), FileType::File);
for (auto const& info : hierarchy.contents) {
arrow::fs::AssertFileInfo(fs.get(), info.path(), FileType::NotFound);
const auto expected_type = fs::internal::IsAncestorOf(hierarchy.base_dir, info.path())
? FileType::NotFound
: info.type();
arrow::fs::AssertFileInfo(fs.get(), info.path(), expected_type);
}
}

Expand All @@ -604,8 +650,12 @@ TEST_F(GcsIntegrationTest, DeleteDirContentsSuccess) {
arrow::fs::AssertFileInfo(fs.get(), PreexistingBucketPath(), FileType::Directory);
arrow::fs::AssertFileInfo(fs.get(), PreexistingObjectPath(), FileType::File);
for (auto const& info : hierarchy.contents) {
if (info.path() == hierarchy.base_dir) continue;
arrow::fs::AssertFileInfo(fs.get(), info.path(), FileType::NotFound);
auto expected_type = FileType::NotFound;
if (info.path() == hierarchy.base_dir ||
!fs::internal::IsAncestorOf(hierarchy.base_dir, info.path())) {
expected_type = info.type();
}
arrow::fs::AssertFileInfo(fs.get(), info.path(), expected_type);
}
}

Expand Down

0 comments on commit 61745be

Please sign in to comment.