Skip to content

Commit

Permalink
Changed recursive directory listing. Instead of "walk"ing the directo…
Browse files Browse the repository at this point in the history
…ries we now rely on S3's ability to do a recursive find. This significantly reduces the number of requests made to S3.

Fix an invalid lifecycle issue.

Minor tweak to the async task scheduler.  Tasks should all be destroyed before the scheduler ends

Add missing headers

Need to use auto to avoid API inconsistency in S3

Addressing PR review comments

Add a stress test for GetFileInfoGenerator.  Fix an old bug in DeleteDirContents.  Fix a bug in GetFileInfoGenerator.

Addressing review comments

Applying suggestion from pitrou

Remove yields added for debugging

Remove accidentally added include
  • Loading branch information
westonpace committed Jul 10, 2023
1 parent 05620b1 commit 5219550
Show file tree
Hide file tree
Showing 7 changed files with 592 additions and 399 deletions.
41 changes: 41 additions & 0 deletions cpp/src/arrow/filesystem/filesystem_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,34 @@ TEST(PathUtil, SplitAbstractPath) {
AssertPartsEqual(parts, {"abc", "def.ghi"});
}

TEST(PathUtil, SliceAbstractPath) {
std::string path = "abc";
ASSERT_EQ("abc", SliceAbstractPath(path, 0, 1));
ASSERT_EQ("abc", SliceAbstractPath(path, 0, 2));
ASSERT_EQ("", SliceAbstractPath(path, 0, 0));
ASSERT_EQ("", SliceAbstractPath(path, 1, 0));

path = "abc/def\\x/y.ext";
ASSERT_EQ("abc/def\\x/y.ext", SliceAbstractPath(path, 0, 4));
ASSERT_EQ("abc/def\\x/y.ext", SliceAbstractPath(path, 0, 3));
ASSERT_EQ("abc/def\\x", SliceAbstractPath(path, 0, 2));
ASSERT_EQ("abc", SliceAbstractPath(path, 0, 1));
ASSERT_EQ("def\\x/y.ext", SliceAbstractPath(path, 1, 2));
ASSERT_EQ("def\\x/y.ext", SliceAbstractPath(path, 1, 3));
ASSERT_EQ("def\\x", SliceAbstractPath(path, 1, 1));
ASSERT_EQ("y.ext", SliceAbstractPath(path, 2, 1));
ASSERT_EQ("", SliceAbstractPath(path, 3, 1));

path = "x/y\\z";
ASSERT_EQ("x", SliceAbstractPath(path, 0, 1));
ASSERT_EQ("x/y", SliceAbstractPath(path, 0, 1, /*sep=*/'\\'));

// Invalid cases but we shouldn't crash
ASSERT_EQ("", SliceAbstractPath(path, -1, 1));
ASSERT_EQ("", SliceAbstractPath(path, 0, -1));
ASSERT_EQ("", SliceAbstractPath(path, -1, -1));
}

TEST(PathUtil, GetAbstractPathExtension) {
ASSERT_EQ(GetAbstractPathExtension("abc.txt"), "txt");
ASSERT_EQ(GetAbstractPathExtension("dir/abc.txt"), "txt");
Expand All @@ -98,6 +126,19 @@ TEST(PathUtil, GetAbstractPathExtension) {
ASSERT_EQ(GetAbstractPathExtension("/run.d/abc"), "");
}

TEST(PathUtil, GetAbstractPathDepth) {
ASSERT_EQ(0, GetAbstractPathDepth(""));
ASSERT_EQ(0, GetAbstractPathDepth("/"));
ASSERT_EQ(1, GetAbstractPathDepth("foo"));
ASSERT_EQ(1, GetAbstractPathDepth("foo/"));
ASSERT_EQ(1, GetAbstractPathDepth("/foo"));
ASSERT_EQ(1, GetAbstractPathDepth("/foo/"));
ASSERT_EQ(2, GetAbstractPathDepth("/foo/bar"));
ASSERT_EQ(2, GetAbstractPathDepth("/foo/bar/"));
ASSERT_EQ(2, GetAbstractPathDepth("foo/bar"));
ASSERT_EQ(2, GetAbstractPathDepth("foo/bar/"));
}

TEST(PathUtil, GetAbstractPathParent) {
std::pair<std::string, std::string> pair;

Expand Down
37 changes: 37 additions & 0 deletions cpp/src/arrow/filesystem/path_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include <algorithm>
#include <regex>
#include <sstream>

#include "arrow/filesystem/path_util.h"
#include "arrow/filesystem/util_internal.h"
Expand Down Expand Up @@ -66,6 +67,42 @@ std::vector<std::string> SplitAbstractPath(const std::string& path, char sep) {
return parts;
}

std::string SliceAbstractPath(const std::string& s, int offset, int length, char sep) {
if (offset < 0 || length < 0) {
return "";
}
std::vector<std::string> components = SplitAbstractPath(s, sep);
std::stringstream combined;
if (offset >= static_cast<int>(components.size())) {
return "";
}
int end = offset + length;
if (end > static_cast<int>(components.size())) {
end = static_cast<int>(components.size());
}
for (int i = offset; i < end; i++) {
combined << components[i];
if (i < end - 1) {
combined << sep;
}
}
return combined.str();
}

int GetAbstractPathDepth(std::string_view path) {
if (path.empty()) {
return 0;
}
int depth = static_cast<int>(std::count(path.begin(), path.end(), kSep)) + 1;
if (path.back() == kSep) {
depth -= 1;
}
if (path.front() == kSep) {
depth -= 1;
}
return depth;
}

std::pair<std::string, std::string> GetAbstractPathParent(const std::string& s) {
// XXX should strip trailing slash?

Expand Down
20 changes: 18 additions & 2 deletions cpp/src/arrow/filesystem/path_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,25 @@ constexpr char kSep = '/';
ARROW_EXPORT
std::vector<std::string> SplitAbstractPath(const std::string& path, char sep = kSep);

// Return the extension of the file
// Slice the individual components of an abstract path and combine them
//
// If offset or length are negative then an empty string is returned
// If offset is >= the number of components then an empty string is returned
// If offset + length is >= the number of components then length is truncated
ARROW_EXPORT
std::string GetAbstractPathExtension(const std::string& s);
std::string SliceAbstractPath(const std::string& path, int offset, int length,
char sep = kSep);

// Return the extension of the file
ARROW_EXPORT std::string GetAbstractPathExtension(const std::string& s);

// Return the depth (number of components) of an abstract path
//
// Trailing slashes do not count towards depth
// Leading slashes do not count towards depth
//
// The root path ("/") has depth 0
ARROW_EXPORT int GetAbstractPathDepth(std::string_view path);

// Return the parent directory and basename of an abstract path. Both values may be
// empty.
Expand Down

0 comments on commit 5219550

Please sign in to comment.