Skip to content

Commit

Permalink
ARROW-7977: [C++] Rename fs::FileStats to fs::FileInfo
Browse files Browse the repository at this point in the history
If we use FileInfo instead of FileStats, we can use singular form
"info" and plural form "infos" as variable names instead of "stats"
and "stats_vector". It will help writing readable code.

Closes #6514 from kou/cpp-file-system-stat and squashes the following commits:

de69164 <Sutou Kouhei>  Rename fs::FileStats to fs::FileInfo

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
  • Loading branch information
kou committed Mar 5, 2020
1 parent 6fa6c91 commit 34a7522
Show file tree
Hide file tree
Showing 50 changed files with 1,213 additions and 1,205 deletions.
2 changes: 1 addition & 1 deletion cpp/src/arrow/dataset/dataset_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@ TEST_F(TestEndToEnd, EndToEndSingleDataset) {
// A selector is used to crawl files and directories of a
// filesystem. If the options in FileSelector are not enough, the
// FileSystemDatasetFactory class also supports an explicit list of
// fs::FileStats instead of the selector.
// fs::FileInfo instead of the selector.
fs::FileSelector s;
s.base_dir = "/dataset";
s.recursive = true;
Expand Down
32 changes: 16 additions & 16 deletions cpp/src/arrow/dataset/discovery.cc
Original file line number Diff line number Diff line change
Expand Up @@ -127,25 +127,25 @@ Result<fs::PathForest> FileSystemDatasetFactory::Filter(
const std::shared_ptr<fs::FileSystem>& filesystem,
const std::shared_ptr<FileFormat>& format, const FileSystemFactoryOptions& options,
fs::PathForest forest) {
fs::FileStatsVector out;
std::vector<fs::FileInfo> out;

auto& stats = forest.stats();
auto& infos = forest.infos();
RETURN_NOT_OK(forest.Visit([&](fs::PathForest::Ref ref) -> fs::PathForest::MaybePrune {
const auto& path = ref.stats().path();
const auto& path = ref.info().path();

if (StartsWithAnyOf(options.ignore_prefixes, path)) {
return fs::PathForest::Prune;
}

if (ref.stats().IsFile() && options.exclude_invalid_files) {
if (ref.info().IsFile() && options.exclude_invalid_files) {
ARROW_ASSIGN_OR_RAISE(auto supported,
format->IsSupported(FileSource(path, filesystem.get())));
if (!supported) {
return fs::PathForest::Continue;
}
}

out.push_back(std::move(stats[ref.i]));
out.push_back(std::move(infos[ref.i]));
return fs::PathForest::Continue;
}));

Expand All @@ -155,25 +155,25 @@ Result<fs::PathForest> FileSystemDatasetFactory::Filter(
Result<std::shared_ptr<DatasetFactory>> FileSystemDatasetFactory::Make(
std::shared_ptr<fs::FileSystem> filesystem, const std::vector<std::string>& paths,
std::shared_ptr<FileFormat> format, FileSystemFactoryOptions options) {
ARROW_ASSIGN_OR_RAISE(auto files, filesystem->GetTargetStats(paths));
ARROW_ASSIGN_OR_RAISE(auto files, filesystem->GetTargetInfos(paths));
ARROW_ASSIGN_OR_RAISE(auto forest, fs::PathForest::Make(std::move(files)));

std::unordered_set<fs::FileStats, fs::FileStats::ByPath> missing;
std::unordered_set<fs::FileInfo, fs::FileInfo::ByPath> missing;
DCHECK_OK(forest.Visit([&](fs::PathForest::Ref ref) {
util::string_view parent_path = options.partition_base_dir;
if (auto parent = ref.parent()) {
parent_path = parent.stats().path();
parent_path = parent.info().path();
}

for (auto&& path :
fs::internal::AncestorsFromBasePath(parent_path, ref.stats().path())) {
ARROW_ASSIGN_OR_RAISE(auto file, filesystem->GetTargetStats(std::move(path)));
fs::internal::AncestorsFromBasePath(parent_path, ref.info().path())) {
ARROW_ASSIGN_OR_RAISE(auto file, filesystem->GetTargetInfo(std::move(path)));
missing.insert(std::move(file));
}
return Status::OK();
}));

files = std::move(forest).stats();
files = std::move(forest).infos();
std::move(missing.begin(), missing.end(), std::back_inserter(files));

ARROW_ASSIGN_OR_RAISE(forest, fs::PathForest::Make(std::move(files)));
Expand All @@ -187,7 +187,7 @@ Result<std::shared_ptr<DatasetFactory>> FileSystemDatasetFactory::Make(
Result<std::shared_ptr<DatasetFactory>> FileSystemDatasetFactory::Make(
std::shared_ptr<fs::FileSystem> filesystem, fs::FileSelector selector,
std::shared_ptr<FileFormat> format, FileSystemFactoryOptions options) {
ARROW_ASSIGN_OR_RAISE(auto files, filesystem->GetTargetStats(selector));
ARROW_ASSIGN_OR_RAISE(auto files, filesystem->GetTargetInfos(selector));

ARROW_ASSIGN_OR_RAISE(auto forest, fs::PathForest::Make(std::move(files)));

Expand All @@ -210,9 +210,9 @@ Result<std::shared_ptr<Schema>> FileSystemDatasetFactory::PartitionSchema() {
}

std::vector<util::string_view> paths;
for (const auto& stats : forest_.stats()) {
for (const auto& info : forest_.infos()) {
if (auto relative =
fs::internal::RemoveAncestor(options_.partition_base_dir, stats.path())) {
fs::internal::RemoveAncestor(options_.partition_base_dir, info.path())) {
paths.push_back(*relative);
}
}
Expand All @@ -223,7 +223,7 @@ Result<std::shared_ptr<Schema>> FileSystemDatasetFactory::PartitionSchema() {
Result<std::vector<std::shared_ptr<Schema>>> FileSystemDatasetFactory::InspectSchemas() {
std::vector<std::shared_ptr<Schema>> schemas;

for (const auto& f : forest_.stats()) {
for (const auto& f : forest_.infos()) {
if (!f.IsFile()) continue;
FileSource src(f.path(), fs_.get());
ARROW_ASSIGN_OR_RAISE(auto schema, format_->Inspect(src));
Expand Down Expand Up @@ -259,7 +259,7 @@ Result<std::shared_ptr<Dataset>> FileSystemDatasetFactory::Finish(
// apply partitioning to forest to derive partitions
auto apply_partitioning = [&](fs::PathForest::Ref ref) {
if (auto relative = fs::internal::RemoveAncestor(options_.partition_base_dir,
ref.stats().path())) {
ref.info().path())) {
auto segments = fs::internal::SplitAbstractPath(relative->to_string());

if (segments.size() > 0) {
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/arrow/dataset/discovery.h
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ struct FileSystemFactoryOptions {
};

/// \brief FileSystemDatasetFactory creates a Dataset from a vector of
/// fs::FileStats or a fs::FileSelector.
/// fs::FileInfo or a fs::FileSelector.
class ARROW_DS_EXPORT FileSystemDatasetFactory : public DatasetFactory {
public:
/// \brief Build a FileSystemDatasetFactory from an explicit list of
Expand All @@ -165,7 +165,7 @@ class ARROW_DS_EXPORT FileSystemDatasetFactory : public DatasetFactory {

/// \brief Build a FileSystemDatasetFactory from a fs::FileSelector.
///
/// The selector will expand to a vector of FileStats. The expansion/crawling
/// The selector will expand to a vector of FileInfo. The expansion/crawling
/// is performed in this function call. Thus, the finalized Dataset is
/// working with a snapshot of the filesystem.
//
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/dataset/discovery_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ TEST_F(MockDatasetFactoryTest, UnifySchemas) {

class FileSystemDatasetFactoryTest : public DatasetFactoryTest {
public:
void MakeFactory(const std::vector<fs::FileStats>& files) {
void MakeFactory(const std::vector<fs::FileInfo>& files) {
MakeFileSystem(files);
ASSERT_OK_AND_ASSIGN(factory_, FileSystemDatasetFactory::Make(fs_, selector_, format_,
factory_options_));
Expand Down
20 changes: 10 additions & 10 deletions cpp/src/arrow/dataset/file_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -60,17 +60,17 @@ FileSystemDataset::FileSystemDataset(std::shared_ptr<Schema> schema,
Result<std::shared_ptr<Dataset>> FileSystemDataset::Make(
std::shared_ptr<Schema> schema, std::shared_ptr<Expression> root_partition,
std::shared_ptr<FileFormat> format, std::shared_ptr<fs::FileSystem> filesystem,
fs::FileStatsVector stats) {
ExpressionVector partitions(stats.size(), scalar(true));
std::vector<fs::FileInfo> infos) {
ExpressionVector partitions(infos.size(), scalar(true));
return Make(std::move(schema), std::move(root_partition), std::move(format),
std::move(filesystem), std::move(stats), std::move(partitions));
std::move(filesystem), std::move(infos), std::move(partitions));
}

Result<std::shared_ptr<Dataset>> FileSystemDataset::Make(
std::shared_ptr<Schema> schema, std::shared_ptr<Expression> root_partition,
std::shared_ptr<FileFormat> format, std::shared_ptr<fs::FileSystem> filesystem,
fs::FileStatsVector stats, ExpressionVector partitions) {
ARROW_ASSIGN_OR_RAISE(auto forest, fs::PathForest::Make(std::move(stats), &partitions));
std::vector<fs::FileInfo> infos, ExpressionVector partitions) {
ARROW_ASSIGN_OR_RAISE(auto forest, fs::PathForest::Make(std::move(infos), &partitions));
return Make(std::move(schema), std::move(root_partition), std::move(format),
std::move(filesystem), std::move(forest), std::move(partitions));
}
Expand All @@ -88,8 +88,8 @@ std::vector<std::string> FileSystemDataset::files() const {
std::vector<std::string> files;

DCHECK_OK(forest_.Visit([&](fs::PathForest::Ref ref) {
if (ref.stats().IsFile()) {
files.push_back(ref.stats().path());
if (ref.info().IsFile()) {
files.push_back(ref.info().path());
}
return Status::OK();
}));
Expand All @@ -105,7 +105,7 @@ std::string FileSystemDataset::ToString() const {
}

DCHECK_OK(forest_.Visit([&](fs::PathForest::Ref ref) {
repr += "\n" + ref.stats().path();
repr += "\n" + ref.info().path();

if (!partitions_[ref.i]->Equals(true)) {
repr += ": " + partitions_[ref.i]->ToString();
Expand Down Expand Up @@ -177,9 +177,9 @@ FragmentIterator FileSystemDataset::GetFragmentsImpl(
}
}

if (ref.stats().IsFile()) {
if (ref.info().IsFile()) {
// generate a fragment for this file
FileSource src(ref.stats().path(), filesystem_.get());
FileSource src(ref.info().path(), filesystem_.get());
ARROW_ASSIGN_OR_RAISE(auto fragment, format_->MakeFragment(src, options[ref.i]));
fragments.push_back(std::move(fragment));
}
Expand Down
16 changes: 8 additions & 8 deletions cpp/src/arrow/dataset/file_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,34 +161,34 @@ class ARROW_DS_EXPORT FileSystemDataset : public Dataset {
/// \param[in] root_partition the top-level partition of the DataDataset
/// \param[in] format file format to create fragments from.
/// \param[in] filesystem the filesystem which files are from.
/// \param[in] stats a list of files/directories to consume.
/// attach additional partition expressions to FileStats found in `stats`.
/// \param[in] infos a list of files/directories to consume.
/// attach additional partition expressions to FileInfo found in `infos`.
///
/// The caller is not required to provide a complete coverage of nodes and
/// partitions.
static Result<std::shared_ptr<Dataset>> Make(std::shared_ptr<Schema> schema,
std::shared_ptr<Expression> root_partition,
std::shared_ptr<FileFormat> format,
std::shared_ptr<fs::FileSystem> filesystem,
fs::FileStatsVector stats);
std::vector<fs::FileInfo> infos);

/// \brief Create a FileSystemDataset with file-level partitions.
///
/// \param[in] schema the top-level schema of the DataDataset
/// \param[in] root_partition the top-level partition of the DataDataset
/// \param[in] format file format to create fragments from.
/// \param[in] filesystem the filesystem which files are from.
/// \param[in] stats a list of files/directories to consume.
/// \param[in] partitions partition information associated with `stats`.
/// attach additional partition expressions to FileStats found in `stats`.
/// \param[in] infos a list of files/directories to consume.
/// \param[in] partitions partition information associated with `infos`.
/// attach additional partition expressions to FileInfo found in `infos`.
///
/// The caller is not required to provide a complete coverage of nodes and
/// partitions.
static Result<std::shared_ptr<Dataset>> Make(std::shared_ptr<Schema> schema,
std::shared_ptr<Expression> root_partition,
std::shared_ptr<FileFormat> format,
std::shared_ptr<fs::FileSystem> filesystem,
fs::FileStatsVector stats,
std::vector<fs::FileInfo> infos,
ExpressionVector partitions);

/// \brief Create a FileSystemDataset with file-level partitions.
Expand All @@ -199,7 +199,7 @@ class ARROW_DS_EXPORT FileSystemDataset : public Dataset {
/// \param[in] filesystem the filesystem which files are from.
/// \param[in] forest a PathForest of files/directories to consume.
/// \param[in] partitions partition information associated with `forest`.
/// attach additional partition expressions to FileStats found in `forest`.
/// attach additional partition expressions to FileInfo found in `forest`.
///
/// The caller is not required to provide a complete coverage of nodes and
/// partitions.
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/dataset/file_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ TEST_F(TestFileSystemDataset, RootPartitionPruning) {

TEST_F(TestFileSystemDataset, TreePartitionPruning) {
auto source_partition = ("country"_ == "US").Copy();
std::vector<fs::FileStats> regions = {
std::vector<fs::FileInfo> regions = {
fs::Dir("NY"), fs::File("NY/New York"), fs::File("NY/Franklin"),
fs::Dir("CA"), fs::File("CA/San Francisco"), fs::File("CA/Franklin"),
};
Expand Down
5 changes: 0 additions & 5 deletions cpp/src/arrow/dataset/partition.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,6 @@

namespace arrow {

namespace fs {
struct FileStats;
struct FileSelector;
} // namespace fs

namespace dataset {

// ----------------------------------------------------------------------
Expand Down
18 changes: 9 additions & 9 deletions cpp/src/arrow/dataset/test_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -286,29 +286,29 @@ Result<std::shared_ptr<Fragment>> JSONRecordBatchFileFormat::MakeFragment(

class TestFileSystemDataset : public ::testing::Test {
public:
void MakeFileSystem(const std::vector<fs::FileStats>& stats) {
ASSERT_OK_AND_ASSIGN(fs_, fs::internal::MockFileSystem::Make(fs::kNoTime, stats));
void MakeFileSystem(const std::vector<fs::FileInfo>& infos) {
ASSERT_OK_AND_ASSIGN(fs_, fs::internal::MockFileSystem::Make(fs::kNoTime, infos));
}

void MakeFileSystem(const std::vector<std::string>& paths) {
std::vector<fs::FileStats> stats{paths.size()};
std::transform(paths.cbegin(), paths.cend(), stats.begin(),
std::vector<fs::FileInfo> infos{paths.size()};
std::transform(paths.cbegin(), paths.cend(), infos.begin(),
[](const std::string& p) { return fs::File(p); });

ASSERT_OK_AND_ASSIGN(fs_, fs::internal::MockFileSystem::Make(fs::kNoTime, stats));
ASSERT_OK_AND_ASSIGN(fs_, fs::internal::MockFileSystem::Make(fs::kNoTime, infos));
}

void MakeDataset(const std::vector<fs::FileStats>& stats,
void MakeDataset(const std::vector<fs::FileInfo>& infos,
std::shared_ptr<Expression> source_partition = scalar(true),
ExpressionVector partitions = {}) {
if (partitions.empty()) {
partitions.resize(stats.size(), scalar(true));
partitions.resize(infos.size(), scalar(true));
}

MakeFileSystem(stats);
MakeFileSystem(infos);
auto format = std::make_shared<DummyFileFormat>();
ASSERT_OK_AND_ASSIGN(
source_, FileSystemDataset::Make(schema({}), source_partition, format, fs_, stats,
source_, FileSystemDataset::Make(schema({}), source_partition, format, fs_, infos,
partitions));
}

Expand Down
3 changes: 1 addition & 2 deletions cpp/src/arrow/dataset/type_fwd.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,7 @@ namespace fs {

class FileSystem;

struct FileStats;
using FileStatsVector = std::vector<FileStats>;
struct FileInfo;

} // namespace fs

Expand Down
Loading

0 comments on commit 34a7522

Please sign in to comment.