Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARROW-453: [C++] Filesystem implementation for Amazon S3 #5167

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ matrix:
- ARROW_TRAVIS_ORC=1
- ARROW_TRAVIS_PARQUET=1
- ARROW_TRAVIS_PLASMA=1
- ARROW_TRAVIS_S3=1
- ARROW_TRAVIS_USE_SYSTEM_JAVA=1
- ARROW_TRAVIS_USE_TOOLCHAIN=1
# TODO: This fails in re2 code
Expand Down
3 changes: 3 additions & 0 deletions appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,10 @@ environment:
CLCACHE_SERVER: 1
CLCACHE_COMPRESS: 1
CLCACHE_COMPRESSLEVEL: 6
ARROW_BUILD_FLIGHT: "OFF"
ARROW_BUILD_GANDIVA: "OFF"
ARROW_LLVM_VERSION: "7.0.*"
ARROW_S3: "OFF"
PYTHON: "3.6"
ARCH: "64"

Expand All @@ -67,6 +69,7 @@ environment:
- JOB: "Toolchain"
GENERATOR: Ninja
CONFIGURATION: "Release"
ARROW_S3: "ON"
ARROW_BUILD_FLIGHT: "ON"
ARROW_BUILD_GANDIVA: "ON"
# NOTE: Since ARROW-5403 we have disabled the static CRT build
Expand Down
5 changes: 5 additions & 0 deletions ci/appveyor-cpp-setup.bat
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,8 @@ if "%USE_CLCACHE%" == "true" (
clcache -s
powershell.exe -Command "Start-Process clcache-server"
)

if "%ARROW_S3%" == "ON" (
@rem Download Minio somewhere on PATH, for unit tests
appveyor DownloadFile https://dl.min.io/server/minio/release/windows-amd64/minio.exe -FileName C:\Windows\Minio.exe || exit /B
)
1 change: 1 addition & 0 deletions ci/conda_env_cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
# specific language governing permissions and limitations
# under the License.

aws-sdk-cpp
benchmark=1.4.1
boost-cpp>=1.68.0
brotli
Expand Down
1 change: 1 addition & 0 deletions ci/cpp-msvc-build-main.bat
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ cmake -G "%GENERATOR%" %CMAKE_ARGS% ^
-DCMAKE_CXX_FLAGS_RELEASE="/MD %CMAKE_CXX_FLAGS_RELEASE%" ^
-DARROW_FLIGHT=%ARROW_BUILD_FLIGHT% ^
-DARROW_GANDIVA=%ARROW_BUILD_GANDIVA% ^
-DARROW_S3=%ARROW_S3% ^
-DARROW_PARQUET=ON ^
-DPARQUET_BUILD_EXECUTABLES=ON ^
-DARROW_PYTHON=ON ^
Expand Down
4 changes: 4 additions & 0 deletions ci/travis_before_script_cpp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,10 @@ if [ "$ARROW_TRAVIS_PYTHON" == "1" ]; then
fi
fi

if [ "$ARROW_TRAVIS_S3" == "1" ]; then
CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_S3=ON"
fi

if [ "$ARROW_TRAVIS_PARQUET" == "1" ]; then
CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS \
-DARROW_PARQUET=ON \
Expand Down
8 changes: 8 additions & 0 deletions ci/travis_install_linux.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,14 @@ if [ "$ARROW_TRAVIS_GANDIVA" == "1" ]; then
sudo apt-get install -y -qq llvm-$ARROW_LLVM_MAJOR_VERSION-dev
fi

if [ "$ARROW_TRAVIS_S3" == "1" ]; then
# Download the Minio S3 server into PATH
S3FS_DIR=~/.local/bin/
mkdir -p $S3FS_DIR
wget --directory-prefix $S3FS_DIR https://dl.min.io/server/minio/release/linux-amd64/minio
chmod +x $S3FS_DIR/minio
fi

if [ "$ARROW_TRAVIS_USE_SYSTEM" == "1" ]; then
if [ "$DISTRO_CODENAME" == "xenial" ]; then
# TODO(ARROW-4761): Install libzstd-dev once we support zstd<1
Expand Down
4 changes: 4 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -655,6 +655,10 @@ if(ARROW_USE_GLOG)
add_definitions("-DARROW_USE_GLOG")
endif()

if(ARROW_S3)
list(APPEND ARROW_LINK_LIBS ${AWSSDK_LINK_LIBRARIES})
endif()

add_custom_target(arrow_dependencies)
add_custom_target(arrow_benchmark_dependencies)
add_custom_target(arrow_test_dependencies)
Expand Down
2 changes: 2 additions & 0 deletions cpp/cmake_modules/DefineOptions.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,8 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")

define_option(ARROW_JSON "Build Arrow with JSON support (requires RapidJSON)" ON)

define_option(ARROW_S3 "Build Arrow with S3 support (requires the AWS SDK for C++)" OFF)

#----------------------------------------------------------------------
set_option_category("Thirdparty toolchain")

Expand Down
87 changes: 85 additions & 2 deletions cpp/cmake_modules/ThirdpartyToolchain.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ endmacro()
# Resolve the dependencies

set(ARROW_THIRDPARTY_DEPENDENCIES
AWSSDK
benchmark
BOOST
Brotli
Expand Down Expand Up @@ -127,7 +128,9 @@ foreach(DEPENDENCY ${ARROW_THIRDPARTY_DEPENDENCIES})
endforeach()

macro(build_dependency DEPENDENCY_NAME)
if("${DEPENDENCY_NAME}" STREQUAL "benchmark")
if("${DEPENDENCY_NAME}" STREQUAL "AWSSDK")
build_awssdk()
elseif("${DEPENDENCY_NAME}" STREQUAL "benchmark")
build_benchmark()
elseif("${DEPENDENCY_NAME}" STREQUAL "Brotli")
build_brotli()
Expand Down Expand Up @@ -255,6 +258,13 @@ foreach(_VERSION_ENTRY ${TOOLCHAIN_VERSIONS_TXT})
set(${_LIB_NAME} "${_LIB_VERSION}")
endforeach()

if(DEFINED ENV{ARROW_AWSSDK_URL})
set(AWSSDK_SOURCE_URL "$ENV{ARROW_AWSSDK_URL}")
else()
set(AWSSDK_SOURCE_URL
"https://github.com/aws/aws-sdk-cpp/archive/${AWSSDK_VERSION}.tar.gz")
endif()

if(DEFINED ENV{ARROW_BOOST_URL})
set(BOOST_SOURCE_URL "$ENV{ARROW_BOOST_URL}")
else()
Expand Down Expand Up @@ -910,7 +920,7 @@ if(BREW_BIN AND NOT OPENSSL_ROOT_DIR)
set(OPENSSL_ROOT_DIR ${OPENSSL_BREW_PREFIX})
endif()
endif()
if(PARQUET_REQUIRE_ENCRYPTION OR ARROW_FLIGHT)
if(PARQUET_REQUIRE_ENCRYPTION OR ARROW_FLIGHT OR ARROW_S3)
# This must work
find_package(OpenSSL ${ARROW_OPENSSL_REQUIRED_VERSION} REQUIRED)
set(ARROW_USE_OPENSSL ON)
Expand Down Expand Up @@ -2370,6 +2380,79 @@ if(ARROW_ORC)
message(STATUS "Found ORC headers: ${ORC_INCLUDE_DIR}")
endif()

# ----------------------------------------------------------------------
# AWS SDK for C++

macro(build_awssdk)
message(
FATAL_ERROR "FIXME: Building AWS C++ SDK from source will link with wrong libcrypto")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's super fun.

message("Building AWS C++ SDK from source")

set(AWSSDK_PREFIX "${THIRDPARTY_DIR}/awssdk_ep-install")
set(AWSSDK_INCLUDE_DIR "${AWSSDK_PREFIX}/include")

if(WIN32)
# On Windows, need to match build types
set(AWSSDK_BUILD_TYPE ${CMAKE_BUILD_TYPE})
else()
# Otherwise, always build in release mode.
# Especially with gcc, debug builds can fail with "asm constraint" errors:
# https://github.com/TileDB-Inc/TileDB/issues/1351
set(AWSSDK_BUILD_TYPE Release)
endif()

set(AWSSDK_CMAKE_ARGS
-DCMAKE_BUILD_TYPE=Release
-DCMAKE_INSTALL_LIBDIR=lib
-DBUILD_ONLY=s3;core;config
-DENABLE_UNITY_BUILD=on
-DENABLE_TESTING=off
"-DCMAKE_C_FLAGS=${EP_C_FLAGS}"
"-DCMAKE_INSTALL_PREFIX=${AWSSDK_PREFIX}")

set(
AWSSDK_CORE_SHARED_LIB
"${AWSSDK_PREFIX}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}aws-cpp-sdk-core${CMAKE_SHARED_LIBRARY_SUFFIX}"
)
set(
AWSSDK_S3_SHARED_LIB
"${AWSSDK_PREFIX}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}aws-cpp-sdk-s3${CMAKE_SHARED_LIBRARY_SUFFIX}"
)
set(AWSSDK_SHARED_LIBS "${AWSSDK_CORE_SHARED_LIB}" "${AWSSDK_S3_SHARED_LIB}")

externalproject_add(awssdk_ep
${EP_LOG_OPTIONS}
URL ${AWSSDK_SOURCE_URL}
CMAKE_ARGS ${AWSSDK_CMAKE_ARGS}
BUILD_BYPRODUCTS ${AWSSDK_SHARED_LIBS})

file(MAKE_DIRECTORY ${AWSSDK_INCLUDE_DIR})

add_dependencies(toolchain awssdk_ep)
set(AWSSDK_LINK_LIBRARIES ${AWSSDK_SHARED_LIBS})
set(AWSSDK_VENDORED TRUE)
endmacro()

if(ARROW_S3)
# See https://aws.amazon.com/blogs/developer/developer-experience-of-the-aws-sdk-for-c-now-simplified-by-cmake/

# Need to customize the find_package() call, so cannot call resolve_dependency()
if(AWSSDK_SOURCE STREQUAL "AUTO")
find_package(AWSSDK COMPONENTS config s3 transfer)
if(NOT AWSSDK_FOUND)
build_awssdk()
endif()
elseif(AWSSDK_SOURCE STREQUAL "BUNDLED")
build_awssdk()
elseif(AWSSDK_SOURCE STREQUAL "SYSTEM")
find_package(AWSSDK REQUIRED COMPONENTS config s3 transfer)
endif()

include_directories(SYSTEM ${AWSSDK_INCLUDE_DIR})
message(STATUS "Found AWS SDK headers: ${AWSSDK_INCLUDE_DIR}")
message(STATUS "Found AWS SDK libraries: ${AWSSDK_LINK_LIBRARIES}")
endif()

# Write out the package configurations.

configure_file("src/arrow/util/config.h.cmake" "src/arrow/util/config.h")
Expand Down
4 changes: 4 additions & 0 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,10 @@ if(ARROW_JSON)
json/reader.cc)
endif()

if(ARROW_S3)
set(ARROW_SRCS ${ARROW_SRCS} filesystem/s3fs.cc)
endif()

if(ARROW_WITH_URIPARSER)
set(ARROW_SRCS ${ARROW_SRCS} util/uri.cc)
endif()
Expand Down
11 changes: 11 additions & 0 deletions cpp/src/arrow/filesystem/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,14 @@ arrow_install_all_headers("arrow/filesystem")

add_arrow_test(filesystem_test)
add_arrow_test(localfs_test)

if(ARROW_S3)
add_arrow_test(s3fs_test)

if(ARROW_BUILD_TESTS)
add_executable(arrow-s3fs-narrative-test s3fs_narrative_test.cc)
target_link_libraries(arrow-s3fs-narrative-test ${ARROW_TEST_LINK_LIBS}
${GFLAGS_LIBRARIES} GTest::GTest)
add_dependencies(arrow-tests arrow-s3fs-narrative-test)
endif()
endif()
21 changes: 21 additions & 0 deletions cpp/src/arrow/filesystem/filesystem.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "arrow/filesystem/filesystem.h"
#include "arrow/filesystem/path_util.h"
#include "arrow/util/logging.h"
#include "arrow/util/macros.h"

namespace arrow {
namespace fs {
Expand All @@ -45,6 +46,26 @@ std::string ToString(FileType ftype) {
}
}

// For googletest
ARROW_EXPORT std::ostream& operator<<(std::ostream& os, FileType ftype) {
#define FILE_TYPE_CASE(value_name) \
case FileType::value_name: \
os << "FileType::" ARROW_STRINGIFY(value_name); \
break;

switch (ftype) {
FILE_TYPE_CASE(NonExistent)
FILE_TYPE_CASE(Unknown)
FILE_TYPE_CASE(File)
FILE_TYPE_CASE(Directory)
default:
ARROW_LOG(FATAL) << "Invalid FileType value: " << static_cast<int>(ftype);
}

#undef FILE_TYPE_CASE
return os;
}

std::string FileStats::base_name() const {
return internal::GetAbstractPathParent(path_).second;
}
Expand Down
3 changes: 3 additions & 0 deletions cpp/src/arrow/filesystem/filesystem.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#include <chrono>
#include <cstdint>
#include <iosfwd>
#include <memory>
#include <string>
#include <vector>
Expand Down Expand Up @@ -70,6 +71,8 @@ enum class ARROW_EXPORT FileType {

ARROW_EXPORT std::string ToString(FileType);

ARROW_EXPORT std::ostream& operator<<(std::ostream& os, FileType);

static const int64_t kNoSize = -1;
static const TimePoint kNoTime = TimePoint(TimePoint::duration(-1));

Expand Down
12 changes: 4 additions & 8 deletions cpp/src/arrow/filesystem/filesystem_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -542,17 +542,13 @@ TEST_F(TestSubTreeFileSystem, GetTargetStatsSingle) {
FileStats st;
ASSERT_OK(subfs_->CreateDir("AB/CD"));

ASSERT_OK(subfs_->GetTargetStats("AB", &st));
AssertFileStats(st, "AB", FileType::Directory, time_);
ASSERT_OK(subfs_->GetTargetStats("AB/CD", &st));
AssertFileStats(st, "AB/CD", FileType::Directory, time_);
AssertFileStats(subfs_.get(), "AB", FileType::Directory, time_);
AssertFileStats(subfs_.get(), "AB/CD", FileType::Directory, time_);

CreateFile("ab", "data");
ASSERT_OK(subfs_->GetTargetStats("ab", &st));
AssertFileStats(st, "ab", FileType::File, time_, 4);
AssertFileStats(subfs_.get(), "ab", FileType::File, time_, 4);

ASSERT_OK(subfs_->GetTargetStats("non-existent", &st));
AssertFileStats(st, "non-existent", FileType::NonExistent);
AssertFileStats(subfs_.get(), "non-existent", FileType::NonExistent);
}

TEST_F(TestSubTreeFileSystem, GetTargetStatsVector) {
Expand Down
7 changes: 7 additions & 0 deletions cpp/src/arrow/filesystem/path_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,13 @@ std::string EnsureTrailingSlash(const std::string& s) {
}
}

util::string_view RemoveTrailingSlash(util::string_view key) {
if (!key.empty() && key.back() == kSep) {
key.remove_suffix(1);
}
return key;
}

} // namespace internal
} // namespace fs
} // namespace arrow
9 changes: 9 additions & 0 deletions cpp/src/arrow/filesystem/path_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <vector>

#include "arrow/status.h"
#include "arrow/util/string_view.h"

namespace arrow {
namespace fs {
Expand Down Expand Up @@ -56,6 +57,9 @@ std::string ConcatAbstractPath(const std::string& base, const std::string& stem)
ARROW_EXPORT
std::string EnsureTrailingSlash(const std::string& s);

ARROW_EXPORT
util::string_view RemoveTrailingSlash(util::string_view s);

// Join the components of an abstract path.
template <class StringIt>
std::string JoinAbstractPath(StringIt it, StringIt end) {
Expand All @@ -69,6 +73,11 @@ std::string JoinAbstractPath(StringIt it, StringIt end) {
return path;
}

template <class StringRange>
std::string JoinAbstractPath(const StringRange& range) {
return JoinAbstractPath(range.begin(), range.end());
}

} // namespace internal
} // namespace fs
} // namespace arrow