Skip to content

Commit

Permalink
ARROW-5510: [C++][Python][R][GLib] Implement Feather "V2" using Arrow…
Browse files Browse the repository at this point in the history
… IPC file format

This is based on top of ARROW-7979, so I will need to rebase once that is merged.

Excluding the changes from ARROW-7979, this patch is a substantial code reduction in Feather-related code. I removed a lot of cruft from the V1 implementation and made things a lot simpler without altering the user-facing functionality.

To summarize:

* V2 is exactly the Arrow IPC file format, with the option for the experimental "trivial" body buffer compression implemented in ARROW-7979. `read_feather` functions distinguish the files based on the magic bytes at the beginning of the file ("FEA1" versus "ARROW1")
* A `ipc::feather::WriteProperties` struct has been introduced to allow setting the file version, as well as chunksize (since large tables are broken up into smaller chunks when writing), compression type, and compression level (compressor-specific)
* LZ4 and ZSTD are the only codecs intended to be supported (also in line with mailing list discussion about IPC compression). The default is LZ4 unless -DARROW_WITH_LZ4=OFF in which case it's uncompressed
* Unit tests in Python now test both versions
* R tests are only running the V2 version. I'll need some help adding options to set the version as well as the compression type and compression level

Since 0.17.0 is likely to be released without formalizing IPC compression, I will plan to support an "ARROW:experimental_compression" metadata member in 0.17.0 Feather files.

Other notes:

* Column decompression is currently serial. I'll work on making this parallel ASAP as it will impact benchmarks significantly.
* Compression (both chunk-level and column-level) is serial. Write performance would be much improved, especially at higher compression levels, by compressing in parallel at least at the column level
* Write performance could be improved by compressing chunks and writing them to disk concurrently. It's done serially at the moment, so will open a follow up JIRA about this

Closes #6694 from wesm/feather-v2

Lead-authored-by: Wes McKinney <wesm+git@apache.org>
Co-authored-by: Sutou Kouhei <kou@clear-code.com>
Co-authored-by: Neal Richardson <neal.p.richardson@gmail.com>
Signed-off-by: Wes McKinney <wesm+git@apache.org>
  • Loading branch information
3 people committed Mar 30, 2020
1 parent db50352 commit e03251c
Show file tree
Hide file tree
Showing 38 changed files with 1,894 additions and 3,079 deletions.
1 change: 1 addition & 0 deletions .github/workflows/ruby.yml
Expand Up @@ -97,6 +97,7 @@ jobs:
ARROW_WITH_LZ4: ON
ARROW_WITH_SNAPPY: ON
ARROW_WITH_ZLIB: ON
ARROW_WITH_ZSTD: ON
XML_CATALOG_FILES: /usr/local/etc/xml/catalog
steps:
- name: Checkout Arrow
Expand Down
49 changes: 25 additions & 24 deletions c_glib/arrow-glib/file-system.cpp
Expand Up @@ -85,27 +85,27 @@ garrow_file_info_set_property(GObject *object,
const GValue *value,
GParamSpec *pspec)
{
auto &arrow_file_info = garrow_file_info_get_raw(GARROW_FILE_INFO(object));
auto arrow_file_info = garrow_file_info_get_raw(GARROW_FILE_INFO(object));

switch (prop_id) {
case PROP_FILE_INFO_TYPE:
{
auto arrow_file_type =
static_cast<arrow::fs::FileType>(g_value_get_enum(value));
arrow_file_info.set_type(arrow_file_type);
arrow_file_info->set_type(arrow_file_type);
}
break;
case PROP_FILE_INFO_PATH:
arrow_file_info.set_path(g_value_get_string(value));
arrow_file_info->set_path(g_value_get_string(value));
break;
case PROP_FILE_INFO_SIZE:
arrow_file_info.set_size(g_value_get_int64(value));
arrow_file_info->set_size(g_value_get_int64(value));
break;
case PROP_FILE_INFO_MTIME:
{
const gint64 mtime = g_value_get_int64(value);
const arrow::fs::TimePoint::duration duration(mtime);
arrow_file_info.set_mtime(arrow::fs::TimePoint(duration));
arrow_file_info->set_mtime(arrow::fs::TimePoint(duration));
}
break;
default:
Expand All @@ -120,35 +120,35 @@ garrow_file_info_get_property(GObject *object,
GValue *value,
GParamSpec *pspec)
{
const auto &arrow_file_info =
const auto arrow_file_info =
garrow_file_info_get_raw(GARROW_FILE_INFO(object));

switch (prop_id) {
case PROP_FILE_INFO_TYPE:
{
const auto arrow_file_type = arrow_file_info.type();
const auto arrow_file_type = arrow_file_info->type();
const auto file_type = static_cast<GArrowFileType>(arrow_file_type);
g_value_set_enum(value, file_type);
}
break;
case PROP_FILE_INFO_PATH:
g_value_set_string(value, arrow_file_info.path().c_str());
g_value_set_string(value, arrow_file_info->path().c_str());
break;
case PROP_FILE_INFO_BASE_NAME:
g_value_set_string(value, arrow_file_info.base_name().c_str());
g_value_set_string(value, arrow_file_info->base_name().c_str());
break;
case PROP_FILE_INFO_DIR_NAME:
g_value_set_string(value, arrow_file_info.dir_name().c_str());
g_value_set_string(value, arrow_file_info->dir_name().c_str());
break;
case PROP_FILE_INFO_EXTENSION:
g_value_set_string(value, arrow_file_info.extension().c_str());
g_value_set_string(value, arrow_file_info->extension().c_str());
break;
case PROP_FILE_INFO_SIZE:
g_value_set_int64(value, arrow_file_info.size());
g_value_set_int64(value, arrow_file_info->size());
break;
case PROP_FILE_INFO_MTIME:
{
const auto arrow_mtime = arrow_file_info.mtime();
const auto arrow_mtime = arrow_file_info->mtime();
const auto mtime = arrow_mtime.time_since_epoch().count();
g_value_set_int64(value, mtime);
}
Expand Down Expand Up @@ -317,9 +317,9 @@ gboolean
garrow_file_info_equal(GArrowFileInfo *file_info,
GArrowFileInfo *other_file_info)
{
const auto &arrow_file_info = garrow_file_info_get_raw(file_info);
const auto &arrow_other_file_info = garrow_file_info_get_raw(other_file_info);
return arrow_file_info.Equals(arrow_other_file_info);
const auto arrow_file_info = garrow_file_info_get_raw(file_info);
const auto arrow_other_file_info = garrow_file_info_get_raw(other_file_info);
return arrow_file_info->Equals(*arrow_other_file_info);
}

/**
Expand All @@ -333,8 +333,8 @@ garrow_file_info_equal(GArrowFileInfo *file_info,
gboolean
garrow_file_info_is_file(GArrowFileInfo *file_info)
{
const auto &arrow_file_info = garrow_file_info_get_raw(file_info);
return arrow_file_info.IsFile();
const auto arrow_file_info = garrow_file_info_get_raw(file_info);
return arrow_file_info->IsFile();
}

/**
Expand All @@ -348,8 +348,8 @@ garrow_file_info_is_file(GArrowFileInfo *file_info)
gboolean
garrow_file_info_is_dir(GArrowFileInfo *file_info)
{
const auto &arrow_file_info = garrow_file_info_get_raw(file_info);
return arrow_file_info.IsDirectory();
const auto arrow_file_info = garrow_file_info_get_raw(file_info);
return arrow_file_info->IsDirectory();
}

/**
Expand All @@ -365,8 +365,8 @@ garrow_file_info_is_dir(GArrowFileInfo *file_info)
gchar *
garrow_file_info_to_string(GArrowFileInfo *file_info)
{
const auto &arrow_file_info = garrow_file_info_get_raw(file_info);
auto string = arrow_file_info.ToString();
const auto arrow_file_info = garrow_file_info_get_raw(file_info);
auto string = arrow_file_info->ToString();
return g_strndup(string.data(), string.size());
}

Expand Down Expand Up @@ -1308,10 +1308,11 @@ garrow_file_info_new_raw(const arrow::fs::FileInfo &arrow_file_info)
return file_info;
}

arrow::fs::FileInfo &
arrow::fs::FileInfo *
garrow_file_info_get_raw(GArrowFileInfo *file_info)
{
return GARROW_FILE_INFO_GET_PRIVATE(file_info)->file_info;
auto priv = GARROW_FILE_INFO_GET_PRIVATE(file_info);
return &(priv->file_info);
}

std::shared_ptr<arrow::fs::FileSystem>
Expand Down
2 changes: 1 addition & 1 deletion c_glib/arrow-glib/file-system.hpp
Expand Up @@ -26,7 +26,7 @@
GArrowFileInfo *
garrow_file_info_new_raw(const arrow::fs::FileInfo &arrow_file_info);

arrow::fs::FileInfo &
arrow::fs::FileInfo *
garrow_file_info_get_raw(GArrowFileInfo *file_info);

std::shared_ptr<arrow::fs::FileSystem>
Expand Down

0 comments on commit e03251c

Please sign in to comment.