ARROW-7839: [Python][Dataset] Expose IPC format in python bindings

Closes #6409 from jorisvandenbossche/ARROW-7839-dataset-ipc and squashes the following commits: 3ceffe5 <Joris Van den Bossche> ARROW-7839: Expose IPC format in python bindings Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com> Signed-off-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
apache · Feb 18, 2020 · 01190ab · 01190ab
1 parent d014bc6
commit 01190ab
Show file tree

Hide file tree

Showing 4 changed files with 36 additions and 7 deletions.
diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
@@ -61,6 +61,8 @@ cdef class FileFormat:
         typ = frombytes(sp.get().type_name())
         if typ == 'parquet':
             self = ParquetFileFormat.__new__(ParquetFileFormat)
+        if typ == 'ipc':
+            self = IpcFileFormat.__new__(IpcFileFormat)
         else:
             raise TypeError(typ)
 
@@ -77,6 +79,12 @@ cdef class ParquetFileFormat(FileFormat):
         self.init(shared_ptr[CFileFormat](new CParquetFileFormat()))
 
 
+cdef class IpcFileFormat(FileFormat):
+
+    def __init__(self):
+        self.init(shared_ptr[CFileFormat](new CIpcFileFormat()))
+
+
 cdef class Partitioning:
 
     cdef:
@@ -479,7 +487,7 @@ cdef class FileSystemSourceFactory(SourceFactory):
     paths_or_selector: pyarrow.fs.Selector or list of path-likes
         Either a Selector object or a list of path-like objects.
     format : FileFormat
-        Currently only ParquetFileFormat is supported.
+        Currently only ParquetFileFormat and IpcFileFormat are supported.
     options : FileSystemFactoryOptions, optional
         Various flags influencing the discovery of filesystem paths.
     """
@@ -629,7 +637,7 @@ cdef class FileSystemSource(Source):
             The top-level partition of the DataSource.
         file_format : FileFormat
             File format to create fragments from, currently only
-            ParquetFileFormat is supported.
+            ParquetFileFormat and IpcFileFormat are supported.
         filesystem : FileSystem
             The filesystem which files are from.
         paths_or_selector : Union[FileSelector, List[FileStats]]

diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py
@@ -36,6 +36,7 @@
     FileSystemFactoryOptions,
     HivePartitioning,
     InExpression,
+    IpcFileFormat,
     IsValidExpression,
     NotExpression,
     OrExpression,
@@ -210,6 +211,8 @@ def _ensure_format(obj):
         return obj
     elif obj == "parquet":
         return ParquetFileFormat()
+    elif obj == "ipc":
+        return IpcFileFormat()
     else:
         raise ValueError("format '{}' is not supported".format(obj))
 

diff --git a/python/pyarrow/includes/libarrow_dataset.pxd b/python/pyarrow/includes/libarrow_dataset.pxd
@@ -260,11 +260,6 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil:
         shared_ptr[CFileFormat] format()
         shared_ptr[CScanOptions] scan_options()
 
-    cdef cppclass CParquetFragment "arrow::dataset::ParquetFragment"(
-            CFileFragment):
-        CParquetFragment(const CFileSource& source,
-                         shared_ptr[CScanOptions] options)
-
     cdef cppclass CFileSystemSource \
             "arrow::dataset::FileSystemSource"(CSource):
         @staticmethod
@@ -297,6 +292,10 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil:
                          shared_ptr[CScanOptions] options)
         c_bool splittable()
 
+    cdef cppclass CIpcFileFormat "arrow::dataset::IpcFileFormat"(
+            CFileFormat):
+        pass
+
     cdef cppclass CPartitioning "arrow::dataset::Partitioning":
         c_string type_name() const
         CResult[shared_ptr[CExpression]] Parse(const c_string& path) const

diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
@@ -767,3 +767,22 @@ def test_multiple_sources_with_selectors(multisourcefs):
         ('year', pa.int32())
     ])
     assert dataset.schema.equals(expected_schema, check_metadata=False)
+
+
+def test_ipc_format(tempdir):
+    table = pa.table({'a': pa.array([1, 2, 3], type="int8"),
+                      'b': pa.array([.1, .2, .3], type="float64")})
+
+    path = str(tempdir / 'test.arrow')
+    with pa.output_stream(path) as sink:
+        writer = pa.RecordBatchFileWriter(sink, table.schema)
+        writer.write_batch(table.to_batches()[0])
+        writer.close()
+
+    dataset = ds.dataset(path, format=ds.IpcFileFormat())
+    result = dataset.to_table()
+    assert result.equals(table)
+
+    dataset = ds.dataset(path, format="ipc")
+    result = dataset.to_table()
+    assert result.equals(table)