apache · jorisvandenbossche · Apr 13, 2020 · Apr 13, 2020 · kszucs · Apr 13, 2020
diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
@@ -276,55 +276,68 @@ cdef class FileSystemDataset(Dataset):
 
     Parameters
     ----------
+    paths_or_selector : Union[FileSelector, List[FileInfo]]
+        List of files/directories to consume.
     schema : Schema
         The top-level schema of the DataDataset.
-    root_partition : Expression
-        The top-level partition of the DataDataset.
-    file_format : FileFormat
+    format : FileFormat
         File format to create fragments from, currently only
         ParquetFileFormat and IpcFileFormat are supported.
     filesystem : FileSystem
         The filesystem which files are from.
-    paths_or_selector : Union[FileSelector, List[FileInfo]]
-        List of files/directories to consume.
-    partitions : List[Expression]
+    partitions : List[Expression], optional
         Attach aditional partition information for the file paths.
+    root_partition : Expression, optional
+        The top-level partition of the DataDataset.
     """
 
     cdef:
         CFileSystemDataset* filesystem_dataset
 
-    def __init__(self, Schema schema not None, Expression root_partition,
-                 FileFormat file_format not None,
-                 FileSystem filesystem not None,
-                 paths_or_selector, partitions):
+    def __init__(self, paths_or_selector, schema=None, format=None,
+                 filesystem=None, partitions=None, root_partition=None):
         cdef:
             FileInfo info
             Expression expr
             vector[CFileInfo] c_file_infos
             vector[shared_ptr[CExpression]] c_partitions
             CResult[shared_ptr[CDataset]] result
 
+        # validate required arguments
+        for arg, class_, name in [
+            (schema, Schema, 'schema'),
+            (format, FileFormat, 'format'),
+            (filesystem, FileSystem, 'filesystem')
+        ]:
+            if not isinstance(arg, class_):
+                raise TypeError(
+                    "Argument '{0}' has incorrect type (expected {1}, "
+                    "got {2})".format(name, class_.__name__, type(arg))
+                )
+
         for info in filesystem.get_file_info(paths_or_selector):
             c_file_infos.push_back(info.unwrap())
 
+        if partitions is None:
+            partitions = [
+                ScalarExpression(True) for _ in range(c_file_infos.size())]
         for expr in partitions:
             c_partitions.push_back(expr.unwrap())
 
         if c_file_infos.size() != c_partitions.size():
             raise ValueError(
-                'The number of files resulting from paths_or_selector must be '
-                'equal to the number of partitions.'
+                'The number of files resulting from paths_or_selector '
+                'must be equal to the number of partitions.'
             )
 
         if root_partition is None:
             root_partition = ScalarExpression(True)
 
         result = CFileSystemDataset.Make(
             pyarrow_unwrap_schema(schema),
-            root_partition.unwrap(),
-            file_format.unwrap(),
-            filesystem.unwrap(),
+            (<Expression> root_partition).unwrap(),
+            (<FileFormat> format).unwrap(),
+            (<FileSystem> filesystem).unwrap(),
             c_file_infos,
             c_partitions
         )

diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
@@ -191,15 +191,29 @@ def test_filesystem_dataset(mockfs):
     partitions = [ds.ScalarExpression(True), ds.ScalarExpression(True)]
 
     dataset = ds.FileSystemDataset(
-        schema,
+        schema=schema,
         root_partition=None,
-        file_format=file_format,
+        format=file_format,
         filesystem=mockfs,
         paths_or_selector=paths,
         partitions=partitions
     )
     assert isinstance(dataset.format, ds.ParquetFileFormat)
 
+    # the root_partition and partitions keywords have defaults
+    dataset = ds.FileSystemDataset(
+        paths, schema, format=file_format, filesystem=mockfs,
+    )
+    assert isinstance(dataset.format, ds.ParquetFileFormat)
+
+    # validation of required arguments
+    with pytest.raises(TypeError, match="incorrect type"):
+        ds.FileSystemDataset(paths, format=file_format, filesystem=mockfs)
+    with pytest.raises(TypeError, match="incorrect type"):
+        ds.FileSystemDataset(paths, schema=schema, filesystem=mockfs)
+    with pytest.raises(TypeError, match="incorrect type"):
+        ds.FileSystemDataset(paths, schema=schema, format=file_format)
+
     root_partition = ds.ComparisonExpression(
         ds.CompareOperator.Equal,
         ds.FieldExpression('level'),
@@ -223,7 +237,7 @@ def test_filesystem_dataset(mockfs):
         root_partition=root_partition,
         filesystem=mockfs,
         partitions=partitions,
-        file_format=file_format
+        format=file_format
     )
     assert dataset.partition_expression.equals(root_partition)
     assert set(dataset.files) == set(paths)