Skip to content

Commit

Permalink
ARROW-3094: [Python] Easier construction of schemas and struct types
Browse files Browse the repository at this point in the history
Allow calling `pa.schema()` and `pa.struct()` with a list of tuples, or a mapping of strings to datatypes, instead of having to call `pa.field()` explicitly for each field. The latter is still possible if e.g. wanting to pass metadata.

Author: Antoine Pitrou <antoine@python.org>

Closes #2450 from pitrou/ARROW-3094-easier-struct-schema-construction and squashes the following commits:

4b27541 <Antoine Pitrou> Use shorthand notation more often in docs
1215123 <Antoine Pitrou> Fix for Python 2.7
39a21df <Antoine Pitrou> ARROW-3094:  Easier construction of schemas and struct types
  • Loading branch information
pitrou authored and wesm committed Aug 21, 2018
1 parent dbf531b commit e8e82d0
Show file tree
Hide file tree
Showing 4 changed files with 133 additions and 25 deletions.
36 changes: 20 additions & 16 deletions python/doc/source/data.rst
Expand Up @@ -18,8 +18,8 @@
.. currentmodule:: pyarrow
.. _data:

In-Memory Data Model
====================
Data Types and In-Memory Data Model
===================================

Apache Arrow defines columnar array data structures by composing type metadata
with memory buffers, like the ones explained in the documentation on
Expand Down Expand Up @@ -107,12 +107,22 @@ A `struct` is a collection of named fields:
pa.field('s0', t1),
pa.field('s1', t2),
pa.field('s2', t4),
pa.field('s3', t6)
pa.field('s3', t6),
]
t7 = pa.struct(fields)
print(t7)
For convenience, you can pass ``(name, type)`` tuples directly instead of
:class:`~pyarrow.Field` instances:

.. ipython:: python
t8 = pa.struct([('s0', t1), ('s1', t2), ('s2', t4), ('s3', t6)])
print(t8)
t8 == t7
See :ref:`Data Types API <api.types>` for a full listing of data type
functions.

Expand All @@ -123,19 +133,15 @@ Schemas

The :class:`~pyarrow.Schema` type is similar to the ``struct`` array type; it
defines the column names and types in a record batch or table data
structure. The ``pyarrow.schema`` factory function makes new Schema objects in
structure. The :func:`pyarrow.schema` factory function makes new Schema objects in
Python:

.. ipython:: python
fields = [
pa.field('s0', t1),
pa.field('s1', t2),
pa.field('s2', t4),
pa.field('s3', t6)
]
my_schema = pa.schema(fields)
my_schema = pa.schema([('field0', t1),
('field1', t2),
('field2', t4),
('field3', t6)])
my_schema
In some applications, you may not create schemas directly, only using the ones
Expand Down Expand Up @@ -233,10 +239,8 @@ sequence of Python dicts or tuples:

.. ipython:: python
ty = pa.struct([
pa.field('x', pa.int8()),
pa.field('y', pa.bool_()),
])
ty = pa.struct([('x', pa.int8()),
('y', pa.bool_())])
pa.array([{'x': 1, 'y': True}, {'x': 2, 'y': False}], type=ty)
pa.array([(3, True), (4, False)], type=ty)
Expand Down
36 changes: 36 additions & 0 deletions python/pyarrow/tests/test_schema.py
Expand Up @@ -15,6 +15,7 @@
# specific language governing permissions and limitations
# under the License.

from collections import OrderedDict
import pickle

import pytest
Expand Down Expand Up @@ -206,6 +207,7 @@ def test_schema():
sch = pa.schema(fields)

assert sch.names == ['foo', 'bar', 'baz']
assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())]

assert len(sch) == 3
assert sch[0].name == 'foo'
Expand All @@ -220,6 +222,40 @@ def test_schema():
child 0, item: int8"""


def test_schema_from_tuples():
fields = [
('foo', pa.int32()),
('bar', pa.string()),
('baz', pa.list_(pa.int8())),
]
sch = pa.schema(fields)
assert sch.names == ['foo', 'bar', 'baz']
assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())]
assert len(sch) == 3
assert repr(sch) == """\
foo: int32
bar: string
baz: list<item: int8>
child 0, item: int8"""


def test_schema_from_mapping():
fields = OrderedDict([
('foo', pa.int32()),
('bar', pa.string()),
('baz', pa.list_(pa.int8())),
])
sch = pa.schema(fields)
assert sch.names == ['foo', 'bar', 'baz']
assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())]
assert len(sch) == 3
assert repr(sch) == """\
foo: int32
bar: string
baz: list<item: int8>
child 0, item: int8"""


def test_field_flatten():
f0 = pa.field('foo', pa.int32()).add_metadata({b'foo': b'bar'})
assert f0.flatten() == [f0]
Expand Down
18 changes: 18 additions & 0 deletions python/pyarrow/tests/test_types.py
Expand Up @@ -15,6 +15,7 @@
# specific language governing permissions and limitations
# under the License.

from collections import OrderedDict
import pickle

import pytest
Expand Down Expand Up @@ -233,6 +234,23 @@ def test_struct_type():
for a, b in zip(ty, fields):
a == b

# Construct from list of tuples
ty = pa.struct([('a', pa.int64()),
('a', pa.int32()),
('b', pa.int32())])
assert list(ty) == fields
for a, b in zip(ty, fields):
a == b

# Construct from mapping
fields = [pa.field('a', pa.int64()),
pa.field('b', pa.int32())]
ty = pa.struct(OrderedDict([('a', pa.int64()),
('b', pa.int32())]))
assert list(ty) == fields
for a, b in zip(ty, fields):
a == b


def test_union_type():
def check_fields(ty, fields):
Expand Down
68 changes: 59 additions & 9 deletions python/pyarrow/types.pxi
Expand Up @@ -15,6 +15,7 @@
# specific language governing permissions and limitations
# under the License.

import collections
import re

# These are imprecise because the type (in pandas 0.x) depends on the presence
Expand Down Expand Up @@ -498,13 +499,31 @@ cdef class Schema:

@property
def names(self):
"""
The schema's field names.
Returns
-------
list of str
"""
cdef int i
result = []
for i in range(self.schema.num_fields()):
name = frombytes(self.schema.field(i).get().name())
result.append(name)
return result

@property
def types(self):
"""
The schema's field types.
Returns
-------
list of DataType
"""
return [field.type for field in self]

@property
def metadata(self):
cdef shared_ptr[const CKeyValueMetadata] metadata = (
Expand Down Expand Up @@ -1221,16 +1240,22 @@ def struct(fields):
Parameters
----------
fields : sequence of Field values
fields : iterable of Fields or tuples, or mapping of strings to DataTypes
Examples
--------
::
import pyarrow as pa
fields = [
('f1', pa.int32()),
('f2', pa.string()),
]
struct_type = pa.struct(fields)
fields = [
pa.field('f1', pa.int32()),
pa.field('f2', pa.string())
pa.field('f2', pa.string(), nullable=false),
]
struct_type = pa.struct(fields)
Expand All @@ -1239,12 +1264,19 @@ def struct(fields):
type : DataType
"""
cdef:
Field field
Field py_field
vector[shared_ptr[CField]] c_fields
cdef shared_ptr[CDataType] struct_type

for field in fields:
c_fields.push_back(field.sp_field)
if isinstance(fields, collections.Mapping):
fields = fields.items()

for item in fields:
if isinstance(item, tuple):
py_field = field(*item)
else:
py_field = item
c_fields.push_back(py_field.sp_field)

struct_type.reset(new CStructType(c_fields))
return pyarrow_wrap_data_type(struct_type)
Expand Down Expand Up @@ -1368,10 +1400,21 @@ def schema(fields, dict metadata=None):
Parameters
----------
field : list or iterable
field : iterable of Fields or tuples, or mapping of strings to DataTypes
metadata : dict, default None
Keys and values must be coercible to bytes
Examples
--------
::
import pyarrow as pa
fields = [
('some_int', pa.int32()),
('some_string', pa.string()),
]
schema = pa.schema(fields)
Returns
-------
schema : pyarrow.Schema
Expand All @@ -1380,11 +1423,18 @@ def schema(fields, dict metadata=None):
shared_ptr[CKeyValueMetadata] c_meta
shared_ptr[CSchema] c_schema
Schema result
Field field
Field py_field
vector[shared_ptr[CField]] c_fields

for i, field in enumerate(fields):
c_fields.push_back(field.sp_field)
if isinstance(fields, collections.Mapping):
fields = fields.items()

for item in fields:
if isinstance(item, tuple):
py_field = field(*item)
else:
py_field = item
c_fields.push_back(py_field.sp_field)

if metadata is not None:
convert_metadata(metadata, &c_meta)
Expand Down

0 comments on commit e8e82d0

Please sign in to comment.