In [1]:
%load_ext memory_profiler

In [2]:
import pyarrow as pa

# DataType

Types are logical types because the physical storage may be the same for one or more types. They are created by [factory functions](https://arrow.apache.org/docs/python/api/datatypes.html#factory-functions) and have methods `equals()`, `to_pandas_dtype` and properties `bit_width`, `id`, `num_buffers`, `num_children`. See [documentation](https://arrow.apache.org/docs/python/generated/pyarrow.DataType.html#pyarrow.DataType)

In [23]:
t0 = pa.float64()
t0

DataType(double)

In [20]:
t1 = pa.int32()
t1, t1.bit_width

(DataType(int32), 32)

In [52]:
t2 = pa.string()
t2

DataType(string)

In [51]:
t3 = pa.binary(10)
t3, t3.bit_width, pa.binary(1).bit_width

(FixedSizeBinaryType(fixed_size_binary[10]), 80, 8)

In [56]:
t4 = pa.timestamp('ms')
t4

TimestampType(timestamp[ms])

## Fields

The Field type is a type plus a name and optional user-defined metadata.

See [xarray approach to metadata](http://xarray.pydata.org/en/stable/faq.html?highlight=%20metadata#what-is-your-approach-to-metadata) for an example of metadata management

In [37]:
f0 = pa.field('signal 0',
              t0,
              nullable=True,
              metadata=dict(label='A simple label', unit="A label containing utf-8 character: µ"))

In [38]:
f0.metadata

OrderedDict([(b'label', b'A simple label'),
             (b'unit', b'A label containing utf-8 character: \xc2\xb5')])

In [57]:
f1 = pa.field('integer', t1)
f2 = pa.field('text', t2)
f3 = pa.field('binary', t3)
f4 = pa.field('time', t4)

## Nested value types 

**list**

Created from `pyarrow.Type` or `pyarrow.Field`

In [74]:
l0 = pa.list_(t0)
l0

ListType(list<item: double>)

In [59]:
pa.list_(pa.field('type', t0))

ListType(list<type: double>)

**union**

In [60]:
u0 = pa.union([f0, f1], mode='dense')
u0

UnionType(union[dense]<signal 0: double=0, integer: int32=1>)

**struct**

In [77]:
s0 = pa.struct([f1, f2])
s0

StructType(struct<integer: int32, text: string>)

In [67]:
s0[0]

pyarrow.Field<integer: int32>

In [80]:
pa.array([(3, "True"), (4, "False")], type=s0)

<pyarrow.lib.StructArray object at 0x7f28e8581728>
-- is_valid: all not null
-- child 0 type: int32
  [
    3,
    4
  ]
-- child 1 type: string
  [
    "True",
    "False"
  ]

# `Schema`

In [63]:
schema0 = pa.schema([('integer', pa.int32()),
                     ('text', pa.string()),
                     ('binary', pa.binary(10)),
                     ('Time', pa.timestamp('ms'))])

In [64]:
schema0

integer: int32
text: string
binary: fixed_size_binary[10]
Time: timestamp[ms]

In [72]:
pa.schema([*s0, f3, f4])

integer: int32
text: string
binary: fixed_size_binary[10]
time: timestamp[ms]

# Record Batches

# `Table`

In [76]:
pa.Table?

[0;31mInit signature:[0m [0mpa[0m[0;34m.[0m[0mTable[0m[0;34m([0m[0mself[0m[0;34m,[0m [0;34m/[0m[0;34m,[0m [0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Table()

A collection of top-level named, equal length Arrow arrays.

-------
Do not call this class's constructor directly, use one of the ``from_*``
methods instead.
[0;31mFile:[0m           /opt/conda/lib/python3.7/site-packages/pyarrow/lib.cpython-37m-x86_64-linux-gnu.so
[0;31mType:[0m           type
[0;31mSubclasses:[0m     


In [10]:
from datetime import datetime

In [16]:
pa.Table.from_batches?

[0;31mDocstring:[0m
Table.from_batches(batches, Schema schema=None)

Construct a Table from a sequence or iterator of Arrow RecordBatches

Parameters
----------
batches : sequence or iterator of RecordBatch
    Sequence of RecordBatch to be converted, all schemas must be equal
schema : Schema, default None
    If not passed, will be inferred from the first RecordBatch

Returns
-------
table : Table
[0;31mType:[0m      builtin_function_or_method


In [20]:
batches= ((1),
          ("aze"),
          (0b010010100),
          (datetime.now()))

In [32]:
data = [
    pa.array([1, 2, 3, 4]),
    pa.array(['foo', 'bar', 'baz', None]),
    pa.array([0b0100, None, 0b0100, 0b0100]),
    pa.array((datetime(2019,8,i) for i in range(1, 5)))
]

In [47]:
batch0 = pa.RecordBatch.from_arrays(data, ['f0', 'f1', 'f2', 'f4'])

In [48]:
batch0.schema

f0: int64
f1: string
f2: int64
f4: timestamp[us]

In [45]:
batch1 = pa.RecordBatch.from_arrays(data, my_schema)

In [46]:
batch1.schema

integer: int32
text: string
binary: fixed_size_binary[10]
Time: timestamp[ms]

In [44]:
my_schema

integer: int32
text: string
binary: fixed_size_binary[10]
Time: timestamp[ms]

In [51]:
pa.Table.from_batches(batch1)

TypeError: Cannot convert pyarrow.lib.Int64Array to pyarrow.lib.RecordBatch