Skip to content

Commit

Permalink
ARROW-3444: [Python] Add Array/ChunkedArray/Table nbytes attribute
Browse files Browse the repository at this point in the history
https://issues.apache.org/jira/browse/ARROW-3444

Question is then what the expected result is for a sliced array? (because the buffers do not take that into account)

Closes #5793 from jorisvandenbossche/ARROW-3444-nbytes and squashes the following commits:

9b648e6 <Joris Van den Bossche> ARROW-3444:  Add Array/ChunkedArray/Table nbytes attribute

Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
  • Loading branch information
jorisvandenbossche authored and pitrou committed Nov 12, 2019
1 parent 76cebfa commit 417febc
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 3 deletions.
9 changes: 9 additions & 0 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -728,6 +728,15 @@ cdef class Array(_PandasConvertible):
def null_count(self):
return self.sp_array.get().null_count()

@property
def nbytes(self):
"""Total number of bytes consumed by the elements of the array."""
size = 0
for buf in self.buffers():
if buf is not None:
size += buf.size
return size

def __iter__(self):
for i in range(len(self)):
yield self.getitem(i)
Expand Down
32 changes: 31 additions & 1 deletion python/pyarrow/table.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,16 @@ cdef class ChunkedArray(_PandasConvertible):
"""
return self.chunked_array.null_count()

@property
def nbytes(self):
"""
Total number of bytes consumed by the elements of the chunked array.
"""
size = 0
for chunk in self.iterchunks():
size += chunk.nbytes
return size

def __iter__(self):
for chunk in self.iterchunks():
for item in chunk:
Expand Down Expand Up @@ -588,7 +598,7 @@ cdef class RecordBatch(_PandasConvertible):
Returns
-------
list of pa.ChunkedArray
list of pa.Array
"""
return [self.column(i) for i in range(self.num_columns)]

Expand All @@ -605,6 +615,16 @@ cdef class RecordBatch(_PandasConvertible):
result._name = self.schema[index].name
return result

@property
def nbytes(self):
"""
Total number of bytes consumed by the elements of the record batch.
"""
size = 0
for i in range(self.num_columns):
size += self.column(i).nbytes
return size

def __getitem__(self, key):
if isinstance(key, slice):
return _normalize_slice(self, key)
Expand Down Expand Up @@ -1420,6 +1440,16 @@ cdef class Table(_PandasConvertible):
"""
return (self.num_rows, self.num_columns)

@property
def nbytes(self):
"""
Total number of bytes consumed by the elements of the table.
"""
size = 0
for column in self.itercolumns():
size += column.nbytes
return size

def add_column(self, int i, field_, column):
"""
Add column to Table at position. Returns new table
Expand Down
9 changes: 9 additions & 0 deletions python/pyarrow/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -1512,6 +1512,15 @@ def test_buffers_nested():
assert struct.unpack('4xh', values) == (43,)


def test_nbytes():
a = pa.array(np.array([4, 5, 6], dtype='int64'))
assert a.nbytes == 8 * 3
a = pa.array([1, None, 3], type='int64')
assert a.nbytes == 8*3 + 1
a = pa.array([[1, 2], None, [3, None, 4, 5]], type=pa.list_(pa.int64()))
assert a.nbytes == 1 + 4 * 4 + 1 + 6 * 8


def test_invalid_tensor_constructor_repr():
# ARROW-2638: prevent calling extension class constructors directly
with pytest.raises(TypeError):
Expand Down
7 changes: 5 additions & 2 deletions python/pyarrow/tests/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def test_chunked_array_basics():
assert all(isinstance(c, pa.lib.Int64Array) for c in data.chunks)
assert all(isinstance(c, pa.lib.Int64Array) for c in data.iterchunks())
assert len(data.chunks) == 3
assert data.nbytes == sum(c.nbytes for c in data.iterchunks())
data.validate()


Expand Down Expand Up @@ -274,6 +275,7 @@ def test_recordbatch_basics():
assert len(batch) == 5
assert batch.num_rows == 5
assert batch.num_columns == len(data)
assert batch.nbytes == 5 * 2 + 1 + 5 * 4 + 1
pydict = batch.to_pydict()
assert pydict == OrderedDict([
('c0', [0, 1, 2, 3, 4]),
Expand Down Expand Up @@ -493,15 +495,16 @@ def test_table_to_batches():

def test_table_basics():
data = [
pa.array(range(5)),
pa.array([-10, -5, 0, 5, 10])
pa.array(range(5), type='int64'),
pa.array([-10, -5, 0, 5, 10], type='int64')
]
table = pa.table(data, names=('a', 'b'))
table.validate()
assert len(table) == 5
assert table.num_rows == 5
assert table.num_columns == 2
assert table.shape == (5, 2)
assert table.nbytes == 2 * (5 * 8 + 1)
pydict = table.to_pydict()
assert pydict == OrderedDict([
('a', [0, 1, 2, 3, 4]),
Expand Down

0 comments on commit 417febc

Please sign in to comment.