ARROW-3444: [Python] Add Array/ChunkedArray/Table nbytes attribute

https://issues.apache.org/jira/browse/ARROW-3444 Question is then what the expected result is for a sliced array? (because the buffers do not take that into account) Closes #5793 from jorisvandenbossche/ARROW-3444-nbytes and squashes the following commits: 9b648e6 <Joris Van den Bossche> ARROW-3444: Add Array/ChunkedArray/Table nbytes attribute Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com> Signed-off-by: Antoine Pitrou <antoine@python.org>
apache · Nov 12, 2019 · 417febc · 417febc
1 parent 76cebfa
commit 417febc
Show file tree

Hide file tree

Showing 4 changed files with 54 additions and 3 deletions.
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
@@ -728,6 +728,15 @@ cdef class Array(_PandasConvertible):
     def null_count(self):
         return self.sp_array.get().null_count()
 
+    @property
+    def nbytes(self):
+        """Total number of bytes consumed by the elements of the array."""
+        size = 0
+        for buf in self.buffers():
+            if buf is not None:
+                size += buf.size
+        return size
+
     def __iter__(self):
         for i in range(len(self)):
             yield self.getitem(i)

diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
@@ -99,6 +99,16 @@ cdef class ChunkedArray(_PandasConvertible):
         """
         return self.chunked_array.null_count()
 
+    @property
+    def nbytes(self):
+        """
+        Total number of bytes consumed by the elements of the chunked array.
+        """
+        size = 0
+        for chunk in self.iterchunks():
+            size += chunk.nbytes
+        return size
+
     def __iter__(self):
         for chunk in self.iterchunks():
             for item in chunk:
@@ -588,7 +598,7 @@ cdef class RecordBatch(_PandasConvertible):
 
         Returns
         -------
-        list of pa.ChunkedArray
+        list of pa.Array
         """
         return [self.column(i) for i in range(self.num_columns)]
 
@@ -605,6 +615,16 @@ cdef class RecordBatch(_PandasConvertible):
         result._name = self.schema[index].name
         return result
 
+    @property
+    def nbytes(self):
+        """
+        Total number of bytes consumed by the elements of the record batch.
+        """
+        size = 0
+        for i in range(self.num_columns):
+            size += self.column(i).nbytes
+        return size
+
     def __getitem__(self, key):
         if isinstance(key, slice):
             return _normalize_slice(self, key)
@@ -1420,6 +1440,16 @@ cdef class Table(_PandasConvertible):
         """
         return (self.num_rows, self.num_columns)
 
+    @property
+    def nbytes(self):
+        """
+        Total number of bytes consumed by the elements of the table.
+        """
+        size = 0
+        for column in self.itercolumns():
+            size += column.nbytes
+        return size
+
     def add_column(self, int i, field_, column):
         """
         Add column to Table at position. Returns new table

diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
@@ -1512,6 +1512,15 @@ def test_buffers_nested():
     assert struct.unpack('4xh', values) == (43,)
 
 
+def test_nbytes():
+    a = pa.array(np.array([4, 5, 6], dtype='int64'))
+    assert a.nbytes == 8 * 3
+    a = pa.array([1, None, 3], type='int64')
+    assert a.nbytes == 8*3 + 1
+    a = pa.array([[1, 2], None, [3, None, 4, 5]], type=pa.list_(pa.int64()))
+    assert a.nbytes == 1 + 4 * 4 + 1 + 6 * 8
+
+
 def test_invalid_tensor_constructor_repr():
     # ARROW-2638: prevent calling extension class constructors directly
     with pytest.raises(TypeError):

diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py
@@ -44,6 +44,7 @@ def test_chunked_array_basics():
     assert all(isinstance(c, pa.lib.Int64Array) for c in data.chunks)
     assert all(isinstance(c, pa.lib.Int64Array) for c in data.iterchunks())
     assert len(data.chunks) == 3
+    assert data.nbytes == sum(c.nbytes for c in data.iterchunks())
     data.validate()
 
 
@@ -274,6 +275,7 @@ def test_recordbatch_basics():
     assert len(batch) == 5
     assert batch.num_rows == 5
     assert batch.num_columns == len(data)
+    assert batch.nbytes == 5 * 2 + 1 + 5 * 4 + 1
     pydict = batch.to_pydict()
     assert pydict == OrderedDict([
         ('c0', [0, 1, 2, 3, 4]),
@@ -493,15 +495,16 @@ def test_table_to_batches():
 
 def test_table_basics():
     data = [
-        pa.array(range(5)),
-        pa.array([-10, -5, 0, 5, 10])
+        pa.array(range(5), type='int64'),
+        pa.array([-10, -5, 0, 5, 10], type='int64')
     ]
     table = pa.table(data, names=('a', 'b'))
     table.validate()
     assert len(table) == 5
     assert table.num_rows == 5
     assert table.num_columns == 2
     assert table.shape == (5, 2)
+    assert table.nbytes == 2 * (5 * 8 + 1)
     pydict = table.to_pydict()
     assert pydict == OrderedDict([
         ('a', [0, 1, 2, 3, 4]),