Skip to content

Commit

Permalink
ARROW-6279: [Python] Add Table.slice, __getitem__ support to match Re…
Browse files Browse the repository at this point in the history
…cordBatch, Array, others

Closes #5181 from wesm/ARROW-6279 and squashes the following commits:

1aecee2 <Wes McKinney> Add Table.slice, __getitem__ support to match RecordBatch, Array, etc.

Authored-by: Wes McKinney <wesm+git@apache.org>
Signed-off-by: Wes McKinney <wesm+git@apache.org>
  • Loading branch information
wesm committed Aug 24, 2019
1 parent 03186b0 commit 5a53e31
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 21 deletions.
3 changes: 3 additions & 0 deletions python/pyarrow/includes/libarrow.pxd
Expand Up @@ -608,6 +608,9 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
shared_ptr[CTable] ReplaceSchemaMetadata(
const shared_ptr[CKeyValueMetadata]& metadata)

shared_ptr[CTable] Slice(int64_t offset)
shared_ptr[CTable] Slice(int64_t offset, int64_t length)

cdef cppclass CRecordBatchReader" arrow::RecordBatchReader":
shared_ptr[CSchema] schema()
CStatus ReadNext(shared_ptr[CRecordBatch]* batch)
Expand Down
39 changes: 35 additions & 4 deletions python/pyarrow/table.pxi
Expand Up @@ -572,7 +572,7 @@ cdef class RecordBatch(_PandasConvertible):
Parameters
----------
offset : int, default 0
Offset from start of array to slice
Offset from start of record batch to slice
length : int, default None
Length of slice (default is until end of batch starting from
offset)
Expand Down Expand Up @@ -766,6 +766,40 @@ cdef class Table(_PandasConvertible):
columns = [col for col in self.columns]
return _reconstruct_table, (columns, self.schema)

def __getitem__(self, key):
if isinstance(key, slice):
return _normalize_slice(self, key)
else:
return self.column(key)

def slice(self, offset=0, length=None):
"""
Compute zero-copy slice of this Table
Parameters
----------
offset : int, default 0
Offset from start of table to slice
length : int, default None
Length of slice (default is until end of table starting from
offset)
Returns
-------
sliced : Table
"""
cdef shared_ptr[CTable] result

if offset < 0:
raise IndexError('Offset must be non-negative')

if length is None:
result = self.table.Slice(offset)
else:
result = self.table.Slice(offset, length)

return pyarrow_wrap_table(result)

def replace_schema_metadata(self, metadata=None):
"""
EXPERIMENTAL: Create shallow copy of table by replacing schema
Expand Down Expand Up @@ -1241,9 +1275,6 @@ cdef class Table(_PandasConvertible):
cdef int index = <int> _normalize_index(i, self.num_columns)
return pyarrow_wrap_chunked_array(self.table.column(index))

def __getitem__(self, key):
return self._column(key)

def itercolumns(self):
"""
Iterator over all columns in their numerical order
Expand Down
40 changes: 23 additions & 17 deletions python/pyarrow/tests/test_table.py
Expand Up @@ -334,43 +334,49 @@ def test_recordbatch_pickle():
assert result.schema == schema


def test_recordbatch_slice_getitem():
def _table_like_slice_tests(factory):
data = [
pa.array(range(5)),
pa.array([-10, -5, 0, 5, 10])
]
names = ['c0', 'c1']

batch = pa.RecordBatch.from_arrays(data, names)
obj = factory(data, names=names)

sliced = batch.slice(2)
sliced = obj.slice(2)
assert sliced.num_rows == 3

expected = pa.RecordBatch.from_arrays(
[x.slice(2) for x in data], names)
expected = factory([x.slice(2) for x in data], names)
assert sliced.equals(expected)

sliced2 = batch.slice(2, 2)
expected2 = pa.RecordBatch.from_arrays(
[x.slice(2, 2) for x in data], names)
sliced2 = obj.slice(2, 2)
expected2 = factory([x.slice(2, 2) for x in data], names)
assert sliced2.equals(expected2)

# 0 offset
assert batch.slice(0).equals(batch)
assert obj.slice(0).equals(obj)

# Slice past end of array
assert len(batch.slice(len(batch))) == 0
assert len(obj.slice(len(obj))) == 0

with pytest.raises(IndexError):
batch.slice(-1)
obj.slice(-1)

# Check __getitem__-based slicing
assert batch.slice(0, 0).equals(batch[:0])
assert batch.slice(0, 2).equals(batch[:2])
assert batch.slice(2, 2).equals(batch[2:4])
assert batch.slice(2, len(batch) - 2).equals(batch[2:])
assert batch.slice(len(batch) - 2, 2).equals(batch[-2:])
assert batch.slice(len(batch) - 4, 2).equals(batch[-4:-2])
assert obj.slice(0, 0).equals(obj[:0])
assert obj.slice(0, 2).equals(obj[:2])
assert obj.slice(2, 2).equals(obj[2:4])
assert obj.slice(2, len(obj) - 2).equals(obj[2:])
assert obj.slice(len(obj) - 2, 2).equals(obj[-2:])
assert obj.slice(len(obj) - 4, 2).equals(obj[-4:-2])


def test_recordbatch_slice_getitem():
return _table_like_slice_tests(pa.RecordBatch.from_arrays)


def test_table_slice_getitem():
return _table_like_slice_tests(pa.table)


def test_recordbatchlist_schema_equals():
Expand Down

0 comments on commit 5a53e31

Please sign in to comment.