Skip to content

Commit

Permalink
Remove ColumnBase.__getitem__ (rapidsai#10516)
Browse files Browse the repository at this point in the history
This PR removes the `ColumnBase.__getitem__` method, which is heavily overloaded and easy to abuse, and changes everywhere that called it to use the appropriate method. There turn out to be exactly two places in the code that actually require the current implementation of this method, Series and Index indexing, so this functionality has been moved into `SingleColumnFrame`. Removing `__getitem__` also means that we no longer need to explicitly fail on `__iter__` and `__array__` since there is no longer an implicit fallback.

The one downside to this now is that it _is_ possible to create a numpy array from a column. However, it won't make a numpy array from the data, it will create an object dtype 0-D array _containing_ the column. This is how `np.array` and `np.asarray` behave for normal objects, so I think this behavior is fine for us too. ColumnBase is (for now) an internal object, so we don't need to provide super helpful error messages, and for developers it should be fairly obvious what is wrong if they try to do this conversion:
```
>>> import cudf
>>> import numpy as np
>>> s = cudf.Series([1])
>>> arr = np.array(s._column)
>>> arr.ndim
0
>>> arr[0]
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
IndexError: too many indices for array: array is 0-dimensional, but 1 were indexed
>>> arr
array(<cudf.core.column.numerical.NumericalColumn object at 0x7efeddf634c0>
[
  1
]
dtype: int64, dtype=object)
```

That being said, I can put back the `__array__` implementation if people feel strongly about it.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu)

URL: rapidsai#10516
  • Loading branch information
vyasr authored and abellina committed Apr 11, 2022
1 parent aa03473 commit a75d38a
Show file tree
Hide file tree
Showing 17 changed files with 132 additions and 106 deletions.
22 changes: 17 additions & 5 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1066,8 +1066,18 @@ def find_and_replace(
)
df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True)
if df._data["old"].null_count == 1:
fill_value = df._data["new"][df._data["old"].isnull()][0]
if fill_value in self.categories:
fill_value = (
df._data["new"]
.apply_boolean_mask(df._data["old"].isnull())
.element_indexing(0)
)
# TODO: This line of code does not work because we cannot use the
# `in` operator on self.categories (which is a column). mypy
# realizes that this is wrong because __iter__ is not implemented.
# However, it seems that this functionality has been broken for a
# long time so for now we're just having mypy ignore and we'll come
# back to this.
if fill_value in self.categories: # type: ignore
replaced = self.fillna(fill_value)
else:
new_categories = self.categories.append(
Expand All @@ -1081,11 +1091,13 @@ def find_and_replace(
else:
replaced = self
if df._data["new"].null_count > 0:
drop_values = df._data["old"][df._data["new"].isnull()]
drop_values = df._data["old"].apply_boolean_mask(
df._data["new"].isnull()
)
cur_categories = replaced.categories
new_categories = cur_categories[
new_categories = cur_categories.apply_boolean_mask(
~cudf.Series(cur_categories.isin(drop_values))
]
)
replaced = replaced._set_categories(new_categories)
df = df.dropna(subset=["new"])
to_replace_col = df._data["old"]
Expand Down
28 changes: 2 additions & 26 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@
from cudf._typing import ColumnLike, Dtype, ScalarLike
from cudf.api.types import (
_is_non_decimal_numeric_dtype,
_is_scalar_or_zero_d_array,
infer_dtype,
is_bool_dtype,
is_categorical_dtype,
Expand Down Expand Up @@ -78,15 +77,15 @@
pandas_dtypes_alias_to_cudf_alias,
pandas_dtypes_to_np_dtypes,
)
from cudf.utils.utils import NotIterable, _array_ufunc, mask_dtype
from cudf.utils.utils import _array_ufunc, mask_dtype

T = TypeVar("T", bound="ColumnBase")
# TODO: This workaround allows type hints for `slice`, since `slice` is a
# method in ColumnBase.
Slice = TypeVar("Slice", bound=slice)


class ColumnBase(Column, Serializable, BinaryOperand, Reducible, NotIterable):
class ColumnBase(Column, Serializable, BinaryOperand, Reducible):
_VALID_REDUCTIONS = {
"any",
"all",
Expand Down Expand Up @@ -480,22 +479,6 @@ def slice(self, start: int, stop: int, stride: int = None) -> ColumnBase:
)
return self.take(gather_map)

def __getitem__(self, arg) -> Union[ScalarLike, ColumnBase]:
if _is_scalar_or_zero_d_array(arg):
return self.element_indexing(int(arg))
elif isinstance(arg, slice):
start, stop, stride = arg.indices(len(self))
return self.slice(start, stop, stride)
else:
arg = as_column(arg)
if len(arg) == 0:
arg = as_column([], dtype="int32")
if is_integer_dtype(arg.dtype):
return self.take(arg)
if is_bool_dtype(arg.dtype):
return self.apply_boolean_mask(arg)
raise NotImplementedError(type(arg))

def __setitem__(self, key: Any, value: Any):
"""
Set the value of ``self[key]`` to ``value``.
Expand Down Expand Up @@ -1028,13 +1011,6 @@ def __arrow_array__(self, type=None):
"consider using .to_arrow()"
)

def __array__(self, dtype=None):
raise TypeError(
"Implicit conversion to a host NumPy array via __array__ is not "
"allowed. To explicitly construct a host array, consider using "
".to_numpy()"
)

@property
def __cuda_array_interface__(self):
raise NotImplementedError(
Expand Down
8 changes: 6 additions & 2 deletions python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,15 +473,19 @@ def find_first_value(
Returns offset of first value that matches
"""
value = pd.to_datetime(value)
value = column.as_column(value, dtype=self.dtype).as_numerical[0]
value = column.as_column(
value, dtype=self.dtype
).as_numerical.element_indexing(0)
return self.as_numerical.find_first_value(value, closest=closest)

def find_last_value(self, value: ScalarLike, closest: bool = False) -> int:
"""
Returns offset of last value that matches
"""
value = pd.to_datetime(value)
value = column.as_column(value, dtype=self.dtype).as_numerical[0]
value = column.as_column(
value, dtype=self.dtype
).as_numerical.element_indexing(0)
return self.as_numerical.find_last_value(value, closest=closest)

@property
Expand Down
4 changes: 3 additions & 1 deletion python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,9 @@ def find_and_replace(
df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True)
if df._data["old"].null_count == 1:
replaced = replaced.fillna(
df._data["new"][df._data["old"].isnull()][0]
df._data["new"]
.apply_boolean_mask(df._data["old"].isnull())
.element_indexing(0)
)
df = df.dropna(subset=["old"])

Expand Down
9 changes: 6 additions & 3 deletions python/cudf/cudf/core/column/numerical_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,11 @@ def quantile(
else:
result = self._numeric_quantile(q, interpolation, exact)
if return_scalar:
scalar_result = result.element_indexing(0)
return (
cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
if result[0] is cudf.NA
else result[0]
if scalar_result is cudf.NA
else scalar_result
)
return result

Expand Down Expand Up @@ -160,7 +161,9 @@ def _numeric_quantile(
sorted_indices = self.as_frame()._get_sorted_inds(
ascending=True, na_position="first"
)
sorted_indices = sorted_indices[self.null_count :]
sorted_indices = sorted_indices.slice(
self.null_count, len(sorted_indices)
)

return libcudf.quantiles.quantile(
self, q, interpolation, sorted_indices, exact
Expand Down
8 changes: 6 additions & 2 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -5205,7 +5205,7 @@ def sum(
result_col,
sep=cudf.Scalar(""),
na_rep=cudf.Scalar(None, "str"),
)[0]
).element_indexing(0)
else:
return result_col

Expand Down Expand Up @@ -5432,7 +5432,11 @@ def find_and_replace(
)
df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True)
if df._data["old"].null_count == 1:
res = self.fillna(df._data["new"][df._data["old"].isnull()][0])
res = self.fillna(
df._data["new"]
.apply_boolean_mask(df._data["old"].isnull())
.element_indexing(0)
)
df = df.dropna(subset=["old"])
else:
res = self
Expand Down
16 changes: 7 additions & 9 deletions python/cudf/cudf/core/column/struct.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
from __future__ import annotations

import pandas as pd
Expand Down Expand Up @@ -91,14 +91,12 @@ def to_pandas(self, index: pd.Index = None, **kwargs) -> "pd.Series":
pd_series.index = index
return pd_series

def __getitem__(self, args):
result = super().__getitem__(args)
if isinstance(result, dict):
return {
field: value
for field, value in zip(self.dtype.fields, result.values())
}
return result
def element_indexing(self, index: int):
result = super().element_indexing(index)
return {
field: value
for field, value in zip(self.dtype.fields, result.values())
}

def __setitem__(self, key, value):
if isinstance(value, dict):
Expand Down
35 changes: 14 additions & 21 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
from cudf._lib.search import search_sorted
from cudf.api.types import (
_is_non_decimal_numeric_dtype,
_is_scalar_or_zero_d_array,
is_categorical_dtype,
is_dtype_equal,
is_interval_dtype,
Expand Down Expand Up @@ -342,29 +341,22 @@ def __len__(self):

@_cudf_nvtx_annotate
def __getitem__(self, index):
len_self = len(self)
if isinstance(index, slice):
sl_start, sl_stop, sl_step = index.indices(len_self)
sl_start, sl_stop, sl_step = index.indices(len(self))

lo = self._start + sl_start * self._step
hi = self._start + sl_stop * self._step
st = self._step * sl_step
return RangeIndex(start=lo, stop=hi, step=st, name=self._name)

elif isinstance(index, Number):
len_self = len(self)
if index < 0:
index = len_self + index
index += len_self
if not (0 <= index < len_self):
raise IndexError("out-of-bound")
index = min(index, len_self)
index = self._start + index * self._step
return index
else:
if _is_scalar_or_zero_d_array(index):
index = np.min_scalar_type(index).type(index)
index = column.as_column(index)

return as_index(self._values[index], name=self.name)
raise IndexError("Index out of bounds")
return self._start + index * self._step
return self._as_int64()[index]

@_cudf_nvtx_annotate
def equals(self, other):
Expand Down Expand Up @@ -1183,11 +1175,7 @@ def __repr__(self):

@_cudf_nvtx_annotate
def __getitem__(self, index):
if type(self) == IntervalIndex:
raise NotImplementedError(
"Getting a scalar from an IntervalIndex is not yet supported"
)
res = self._values[index]
res = self._get_elements_from_column(index)
if not isinstance(index, int):
res = as_index(res)
res.name = self.name
Expand Down Expand Up @@ -2457,8 +2445,8 @@ def interval_range(
init=start.device_value,
step=freq_step.device_value,
)
left_col = bin_edges[:-1]
right_col = bin_edges[1:]
left_col = bin_edges.slice(0, len(bin_edges) - 1)
right_col = bin_edges.slice(1, len(bin_edges))
elif freq and periods:
if end:
start = end - (freq * periods)
Expand Down Expand Up @@ -2614,6 +2602,11 @@ def from_breaks(breaks, closed="right", name=None, copy=False, dtype=None):

return IntervalIndex(interval_col, name=name)

def __getitem__(self, index):
raise NotImplementedError(
"Getting a scalar from an IntervalIndex is not yet supported"
)

def is_interval(self):
return True

Expand Down
16 changes: 10 additions & 6 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1173,9 +1173,9 @@ def _n_largest_or_smallest(self, largest, n, columns, keep):

# argsort the `by` column
return self._gather(
self._get_columns_by_label(columns)._get_sorted_inds(
ascending=not largest
)[:n],
self._get_columns_by_label(columns)
._get_sorted_inds(ascending=not largest)
.slice(*slice(None, n).indices(len(self))),
keep_index=True,
check_bounds=False,
)
Expand All @@ -1186,9 +1186,11 @@ def _n_largest_or_smallest(self, largest, n, columns, keep):

if n <= 0:
# Empty slice.
indices = indices[0:0]
indices = indices.slice(0, 0)
else:
indices = indices[: -n - 1 : -1]
indices = indices.slice(
*slice(None, -n - 1, -1).indices(len(self))
)
return self._gather(indices, keep_index=True, check_bounds=False)
else:
raise ValueError('keep must be either "first", "last"')
Expand Down Expand Up @@ -1808,7 +1810,9 @@ def _first_or_last(
return self.copy()

pd_offset = pd.tseries.frequencies.to_offset(offset)
to_search = op(pd.Timestamp(self._index._column[idx]), pd_offset)
to_search = op(
pd.Timestamp(self._index._column.element_indexing(idx)), pd_offset
)
if (
idx == 0
and not isinstance(pd_offset, pd.tseries.offsets.Tick)
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ class _SeriesIlocIndexer(_FrameIndexer):
def __getitem__(self, arg):
if isinstance(arg, tuple):
arg = list(arg)
data = self._frame._column[arg]
data = self._frame._get_elements_from_column(arg)

if (
isinstance(data, (dict, list))
Expand Down
28 changes: 26 additions & 2 deletions python/cudf/cudf/core/single_column_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,12 @@
import pandas as pd

import cudf
from cudf._typing import Dtype
from cudf.api.types import _is_scalar_or_zero_d_array
from cudf._typing import Dtype, ScalarLike
from cudf.api.types import (
_is_scalar_or_zero_d_array,
is_bool_dtype,
is_integer_dtype,
)
from cudf.core.column import ColumnBase, as_column
from cudf.core.frame import Frame
from cudf.utils.utils import NotIterable, _cudf_nvtx_annotate
Expand Down Expand Up @@ -359,3 +363,23 @@ def nunique(self, dropna: bool = True):
if self._column.null_count == len(self):
return 0
return self._column.distinct_count(dropna=dropna)

def _get_elements_from_column(self, arg) -> Union[ScalarLike, ColumnBase]:
# A generic method for getting elements from a column that supports a
# wide range of different inputs. This method should only used where
# _absolutely_ necessary, since in almost all cases a more specific
# method can be used e.g. element_indexing or slice.
if _is_scalar_or_zero_d_array(arg):
return self._column.element_indexing(int(arg))
elif isinstance(arg, slice):
start, stop, stride = arg.indices(len(self))
return self._column.slice(start, stop, stride)
else:
arg = as_column(arg)
if len(arg) == 0:
arg = as_column([], dtype="int32")
if is_integer_dtype(arg.dtype):
return self._column.take(arg)
if is_bool_dtype(arg.dtype):
return self._column.apply_boolean_mask(arg)
raise NotImplementedError(f"Unknown indexer {type(arg)}")
Loading

0 comments on commit a75d38a

Please sign in to comment.