Remove ColumnBase.__getitem__ (rapidsai#10516)

This PR removes the `ColumnBase.__getitem__` method, which is heavily overloaded and easy to abuse, and changes everywhere that called it to use the appropriate method. There turn out to be exactly two places in the code that actually require the current implementation of this method, Series and Index indexing, so this functionality has been moved into `SingleColumnFrame`. Removing `__getitem__` also means that we no longer need to explicitly fail on `__iter__` and `__array__` since there is no longer an implicit fallback. The one downside to this now is that it _is_ possible to create a numpy array from a column. However, it won't make a numpy array from the data, it will create an object dtype 0-D array _containing_ the column. This is how `np.array` and `np.asarray` behave for normal objects, so I think this behavior is fine for us too. ColumnBase is (for now) an internal object, so we don't need to provide super helpful error messages, and for developers it should be fairly obvious what is wrong if they try to do this conversion: ``` >>> import cudf >>> import numpy as np >>> s = cudf.Series([1]) >>> arr = np.array(s._column) >>> arr.ndim 0 >>> arr[0] Traceback (most recent call last): File "<stdin>", line 1, in <module> IndexError: too many indices for array: array is 0-dimensional, but 1 were indexed >>> arr array(<cudf.core.column.numerical.NumericalColumn object at 0x7efeddf634c0> [ 1 ] dtype: int64, dtype=object) ``` That being said, I can put back the `__array__` implementation if people feel strongly about it. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Ashwin Srinath (https://github.com/shwina) - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu) URL: rapidsai#10516
abellina · Apr 11, 2022 · a75d38a · a75d38a
1 parent aa03473
commit a75d38a
Show file tree

Hide file tree

Showing 17 changed files with 132 additions and 106 deletions.
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
@@ -1066,8 +1066,18 @@ def find_and_replace(
         )
         df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True)
         if df._data["old"].null_count == 1:
-            fill_value = df._data["new"][df._data["old"].isnull()][0]
-            if fill_value in self.categories:
+            fill_value = (
+                df._data["new"]
+                .apply_boolean_mask(df._data["old"].isnull())
+                .element_indexing(0)
+            )
+            # TODO: This line of code does not work because we cannot use the
+            # `in` operator on self.categories (which is a column). mypy
+            # realizes that this is wrong because __iter__ is not implemented.
+            # However, it seems that this functionality has been broken for a
+            # long time so for now we're just having mypy ignore and we'll come
+            # back to this.
+            if fill_value in self.categories:  # type: ignore
                 replaced = self.fillna(fill_value)
             else:
                 new_categories = self.categories.append(
@@ -1081,11 +1091,13 @@ def find_and_replace(
         else:
             replaced = self
         if df._data["new"].null_count > 0:
-            drop_values = df._data["old"][df._data["new"].isnull()]
+            drop_values = df._data["old"].apply_boolean_mask(
+                df._data["new"].isnull()
+            )
             cur_categories = replaced.categories
-            new_categories = cur_categories[
+            new_categories = cur_categories.apply_boolean_mask(
                 ~cudf.Series(cur_categories.isin(drop_values))
-            ]
+            )
             replaced = replaced._set_categories(new_categories)
             df = df.dropna(subset=["new"])
             to_replace_col = df._data["old"]

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -44,7 +44,6 @@
 from cudf._typing import ColumnLike, Dtype, ScalarLike
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
-    _is_scalar_or_zero_d_array,
     infer_dtype,
     is_bool_dtype,
     is_categorical_dtype,
@@ -78,15 +77,15 @@
     pandas_dtypes_alias_to_cudf_alias,
     pandas_dtypes_to_np_dtypes,
 )
-from cudf.utils.utils import NotIterable, _array_ufunc, mask_dtype
+from cudf.utils.utils import _array_ufunc, mask_dtype
 
 T = TypeVar("T", bound="ColumnBase")
 # TODO: This workaround allows type hints for `slice`, since `slice` is a
 # method in ColumnBase.
 Slice = TypeVar("Slice", bound=slice)
 
 
-class ColumnBase(Column, Serializable, BinaryOperand, Reducible, NotIterable):
+class ColumnBase(Column, Serializable, BinaryOperand, Reducible):
     _VALID_REDUCTIONS = {
         "any",
         "all",
@@ -480,22 +479,6 @@ def slice(self, start: int, stop: int, stride: int = None) -> ColumnBase:
             )
             return self.take(gather_map)
 
-    def __getitem__(self, arg) -> Union[ScalarLike, ColumnBase]:
-        if _is_scalar_or_zero_d_array(arg):
-            return self.element_indexing(int(arg))
-        elif isinstance(arg, slice):
-            start, stop, stride = arg.indices(len(self))
-            return self.slice(start, stop, stride)
-        else:
-            arg = as_column(arg)
-            if len(arg) == 0:
-                arg = as_column([], dtype="int32")
-            if is_integer_dtype(arg.dtype):
-                return self.take(arg)
-            if is_bool_dtype(arg.dtype):
-                return self.apply_boolean_mask(arg)
-            raise NotImplementedError(type(arg))
-
     def __setitem__(self, key: Any, value: Any):
         """
         Set the value of ``self[key]`` to ``value``.
@@ -1028,13 +1011,6 @@ def __arrow_array__(self, type=None):
             "consider using .to_arrow()"
         )
 
-    def __array__(self, dtype=None):
-        raise TypeError(
-            "Implicit conversion to a host NumPy array via __array__ is not "
-            "allowed. To explicitly construct a host array, consider using "
-            ".to_numpy()"
-        )
-
     @property
     def __cuda_array_interface__(self):
         raise NotImplementedError(

diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
@@ -473,15 +473,19 @@ def find_first_value(
         Returns offset of first value that matches
         """
         value = pd.to_datetime(value)
-        value = column.as_column(value, dtype=self.dtype).as_numerical[0]
+        value = column.as_column(
+            value, dtype=self.dtype
+        ).as_numerical.element_indexing(0)
         return self.as_numerical.find_first_value(value, closest=closest)
 
     def find_last_value(self, value: ScalarLike, closest: bool = False) -> int:
         """
         Returns offset of last value that matches
         """
         value = pd.to_datetime(value)
-        value = column.as_column(value, dtype=self.dtype).as_numerical[0]
+        value = column.as_column(
+            value, dtype=self.dtype
+        ).as_numerical.element_indexing(0)
         return self.as_numerical.find_last_value(value, closest=closest)
 
     @property

diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
@@ -453,7 +453,9 @@ def find_and_replace(
         df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True)
         if df._data["old"].null_count == 1:
             replaced = replaced.fillna(
-                df._data["new"][df._data["old"].isnull()][0]
+                df._data["new"]
+                .apply_boolean_mask(df._data["old"].isnull())
+                .element_indexing(0)
             )
             df = df.dropna(subset=["old"])
 

diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
@@ -113,10 +113,11 @@ def quantile(
         else:
             result = self._numeric_quantile(q, interpolation, exact)
         if return_scalar:
+            scalar_result = result.element_indexing(0)
             return (
                 cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
-                if result[0] is cudf.NA
-                else result[0]
+                if scalar_result is cudf.NA
+                else scalar_result
             )
         return result
 
@@ -160,7 +161,9 @@ def _numeric_quantile(
         sorted_indices = self.as_frame()._get_sorted_inds(
             ascending=True, na_position="first"
         )
-        sorted_indices = sorted_indices[self.null_count :]
+        sorted_indices = sorted_indices.slice(
+            self.null_count, len(sorted_indices)
+        )
 
         return libcudf.quantiles.quantile(
             self, q, interpolation, sorted_indices, exact

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
@@ -5205,7 +5205,7 @@ def sum(
                 result_col,
                 sep=cudf.Scalar(""),
                 na_rep=cudf.Scalar(None, "str"),
-            )[0]
+            ).element_indexing(0)
         else:
             return result_col
 
@@ -5432,7 +5432,11 @@ def find_and_replace(
         )
         df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True)
         if df._data["old"].null_count == 1:
-            res = self.fillna(df._data["new"][df._data["old"].isnull()][0])
+            res = self.fillna(
+                df._data["new"]
+                .apply_boolean_mask(df._data["old"].isnull())
+                .element_indexing(0)
+            )
             df = df.dropna(subset=["old"])
         else:
             res = self

diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import pandas as pd
@@ -91,14 +91,12 @@ def to_pandas(self, index: pd.Index = None, **kwargs) -> "pd.Series":
             pd_series.index = index
         return pd_series
 
-    def __getitem__(self, args):
-        result = super().__getitem__(args)
-        if isinstance(result, dict):
-            return {
-                field: value
-                for field, value in zip(self.dtype.fields, result.values())
-            }
-        return result
+    def element_indexing(self, index: int):
+        result = super().element_indexing(index)
+        return {
+            field: value
+            for field, value in zip(self.dtype.fields, result.values())
+        }
 
     def __setitem__(self, key, value):
         if isinstance(value, dict):

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
@@ -30,7 +30,6 @@
 from cudf._lib.search import search_sorted
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
-    _is_scalar_or_zero_d_array,
     is_categorical_dtype,
     is_dtype_equal,
     is_interval_dtype,
@@ -342,29 +341,22 @@ def __len__(self):
 
     @_cudf_nvtx_annotate
     def __getitem__(self, index):
-        len_self = len(self)
         if isinstance(index, slice):
-            sl_start, sl_stop, sl_step = index.indices(len_self)
+            sl_start, sl_stop, sl_step = index.indices(len(self))
 
             lo = self._start + sl_start * self._step
             hi = self._start + sl_stop * self._step
             st = self._step * sl_step
             return RangeIndex(start=lo, stop=hi, step=st, name=self._name)
 
         elif isinstance(index, Number):
+            len_self = len(self)
             if index < 0:
-                index = len_self + index
+                index += len_self
             if not (0 <= index < len_self):
-                raise IndexError("out-of-bound")
-            index = min(index, len_self)
-            index = self._start + index * self._step
-            return index
-        else:
-            if _is_scalar_or_zero_d_array(index):
-                index = np.min_scalar_type(index).type(index)
-            index = column.as_column(index)
-
-        return as_index(self._values[index], name=self.name)
+                raise IndexError("Index out of bounds")
+            return self._start + index * self._step
+        return self._as_int64()[index]
 
     @_cudf_nvtx_annotate
     def equals(self, other):
@@ -1183,11 +1175,7 @@ def __repr__(self):
 
     @_cudf_nvtx_annotate
     def __getitem__(self, index):
-        if type(self) == IntervalIndex:
-            raise NotImplementedError(
-                "Getting a scalar from an IntervalIndex is not yet supported"
-            )
-        res = self._values[index]
+        res = self._get_elements_from_column(index)
         if not isinstance(index, int):
             res = as_index(res)
             res.name = self.name
@@ -2457,8 +2445,8 @@ def interval_range(
                 init=start.device_value,
                 step=freq_step.device_value,
             )
-            left_col = bin_edges[:-1]
-            right_col = bin_edges[1:]
+            left_col = bin_edges.slice(0, len(bin_edges) - 1)
+            right_col = bin_edges.slice(1, len(bin_edges))
     elif freq and periods:
         if end:
             start = end - (freq * periods)
@@ -2614,6 +2602,11 @@ def from_breaks(breaks, closed="right", name=None, copy=False, dtype=None):
 
         return IntervalIndex(interval_col, name=name)
 
+    def __getitem__(self, index):
+        raise NotImplementedError(
+            "Getting a scalar from an IntervalIndex is not yet supported"
+        )
+
     def is_interval(self):
         return True
 

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
@@ -1173,9 +1173,9 @@ def _n_largest_or_smallest(self, largest, n, columns, keep):
 
             # argsort the `by` column
             return self._gather(
-                self._get_columns_by_label(columns)._get_sorted_inds(
-                    ascending=not largest
-                )[:n],
+                self._get_columns_by_label(columns)
+                ._get_sorted_inds(ascending=not largest)
+                .slice(*slice(None, n).indices(len(self))),
                 keep_index=True,
                 check_bounds=False,
             )
@@ -1186,9 +1186,11 @@ def _n_largest_or_smallest(self, largest, n, columns, keep):
 
             if n <= 0:
                 # Empty slice.
-                indices = indices[0:0]
+                indices = indices.slice(0, 0)
             else:
-                indices = indices[: -n - 1 : -1]
+                indices = indices.slice(
+                    *slice(None, -n - 1, -1).indices(len(self))
+                )
             return self._gather(indices, keep_index=True, check_bounds=False)
         else:
             raise ValueError('keep must be either "first", "last"')
@@ -1808,7 +1810,9 @@ def _first_or_last(
             return self.copy()
 
         pd_offset = pd.tseries.frequencies.to_offset(offset)
-        to_search = op(pd.Timestamp(self._index._column[idx]), pd_offset)
+        to_search = op(
+            pd.Timestamp(self._index._column.element_indexing(idx)), pd_offset
+        )
         if (
             idx == 0
             and not isinstance(pd_offset, pd.tseries.offsets.Tick)

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
@@ -96,7 +96,7 @@ class _SeriesIlocIndexer(_FrameIndexer):
     def __getitem__(self, arg):
         if isinstance(arg, tuple):
             arg = list(arg)
-        data = self._frame._column[arg]
+        data = self._frame._get_elements_from_column(arg)
 
         if (
             isinstance(data, (dict, list))

diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
@@ -10,8 +10,12 @@
 import pandas as pd
 
 import cudf
-from cudf._typing import Dtype
-from cudf.api.types import _is_scalar_or_zero_d_array
+from cudf._typing import Dtype, ScalarLike
+from cudf.api.types import (
+    _is_scalar_or_zero_d_array,
+    is_bool_dtype,
+    is_integer_dtype,
+)
 from cudf.core.column import ColumnBase, as_column
 from cudf.core.frame import Frame
 from cudf.utils.utils import NotIterable, _cudf_nvtx_annotate
@@ -359,3 +363,23 @@ def nunique(self, dropna: bool = True):
         if self._column.null_count == len(self):
             return 0
         return self._column.distinct_count(dropna=dropna)
+
+    def _get_elements_from_column(self, arg) -> Union[ScalarLike, ColumnBase]:
+        # A generic method for getting elements from a column that supports a
+        # wide range of different inputs. This method should only used where
+        # _absolutely_ necessary, since in almost all cases a more specific
+        # method can be used e.g. element_indexing or slice.
+        if _is_scalar_or_zero_d_array(arg):
+            return self._column.element_indexing(int(arg))
+        elif isinstance(arg, slice):
+            start, stop, stride = arg.indices(len(self))
+            return self._column.slice(start, stop, stride)
+        else:
+            arg = as_column(arg)
+            if len(arg) == 0:
+                arg = as_column([], dtype="int32")
+            if is_integer_dtype(arg.dtype):
+                return self._column.take(arg)
+            if is_bool_dtype(arg.dtype):
+                return self._column.apply_boolean_mask(arg)
+            raise NotImplementedError(f"Unknown indexer {type(arg)}")