Refactor and remove SamplerBase class (#2775)

* init * fix tests and add release notes * refactor undersampler init * remove and comment out tests * clean up impl * more cleanup * add back initialize_sampler because it is an abstract method * add init error tests * clean up and remove test file * linting * remove pop * clean up via comments
alteryx · Sep 14, 2021 · a09d9f0 · a09d9f0
1 parent 2719871
commit a09d9f0
Show file tree

Hide file tree

Showing 7 changed files with 135 additions and 660 deletions.
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -5,8 +5,9 @@ Release Notes
     * Fixes
         * Fixed bug where warnings during ``make_pipeline`` were not being raised to the user :pr:`2765`
     * Changes
-        * Added docstring linting packages ``pydocstyle`` and ``darglint`` to `make-lint` command :pr:`2670`
+        * Refactored and removed ``SamplerBase`` class :pr:`2775`
     * Documentation Changes
+        * Added docstring linting packages ``pydocstyle`` and ``darglint`` to `make-lint` command :pr:`2670`
     * Testing Changes
 
 .. warning::

diff --git a/evalml/pipelines/components/transformers/samplers/undersampler.py b/evalml/pipelines/components/transformers/samplers/undersampler.py
@@ -1,12 +1,11 @@
 """An undersampling transformer to downsample the majority classes in the dataset."""
+import numpy as np
 import pandas as pd
 
 from evalml.pipelines.components.transformers.samplers.base_sampler import (
     BaseSampler,
 )
-from evalml.preprocessing.data_splitters.balanced_classification_sampler import (
-    BalancedClassificationSampler,
-)
+from evalml.utils.woodwork_utils import infer_feature_types
 
 
 class Undersampler(BaseSampler):
@@ -27,6 +26,11 @@ class Undersampler(BaseSampler):
             If min_percentage and min_samples are not met, treat this as severely imbalanced, and we will not resample the data.
             Must be between 0 and 0.5, inclusive. Defaults to 0.1.
         random_seed (int): The seed to use for random sampling. Defaults to 0.
+
+    Raises:
+        ValueError: If sampling_ratio is not in the range (0, 1].
+        ValueError: If min_sample is not greater than 0.
+        ValueError: If min_percentage is not between 0 and 0.5, inclusive.
     """
 
     name = "Undersampler"
@@ -42,12 +46,32 @@ def __init__(
         random_seed=0,
         **kwargs,
     ):
+        if sampling_ratio <= 0 or sampling_ratio > 1:
+            raise ValueError(
+                f"sampling_ratio must be within (0, 1], but received {sampling_ratio}"
+            )
+        if min_samples <= 0:
+            raise ValueError(
+                f"min_sample must be greater than 0, but received {min_samples}"
+            )
+        if min_percentage <= 0 or min_percentage > 0.5:
+            raise ValueError(
+                f"min_percentage must be between 0 and 0.5, inclusive, but received {min_percentage}"
+            )
+
         parameters = {
             "sampling_ratio": sampling_ratio,
             "min_samples": min_samples,
             "min_percentage": min_percentage,
             "sampling_ratio_dict": sampling_ratio_dict,
         }
+        self.sampling_ratio = sampling_ratio
+        self.min_samples = min_samples
+        self.min_percentage = min_percentage
+        self.random_seed = random_seed
+        self.random_state = np.random.RandomState(self.random_seed)
+        self.sampling_ratio_dict = sampling_ratio_dict or {}
+
         parameters.update(kwargs)
         super().__init__(
             parameters=parameters, component_obj=None, random_seed=random_seed
@@ -60,14 +84,7 @@ def _initialize_sampler(self, X, y):
             X (pd.DataFrame): Ignored.
             y (pd.Series): The target data.
         """
-        param_dic = self._dictionary_to_params(
-            self.parameters["sampling_ratio_dict"], y
-        )
-        param_dic.pop("n_jobs", None)
-        sampler = BalancedClassificationSampler(
-            **param_dic, random_seed=self.random_seed
-        )
-        self._component_obj = sampler
+        pass
 
     def transform(self, X, y=None):
         """Transforms the input data by sampling the data.
@@ -80,9 +97,92 @@ def transform(self, X, y=None):
             pd.DataFrame, pd.Series: Transformed features and target.
         """
         X_ww, y_ww = self._prepare_data(X, y)
-        self._initialize_sampler(X, y_ww)
         index_df = pd.Series(y_ww.index)
-        indices = self._component_obj.fit_resample(X_ww, y_ww)
+        indices = self.fit_resample(X_ww, y_ww)
 
         train_indices = index_df[index_df.isin(indices)].index.values.tolist()
         return X_ww.iloc[train_indices], y_ww.iloc[train_indices]
+
+    def _find_ideal_samples(self, y):
+        """Returns dictionary of examples to drop for each class if we need to resample.
+
+        Arguments:
+            y (pd.Series): Target data passed in.
+
+        Returns:
+            dict: Dictionary with undersample target class as key, and number of samples to remove as the value.
+                If we don't need to resample, returns empty dictionary.
+        """
+        counts = y.value_counts()
+        normalized_counts = y.value_counts(normalize=True)
+        minority_class_count = min(normalized_counts)
+        class_ratios = minority_class_count / normalized_counts
+        # if no class ratios are larger than what we consider balanced, then the target is balanced
+        if all(class_ratios >= self.sampling_ratio):
+            return {}
+        # if any classes have less than min_samples counts and are less than min_percentage of the total data,
+        # then it's severely imbalanced
+        if any(counts < self.min_samples) and any(
+            normalized_counts < self.min_percentage
+        ):
+            return {}
+        # otherwise, we are imbalanced enough to perform on this
+        undersample_classes = counts[class_ratios <= self.sampling_ratio].index.values
+        # find goal size, round it down if it's a float
+        minority_class = min(counts.values)
+        goal_value = max(
+            int((minority_class / self.sampling_ratio) // 1), self.min_samples
+        )
+        # we don't want to drop less than 0 rows
+        drop_values = {k: max(0, counts[k] - goal_value) for k in undersample_classes}
+        return {k: v for k, v in drop_values.items() if v > 0}
+
+    def _sampling_dict_to_remove_dict(self, y):
+        """Turns the sampling dict input into a dict of samples to remove for each target, similar to the return of _find_ideal_samples.
+
+        Arguments:
+            y (pd.Series): Training data targets.
+
+        Returns:
+            (dict): dictionary with undersample target class as key, and number of samples to remove as the value.
+                If we don't need to resample, returns empty dictionary.
+        """
+        y_dict = y.value_counts().to_dict()
+        new_dic = {}
+        for k, v in self.sampling_ratio_dict.items():
+            new_dic[k] = max(y_dict[k] - v, 0)
+        return new_dic
+
+    def fit_resample(self, X, y):
+        """Resampling technique for this sampler.
+
+        Arguments:
+            X (pd.DataFrame): Training data to fit and resample.
+            y (pd.Series): Training data targets to fit and resample.
+
+        Returns:
+            list: Indices to keep for training data.
+        """
+        if self.parameters["sampling_ratio_dict"]:
+            self.sampling_ratio_dict = self._convert_dictionary(
+                self.parameters["sampling_ratio_dict"], y
+            )
+
+        y = infer_feature_types(y)
+
+        if len(self.sampling_ratio_dict):
+            result = self._sampling_dict_to_remove_dict(y)
+        else:
+            result = self._find_ideal_samples(y)
+        indices_to_drop = []
+        if len(result):
+            # iterate through the classes we need to undersample and remove the number of samples we need to remove
+            for key, value in result.items():
+                indices = y.index[y == key].values
+                indices_to_remove = self.random_state.choice(
+                    indices, value, replace=False
+                )
+                indices_to_drop.extend(indices_to_remove)
+        # indices of the y series
+        original_indices = list(set(y.index.values).difference(set(indices_to_drop)))
+        return original_indices
diff --git a/evalml/preprocessing/data_splitters/__init__.py b/evalml/preprocessing/data_splitters/__init__.py
@@ -1,5 +1,3 @@
 """Data splitter classes."""
 from .training_validation_split import TrainingValidationSplit
 from .time_series_split import TimeSeriesSplit
-from .balanced_classification_sampler import BalancedClassificationSampler
-from .sampler_base import SamplerBase
diff --git a/evalml/preprocessing/data_splitters/balanced_classification_sampler.py b/evalml/preprocessing/data_splitters/balanced_classification_sampler.py
diff --git a/evalml/preprocessing/data_splitters/sampler_base.py b/evalml/preprocessing/data_splitters/sampler_base.py
diff --git a/evalml/tests/component_tests/test_undersampler.py b/evalml/tests/component_tests/test_undersampler.py
@@ -17,6 +17,26 @@ def test_init():
     assert undersampler.parameters == parameters
 
 
+def test_undersampler_errors():
+    with pytest.raises(ValueError, match="sampling_ratio must be"):
+        Undersampler(sampling_ratio=1.01)
+
+    with pytest.raises(ValueError, match="sampling_ratio must be"):
+        Undersampler(sampling_ratio=-1)
+
+    with pytest.raises(ValueError, match="min_sample must be"):
+        Undersampler(min_samples=0)
+
+    with pytest.raises(ValueError, match="min_percentage must be"):
+        Undersampler(min_percentage=0)
+
+    with pytest.raises(ValueError, match="min_percentage must be"):
+        Undersampler(min_percentage=0.6)
+
+    with pytest.raises(ValueError, match="min_percentage must be"):
+        Undersampler(min_percentage=-1.3)
+
+
 def test_undersampler_raises_error_if_y_is_None():
     X = pd.DataFrame([[i] for i in range(5)])
     undersampler = Undersampler()
@@ -105,7 +125,6 @@ def test_undersampler_sampling_dict(sampling_ratio_dict, expected_dict_values):
     assert len(new_X) == sum(expected_dict_values.values())
     assert new_y.value_counts().to_dict() == expected_dict_values
     assert undersampler.random_seed == 12
-    assert undersampler._component_obj.random_seed == 12
 
 
 def test_undersampler_dictionary_overrides_ratio():