Skip to content

Commit

Permalink
Refactor and remove SamplerBase class (#2775)
Browse files Browse the repository at this point in the history
* init

* fix tests and add release notes

* refactor undersampler init

* remove and comment out tests

* clean up impl

* more cleanup

* add back initialize_sampler because it is an abstract method

* add init error tests

* clean up and remove test file

* linting

* remove pop

* clean up via comments
  • Loading branch information
angela97lin committed Sep 14, 2021
1 parent 2719871 commit a09d9f0
Show file tree
Hide file tree
Showing 7 changed files with 135 additions and 660 deletions.
3 changes: 2 additions & 1 deletion docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@ Release Notes
* Fixes
* Fixed bug where warnings during ``make_pipeline`` were not being raised to the user :pr:`2765`
* Changes
* Added docstring linting packages ``pydocstyle`` and ``darglint`` to `make-lint` command :pr:`2670`
* Refactored and removed ``SamplerBase`` class :pr:`2775`
* Documentation Changes
* Added docstring linting packages ``pydocstyle`` and ``darglint`` to `make-lint` command :pr:`2670`
* Testing Changes

.. warning::
Expand Down
126 changes: 113 additions & 13 deletions evalml/pipelines/components/transformers/samplers/undersampler.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
"""An undersampling transformer to downsample the majority classes in the dataset."""
import numpy as np
import pandas as pd

from evalml.pipelines.components.transformers.samplers.base_sampler import (
BaseSampler,
)
from evalml.preprocessing.data_splitters.balanced_classification_sampler import (
BalancedClassificationSampler,
)
from evalml.utils.woodwork_utils import infer_feature_types


class Undersampler(BaseSampler):
Expand All @@ -27,6 +26,11 @@ class Undersampler(BaseSampler):
If min_percentage and min_samples are not met, treat this as severely imbalanced, and we will not resample the data.
Must be between 0 and 0.5, inclusive. Defaults to 0.1.
random_seed (int): The seed to use for random sampling. Defaults to 0.
Raises:
ValueError: If sampling_ratio is not in the range (0, 1].
ValueError: If min_sample is not greater than 0.
ValueError: If min_percentage is not between 0 and 0.5, inclusive.
"""

name = "Undersampler"
Expand All @@ -42,12 +46,32 @@ def __init__(
random_seed=0,
**kwargs,
):
if sampling_ratio <= 0 or sampling_ratio > 1:
raise ValueError(
f"sampling_ratio must be within (0, 1], but received {sampling_ratio}"
)
if min_samples <= 0:
raise ValueError(
f"min_sample must be greater than 0, but received {min_samples}"
)
if min_percentage <= 0 or min_percentage > 0.5:
raise ValueError(
f"min_percentage must be between 0 and 0.5, inclusive, but received {min_percentage}"
)

parameters = {
"sampling_ratio": sampling_ratio,
"min_samples": min_samples,
"min_percentage": min_percentage,
"sampling_ratio_dict": sampling_ratio_dict,
}
self.sampling_ratio = sampling_ratio
self.min_samples = min_samples
self.min_percentage = min_percentage
self.random_seed = random_seed
self.random_state = np.random.RandomState(self.random_seed)
self.sampling_ratio_dict = sampling_ratio_dict or {}

parameters.update(kwargs)
super().__init__(
parameters=parameters, component_obj=None, random_seed=random_seed
Expand All @@ -60,14 +84,7 @@ def _initialize_sampler(self, X, y):
X (pd.DataFrame): Ignored.
y (pd.Series): The target data.
"""
param_dic = self._dictionary_to_params(
self.parameters["sampling_ratio_dict"], y
)
param_dic.pop("n_jobs", None)
sampler = BalancedClassificationSampler(
**param_dic, random_seed=self.random_seed
)
self._component_obj = sampler
pass

def transform(self, X, y=None):
"""Transforms the input data by sampling the data.
Expand All @@ -80,9 +97,92 @@ def transform(self, X, y=None):
pd.DataFrame, pd.Series: Transformed features and target.
"""
X_ww, y_ww = self._prepare_data(X, y)
self._initialize_sampler(X, y_ww)
index_df = pd.Series(y_ww.index)
indices = self._component_obj.fit_resample(X_ww, y_ww)
indices = self.fit_resample(X_ww, y_ww)

train_indices = index_df[index_df.isin(indices)].index.values.tolist()
return X_ww.iloc[train_indices], y_ww.iloc[train_indices]

def _find_ideal_samples(self, y):
"""Returns dictionary of examples to drop for each class if we need to resample.
Arguments:
y (pd.Series): Target data passed in.
Returns:
dict: Dictionary with undersample target class as key, and number of samples to remove as the value.
If we don't need to resample, returns empty dictionary.
"""
counts = y.value_counts()
normalized_counts = y.value_counts(normalize=True)
minority_class_count = min(normalized_counts)
class_ratios = minority_class_count / normalized_counts
# if no class ratios are larger than what we consider balanced, then the target is balanced
if all(class_ratios >= self.sampling_ratio):
return {}
# if any classes have less than min_samples counts and are less than min_percentage of the total data,
# then it's severely imbalanced
if any(counts < self.min_samples) and any(
normalized_counts < self.min_percentage
):
return {}
# otherwise, we are imbalanced enough to perform on this
undersample_classes = counts[class_ratios <= self.sampling_ratio].index.values
# find goal size, round it down if it's a float
minority_class = min(counts.values)
goal_value = max(
int((minority_class / self.sampling_ratio) // 1), self.min_samples
)
# we don't want to drop less than 0 rows
drop_values = {k: max(0, counts[k] - goal_value) for k in undersample_classes}
return {k: v for k, v in drop_values.items() if v > 0}

def _sampling_dict_to_remove_dict(self, y):
"""Turns the sampling dict input into a dict of samples to remove for each target, similar to the return of _find_ideal_samples.
Arguments:
y (pd.Series): Training data targets.
Returns:
(dict): dictionary with undersample target class as key, and number of samples to remove as the value.
If we don't need to resample, returns empty dictionary.
"""
y_dict = y.value_counts().to_dict()
new_dic = {}
for k, v in self.sampling_ratio_dict.items():
new_dic[k] = max(y_dict[k] - v, 0)
return new_dic

def fit_resample(self, X, y):
"""Resampling technique for this sampler.
Arguments:
X (pd.DataFrame): Training data to fit and resample.
y (pd.Series): Training data targets to fit and resample.
Returns:
list: Indices to keep for training data.
"""
if self.parameters["sampling_ratio_dict"]:
self.sampling_ratio_dict = self._convert_dictionary(
self.parameters["sampling_ratio_dict"], y
)

y = infer_feature_types(y)

if len(self.sampling_ratio_dict):
result = self._sampling_dict_to_remove_dict(y)
else:
result = self._find_ideal_samples(y)
indices_to_drop = []
if len(result):
# iterate through the classes we need to undersample and remove the number of samples we need to remove
for key, value in result.items():
indices = y.index[y == key].values
indices_to_remove = self.random_state.choice(
indices, value, replace=False
)
indices_to_drop.extend(indices_to_remove)
# indices of the y series
original_indices = list(set(y.index.values).difference(set(indices_to_drop)))
return original_indices
2 changes: 0 additions & 2 deletions evalml/preprocessing/data_splitters/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
"""Data splitter classes."""
from .training_validation_split import TrainingValidationSplit
from .time_series_split import TimeSeriesSplit
from .balanced_classification_sampler import BalancedClassificationSampler
from .sampler_base import SamplerBase
129 changes: 0 additions & 129 deletions evalml/preprocessing/data_splitters/balanced_classification_sampler.py

This file was deleted.

25 changes: 0 additions & 25 deletions evalml/preprocessing/data_splitters/sampler_base.py

This file was deleted.

21 changes: 20 additions & 1 deletion evalml/tests/component_tests/test_undersampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,26 @@ def test_init():
assert undersampler.parameters == parameters


def test_undersampler_errors():
with pytest.raises(ValueError, match="sampling_ratio must be"):
Undersampler(sampling_ratio=1.01)

with pytest.raises(ValueError, match="sampling_ratio must be"):
Undersampler(sampling_ratio=-1)

with pytest.raises(ValueError, match="min_sample must be"):
Undersampler(min_samples=0)

with pytest.raises(ValueError, match="min_percentage must be"):
Undersampler(min_percentage=0)

with pytest.raises(ValueError, match="min_percentage must be"):
Undersampler(min_percentage=0.6)

with pytest.raises(ValueError, match="min_percentage must be"):
Undersampler(min_percentage=-1.3)


def test_undersampler_raises_error_if_y_is_None():
X = pd.DataFrame([[i] for i in range(5)])
undersampler = Undersampler()
Expand Down Expand Up @@ -105,7 +125,6 @@ def test_undersampler_sampling_dict(sampling_ratio_dict, expected_dict_values):
assert len(new_X) == sum(expected_dict_values.values())
assert new_y.value_counts().to_dict() == expected_dict_values
assert undersampler.random_seed == 12
assert undersampler._component_obj.random_seed == 12


def test_undersampler_dictionary_overrides_ratio():
Expand Down

0 comments on commit a09d9f0

Please sign in to comment.