diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 555a7274c3..40481bc60c 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -17,6 +17,7 @@ Release Notes * Fixed ``boosting type='rf'`` for LightGBM Classifier, as well as ``num_leaves`` error :pr:`1302` * Fixed bug in ``explain_predictions_best_worst`` where a custom index in the target variable would cause a ``ValueError`` :pr:`1318` * Added stacked ensemble estimators to to ``evalml.pipelines.__init__`` file :pr:`1326` + * Fixed bug in OHE where calls to transform were not deterministic if ``top_n`` was less than the number of categories in a column :pr:`1324` * Changes * Allow ``add_to_rankings`` to be called before AutoMLSearch is called :pr:`1250` * Removed ``max_pipelines`` parameter from ``AutoMLSearch`` :pr:`1264` diff --git a/evalml/pipelines/components/transformers/encoders/onehot_encoder.py b/evalml/pipelines/components/transformers/encoders/onehot_encoder.py index 8c663cdfce..562953a909 100644 --- a/evalml/pipelines/components/transformers/encoders/onehot_encoder.py +++ b/evalml/pipelines/components/transformers/encoders/onehot_encoder.py @@ -69,6 +69,7 @@ def __init__(self, super().__init__(parameters=parameters, component_obj=None, random_state=random_state) + self._initial_state = self.random_state.get_state() @staticmethod def _get_cat_cols(X): @@ -111,7 +112,9 @@ def fit(self, X, y=None): if top_n is None or len(value_counts) <= top_n: unique_values = value_counts.index.tolist() else: - value_counts = value_counts.sample(frac=1, random_state=self.random_state) + new_random_state = np.random.RandomState() + new_random_state.set_state(self._initial_state) + value_counts = value_counts.sample(frac=1, random_state=new_random_state) value_counts = value_counts.sort_values([col], ascending=False, kind='mergesort') unique_values = value_counts.head(top_n).index.tolist() unique_values = np.sort(unique_values) diff --git a/evalml/tests/component_tests/test_one_hot_encoder.py b/evalml/tests/component_tests/test_one_hot_encoder.py index b0fd471558..cf9549eccd 100644 --- a/evalml/tests/component_tests/test_one_hot_encoder.py +++ b/evalml/tests/component_tests/test_one_hot_encoder.py @@ -224,18 +224,17 @@ def test_more_top_n_unique_values(): "col_4": [2, 0, 1, 3, 0, 1, 2]}) random_seed = 2 - test_random_state = get_random_state(random_seed) encoder = OneHotEncoder(top_n=5, random_state=random_seed) encoder.fit(X) X_t = encoder.transform(X) col_1_counts = X["col_1"].value_counts(dropna=False).to_frame() - col_1_counts = col_1_counts.sample(frac=1, random_state=test_random_state) + col_1_counts = col_1_counts.sample(frac=1, random_state=random_seed) col_1_counts = col_1_counts.sort_values(["col_1"], ascending=False, kind='mergesort') col_1_samples = col_1_counts.head(encoder.parameters['top_n']).index.tolist() col_2_counts = X["col_2"].value_counts(dropna=False).to_frame() - col_2_counts = col_2_counts.sample(frac=1, random_state=test_random_state) + col_2_counts = col_2_counts.sample(frac=1, random_state=random_seed) col_2_counts = col_2_counts.sort_values(["col_2"], ascending=False, kind='mergesort') col_2_samples = col_2_counts.head(encoder.parameters['top_n']).index.tolist() @@ -428,3 +427,17 @@ def test_ohe_features_to_encode_no_col_names(): col_names = set(X_t.columns) assert (col_names == expected_col_names) assert ([X_t[col].dtype == "uint8" for col in X_t]) + + +def test_ohe_top_n_categories_always_the_same(): + df = pd.DataFrame({"categories": ["cat_1"] * 5 + ["cat_2"] * 4 + ["cat_3"] * 3 + ["cat_4"] * 3 + ["cat_5"] * 3, + "numbers": range(18)}) + + def check_df_equality(random_state): + ohe = OneHotEncoder(top_n=4, random_state=random_state) + df1 = ohe.fit_transform(df) + df2 = ohe.fit_transform(df) + pd.testing.assert_frame_equal(df1, df2) + + check_df_equality(5) + check_df_equality(get_random_state(5))