alteryx · freddyaboulton · Oct 22, 2020 · Oct 20, 2020 · Oct 20, 2020 · Oct 20, 2020
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -17,6 +17,7 @@ Release Notes
         * Fixed ``boosting type='rf'`` for LightGBM Classifier, as well as ``num_leaves`` error :pr:`1302`
         * Fixed bug in ``explain_predictions_best_worst`` where a custom index in the target variable would cause a ``ValueError`` :pr:`1318`
         * Added stacked ensemble estimators to to ``evalml.pipelines.__init__`` file :pr:`1326`
+        * Fixed bug in OHE where calls to transform were not deterministic if ``top_n`` was less than the number of categories in a column :pr:`1324`
     * Changes
         * Allow ``add_to_rankings`` to be called before AutoMLSearch is called :pr:`1250`
         * Removed ``max_pipelines`` parameter from ``AutoMLSearch`` :pr:`1264`

diff --git a/evalml/pipelines/components/transformers/encoders/onehot_encoder.py b/evalml/pipelines/components/transformers/encoders/onehot_encoder.py
@@ -69,6 +69,7 @@ def __init__(self,
         super().__init__(parameters=parameters,
                          component_obj=None,
                          random_state=random_state)
+        self._initial_state = self.random_state.get_state()
 
     @staticmethod
     def _get_cat_cols(X):
@@ -111,7 +112,9 @@ def fit(self, X, y=None):
                 if top_n is None or len(value_counts) <= top_n:
                     unique_values = value_counts.index.tolist()
                 else:
-                    value_counts = value_counts.sample(frac=1, random_state=self.random_state)
+                    new_random_state = np.random.RandomState()
+                    new_random_state.set_state(self._initial_state)
+                    value_counts = value_counts.sample(frac=1, random_state=new_random_state)
                     value_counts = value_counts.sort_values([col], ascending=False, kind='mergesort')
                     unique_values = value_counts.head(top_n).index.tolist()
                 unique_values = np.sort(unique_values)

diff --git a/evalml/tests/component_tests/test_one_hot_encoder.py b/evalml/tests/component_tests/test_one_hot_encoder.py
@@ -224,18 +224,17 @@ def test_more_top_n_unique_values():
                       "col_4": [2, 0, 1, 3, 0, 1, 2]})
 
     random_seed = 2
-    test_random_state = get_random_state(random_seed)
 
     encoder = OneHotEncoder(top_n=5, random_state=random_seed)
     encoder.fit(X)
     X_t = encoder.transform(X)
     col_1_counts = X["col_1"].value_counts(dropna=False).to_frame()
-    col_1_counts = col_1_counts.sample(frac=1, random_state=test_random_state)
+    col_1_counts = col_1_counts.sample(frac=1, random_state=random_seed)
     col_1_counts = col_1_counts.sort_values(["col_1"], ascending=False, kind='mergesort')
     col_1_samples = col_1_counts.head(encoder.parameters['top_n']).index.tolist()
 
     col_2_counts = X["col_2"].value_counts(dropna=False).to_frame()
-    col_2_counts = col_2_counts.sample(frac=1, random_state=test_random_state)
+    col_2_counts = col_2_counts.sample(frac=1, random_state=random_seed)
     col_2_counts = col_2_counts.sort_values(["col_2"], ascending=False, kind='mergesort')
     col_2_samples = col_2_counts.head(encoder.parameters['top_n']).index.tolist()
 
@@ -428,3 +427,17 @@ def test_ohe_features_to_encode_no_col_names():
     col_names = set(X_t.columns)
     assert (col_names == expected_col_names)
     assert ([X_t[col].dtype == "uint8" for col in X_t])
+
+
+def test_ohe_top_n_categories_always_the_same():
+    df = pd.DataFrame({"categories": ["cat_1"] * 5 + ["cat_2"] * 4 + ["cat_3"] * 3 + ["cat_4"] * 3 + ["cat_5"] * 3,
+                       "numbers": range(18)})
+
+    def check_df_equality(random_state):
+        ohe = OneHotEncoder(top_n=4, random_state=random_state)
+        df1 = ohe.fit_transform(df)
+        df2 = ohe.fit_transform(df)
+        pd.testing.assert_frame_equal(df1, df2)
+
+    check_df_equality(5)
+    check_df_equality(get_random_state(5))