Fix #970, sort feature types (#975)

When encoding a pandas array in autosklearn.data.validator, the columns are re-ordered by the ColumnTransformer. This PR re-orders the feature types so that when passing the data to the actual ML pipeline, columns and feature types are sorted the same way.
automl · Oct 8, 2020 · 08d099b · 08d099b
1 parent 49b3750
commit 08d099b
Show file tree

Hide file tree

Showing 3 changed files with 30 additions and 2 deletions.
diff --git a/autosklearn/data/validation.py b/autosklearn/data/validation.py
@@ -1,5 +1,6 @@
 # -*- encoding: utf-8 -*-
 
+import functools
 import warnings
 from typing import List, Optional, Tuple, Union
 
@@ -365,6 +366,25 @@ def _check_and_encode_features(
                 assert self.feature_encoder is not None
                 self.feature_encoder.fit(X)
 
+                # The column transformer reoders the feature types - we therefore need to change
+                # it as well
+                def comparator(cmp1, cmp2):
+                    if (
+                        cmp1 == 'categorical' and cmp2 == 'categorical'
+                        or cmp1 == 'numerical' and cmp2 == 'numerical'
+                    ):
+                        return 0
+                    elif cmp1 == 'categorical' and cmp2 == 'numerical':
+                        return -1
+                    elif cmp1 == 'numerical' and cmp2 == 'categorical':
+                        return 1
+                    else:
+                        raise ValueError((cmp1, cmp2))
+                self.feature_types = sorted(
+                    self.feature_types,
+                    key=functools.cmp_to_key(comparator)
+                )
+
         if self.feature_encoder:
             try:
                 X = self.feature_encoder.transform(X)

diff --git a/test/test_automl/test_estimators.py b/test/test_automl/test_estimators.py
@@ -563,7 +563,7 @@ def test_cv_regression(self):
         self._setUp(tmp)
         self._setUp(output)
 
-        X_train, Y_train, X_test, Y_test = putil.get_dataset('boston', train_size_maximum=300)
+        X_train, Y_train, X_test, Y_test = putil.get_dataset('diabetes', train_size_maximum=400)
         automl = AutoSklearnRegressor(time_left_for_this_task=60,
                                       per_run_time_limit=10,
                                       resampling_strategy='cv',
@@ -572,7 +572,7 @@ def test_cv_regression(self):
 
         automl.fit(X_train, Y_train)
         predictions = automl.predict(X_test)
-        self.assertEqual(predictions.shape, (206,))
+        self.assertEqual(predictions.shape, (148,))
         score = r2(Y_test, predictions)
         print(Y_test)
         print(predictions)

diff --git a/test/test_data/test_validation.py b/test/test_data/test_validation.py
@@ -538,6 +538,14 @@ def test_big_dataset_encoding(self):
         # No change to numerical columns
         np.testing.assert_array_equal(x['carbon'].to_numpy(), x_t[:, 3])
 
+        # Categorical columns are sorted to the beginning
+        self.assertEqual(
+            validator.feature_types,
+            (['categorical'] * 3) + (['numerical'] * 7)
+        )
+        self.assertEqual(x.iloc[0, 6], 610)
+        np.testing.assert_array_equal(x_t[0], [0, 0, 0, 8, 0, 0, 0.7, 610, 0, np.NaN])
+
         return
 
     def test_join_and_check(self):