Skip to content

Commit

Permalink
Fix #970, sort feature types (#975)
Browse files Browse the repository at this point in the history
When encoding a pandas array in autosklearn.data.validator,
the columns are re-ordered by the ColumnTransformer. This PR
re-orders the feature types so that when passing the data to
the actual ML pipeline, columns and feature types are sorted
the same way.
  • Loading branch information
mfeurer committed Oct 8, 2020
1 parent 49b3750 commit 08d099b
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 2 deletions.
20 changes: 20 additions & 0 deletions autosklearn/data/validation.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# -*- encoding: utf-8 -*-

import functools
import warnings
from typing import List, Optional, Tuple, Union

Expand Down Expand Up @@ -365,6 +366,25 @@ def _check_and_encode_features(
assert self.feature_encoder is not None
self.feature_encoder.fit(X)

# The column transformer reoders the feature types - we therefore need to change
# it as well
def comparator(cmp1, cmp2):
if (
cmp1 == 'categorical' and cmp2 == 'categorical'
or cmp1 == 'numerical' and cmp2 == 'numerical'
):
return 0
elif cmp1 == 'categorical' and cmp2 == 'numerical':
return -1
elif cmp1 == 'numerical' and cmp2 == 'categorical':
return 1
else:
raise ValueError((cmp1, cmp2))
self.feature_types = sorted(
self.feature_types,
key=functools.cmp_to_key(comparator)
)

if self.feature_encoder:
try:
X = self.feature_encoder.transform(X)
Expand Down
4 changes: 2 additions & 2 deletions test/test_automl/test_estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -563,7 +563,7 @@ def test_cv_regression(self):
self._setUp(tmp)
self._setUp(output)

X_train, Y_train, X_test, Y_test = putil.get_dataset('boston', train_size_maximum=300)
X_train, Y_train, X_test, Y_test = putil.get_dataset('diabetes', train_size_maximum=400)
automl = AutoSklearnRegressor(time_left_for_this_task=60,
per_run_time_limit=10,
resampling_strategy='cv',
Expand All @@ -572,7 +572,7 @@ def test_cv_regression(self):

automl.fit(X_train, Y_train)
predictions = automl.predict(X_test)
self.assertEqual(predictions.shape, (206,))
self.assertEqual(predictions.shape, (148,))
score = r2(Y_test, predictions)
print(Y_test)
print(predictions)
Expand Down
8 changes: 8 additions & 0 deletions test/test_data/test_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -538,6 +538,14 @@ def test_big_dataset_encoding(self):
# No change to numerical columns
np.testing.assert_array_equal(x['carbon'].to_numpy(), x_t[:, 3])

# Categorical columns are sorted to the beginning
self.assertEqual(
validator.feature_types,
(['categorical'] * 3) + (['numerical'] * 7)
)
self.assertEqual(x.iloc[0, 6], 610)
np.testing.assert_array_equal(x_t[0], [0, 0, 0, 8, 0, 0, 0.7, 610, 0, np.NaN])

return

def test_join_and_check(self):
Expand Down

0 comments on commit 08d099b

Please sign in to comment.