automl · mfeurer · Jan 12, 2021 · Dec 19, 2020 · Dec 20, 2020 · Jan 8, 2021
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -3,23 +3,27 @@ repos:
     rev: v0.761
     hooks:
       - id: mypy
+        args: [--show-error-codes]
         name: mypy auto-sklearn-ensembles
         files: autosklearn/ensembles
       - id: mypy
+        args: [--show-error-codes]
         name: mypy auto-sklearn-metrics
         files: autosklearn/metrics
       - id: mypy
+        args: [--show-error-codes]
         name: mypy auto-sklearn-data
         files: autosklearn/data
       - id: mypy
+        args: [--show-error-codes]
         name: mypy auto-sklearn-util
         files: autosklearn/util
   - repo: https://gitlab.com/pycqa/flake8
     rev: 3.8.3
     hooks:
       - id: flake8
         name: flake8 auto-sklearn
-        files: autosklearn/*
+        files: autosklearn/.*
       - id: flake8
         name: flake8 file-order-data
         files: autosklearn/data
@@ -42,4 +46,4 @@ repos:
           - flake8-import-order
       - id: flake8
         name: flake8 autosklearn-test
-        files: test/*
+        files: test/.*
diff --git a/autosklearn/automl.py b/autosklearn/automl.py
@@ -9,6 +9,7 @@
 import sys
 import time
 from typing import Any, Dict, Optional, List, Union
+import uuid
 import unittest.mock
 import warnings
 import tempfile
@@ -29,8 +30,8 @@
 import joblib
 import sklearn.utils
 import scipy.sparse
-from sklearn.metrics._classification import type_of_target
 from sklearn.utils.validation import check_is_fitted
+from sklearn.metrics._classification import type_of_target
 from sklearn.dummy import DummyClassifier, DummyRegressor
 
 from autosklearn.metrics import Scorer
@@ -51,7 +52,6 @@
 from autosklearn.ensemble_builder import EnsembleBuilderManager
 from autosklearn.ensembles.singlebest_ensemble import SingleBest
 from autosklearn.smbo import AutoMLSMBO
-from autosklearn.util.hash import hash_array_or_matrix
 from autosklearn.metrics import f1_macro, accuracy, r2
 from autosklearn.constants import MULTILABEL_CLASSIFICATION, MULTICLASS_CLASSIFICATION, \
     REGRESSION_TASKS, REGRESSION, BINARY_CLASSIFICATION, MULTIOUTPUT_REGRESSION, \
@@ -217,7 +217,7 @@ def __init__(self,
 
         self._debug_mode = debug_mode
 
-        self.InputValidator = InputValidator()
+        self.InputValidator = None  # type: Optional[InputValidator]
 
         # The ensemble performance history through time
         self.ensemble_performance_history = []
@@ -436,29 +436,34 @@ def fit(
         dataset_name: Optional[str] = None,
         only_return_configuration_space: Optional[bool] = False,
         load_models: bool = True,
+        is_classification: bool = False,
     ):
         if dataset_name is None:
-            dataset_name = hash_array_or_matrix(X)
-        # The first thing we have to do is create the logger to update the backend
+            dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))
+
+        # By default try to use the TCP logging port or get a new port
+        self._logger_port = logging.handlers.DEFAULT_TCP_LOGGING_PORT
         self._logger = self._get_logger(dataset_name)
+
+        # The first thing we have to do is create the logger to update the backend
         self._backend.setup_logger(self._logger_port)
 
         self._backend.save_start_time(self._seed)
         self._stopwatch = StopWatch()
 
-        # Employ the user feature types if provided
-        self.InputValidator.register_user_feat_type(feat_type, X)
-
         # Make sure that input is valid
         # Performs Ordinal one hot encoding to the target
         # both for train and test data
-        X, y = self.InputValidator.validate(X, y)
+        self.InputValidator = InputValidator(
+            is_classification=is_classification,
+            feat_type=feat_type,
+            logger_port=self._logger_port,
+        )
+        self.InputValidator.fit(X_train=X, y_train=y, X_test=X_test, y_test=y_test)
+        X, y = self.InputValidator.transform(X, y)
 
         if X_test is not None:
-            X_test, y_test = self.InputValidator.validate(X_test, y_test)
-            if len(y.shape) != len(y_test.shape):
-                raise ValueError('Target value shapes do not match: %s vs %s'
-                                 % (y.shape, y_test.shape))
+            X_test, y_test = self.InputValidator.transform(X_test, y_test)
 
         X, y = self.subsample_if_too_large(
             X=X,
@@ -496,8 +501,8 @@ def fit(
         self._dataset_name = dataset_name
         self._stopwatch.start_task(self._dataset_name)
 
-        if feat_type is None and self.InputValidator.feature_types:
-            feat_type = self.InputValidator.feature_types
+        if feat_type is None and self.InputValidator.feature_validator.feat_type:
+            feat_type = self.InputValidator.feature_validator.feat_type
 
         # Produce debug information to the logfile
         self._logger.debug('Starting to print environment information')
@@ -847,7 +852,10 @@ def subsample_if_too_large(X, y, logger, seed, memory_limit, task):
     def refit(self, X, y):
 
         # Make sure input data is valid
-        X, y = self.InputValidator.validate(X, y)
+        if self.InputValidator is None or not self.InputValidator._is_fitted:
+            raise ValueError("refit() is only supported after calling fit. Kindly call first "
+                             "the estimator fit() method.")
+        X, y = self.InputValidator.transform(X, y)
 
         if self.models_ is None or len(self.models_) == 0 or self.ensemble_ is None:
             self._load_models()
@@ -929,7 +937,10 @@ def predict(self, X, batch_size=None, n_jobs=1):
                              "if 'ensemble_size != 0'")
 
         # Make sure that input is valid
-        X = self.InputValidator.validate_features(X)
+        if self.InputValidator is None or not self.InputValidator._is_fitted:
+            raise ValueError("predict() can only be called after performing fit(). Kindly call "
+                             "the estimator fit() method first.")
+        X = self.InputValidator.feature_validator.transform(X)
 
         # Parallelize predictions across models with n_jobs processes.
         # Each process computes predictions in chunks of batch_size rows.
@@ -987,7 +998,10 @@ def fit_ensemble(self, y, task=None, precision=32,
             self._logger = self._get_logger(dataset_name)
 
         # Make sure that input is valid
-        y = self.InputValidator.validate_target(y, is_classification=True)
+        if self.InputValidator is None or not self.InputValidator._is_fitted:
+            raise ValueError("fit_ensemble() can only be called after fit. Please call the "
+                             "estimator fit() method prior to fit_ensemble().")
+        y = self.InputValidator.target_validator.transform(y)
 
         # Create a client if needed
         if self._dask_client is None:
@@ -1110,15 +1124,18 @@ def score(self, X, y):
         prediction = self.predict(X)
 
         # Make sure that input is valid
-        X, y = self.InputValidator.validate(X, y)
+        if self.InputValidator is None or not self.InputValidator._is_fitted:
+            raise ValueError("score() is only supported after calling fit. Kindly call first "
+                             "the estimator fit() method.")
+        y = self.InputValidator.target_validator.transform(y)
 
         # Encode the prediction using the input validator
         # We train autosklearn with a encoded version of y,
         # which is decoded by predict().
         # Above call to validate() encodes the y given for score()
         # Below call encodes the prediction, so we compare in the
         # same representation domain
-        prediction = self.InputValidator.encode_target(prediction)
+        prediction = self.InputValidator.target_validator.transform(prediction)
 
         return calculate_score(solution=y,
                                prediction=prediction,
@@ -1366,15 +1383,7 @@ def fit(
         only_return_configuration_space: bool = False,
         load_models: bool = True,
     ):
-
-        # We first validate the dtype of the target provided by the user
-        # In doing so, we also fit the internal encoder for classification
-        # In case y_test is provided, we proactively check their type
-        # and make sure the enconding accounts for both y_test/y_train classes
-        input_y = self.InputValidator.join_and_check(y, y_test) if y_test is not None else y
-        y_task = type_of_target(
-            self.InputValidator.validate_target(input_y, is_classification=True)
-        )
+        y_task = type_of_target(y)
         task = self._task_mapping.get(y_task)
         if task is None:
             raise ValueError('Cannot work on data of type %s' % y_task)
@@ -1394,18 +1403,22 @@ def fit(
             dataset_name=dataset_name,
             only_return_configuration_space=only_return_configuration_space,
             load_models=load_models,
+            is_classification=True,
         )
 
     def predict(self, X, batch_size=None, n_jobs=1):
         predicted_probabilities = super().predict(X, batch_size=batch_size,
                                                   n_jobs=n_jobs)
 
-        if self.InputValidator.is_single_column_target() == 1:
+        if self.InputValidator is None or not self.InputValidator._is_fitted:
+            raise ValueError("predict() is only supported after calling fit. Kindly call first "
+                             "the estimator fit() method.")
+        if self.InputValidator.target_validator.is_single_column_target():
             predicted_indexes = np.argmax(predicted_probabilities, axis=1)
         else:
             predicted_indexes = (predicted_probabilities > 0.5).astype(int)
 
-        return self.InputValidator.decode_target(predicted_indexes)
+        return self.InputValidator.target_validator.inverse_transform(predicted_indexes)
 
     def predict_proba(self, X, batch_size=None, n_jobs=1):
         return super().predict(X, batch_size=batch_size, n_jobs=n_jobs)
@@ -1433,9 +1446,7 @@ def fit(
         # Check the data provided in y
         # After the y data type is validated,
         # check the task type
-        y_task = type_of_target(
-            self.InputValidator.validate_target(y)
-        )
+        y_task = type_of_target(y)
         task = self._task_mapping.get(y_task)
         if task is None:
             raise ValueError('Cannot work on data of type %s' % y_task)
@@ -1451,4 +1462,5 @@ def fit(
             dataset_name=dataset_name,
             only_return_configuration_space=only_return_configuration_space,
             load_models=load_models,
+            is_classification=False,
         )