Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,27 @@ repos:
rev: v0.761
hooks:
- id: mypy
args: [--show-error-codes]
name: mypy auto-sklearn-ensembles
files: autosklearn/ensembles
- id: mypy
args: [--show-error-codes]
name: mypy auto-sklearn-metrics
files: autosklearn/metrics
- id: mypy
args: [--show-error-codes]
name: mypy auto-sklearn-data
files: autosklearn/data
- id: mypy
args: [--show-error-codes]
name: mypy auto-sklearn-util
files: autosklearn/util
- repo: https://gitlab.com/pycqa/flake8
rev: 3.8.3
hooks:
- id: flake8
name: flake8 auto-sklearn
files: autosklearn/*
files: autosklearn/.*
- id: flake8
name: flake8 file-order-data
files: autosklearn/data
Expand All @@ -42,4 +46,4 @@ repos:
- flake8-import-order
- id: flake8
name: flake8 autosklearn-test
files: test/*
files: test/.*
80 changes: 46 additions & 34 deletions autosklearn/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import sys
import time
from typing import Any, Dict, Optional, List, Union
import uuid
import unittest.mock
import warnings
import tempfile
Expand All @@ -29,8 +30,8 @@
import joblib
import sklearn.utils
import scipy.sparse
from sklearn.metrics._classification import type_of_target
from sklearn.utils.validation import check_is_fitted
from sklearn.metrics._classification import type_of_target
from sklearn.dummy import DummyClassifier, DummyRegressor

from autosklearn.metrics import Scorer
Expand All @@ -51,7 +52,6 @@
from autosklearn.ensemble_builder import EnsembleBuilderManager
from autosklearn.ensembles.singlebest_ensemble import SingleBest
from autosklearn.smbo import AutoMLSMBO
from autosklearn.util.hash import hash_array_or_matrix
from autosklearn.metrics import f1_macro, accuracy, r2
from autosklearn.constants import MULTILABEL_CLASSIFICATION, MULTICLASS_CLASSIFICATION, \
REGRESSION_TASKS, REGRESSION, BINARY_CLASSIFICATION, MULTIOUTPUT_REGRESSION, \
Expand Down Expand Up @@ -217,7 +217,7 @@ def __init__(self,

self._debug_mode = debug_mode

self.InputValidator = InputValidator()
self.InputValidator = None # type: Optional[InputValidator]

# The ensemble performance history through time
self.ensemble_performance_history = []
Expand Down Expand Up @@ -436,29 +436,34 @@ def fit(
dataset_name: Optional[str] = None,
only_return_configuration_space: Optional[bool] = False,
load_models: bool = True,
is_classification: bool = False,
):
if dataset_name is None:
dataset_name = hash_array_or_matrix(X)
# The first thing we have to do is create the logger to update the backend
dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))

# By default try to use the TCP logging port or get a new port
self._logger_port = logging.handlers.DEFAULT_TCP_LOGGING_PORT
self._logger = self._get_logger(dataset_name)

# The first thing we have to do is create the logger to update the backend
self._backend.setup_logger(self._logger_port)

self._backend.save_start_time(self._seed)
self._stopwatch = StopWatch()

# Employ the user feature types if provided
self.InputValidator.register_user_feat_type(feat_type, X)

# Make sure that input is valid
# Performs Ordinal one hot encoding to the target
# both for train and test data
X, y = self.InputValidator.validate(X, y)
self.InputValidator = InputValidator(
is_classification=is_classification,
feat_type=feat_type,
logger_port=self._logger_port,
)
self.InputValidator.fit(X_train=X, y_train=y, X_test=X_test, y_test=y_test)
X, y = self.InputValidator.transform(X, y)

if X_test is not None:
X_test, y_test = self.InputValidator.validate(X_test, y_test)
if len(y.shape) != len(y_test.shape):
raise ValueError('Target value shapes do not match: %s vs %s'
% (y.shape, y_test.shape))
X_test, y_test = self.InputValidator.transform(X_test, y_test)

X, y = self.subsample_if_too_large(
X=X,
Expand Down Expand Up @@ -496,8 +501,8 @@ def fit(
self._dataset_name = dataset_name
self._stopwatch.start_task(self._dataset_name)

if feat_type is None and self.InputValidator.feature_types:
feat_type = self.InputValidator.feature_types
if feat_type is None and self.InputValidator.feature_validator.feat_type:
feat_type = self.InputValidator.feature_validator.feat_type

# Produce debug information to the logfile
self._logger.debug('Starting to print environment information')
Expand Down Expand Up @@ -847,7 +852,10 @@ def subsample_if_too_large(X, y, logger, seed, memory_limit, task):
def refit(self, X, y):

# Make sure input data is valid
X, y = self.InputValidator.validate(X, y)
if self.InputValidator is None or not self.InputValidator._is_fitted:
raise ValueError("refit() is only supported after calling fit. Kindly call first "
"the estimator fit() method.")
X, y = self.InputValidator.transform(X, y)

if self.models_ is None or len(self.models_) == 0 or self.ensemble_ is None:
self._load_models()
Expand Down Expand Up @@ -929,7 +937,10 @@ def predict(self, X, batch_size=None, n_jobs=1):
"if 'ensemble_size != 0'")

# Make sure that input is valid
X = self.InputValidator.validate_features(X)
if self.InputValidator is None or not self.InputValidator._is_fitted:
raise ValueError("predict() can only be called after performing fit(). Kindly call "
"the estimator fit() method first.")
X = self.InputValidator.feature_validator.transform(X)

# Parallelize predictions across models with n_jobs processes.
# Each process computes predictions in chunks of batch_size rows.
Expand Down Expand Up @@ -987,7 +998,10 @@ def fit_ensemble(self, y, task=None, precision=32,
self._logger = self._get_logger(dataset_name)

# Make sure that input is valid
y = self.InputValidator.validate_target(y, is_classification=True)
if self.InputValidator is None or not self.InputValidator._is_fitted:
raise ValueError("fit_ensemble() can only be called after fit. Please call the "
"estimator fit() method prior to fit_ensemble().")
y = self.InputValidator.target_validator.transform(y)

# Create a client if needed
if self._dask_client is None:
Expand Down Expand Up @@ -1110,15 +1124,18 @@ def score(self, X, y):
prediction = self.predict(X)

# Make sure that input is valid
X, y = self.InputValidator.validate(X, y)
if self.InputValidator is None or not self.InputValidator._is_fitted:
raise ValueError("score() is only supported after calling fit. Kindly call first "
"the estimator fit() method.")
y = self.InputValidator.target_validator.transform(y)

# Encode the prediction using the input validator
# We train autosklearn with a encoded version of y,
# which is decoded by predict().
# Above call to validate() encodes the y given for score()
# Below call encodes the prediction, so we compare in the
# same representation domain
prediction = self.InputValidator.encode_target(prediction)
prediction = self.InputValidator.target_validator.transform(prediction)

return calculate_score(solution=y,
prediction=prediction,
Expand Down Expand Up @@ -1366,15 +1383,7 @@ def fit(
only_return_configuration_space: bool = False,
load_models: bool = True,
):

# We first validate the dtype of the target provided by the user
# In doing so, we also fit the internal encoder for classification
# In case y_test is provided, we proactively check their type
# and make sure the enconding accounts for both y_test/y_train classes
input_y = self.InputValidator.join_and_check(y, y_test) if y_test is not None else y
y_task = type_of_target(
self.InputValidator.validate_target(input_y, is_classification=True)
)
y_task = type_of_target(y)
task = self._task_mapping.get(y_task)
if task is None:
raise ValueError('Cannot work on data of type %s' % y_task)
Expand All @@ -1394,18 +1403,22 @@ def fit(
dataset_name=dataset_name,
only_return_configuration_space=only_return_configuration_space,
load_models=load_models,
is_classification=True,
)

def predict(self, X, batch_size=None, n_jobs=1):
predicted_probabilities = super().predict(X, batch_size=batch_size,
n_jobs=n_jobs)

if self.InputValidator.is_single_column_target() == 1:
if self.InputValidator is None or not self.InputValidator._is_fitted:
raise ValueError("predict() is only supported after calling fit. Kindly call first "
"the estimator fit() method.")
if self.InputValidator.target_validator.is_single_column_target():
predicted_indexes = np.argmax(predicted_probabilities, axis=1)
else:
predicted_indexes = (predicted_probabilities > 0.5).astype(int)

return self.InputValidator.decode_target(predicted_indexes)
return self.InputValidator.target_validator.inverse_transform(predicted_indexes)

def predict_proba(self, X, batch_size=None, n_jobs=1):
return super().predict(X, batch_size=batch_size, n_jobs=n_jobs)
Expand Down Expand Up @@ -1433,9 +1446,7 @@ def fit(
# Check the data provided in y
# After the y data type is validated,
# check the task type
y_task = type_of_target(
self.InputValidator.validate_target(y)
)
y_task = type_of_target(y)
task = self._task_mapping.get(y_task)
if task is None:
raise ValueError('Cannot work on data of type %s' % y_task)
Expand All @@ -1451,4 +1462,5 @@ def fit(
dataset_name=dataset_name,
only_return_configuration_space=only_return_configuration_space,
load_models=load_models,
is_classification=False,
)
Loading